From 6283bf039d63be5b41a7b37d3e6b028ae67a1065 Mon Sep 17 00:00:00 2001
From: Douglas Lovell
Date: Tue, 27 Oct 2015 16:55:35 -0600
Subject: [PATCH 01/20] first pass refactoring for word transitions
---
src/main/java/org/ahocorasick/trie/State.java | 30 ++--
.../java/org/ahocorasick/trie/Transition.java | 36 ++++
src/main/java/org/ahocorasick/trie/Trie.java | 157 +++++++++++++-----
.../java/org/ahocorasick/trie/TrieConfig.java | 12 +-
4 files changed, 180 insertions(+), 55 deletions(-)
create mode 100644 src/main/java/org/ahocorasick/trie/Transition.java
diff --git a/src/main/java/org/ahocorasick/trie/State.java b/src/main/java/org/ahocorasick/trie/State.java
index 82d167b..e104732 100644
--- a/src/main/java/org/ahocorasick/trie/State.java
+++ b/src/main/java/org/ahocorasick/trie/State.java
@@ -10,8 +10,8 @@ import java.util.*;
*
*
*
- * - success; when a character points to another state, it must return that state
- * - failure; when a character has no matching state, the algorithm must be able to fall back on a
+ *
- success; when a transition points to another state, it must return that state
+ * - failure; when a transition has no matching state, the algorithm must be able to fall back on a
* state with less depth
* - emits; when this state is passed and keywords have been matched, the matches must be
* 'emitted' so that they can be used later on.
@@ -19,7 +19,7 @@ import java.util.*;
*
*
* The root state is special in the sense that it has no failure state; it cannot fail. If it 'fails'
- * it will still parse the next character and start from the root node. This ensures that the algorithm
+ * it will still parse the next transition and start from the root node. This ensures that the algorithm
* always runs. All other states always have a fail state.
*
*
@@ -35,9 +35,9 @@ public class State {
/**
* referred to in the white paper as the 'goto' structure. From a state it is possible to go
- * to other states, depending on the character passed.
+ * to other states, depending on the transition passed.
*/
- private Map success = new HashMap();
+ private final Map success = new HashMap<>();
/** if no matching states are found, the failure state will be returned */
private State failure = null;
@@ -54,27 +54,27 @@ public class State {
this.rootState = depth == 0 ? this : null;
}
- private State nextState(Character character, boolean ignoreRootState) {
- State nextState = this.success.get(character);
+ private State nextState(Transition t, boolean ignoreRootState) {
+ State nextState = this.success.get(t);
if (!ignoreRootState && nextState == null && this.rootState != null) {
nextState = this.rootState;
}
return nextState;
}
- public State nextState(Character character) {
- return nextState(character, false);
+ public State nextState(Transition t) {
+ return nextState(t, false);
}
- public State nextStateIgnoreRootState(Character character) {
- return nextState(character, true);
+ public State nextStateIgnoreRootState(Transition t) {
+ return nextState(t, true);
}
- public State addState(Character character) {
- State nextState = nextStateIgnoreRootState(character);
+ public State addState(Transition t) {
+ State nextState = nextStateIgnoreRootState(t);
if (nextState == null) {
nextState = new State(this.depth+1);
- this.success.put(character, nextState);
+ this.success.put(t, nextState);
}
return nextState;
}
@@ -119,7 +119,7 @@ public class State {
return this.success.values();
}
- public Collection getTransitions() {
+ public Collection getTransitions() {
return this.success.keySet();
}
diff --git a/src/main/java/org/ahocorasick/trie/Transition.java b/src/main/java/org/ahocorasick/trie/Transition.java
new file mode 100644
index 0000000..99776b0
--- /dev/null
+++ b/src/main/java/org/ahocorasick/trie/Transition.java
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2015 Rogue Wave Software.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.ahocorasick.trie;
+
+/**
+ * Enables the trie to model transitions on whole words or characters
+ * ... or whatever!
+ * @author doug.lovell
+ * @param
+ */
+public class Transition {
+ private final T token;
+ public Transition(T token) {
+ this.token = token;
+ }
+ public T transitionToken() {
+ return token;
+ }
+ public boolean isWordSeparator() {
+ return (!(token instanceof Character) ||
+ Character.isSpaceChar((Character)token));
+ }
+}
diff --git a/src/main/java/org/ahocorasick/trie/Trie.java b/src/main/java/org/ahocorasick/trie/Trie.java
index 3c5403d..d2ada77 100644
--- a/src/main/java/org/ahocorasick/trie/Trie.java
+++ b/src/main/java/org/ahocorasick/trie/Trie.java
@@ -19,27 +19,112 @@ import java.util.concurrent.LinkedBlockingDeque;
*/
public class Trie {
- private TrieConfig trieConfig;
+ private final TrieConfig trieConfig;
- private State rootState;
+ private final State rootState;
private Trie(TrieConfig trieConfig) {
this.trieConfig = trieConfig;
this.rootState = new State();
}
+
+ private interface KeywordTokenizer {
+ public Transition nextTransition();
+ }
+
+ private class WordTokenizer implements KeywordTokenizer {
+ private final java.util.StringTokenizer st;
+ private boolean lastWasSpace = true;
+ public WordTokenizer(String keyword) {
+ st = new java.util.StringTokenizer(keyword);
+ }
+ @Override
+ public Transition nextTransition() {
+ Transition t;
+ if (lastWasSpace) {
+ t = new Transition<>(st.nextToken());
+ } else {
+ t = new Transition<>(' ');
+ }
+ lastWasSpace = !lastWasSpace;
+ return t;
+ }
+ }
+
+ private class CharacterTokenizer implements KeywordTokenizer {
+ private final java.text.StringCharacterIterator ct;
+ public CharacterTokenizer(String keyword) {
+ ct = new java.text.StringCharacterIterator(keyword);
+ }
+ @Override
+ public Transition nextTransition() {
+ return new Transition<>(ct.next());
+ }
+ }
+
+ private KeywordTokenizer keywordTokenizer(String keyword) {
+ KeywordTokenizer kwt;
+ if (trieConfig.hasOnlyWordNodes()) {
+ kwt = new WordTokenizer(keyword);
+ }
+ else {
+ kwt = new CharacterTokenizer(keyword);
+ }
+ return kwt;
+ }
+ private class TokenStream {
+ private final KeywordTokenizer kwt;
+ private Transition lookahead;
+ private final StringBuffer input = new StringBuffer();
+
+ public TokenStream(KeywordTokenizer kwt) {
+ this.kwt = kwt;
+ }
+
+ public Transition nextTransition() {
+ Transition next = lookahead;
+ if (next == null) {
+ next = kwt.nextTransition();
+ }
+ else {
+ lookahead = null;
+ }
+ if (next != null) {
+ input.append(next.transitionToken().toString());
+ }
+ return next;
+ }
+
+ public int position() {
+ return input.length();
+ }
+
+ public boolean isWholeWord(int start) {
+ if (lookahead == null) {
+ lookahead = kwt.nextTransition();
+ }
+ return ((start == 0 ||
+ Character.isSpaceChar(input.codePointAt(start))) &&
+ (lookahead == null || lookahead.isWordSeparator()));
+ }
+ }
+
private void addKeyword(String keyword) {
if (keyword == null || keyword.length() == 0) {
return;
}
- State currentState = this.rootState;
- for (Character character : keyword.toCharArray()) {
- if (trieConfig.isCaseInsensitive()) {
- character = Character.toLowerCase(character);
- }
- currentState = currentState.addState(character);
+ if (trieConfig.isCaseInsensitive()) {
+ keyword = keyword.toLowerCase();
}
- currentState.addEmit(trieConfig.isCaseInsensitive() ? keyword.toLowerCase() : keyword);
+ State currentState = this.rootState;
+ KeywordTokenizer tknz = keywordTokenizer(keyword);
+ Transition tn = tknz.nextTransition();
+ while (tn != null) {
+ currentState = currentState.addState(tn);
+ tn = tknz.nextTransition();
+ }
+ currentState.addEmit(keyword);
}
public Collection tokenize(String text) {
@@ -53,10 +138,10 @@ public class Trie {
return emitHandler.getEmits();
}
- public boolean containsMatch(CharSequence text) {
- Emit firstMatch = firstMatch(text);
- return firstMatch != null;
- }
+ public boolean containsMatch(CharSequence text) {
+ Emit firstMatch = firstMatch(text);
+ return firstMatch != null;
+ }
public Emit firstMatch(CharSequence text) {
FirstMatchHandler emitHandler = new FirstMatchHandler();
@@ -72,41 +157,35 @@ public class Trie {
final EmitCandidateFlushHandler flushHandler = new EmitCandidateFlushHandler(emitHandler, emitCandidateHolder);
+ String input = text.toString();
+ if (trieConfig.isCaseInsensitive()) {
+ input = input.toLowerCase();
+ }
+ TokenStream tknz = new TokenStream(keywordTokenizer(input));
+
State currentState = this.rootState;
- for (int position = 0; position < text.length(); position++) {
-
- if (flushHandler.stop()) {
- return;
- }
-
- Character character = text.charAt(position);
- if (trieConfig.isCaseInsensitive()) {
- character = Character.toLowerCase(character);
- }
- currentState = getState(currentState, character, flushHandler);
-
+ Transition tn = tknz.nextTransition();
+ while (tn != null) {
+ currentState = getState(currentState, tn, flushHandler);
+
Collection emits = currentState.emit();
for (String emit : emits) {
- int start = position - emit.length() + 1;
- if (!trieConfig.isOnlyWholeWords() || isWholeWord(text, start, position)) {
+ int position = tknz.position();
+ int start = tknz.position() - emit.length() + 1;
+ if (!trieConfig.isOnlyWholeWords() || tknz.isWholeWord(start)) {
emitCandidateHolder.addCandidate(new Emit(start, position, emit));
}
}
-
+ tn = tknz.nextTransition();
}
flushHandler.flush();
}
- public static boolean isWholeWord(CharSequence text, int start, int end) {
- return (start == 0 || Character.isWhitespace(text.charAt(start - 1))) &&
- (end == text.length() - 1 || Character.isWhitespace(text.charAt(end + 1)));
- }
-
- private State getState(State currentState, Character character, EmitCandidateFlushHandler flushHandler) {
- State newCurrentState = currentState.nextState(character);
+ private State getState(State currentState, Transition transition, EmitCandidateFlushHandler flushHandler) {
+ State newCurrentState = currentState.nextState(transition);
while (newCurrentState == null) {
currentState = currentState.failure(flushHandler);
- newCurrentState = currentState.nextState(character);
+ newCurrentState = currentState.nextState(transition);
}
return newCurrentState;
}
@@ -124,7 +203,7 @@ public class Trie {
while (!queue.isEmpty()) {
State currentState = queue.remove();
- for (Character transition : currentState.getTransitions()) {
+ for (Transition transition : currentState.getTransitions()) {
State targetState = currentState.nextState(transition);
queue.add(targetState);
@@ -145,9 +224,9 @@ public class Trie {
public static class TrieBuilder {
- private TrieConfig trieConfig = new TrieConfig();
+ private final TrieConfig trieConfig = new TrieConfig();
- private Trie trie = new Trie(trieConfig);
+ private final Trie trie = new Trie(trieConfig);
private TrieBuilder() {}
diff --git a/src/main/java/org/ahocorasick/trie/TrieConfig.java b/src/main/java/org/ahocorasick/trie/TrieConfig.java
index 6fa05c7..60a0b02 100644
--- a/src/main/java/org/ahocorasick/trie/TrieConfig.java
+++ b/src/main/java/org/ahocorasick/trie/TrieConfig.java
@@ -7,6 +7,8 @@ public class TrieConfig {
private boolean onlyWholeWords = false;
private boolean caseInsensitive = false;
+
+ private boolean wordNodes = false;
public boolean isAllowOverlaps() {
return allowOverlaps;
@@ -17,7 +19,7 @@ public class TrieConfig {
}
public boolean isOnlyWholeWords() {
- return onlyWholeWords;
+ return wordNodes || onlyWholeWords;
}
public void setOnlyWholeWords(boolean onlyWholeWords) {
@@ -31,4 +33,12 @@ public class TrieConfig {
public void setCaseInsensitive(boolean caseInsensitive) {
this.caseInsensitive = caseInsensitive;
}
+
+ public boolean hasOnlyWordNodes() {
+ return wordNodes;
+ }
+
+ public void setOnlyWordNodes(boolean wordNodes) {
+ this.wordNodes = wordNodes;
+ }
}
From f05026cf90b1fa51c0d54cc4d027eaf01a5cd78a Mon Sep 17 00:00:00 2001
From: Douglas Lovell
Date: Wed, 28 Oct 2015 10:59:45 -0600
Subject: [PATCH 02/20] added word transitions. all tests pass
---
.../ahocorasick/trie/CharacterTransition.java | 38 +++++++++++++++++
src/main/java/org/ahocorasick/trie/Emit.java | 13 +++++-
.../java/org/ahocorasick/trie/MatchToken.java | 7 +---
.../java/org/ahocorasick/trie/Tokenizer.java | 3 +-
.../java/org/ahocorasick/trie/Transition.java | 10 ++---
src/main/java/org/ahocorasick/trie/Trie.java | 31 +++++++-------
.../org/ahocorasick/trie/WordTransition.java | 41 +++++++++++++++++++
.../java/org/ahocorasick/trie/StateTest.java | 16 ++++----
8 files changed, 121 insertions(+), 38 deletions(-)
create mode 100644 src/main/java/org/ahocorasick/trie/CharacterTransition.java
create mode 100644 src/main/java/org/ahocorasick/trie/WordTransition.java
diff --git a/src/main/java/org/ahocorasick/trie/CharacterTransition.java b/src/main/java/org/ahocorasick/trie/CharacterTransition.java
new file mode 100644
index 0000000..acff751
--- /dev/null
+++ b/src/main/java/org/ahocorasick/trie/CharacterTransition.java
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2015 Rogue Wave Software.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.ahocorasick.trie;
+
+/**
+ * Model transitions on characters
+ * @author doug.lovell
+ */
+class CharacterTransition extends Transition {
+
+ public CharacterTransition(Character c) {
+ super(c);
+ }
+
+ @Override
+ public void updateMatch(StringBuffer match) {
+ match.append(token);
+ }
+
+ @Override
+ public boolean isWordSeparator() {
+ return Character.isSpaceChar(token);
+ }
+
+}
diff --git a/src/main/java/org/ahocorasick/trie/Emit.java b/src/main/java/org/ahocorasick/trie/Emit.java
index 60c1f9e..16b6ae3 100644
--- a/src/main/java/org/ahocorasick/trie/Emit.java
+++ b/src/main/java/org/ahocorasick/trie/Emit.java
@@ -6,15 +6,26 @@ import org.ahocorasick.interval.Intervalable;
public class Emit extends Interval implements Intervalable {
private final String keyword;
+ private final boolean isWholeWord;
- public Emit(final int start, final int end, final String keyword) {
+ public Emit(final int start, final int end,
+ final String keyword, boolean isWholeWord) {
super(start, end);
this.keyword = keyword;
+ this.isWholeWord = isWholeWord;
+ }
+
+ public Emit(final int start, final int end, final String keyword) {
+ this(start, end, keyword, true);
}
public String getKeyword() {
return this.keyword;
}
+
+ public boolean isWholeWord() {
+ return isWholeWord;
+ }
@Override
public String toString() {
diff --git a/src/main/java/org/ahocorasick/trie/MatchToken.java b/src/main/java/org/ahocorasick/trie/MatchToken.java
index 5becc64..d0ba56c 100644
--- a/src/main/java/org/ahocorasick/trie/MatchToken.java
+++ b/src/main/java/org/ahocorasick/trie/MatchToken.java
@@ -2,19 +2,16 @@ package org.ahocorasick.trie;
public class MatchToken extends Token {
- private final boolean wholeWord;
-
private final Emit emit;
- public MatchToken(String fragment, Emit emit, boolean wholeWord) {
+ public MatchToken(String fragment, Emit emit) {
super(fragment);
this.emit = emit;
- this.wholeWord = wholeWord;
}
@Override
public boolean isWholeWord() {
- return wholeWord;
+ return emit.isWholeWord();
}
@Override
diff --git a/src/main/java/org/ahocorasick/trie/Tokenizer.java b/src/main/java/org/ahocorasick/trie/Tokenizer.java
index b45c438..d0dbe43 100644
--- a/src/main/java/org/ahocorasick/trie/Tokenizer.java
+++ b/src/main/java/org/ahocorasick/trie/Tokenizer.java
@@ -40,8 +40,7 @@ public class Tokenizer {
private Token createMatch(Emit emit, String text) {
return new MatchToken(
text.substring(emit.getStart(), emit.getEnd()+1),
- emit,
- Trie.isWholeWord(this.text, emit.getStart(), emit.getEnd()));
+ emit);
}
}
diff --git a/src/main/java/org/ahocorasick/trie/Transition.java b/src/main/java/org/ahocorasick/trie/Transition.java
index 99776b0..9fa374c 100644
--- a/src/main/java/org/ahocorasick/trie/Transition.java
+++ b/src/main/java/org/ahocorasick/trie/Transition.java
@@ -21,16 +21,14 @@ package org.ahocorasick.trie;
* @author doug.lovell
* @param
*/
-public class Transition {
- private final T token;
+public abstract class Transition {
+ protected final T token;
public Transition(T token) {
this.token = token;
}
public T transitionToken() {
return token;
}
- public boolean isWordSeparator() {
- return (!(token instanceof Character) ||
- Character.isSpaceChar((Character)token));
- }
+ public abstract void updateMatch(StringBuffer match);
+ public abstract boolean isWordSeparator();
}
diff --git a/src/main/java/org/ahocorasick/trie/Trie.java b/src/main/java/org/ahocorasick/trie/Trie.java
index d2ada77..1ada938 100644
--- a/src/main/java/org/ahocorasick/trie/Trie.java
+++ b/src/main/java/org/ahocorasick/trie/Trie.java
@@ -32,22 +32,16 @@ public class Trie {
public Transition nextTransition();
}
+
+
private class WordTokenizer implements KeywordTokenizer {
private final java.util.StringTokenizer st;
- private boolean lastWasSpace = true;
public WordTokenizer(String keyword) {
st = new java.util.StringTokenizer(keyword);
}
@Override
public Transition nextTransition() {
- Transition t;
- if (lastWasSpace) {
- t = new Transition<>(st.nextToken());
- } else {
- t = new Transition<>(' ');
- }
- lastWasSpace = !lastWasSpace;
- return t;
+ return new WordTransition(st.nextToken());
}
}
@@ -58,7 +52,7 @@ public class Trie {
}
@Override
public Transition nextTransition() {
- return new Transition<>(ct.next());
+ return new CharacterTransition(ct.next());
}
}
@@ -76,7 +70,7 @@ public class Trie {
private class TokenStream {
private final KeywordTokenizer kwt;
private Transition lookahead;
- private final StringBuffer input = new StringBuffer();
+ private final StringBuffer match = new StringBuffer();
public TokenStream(KeywordTokenizer kwt) {
this.kwt = kwt;
@@ -91,13 +85,13 @@ public class Trie {
lookahead = null;
}
if (next != null) {
- input.append(next.transitionToken().toString());
+ next.updateMatch(match);
}
return next;
}
public int position() {
- return input.length();
+ return match.length();
}
public boolean isWholeWord(int start) {
@@ -105,7 +99,7 @@ public class Trie {
lookahead = kwt.nextTransition();
}
return ((start == 0 ||
- Character.isSpaceChar(input.codePointAt(start))) &&
+ Character.isSpaceChar(match.codePointAt(start))) &&
(lookahead == null || lookahead.isWordSeparator()));
}
}
@@ -155,7 +149,8 @@ public class Trie {
new OverlappingEmitCandidateHolder() :
new NonOverlappingEmitCandidateHolder();
- final EmitCandidateFlushHandler flushHandler = new EmitCandidateFlushHandler(emitHandler, emitCandidateHolder);
+ final EmitCandidateFlushHandler flushHandler =
+ new EmitCandidateFlushHandler(emitHandler, emitCandidateHolder);
String input = text.toString();
if (trieConfig.isCaseInsensitive()) {
@@ -172,8 +167,10 @@ public class Trie {
for (String emit : emits) {
int position = tknz.position();
int start = tknz.position() - emit.length() + 1;
- if (!trieConfig.isOnlyWholeWords() || tknz.isWholeWord(start)) {
- emitCandidateHolder.addCandidate(new Emit(start, position, emit));
+ boolean isWholeWord = tknz.isWholeWord(start);
+ if (isWholeWord || !trieConfig.isOnlyWholeWords()) {
+ emitCandidateHolder.addCandidate(
+ new Emit(start, position, emit, isWholeWord));
}
}
tn = tknz.nextTransition();
diff --git a/src/main/java/org/ahocorasick/trie/WordTransition.java b/src/main/java/org/ahocorasick/trie/WordTransition.java
new file mode 100644
index 0000000..335988f
--- /dev/null
+++ b/src/main/java/org/ahocorasick/trie/WordTransition.java
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2015 Rogue Wave Software.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.ahocorasick.trie;
+
+/**
+ * Model transitions on words
+ * @author doug.lovell
+ */
+public class WordTransition extends Transition {
+
+ public WordTransition(String s) {
+ super(s);
+ }
+
+ @Override
+ public void updateMatch(StringBuffer match) {
+ if (0 < match.length()) {
+ match.append(' ');
+ }
+ match.append(token);
+ }
+
+ @Override
+ public boolean isWordSeparator() {
+ return true;
+ }
+
+}
diff --git a/src/test/java/org/ahocorasick/trie/StateTest.java b/src/test/java/org/ahocorasick/trie/StateTest.java
index 2a64370..b088bb4 100644
--- a/src/test/java/org/ahocorasick/trie/StateTest.java
+++ b/src/test/java/org/ahocorasick/trie/StateTest.java
@@ -1,6 +1,5 @@
package org.ahocorasick.trie;
-import org.ahocorasick.trie.State;
import org.junit.Test;
import static junit.framework.Assert.assertEquals;
@@ -10,15 +9,18 @@ public class StateTest {
@Test
public void constructSequenceOfCharacters() {
State rootState = new State();
+ Transition a = new CharacterTransition('a');
+ Transition b = new CharacterTransition('b');
+ Transition c = new CharacterTransition('c');
rootState
- .addState('a')
- .addState('b')
- .addState('c');
- State currentState = rootState.nextState('a');
+ .addState(a)
+ .addState(b)
+ .addState(c);
+ State currentState = rootState.nextState(a);
assertEquals(1, currentState.getDepth());
- currentState = currentState.nextState('b');
+ currentState = currentState.nextState(b);
assertEquals(2, currentState.getDepth());
- currentState = currentState.nextState('c');
+ currentState = currentState.nextState(c);
assertEquals(3, currentState.getDepth());
}
From a646f233a5676a9c681b6050cf779a9e47a768b6 Mon Sep 17 00:00:00 2001
From: Douglas Lovell
Date: Wed, 28 Oct 2015 11:17:46 -0600
Subject: [PATCH 03/20] improve the option name
---
src/main/java/org/ahocorasick/trie/Trie.java | 4 +---
src/main/java/org/ahocorasick/trie/TrieConfig.java | 12 ++++++------
2 files changed, 7 insertions(+), 9 deletions(-)
diff --git a/src/main/java/org/ahocorasick/trie/Trie.java b/src/main/java/org/ahocorasick/trie/Trie.java
index 1ada938..deed616 100644
--- a/src/main/java/org/ahocorasick/trie/Trie.java
+++ b/src/main/java/org/ahocorasick/trie/Trie.java
@@ -32,8 +32,6 @@ public class Trie {
public Transition nextTransition();
}
-
-
private class WordTokenizer implements KeywordTokenizer {
private final java.util.StringTokenizer st;
public WordTokenizer(String keyword) {
@@ -58,7 +56,7 @@ public class Trie {
private KeywordTokenizer keywordTokenizer(String keyword) {
KeywordTokenizer kwt;
- if (trieConfig.hasOnlyWordNodes()) {
+ if (trieConfig.hasWordTransitions()) {
kwt = new WordTokenizer(keyword);
}
else {
diff --git a/src/main/java/org/ahocorasick/trie/TrieConfig.java b/src/main/java/org/ahocorasick/trie/TrieConfig.java
index 60a0b02..c7c6f72 100644
--- a/src/main/java/org/ahocorasick/trie/TrieConfig.java
+++ b/src/main/java/org/ahocorasick/trie/TrieConfig.java
@@ -8,7 +8,7 @@ public class TrieConfig {
private boolean caseInsensitive = false;
- private boolean wordNodes = false;
+ private boolean wordTransitions = false;
public boolean isAllowOverlaps() {
return allowOverlaps;
@@ -19,7 +19,7 @@ public class TrieConfig {
}
public boolean isOnlyWholeWords() {
- return wordNodes || onlyWholeWords;
+ return onlyWholeWords;
}
public void setOnlyWholeWords(boolean onlyWholeWords) {
@@ -34,11 +34,11 @@ public class TrieConfig {
this.caseInsensitive = caseInsensitive;
}
- public boolean hasOnlyWordNodes() {
- return wordNodes;
+ public boolean hasWordTransitions() {
+ return wordTransitions;
}
- public void setOnlyWordNodes(boolean wordNodes) {
- this.wordNodes = wordNodes;
+ public void setWordTransitions(boolean wordNodes) {
+ this.wordTransitions = wordNodes;
}
}
From 4be3e115b6e36b8750d1b90f224017d8ce9a6272 Mon Sep 17 00:00:00 2001
From: Douglas Lovell
Date: Wed, 28 Oct 2015 14:10:53 -0600
Subject: [PATCH 04/20] add builder method for setting word transitions
---
src/main/java/org/ahocorasick/trie/Trie.java | 16 ++++++++++++++--
1 file changed, 14 insertions(+), 2 deletions(-)
diff --git a/src/main/java/org/ahocorasick/trie/Trie.java b/src/main/java/org/ahocorasick/trie/Trie.java
index deed616..c8956d4 100644
--- a/src/main/java/org/ahocorasick/trie/Trie.java
+++ b/src/main/java/org/ahocorasick/trie/Trie.java
@@ -222,6 +222,8 @@ public class Trie {
private final TrieConfig trieConfig = new TrieConfig();
private final Trie trie = new Trie(trieConfig);
+
+ private boolean hasAddedKeyword = false;
private TrieBuilder() {}
@@ -240,11 +242,21 @@ public class Trie {
return this;
}
- public TrieBuilder addKeyword(String keyword) {
- trie.addKeyword(keyword);
+ public TrieBuilder wordTransitions() {
+ if (hasAddedKeyword) {
+ throw new IllegalStateException(
+ "Unable to switch to word transitions after keywords added");
+ }
+ this.trieConfig.setWordTransitions(true);
return this;
}
+ public TrieBuilder addKeyword(String keyword) {
+ trie.addKeyword(keyword);
+ hasAddedKeyword = true;
+ return this;
+ }
+
public Trie build() {
trie.constructFailureStates();
return trie;
From 1f63ae71d4f1ee258e83be8eea86242d70d563a7 Mon Sep 17 00:00:00 2001
From: Douglas Lovell
Date: Wed, 28 Oct 2015 16:56:27 -0600
Subject: [PATCH 05/20] StringBuffer, StringIterator are old school
---
.../ahocorasick/trie/CharacterTransition.java | 2 +-
.../java/org/ahocorasick/trie/Transition.java | 2 +-
src/main/java/org/ahocorasick/trie/Trie.java | 30 +++++++++++++++----
.../org/ahocorasick/trie/WordTransition.java | 2 +-
4 files changed, 27 insertions(+), 9 deletions(-)
diff --git a/src/main/java/org/ahocorasick/trie/CharacterTransition.java b/src/main/java/org/ahocorasick/trie/CharacterTransition.java
index acff751..31bf4e8 100644
--- a/src/main/java/org/ahocorasick/trie/CharacterTransition.java
+++ b/src/main/java/org/ahocorasick/trie/CharacterTransition.java
@@ -26,7 +26,7 @@ class CharacterTransition extends Transition {
}
@Override
- public void updateMatch(StringBuffer match) {
+ public void updateMatch(StringBuilder match) {
match.append(token);
}
diff --git a/src/main/java/org/ahocorasick/trie/Transition.java b/src/main/java/org/ahocorasick/trie/Transition.java
index 9fa374c..382213b 100644
--- a/src/main/java/org/ahocorasick/trie/Transition.java
+++ b/src/main/java/org/ahocorasick/trie/Transition.java
@@ -29,6 +29,6 @@ public abstract class Transition {
public T transitionToken() {
return token;
}
- public abstract void updateMatch(StringBuffer match);
+ public abstract void updateMatch(StringBuilder match);
public abstract boolean isWordSeparator();
}
diff --git a/src/main/java/org/ahocorasick/trie/Trie.java b/src/main/java/org/ahocorasick/trie/Trie.java
index c8956d4..1b5e49a 100644
--- a/src/main/java/org/ahocorasick/trie/Trie.java
+++ b/src/main/java/org/ahocorasick/trie/Trie.java
@@ -1,5 +1,7 @@
package org.ahocorasick.trie;
+import java.text.CharacterIterator;
+import java.util.Arrays;
import org.ahocorasick.trie.candidate.EmitCandidateFlushHandler;
import org.ahocorasick.trie.candidate.EmitCandidateHolder;
import org.ahocorasick.trie.candidate.NonOverlappingEmitCandidateHolder;
@@ -9,6 +11,7 @@ import org.ahocorasick.trie.handler.EmitHandler;
import org.ahocorasick.trie.handler.FirstMatchHandler;
import java.util.Collection;
+import java.util.Iterator;
import java.util.Queue;
import java.util.concurrent.LinkedBlockingDeque;
@@ -33,24 +36,37 @@ public class Trie {
}
private class WordTokenizer implements KeywordTokenizer {
- private final java.util.StringTokenizer st;
+ private final Iterator st;
public WordTokenizer(String keyword) {
- st = new java.util.StringTokenizer(keyword);
+ String[] tokens = keyword.split("\\s");
+ st = Arrays.asList(tokens).iterator();
}
@Override
public Transition nextTransition() {
- return new WordTransition(st.nextToken());
+ WordTransition t = null;
+ if (st.hasNext()) {
+ t = new WordTransition(st.next());
+ }
+ return t;
}
}
private class CharacterTokenizer implements KeywordTokenizer {
private final java.text.StringCharacterIterator ct;
+ private char cur;
public CharacterTokenizer(String keyword) {
ct = new java.text.StringCharacterIterator(keyword);
+ cur = ct.first();
}
@Override
public Transition nextTransition() {
- return new CharacterTransition(ct.next());
+ CharacterTransition t = null;
+ if (cur != CharacterIterator.DONE) {
+ t = new CharacterTransition(cur);
+ cur = ct.next();
+ }
+
+ return t;
}
}
@@ -68,7 +84,7 @@ public class Trie {
private class TokenStream {
private final KeywordTokenizer kwt;
private Transition lookahead;
- private final StringBuffer match = new StringBuffer();
+ private final StringBuilder match = new StringBuilder();
public TokenStream(KeywordTokenizer kwt) {
this.kwt = kwt;
@@ -159,8 +175,10 @@ public class Trie {
State currentState = this.rootState;
Transition tn = tknz.nextTransition();
while (tn != null) {
+ if (flushHandler.stop()) {
+ return;
+ }
currentState = getState(currentState, tn, flushHandler);
-
Collection emits = currentState.emit();
for (String emit : emits) {
int position = tknz.position();
diff --git a/src/main/java/org/ahocorasick/trie/WordTransition.java b/src/main/java/org/ahocorasick/trie/WordTransition.java
index 335988f..1d631b5 100644
--- a/src/main/java/org/ahocorasick/trie/WordTransition.java
+++ b/src/main/java/org/ahocorasick/trie/WordTransition.java
@@ -26,7 +26,7 @@ public class WordTransition extends Transition {
}
@Override
- public void updateMatch(StringBuffer match) {
+ public void updateMatch(StringBuilder match) {
if (0 < match.length()) {
match.append(' ');
}
From 51940af6e763b34d006c829b178e58b39224f748 Mon Sep 17 00:00:00 2001
From: Douglas Lovell
Date: Wed, 28 Oct 2015 16:57:37 -0600
Subject: [PATCH 06/20] test state with word transitions
---
.../java/org/ahocorasick/trie/StateTest.java | 18 ++++++++++++++++++
1 file changed, 18 insertions(+)
diff --git a/src/test/java/org/ahocorasick/trie/StateTest.java b/src/test/java/org/ahocorasick/trie/StateTest.java
index b088bb4..7036e44 100644
--- a/src/test/java/org/ahocorasick/trie/StateTest.java
+++ b/src/test/java/org/ahocorasick/trie/StateTest.java
@@ -23,5 +23,23 @@ public class StateTest {
currentState = currentState.nextState(c);
assertEquals(3, currentState.getDepth());
}
+
+ @Test
+ public void constructSequenceOfWords() {
+ State rootState = new State();
+ Transition a = new WordTransition("Alpha");
+ Transition b = new WordTransition("Bravo");
+ Transition c = new WordTransition("Charlie");
+ rootState
+ .addState(a)
+ .addState(b)
+ .addState(c);
+ State currentState = rootState.nextState(a);
+ assertEquals(1, currentState.getDepth());
+ currentState = currentState.nextState(b);
+ assertEquals(2, currentState.getDepth());
+ currentState = currentState.nextState(c);
+ assertEquals(3, currentState.getDepth());
+ }
}
From 8560af8cce381239f2bb6b4e797079efad7c86b9 Mon Sep 17 00:00:00 2001
From: Douglas Lovell
Date: Wed, 28 Oct 2015 17:03:29 -0600
Subject: [PATCH 07/20] test trie in word transition mode
---
.../java/org/ahocorasick/trie/TrieTest.java | 38 ++++++++++++++++++-
1 file changed, 37 insertions(+), 1 deletion(-)
diff --git a/src/test/java/org/ahocorasick/trie/TrieTest.java b/src/test/java/org/ahocorasick/trie/TrieTest.java
index 03f7924..c0c4c97 100644
--- a/src/test/java/org/ahocorasick/trie/TrieTest.java
+++ b/src/test/java/org/ahocorasick/trie/TrieTest.java
@@ -10,8 +10,9 @@ import java.util.Iterator;
import java.util.List;
import static junit.framework.Assert.assertEquals;
-import static junit.framework.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
+import org.junit.Rule;
+import org.junit.rules.ExpectedException;
public class TrieTest {
@@ -395,6 +396,41 @@ public class TrieTest {
assertToken(tokensIt.next(), " in reserve", false, false, false);
}
+ @Test
+ public void tokenizeFullSentenceByWords() {
+ Trie trie = Trie.builder()
+ .wordTransitions()
+ .addKeyword("Alpha")
+ .addKeyword("Beta")
+ .addKeyword("Gamma")
+ .build();
+ Collection tokens = trie.tokenize("Hear: Alpha team first, Beta from the rear, Gamma in reserve");
+ assertEquals(7, tokens.size());
+ Iterator tokensIt = tokens.iterator();
+ assertToken(tokensIt.next(), "Hear: ", false, false, false);
+ assertToken(tokensIt.next(), "Alpha", true, true, false);
+ assertToken(tokensIt.next(), " team first, ", false, false, false);
+ assertToken(tokensIt.next(), "Beta", true, true, false);
+ assertToken(tokensIt.next(), " from the rear, ", false, false, false);
+ assertToken(tokensIt.next(), "Gamma", true, true, false);
+ assertToken(tokensIt.next(), " in reserve", false, false, false);
+ }
+
+ @Rule
+ public ExpectedException thrown = ExpectedException.none();
+
+ @Test
+ public void wordTransitionsThrowsExceptionAfterKeywordsAdded()
+ throws IllegalStateException {
+ thrown.expect(IllegalStateException.class);
+ thrown.expectMessage("Unable to switch to word transitions after keywords added");
+ Trie trie = Trie.builder()
+ .addKeyword("Happy for now")
+ .wordTransitions()
+ .addKeyword("Not so happy")
+ .build();
+ }
+
@Test
public void bug5InGithubReportedByXCurry() {
Trie trie = Trie.builder().caseInsensitive().onlyWholeWords()
From 7514478a65b5841d1ec9532f8d95d9863265b9c2 Mon Sep 17 00:00:00 2001
From: Douglas Lovell
Date: Thu, 29 Oct 2015 15:28:23 -0600
Subject: [PATCH 08/20] Make the transition token the hash key for a transition
---
.../java/org/ahocorasick/trie/Transition.java | 27 +++++++++++++++++++
1 file changed, 27 insertions(+)
diff --git a/src/main/java/org/ahocorasick/trie/Transition.java b/src/main/java/org/ahocorasick/trie/Transition.java
index 382213b..d20ecd6 100644
--- a/src/main/java/org/ahocorasick/trie/Transition.java
+++ b/src/main/java/org/ahocorasick/trie/Transition.java
@@ -15,6 +15,8 @@
*/
package org.ahocorasick.trie;
+import java.util.Objects;
+
/**
* Enables the trie to model transitions on whole words or characters
* ... or whatever!
@@ -23,12 +25,37 @@ package org.ahocorasick.trie;
*/
public abstract class Transition {
protected final T token;
+
public Transition(T token) {
this.token = token;
}
+
public T transitionToken() {
return token;
}
+
public abstract void updateMatch(StringBuilder match);
public abstract boolean isWordSeparator();
+
+ @Override
+ public String toString() {
+ return "Transition on " + token;
+ }
+
+ @Override
+ public int hashCode() {
+ return token.hashCode();
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (obj == null) {
+ return false;
+ }
+ if (getClass() != obj.getClass()) {
+ return false;
+ }
+ final Transition> other = (Transition>) obj;
+ return Objects.equals(this.token, other.token);
+ }
}
From dd5f9b25fa4aff0851ccadd4051bec92c69daca6 Mon Sep 17 00:00:00 2001
From: Douglas Lovell
Date: Thu, 29 Oct 2015 15:29:44 -0600
Subject: [PATCH 09/20] fix the off by ones
---
src/main/java/org/ahocorasick/trie/Trie.java | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/src/main/java/org/ahocorasick/trie/Trie.java b/src/main/java/org/ahocorasick/trie/Trie.java
index 1b5e49a..b6d3ab0 100644
--- a/src/main/java/org/ahocorasick/trie/Trie.java
+++ b/src/main/java/org/ahocorasick/trie/Trie.java
@@ -65,7 +65,6 @@ public class Trie {
t = new CharacterTransition(cur);
cur = ct.next();
}
-
return t;
}
}
@@ -105,7 +104,7 @@ public class Trie {
}
public int position() {
- return match.length();
+ return match.length() - 1;
}
public boolean isWholeWord(int start) {
@@ -113,7 +112,7 @@ public class Trie {
lookahead = kwt.nextTransition();
}
return ((start == 0 ||
- Character.isSpaceChar(match.codePointAt(start))) &&
+ Character.isSpaceChar(match.codePointAt(start-1))) &&
(lookahead == null || lookahead.isWordSeparator()));
}
}
@@ -182,7 +181,7 @@ public class Trie {
Collection emits = currentState.emit();
for (String emit : emits) {
int position = tknz.position();
- int start = tknz.position() - emit.length() + 1;
+ int start = position - emit.length() + 1;
boolean isWholeWord = tknz.isWholeWord(start);
if (isWholeWord || !trieConfig.isOnlyWholeWords()) {
emitCandidateHolder.addCandidate(
From b7bb0cbf5b229569872b23875b7177022e969e32 Mon Sep 17 00:00:00 2001
From: Douglas Lovell
Date: Fri, 30 Oct 2015 10:58:15 -0600
Subject: [PATCH 10/20] put text positions on the transitions and track in the
token stream
---
.../ahocorasick/trie/CharacterTransition.java | 9 ++--
.../java/org/ahocorasick/trie/Transition.java | 15 +++++-
src/main/java/org/ahocorasick/trie/Trie.java | 48 +++++++++++--------
.../org/ahocorasick/trie/WordTransition.java | 13 ++---
4 files changed, 50 insertions(+), 35 deletions(-)
diff --git a/src/main/java/org/ahocorasick/trie/CharacterTransition.java b/src/main/java/org/ahocorasick/trie/CharacterTransition.java
index 31bf4e8..b49019a 100644
--- a/src/main/java/org/ahocorasick/trie/CharacterTransition.java
+++ b/src/main/java/org/ahocorasick/trie/CharacterTransition.java
@@ -21,13 +21,12 @@ package org.ahocorasick.trie;
*/
class CharacterTransition extends Transition {
- public CharacterTransition(Character c) {
- super(c);
+ public CharacterTransition(Character c, int start) {
+ super(c, start, 1);
}
- @Override
- public void updateMatch(StringBuilder match) {
- match.append(token);
+ public CharacterTransition(Character c) {
+ this(c, 0);
}
@Override
diff --git a/src/main/java/org/ahocorasick/trie/Transition.java b/src/main/java/org/ahocorasick/trie/Transition.java
index d20ecd6..84e15ca 100644
--- a/src/main/java/org/ahocorasick/trie/Transition.java
+++ b/src/main/java/org/ahocorasick/trie/Transition.java
@@ -25,16 +25,27 @@ import java.util.Objects;
*/
public abstract class Transition {
protected final T token;
+ protected final int start;
+ protected final int length;
- public Transition(T token) {
+ public Transition(T token, int start, int length) {
this.token = token;
+ this.start = start;
+ this.length = length;
}
public T transitionToken() {
return token;
}
- public abstract void updateMatch(StringBuilder match);
+ public int getStart() {
+ return start;
+ }
+
+ public int getLength() {
+ return length;
+ }
+
public abstract boolean isWordSeparator();
@Override
diff --git a/src/main/java/org/ahocorasick/trie/Trie.java b/src/main/java/org/ahocorasick/trie/Trie.java
index b6d3ab0..d285382 100644
--- a/src/main/java/org/ahocorasick/trie/Trie.java
+++ b/src/main/java/org/ahocorasick/trie/Trie.java
@@ -31,11 +31,15 @@ public class Trie {
this.rootState = new State();
}
- private interface KeywordTokenizer {
- public Transition nextTransition();
+ private abstract class KeywordTokenizer {
+ protected int position = 0;
+ public abstract Transition nextTransition();
+ public int getPosition() {
+ return position;
+ }
}
- private class WordTokenizer implements KeywordTokenizer {
+ private class WordTokenizer extends KeywordTokenizer {
private final Iterator st;
public WordTokenizer(String keyword) {
String[] tokens = keyword.split("\\s");
@@ -45,13 +49,18 @@ public class Trie {
public Transition nextTransition() {
WordTransition t = null;
if (st.hasNext()) {
- t = new WordTransition(st.next());
+ String word = st.next();
+ t = new WordTransition(word, position);
+ if (0 < position) {
+ position += 1; // a space
+ }
+ position += word.length();
}
return t;
}
}
- private class CharacterTokenizer implements KeywordTokenizer {
+ private class CharacterTokenizer extends KeywordTokenizer {
private final java.text.StringCharacterIterator ct;
private char cur;
public CharacterTokenizer(String keyword) {
@@ -62,9 +71,10 @@ public class Trie {
public Transition nextTransition() {
CharacterTransition t = null;
if (cur != CharacterIterator.DONE) {
- t = new CharacterTransition(cur);
+ t = new CharacterTransition(cur, position);
cur = ct.next();
}
+ position += 1;
return t;
}
}
@@ -82,11 +92,18 @@ public class Trie {
private class TokenStream {
private final KeywordTokenizer kwt;
+ private final String input;
private Transition lookahead;
- private final StringBuilder match = new StringBuilder();
- public TokenStream(KeywordTokenizer kwt) {
- this.kwt = kwt;
+ public TokenStream(String input) {
+ this.input = input;
+ if (trieConfig.hasWordTransitions()) {
+ kwt = new WordTokenizer(input);
+ }
+ else {
+ kwt = new CharacterTokenizer(input);
+ }
+ lookahead = null;
}
public Transition nextTransition() {
@@ -97,22 +114,15 @@ public class Trie {
else {
lookahead = null;
}
- if (next != null) {
- next.updateMatch(match);
- }
return next;
}
- public int position() {
- return match.length() - 1;
- }
-
public boolean isWholeWord(int start) {
if (lookahead == null) {
lookahead = kwt.nextTransition();
}
return ((start == 0 ||
- Character.isSpaceChar(match.codePointAt(start-1))) &&
+ Character.isSpaceChar(input.codePointAt(start-1))) &&
(lookahead == null || lookahead.isWordSeparator()));
}
}
@@ -169,7 +179,7 @@ public class Trie {
if (trieConfig.isCaseInsensitive()) {
input = input.toLowerCase();
}
- TokenStream tknz = new TokenStream(keywordTokenizer(input));
+ TokenStream tknz = new TokenStream(input);
State currentState = this.rootState;
Transition tn = tknz.nextTransition();
@@ -180,7 +190,7 @@ public class Trie {
currentState = getState(currentState, tn, flushHandler);
Collection emits = currentState.emit();
for (String emit : emits) {
- int position = tknz.position();
+ int position = tn.getStart() + tn.getLength() - 1;
int start = position - emit.length() + 1;
boolean isWholeWord = tknz.isWholeWord(start);
if (isWholeWord || !trieConfig.isOnlyWholeWords()) {
diff --git a/src/main/java/org/ahocorasick/trie/WordTransition.java b/src/main/java/org/ahocorasick/trie/WordTransition.java
index 1d631b5..e4f6436 100644
--- a/src/main/java/org/ahocorasick/trie/WordTransition.java
+++ b/src/main/java/org/ahocorasick/trie/WordTransition.java
@@ -21,21 +21,16 @@ package org.ahocorasick.trie;
*/
public class WordTransition extends Transition {
- public WordTransition(String s) {
- super(s);
+ public WordTransition(String s, int start) {
+ super(s, start, s.length());
}
- @Override
- public void updateMatch(StringBuilder match) {
- if (0 < match.length()) {
- match.append(' ');
- }
- match.append(token);
+ public WordTransition(String s) {
+ this(s, 0);
}
@Override
public boolean isWordSeparator() {
return true;
}
-
}
From f9c2d9d4aa7094d5f69ec98c04fbb5e79734f90f Mon Sep 17 00:00:00 2001
From: Douglas Lovell
Date: Fri, 30 Oct 2015 15:03:06 -0600
Subject: [PATCH 11/20] all the tests pass
---
.../ahocorasick/trie/CharacterTransition.java | 2 +-
src/main/java/org/ahocorasick/trie/Trie.java | 96 +++++++++----------
2 files changed, 46 insertions(+), 52 deletions(-)
diff --git a/src/main/java/org/ahocorasick/trie/CharacterTransition.java b/src/main/java/org/ahocorasick/trie/CharacterTransition.java
index b49019a..830f211 100644
--- a/src/main/java/org/ahocorasick/trie/CharacterTransition.java
+++ b/src/main/java/org/ahocorasick/trie/CharacterTransition.java
@@ -31,7 +31,7 @@ class CharacterTransition extends Transition {
@Override
public boolean isWordSeparator() {
- return Character.isSpaceChar(token);
+ return Character.isWhitespace(token);
}
}
diff --git a/src/main/java/org/ahocorasick/trie/Trie.java b/src/main/java/org/ahocorasick/trie/Trie.java
index d285382..9ae8147 100644
--- a/src/main/java/org/ahocorasick/trie/Trie.java
+++ b/src/main/java/org/ahocorasick/trie/Trie.java
@@ -1,7 +1,5 @@
package org.ahocorasick.trie;
-import java.text.CharacterIterator;
-import java.util.Arrays;
import org.ahocorasick.trie.candidate.EmitCandidateFlushHandler;
import org.ahocorasick.trie.candidate.EmitCandidateHolder;
import org.ahocorasick.trie.candidate.NonOverlappingEmitCandidateHolder;
@@ -11,7 +9,6 @@ import org.ahocorasick.trie.handler.EmitHandler;
import org.ahocorasick.trie.handler.FirstMatchHandler;
import java.util.Collection;
-import java.util.Iterator;
import java.util.Queue;
import java.util.concurrent.LinkedBlockingDeque;
@@ -33,6 +30,15 @@ public class Trie {
private abstract class KeywordTokenizer {
protected int position = 0;
+ protected CharSequence input;
+ protected int length;
+ protected KeywordTokenizer(CharSequence input) {
+ this.input = input;
+ this.length = input.length();
+ }
+ protected char currentChar() {
+ return (position < length) ? input.charAt(position) : '\0';
+ }
public abstract Transition nextTransition();
public int getPosition() {
return position;
@@ -40,63 +46,54 @@ public class Trie {
}
private class WordTokenizer extends KeywordTokenizer {
- private final Iterator st;
- public WordTokenizer(String keyword) {
- String[] tokens = keyword.split("\\s");
- st = Arrays.asList(tokens).iterator();
+ public WordTokenizer(CharSequence input) {
+ super(input);
}
@Override
public Transition nextTransition() {
WordTransition t = null;
- if (st.hasNext()) {
- String word = st.next();
- t = new WordTransition(word, position);
- if (0 < position) {
- position += 1; // a space
+ while (position < length && Character.isWhitespace(currentChar())) {
+ ++position;
+ }
+ int start = position;
+ if (start < length) {
+ while (position < length && !Character.isWhitespace(currentChar())) {
+ ++position;
}
- position += word.length();
+ String word = input.subSequence(start, position).toString();
+ t = new WordTransition(word, start);
}
return t;
}
}
private class CharacterTokenizer extends KeywordTokenizer {
- private final java.text.StringCharacterIterator ct;
- private char cur;
- public CharacterTokenizer(String keyword) {
- ct = new java.text.StringCharacterIterator(keyword);
- cur = ct.first();
+ public CharacterTokenizer(CharSequence input) {
+ super(input);
}
@Override
public Transition nextTransition() {
CharacterTransition t = null;
- if (cur != CharacterIterator.DONE) {
- t = new CharacterTransition(cur, position);
- cur = ct.next();
+ if (position < length) {
+ t = new CharacterTransition(currentChar(), position);
+ position += 1;
}
- position += 1;
return t;
}
}
- private KeywordTokenizer keywordTokenizer(String keyword) {
- KeywordTokenizer kwt;
- if (trieConfig.hasWordTransitions()) {
- kwt = new WordTokenizer(keyword);
- }
- else {
- kwt = new CharacterTokenizer(keyword);
- }
- return kwt;
- }
-
private class TokenStream {
private final KeywordTokenizer kwt;
- private final String input;
+ private final StringBuilder input;
private Transition lookahead;
- public TokenStream(String input) {
- this.input = input;
+ public TokenStream(CharSequence text) {
+ input = new StringBuilder(text.length());
+ for (int p = 0; p < text.length(); ++p) {
+ char ch = text.charAt(p);
+ input.append(trieConfig.isCaseInsensitive() ?
+ Character.toLowerCase(ch) : ch);
+ }
if (trieConfig.hasWordTransitions()) {
kwt = new WordTokenizer(input);
}
@@ -122,26 +119,27 @@ public class Trie {
lookahead = kwt.nextTransition();
}
return ((start == 0 ||
- Character.isSpaceChar(input.codePointAt(start-1))) &&
+ Character.isWhitespace(input.charAt(start-1))) &&
(lookahead == null || lookahead.isWordSeparator()));
}
+
+ public String input() {
+ return input.toString();
+ }
}
- private void addKeyword(String keyword) {
+ private void addKeyword(CharSequence keyword) {
if (keyword == null || keyword.length() == 0) {
return;
}
- if (trieConfig.isCaseInsensitive()) {
- keyword = keyword.toLowerCase();
- }
State currentState = this.rootState;
- KeywordTokenizer tknz = keywordTokenizer(keyword);
+ TokenStream tknz = new TokenStream(keyword);
Transition tn = tknz.nextTransition();
while (tn != null) {
currentState = currentState.addState(tn);
tn = tknz.nextTransition();
}
- currentState.addEmit(keyword);
+ currentState.addEmit(tknz.input());
}
public Collection tokenize(String text) {
@@ -175,11 +173,7 @@ public class Trie {
final EmitCandidateFlushHandler flushHandler =
new EmitCandidateFlushHandler(emitHandler, emitCandidateHolder);
- String input = text.toString();
- if (trieConfig.isCaseInsensitive()) {
- input = input.toLowerCase();
- }
- TokenStream tknz = new TokenStream(input);
+ TokenStream tknz = new TokenStream(text);
State currentState = this.rootState;
Transition tn = tknz.nextTransition();
@@ -190,12 +184,12 @@ public class Trie {
currentState = getState(currentState, tn, flushHandler);
Collection emits = currentState.emit();
for (String emit : emits) {
- int position = tn.getStart() + tn.getLength() - 1;
- int start = position - emit.length() + 1;
+ int position = tn.getStart() + tn.getLength();
+ int start = position - emit.length();
boolean isWholeWord = tknz.isWholeWord(start);
if (isWholeWord || !trieConfig.isOnlyWholeWords()) {
emitCandidateHolder.addCandidate(
- new Emit(start, position, emit, isWholeWord));
+ new Emit(start, position - 1, emit, isWholeWord));
}
}
tn = tknz.nextTransition();
From c0d89cec2d0ee21c1ad55122e8d658710738cff4 Mon Sep 17 00:00:00 2001
From: Douglas Lovell
Date: Fri, 30 Oct 2015 16:25:04 -0600
Subject: [PATCH 12/20] add a few word transition tests
---
.../java/org/ahocorasick/trie/TrieTest.java | 39 ++++++++++++++++++-
1 file changed, 38 insertions(+), 1 deletion(-)
diff --git a/src/test/java/org/ahocorasick/trie/TrieTest.java b/src/test/java/org/ahocorasick/trie/TrieTest.java
index c0c4c97..c7b0599 100644
--- a/src/test/java/org/ahocorasick/trie/TrieTest.java
+++ b/src/test/java/org/ahocorasick/trie/TrieTest.java
@@ -79,8 +79,9 @@ public class TrieTest {
}
@Test
- public void variousKeywordsFirstMatch() {
+ public void variousKeywordsFirstMatchWordTransitions() {
Trie trie = Trie.builder()
+ .wordTransitions()
.addKeyword("abc")
.addKeyword("bcd")
.addKeyword("cde")
@@ -194,6 +195,23 @@ public class TrieTest {
checkEmit(iterator.next(), 51, 58, "broccoli");
}
+@Test
+ public void recipesWordTransitions() {
+ Trie trie = Trie.builder()
+ .wordTransitions()
+ .addKeyword("veal")
+ .addKeyword("cauliflower")
+ .addKeyword("broccoli")
+ .addKeyword("tomatoes")
+ .build();
+ Collection emits = trie.parseText("2 cauliflower 3 tomatoes 4 slices of veal 100g broccoli");
+ Iterator iterator = emits.iterator();
+ checkEmit(iterator.next(), 2, 12, "cauliflower");
+ checkEmit(iterator.next(), 16, 23, "tomatoes");
+ checkEmit(iterator.next(), 37, 40, "veal");
+ checkEmit(iterator.next(), 47, 54, "broccoli");
+ }
+
@Test
public void recipesFirstMatch() {
Trie trie = Trie.builder()
@@ -243,6 +261,25 @@ public class TrieTest {
checkEmit(iterator.next(), 41, 48, "wiel dop");
}
+ @Test
+ public void nonOverlappingWordTransitions() {
+ Trie trie = Trie.builder()
+ .removeOverlaps()
+ .wordTransitions()
+ .addKeyword("peper molen")
+ .addKeyword("molen wiel")
+ .addKeyword("wiel dop")
+ .addKeyword("dop")
+ .build();
+ Collection emits = trie.parseText("peper molen wiel dop xwiel dop wiel dopx wiel dop");
+ assertEquals(4, emits.size());
+ Iterator iterator = emits.iterator();
+ checkEmit(iterator.next(), 0, 10, "peper molen");
+ checkEmit(iterator.next(), 12, 19, "wiel dop");
+ checkEmit(iterator.next(), 27, 29, "dop");
+ checkEmit(iterator.next(), 41, 48, "wiel dop");
+ }
+
@Test
public void nonOverlappingWholeWordsWithCustomEmitHandler() {
Trie trie = Trie.builder()
From 9aa9695d382ca473fb1b04a1e0bb5c514e526cc1 Mon Sep 17 00:00:00 2001
From: Douglas Lovell
Date: Mon, 2 Nov 2015 11:49:14 -0700
Subject: [PATCH 13/20] version change for heartbeat changes
---
pom.xml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pom.xml b/pom.xml
index 9d964e4..ed7c40b 100644
--- a/pom.xml
+++ b/pom.xml
@@ -3,7 +3,7 @@
org.ahocorasick
ahocorasick
- 0.3.1-SNAPSHOT
+ 0.3.1-heartbeat
jar
Aho-CoraSick algorithm for efficient string matching
Java library for efficient string matching against a large set of keywords
@@ -104,4 +104,4 @@
-
\ No newline at end of file
+
From f89b000894c3080ca1a412026cf0ea9a034966f5 Mon Sep 17 00:00:00 2001
From: Douglas Lovell
Date: Mon, 2 Nov 2015 11:49:44 -0700
Subject: [PATCH 14/20] progress on leading and trailing white space problem
---
src/main/java/org/ahocorasick/trie/Trie.java | 31 +++++++++++
.../java/org/ahocorasick/trie/TrieTest.java | 51 +++++++++++++++++++
2 files changed, 82 insertions(+)
diff --git a/src/main/java/org/ahocorasick/trie/Trie.java b/src/main/java/org/ahocorasick/trie/Trie.java
index 9ae8147..985108c 100644
--- a/src/main/java/org/ahocorasick/trie/Trie.java
+++ b/src/main/java/org/ahocorasick/trie/Trie.java
@@ -48,10 +48,29 @@ public class Trie {
private class WordTokenizer extends KeywordTokenizer {
public WordTokenizer(CharSequence input) {
super(input);
+ System.out.println("WORDTOKENIZER input '" + input + "'");
+ // leading and trailing white space cannot be part of the
+ // search pattern
+ int start = 0;
+ while (start < length && Character.isWhitespace(input.charAt(start))) {
+ ++start;
+ }
+ int end = length - 1;
+ while (start < end && Character.isWhitespace(input.charAt(end))) {
+ --end;
+ }
+ this.input = input.subSequence(start, end + 1);
+ this.length = end - start + 1;
+ System.out.println("WORDTOKENIZER input '" + this.input + "'");
+ System.out.println("input.length " + this.input.length() + ", this.length " + this.length);
}
@Override
public Transition nextTransition() {
WordTransition t = null;
+ System.out.println("WORDTOKENIZER get next word transition");
+ System.out.println("Position: " + position);
+ System.out.println("Text under cursor, '" +
+ input.subSequence(Math.min(length-1, position), Math.min(length, position + 10)) + "'");
while (position < length && Character.isWhitespace(currentChar())) {
++position;
}
@@ -63,6 +82,8 @@ public class Trie {
String word = input.subSequence(start, position).toString();
t = new WordTransition(word, start);
}
+ System.out.println("New position: " + position);
+ System.out.println("Text in transition, '" + (t == null ? "null" : t.transitionToken()) + "'");
return t;
}
}
@@ -74,10 +95,15 @@ public class Trie {
@Override
public Transition nextTransition() {
CharacterTransition t = null;
+ System.out.println("CHARACTERTOKENIZER get next character transition");
+ System.out.println("Position: " + position);
+ System.out.println("Text under cursor, '" + input.subSequence(position, Math.min(length, position + 10)) + "'");
if (position < length) {
t = new CharacterTransition(currentChar(), position);
position += 1;
}
+ System.out.println("New position: " + position);
+ System.out.println("Text in transition, '" + (t == null ? "null" : t.transitionToken()) + "'");
return t;
}
}
@@ -186,6 +212,11 @@ public class Trie {
for (String emit : emits) {
int position = tn.getStart() + tn.getLength();
int start = position - emit.length();
+ if (start < 0) {
+ System.out.println("START < 0 !! at " + position + " on '" + emit + "', length " + emit.length());
+ System.out.println("TEXT is '" + text + "'");
+ System.out.println(tn);
+ }
boolean isWholeWord = tknz.isWholeWord(start);
if (isWholeWord || !trieConfig.isOnlyWholeWords()) {
emitCandidateHolder.addCandidate(
diff --git a/src/test/java/org/ahocorasick/trie/TrieTest.java b/src/test/java/org/ahocorasick/trie/TrieTest.java
index c7b0599..1a5b5f1 100644
--- a/src/test/java/org/ahocorasick/trie/TrieTest.java
+++ b/src/test/java/org/ahocorasick/trie/TrieTest.java
@@ -562,6 +562,22 @@ public class TrieTest {
checkEmit(firstMatch, 5, 8, "this");
}
+ @Test
+ public void unicodeInKeyword() {
+ // The upper case character ('İ') is Unicode,
+ // which was read by AC as a 2-byte char
+ String target = "it is so much LİKE Unicode to mess with Java";
+ Trie trie = Trie.builder()
+ .onlyWholeWords()
+ .addKeyword("so much LİKE Unicode")
+ .addKeyword("it is")
+ .build();
+ Collection emits = trie.parseText(target);
+ Iterator it = emits.iterator();
+ checkEmit(it.next(), 0, 4, "it is");
+ checkEmit(it.next(), 6, 25, "so much LİKE Unicode");
+ }
+
@Test
public void partialMatchWhiteSpaces() {
Trie trie = Trie.builder()
@@ -573,6 +589,41 @@ public class TrieTest {
checkEmit(emits.iterator().next(), 0, 9, "#sugar-123");
}
+ /*
+ What does "onlyWholeWords" mean when the keyword itself has spaces?
+ @Test
+ public void spacesAroundKeyword() {
+ String keyword = " lorem ipso facto genera linden pharma six 1 ";
+ Trie trie = Trie.builder()
+ .onlyWholeWords()
+ .caseInsensitive()
+ .addKeyword(keyword)
+ .build();
+ Collection < Emit > emits = trie.parseText(
+ "gravita conundrum" + keyword + "under addressed object ");
+ assertEquals(1, emits.size());
+ checkEmit(emits.iterator().next(), 0, keyword.length() + 1, keyword);
+ }
+ */
+
+ /*
+ For wordTransitions, we'll ignore leading and trailing white space
+ included on keywords
+ */
+ @Test
+ public void spacesAroundKeywordByWords() {
+ String keyword = "lorem ipso facto genera linden pharma six 1";
+ Trie trie = Trie.builder()
+ .wordTransitions()
+ .caseInsensitive()
+ .addKeyword(" " + keyword + " ")
+ .build();
+ Collection < Emit > emits = trie.parseText(
+ keyword + " under addressed object ");
+ assertEquals(1, emits.size());
+ checkEmit(emits.iterator().next(), 0, keyword.length(), keyword);
+ }
+
private void assertToken(Token token, String fragment, boolean match, boolean wholeWord, boolean whiteSpace) {
assertEquals(fragment, token.getFragment());
assertEquals(match, token.isMatch());
From d764017abee543edaf0c679a52326a7d7e06f000 Mon Sep 17 00:00:00 2001
From: Douglas Lovell
Date: Mon, 2 Nov 2015 17:14:28 -0700
Subject: [PATCH 15/20] use word transitions for whole word only mode
---
.../ahocorasick/trie/CharacterTransition.java | 5 -
src/main/java/org/ahocorasick/trie/Emit.java | 13 +--
.../org/ahocorasick/trie/FragmentToken.java | 13 ---
.../java/org/ahocorasick/trie/Keyword.java | 54 +++++++++++
.../java/org/ahocorasick/trie/MatchToken.java | 5 -
src/main/java/org/ahocorasick/trie/State.java | 16 +--
src/main/java/org/ahocorasick/trie/Token.java | 10 +-
.../java/org/ahocorasick/trie/Tokenizer.java | 2 +-
.../java/org/ahocorasick/trie/Transition.java | 7 +-
src/main/java/org/ahocorasick/trie/Trie.java | 97 ++++++++-----------
.../java/org/ahocorasick/trie/TrieConfig.java | 10 --
.../org/ahocorasick/trie/WordTransition.java | 5 -
.../java/org/ahocorasick/trie/TrieTest.java | 39 ++------
13 files changed, 120 insertions(+), 156 deletions(-)
create mode 100644 src/main/java/org/ahocorasick/trie/Keyword.java
diff --git a/src/main/java/org/ahocorasick/trie/CharacterTransition.java b/src/main/java/org/ahocorasick/trie/CharacterTransition.java
index 830f211..ca561ee 100644
--- a/src/main/java/org/ahocorasick/trie/CharacterTransition.java
+++ b/src/main/java/org/ahocorasick/trie/CharacterTransition.java
@@ -29,9 +29,4 @@ class CharacterTransition extends Transition {
this(c, 0);
}
- @Override
- public boolean isWordSeparator() {
- return Character.isWhitespace(token);
- }
-
}
diff --git a/src/main/java/org/ahocorasick/trie/Emit.java b/src/main/java/org/ahocorasick/trie/Emit.java
index 16b6ae3..20d73ff 100644
--- a/src/main/java/org/ahocorasick/trie/Emit.java
+++ b/src/main/java/org/ahocorasick/trie/Emit.java
@@ -6,27 +6,16 @@ import org.ahocorasick.interval.Intervalable;
public class Emit extends Interval implements Intervalable {
private final String keyword;
- private final boolean isWholeWord;
- public Emit(final int start, final int end,
- final String keyword, boolean isWholeWord) {
+ public Emit(final int start, final int end, final String keyword) {
super(start, end);
this.keyword = keyword;
- this.isWholeWord = isWholeWord;
}
- public Emit(final int start, final int end, final String keyword) {
- this(start, end, keyword, true);
- }
-
public String getKeyword() {
return this.keyword;
}
- public boolean isWholeWord() {
- return isWholeWord;
- }
-
@Override
public String toString() {
return super.toString() + "=" + this.keyword;
diff --git a/src/main/java/org/ahocorasick/trie/FragmentToken.java b/src/main/java/org/ahocorasick/trie/FragmentToken.java
index f0c899f..37e83d1 100644
--- a/src/main/java/org/ahocorasick/trie/FragmentToken.java
+++ b/src/main/java/org/ahocorasick/trie/FragmentToken.java
@@ -2,16 +2,8 @@ package org.ahocorasick.trie;
public class FragmentToken extends Token {
- private boolean whiteSpace;
-
public FragmentToken(String fragment) {
super(fragment);
- this.whiteSpace = true;
- for (int position = 0; position < fragment.length(); position++) {
- if (!Character.isWhitespace(fragment.charAt(position))) {
- whiteSpace = false;
- }
- }
}
@Override
@@ -24,9 +16,4 @@ public class FragmentToken extends Token {
return null;
}
- @Override
- public boolean isWhiteSpace() {
- return whiteSpace;
- }
-
}
diff --git a/src/main/java/org/ahocorasick/trie/Keyword.java b/src/main/java/org/ahocorasick/trie/Keyword.java
new file mode 100644
index 0000000..1d31728
--- /dev/null
+++ b/src/main/java/org/ahocorasick/trie/Keyword.java
@@ -0,0 +1,54 @@
+/*
+ * Copyright 2015 Rogue Wave Software.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.ahocorasick.trie;
+
+/**
+ *
+ * @author doug.lovell
+ */
+public class Keyword implements Comparable {
+ private final String text;
+ private int depth;
+
+ public Keyword(String text, int depth) {
+ this.text = text;
+ this.depth = depth;
+ }
+
+ public void setDepth(int depth) {
+ this.depth = depth;
+ }
+
+ public int getDepth() {
+ return depth;
+ }
+
+ public String getText() {
+ return text;
+ }
+
+ public String toString() {
+ return "Keyword '" + text + "' at depth " + depth;
+ }
+
+ @Override
+ public int compareTo(Object o) {
+ if (o instanceof Keyword) {
+ return text.compareTo(((Keyword) o).text);
+ }
+ throw new IllegalArgumentException("Only supports comparison with other keywords");
+ }
+}
diff --git a/src/main/java/org/ahocorasick/trie/MatchToken.java b/src/main/java/org/ahocorasick/trie/MatchToken.java
index d0ba56c..9d91693 100644
--- a/src/main/java/org/ahocorasick/trie/MatchToken.java
+++ b/src/main/java/org/ahocorasick/trie/MatchToken.java
@@ -9,11 +9,6 @@ public class MatchToken extends Token {
this.emit = emit;
}
- @Override
- public boolean isWholeWord() {
- return emit.isWholeWord();
- }
-
@Override
public boolean isMatch() {
return true;
diff --git a/src/main/java/org/ahocorasick/trie/State.java b/src/main/java/org/ahocorasick/trie/State.java
index e104732..215a652 100644
--- a/src/main/java/org/ahocorasick/trie/State.java
+++ b/src/main/java/org/ahocorasick/trie/State.java
@@ -43,7 +43,7 @@ public class State {
private State failure = null;
/** whenever this state is reached, it will emit the matches keywords for future reference */
- private Set emits = null;
+ private Set emits = null;
public State() {
this(0);
@@ -83,21 +83,25 @@ public class State {
return this.depth;
}
- public void addEmit(String keyword) {
+ public void addEmit(Keyword keyword) {
if (this.emits == null) {
this.emits = new TreeSet<>();
}
this.emits.add(keyword);
}
- public void addEmit(Collection emits) {
- for (String emit : emits) {
+ public void addEmit(Collection emits) {
+ for (Keyword emit : emits) {
addEmit(emit);
}
}
+
+ public void addEmitString(String key) {
+ addEmit(new Keyword(key, depth));
+ }
- public Collection emit() {
- return this.emits == null ? Collections. emptyList() : this.emits;
+ public Collection emit() {
+ return this.emits == null ? Collections. emptyList() : this.emits;
}
public State failure(EmitCandidateFlushHandler emitCandidateFlushHandler) {
diff --git a/src/main/java/org/ahocorasick/trie/Token.java b/src/main/java/org/ahocorasick/trie/Token.java
index 7ec280f..2e4c72f 100644
--- a/src/main/java/org/ahocorasick/trie/Token.java
+++ b/src/main/java/org/ahocorasick/trie/Token.java
@@ -2,7 +2,7 @@ package org.ahocorasick.trie;
public abstract class Token {
- private String fragment;
+ private final String fragment;
public Token(String fragment) {
this.fragment = fragment;
@@ -14,14 +14,6 @@ public abstract class Token {
public abstract boolean isMatch();
- public boolean isWholeWord() {
- return false;
- }
-
- public boolean isWhiteSpace() {
- return false;
- }
-
public abstract Emit getEmit();
}
diff --git a/src/main/java/org/ahocorasick/trie/Tokenizer.java b/src/main/java/org/ahocorasick/trie/Tokenizer.java
index d0dbe43..9f587c4 100644
--- a/src/main/java/org/ahocorasick/trie/Tokenizer.java
+++ b/src/main/java/org/ahocorasick/trie/Tokenizer.java
@@ -9,7 +9,7 @@ public class Tokenizer {
private final Collection emits;
private final String text;
-
+
public Tokenizer(Collection emits, String text) {
this.emits = emits;
this.text = text;
diff --git a/src/main/java/org/ahocorasick/trie/Transition.java b/src/main/java/org/ahocorasick/trie/Transition.java
index 84e15ca..aa1b863 100644
--- a/src/main/java/org/ahocorasick/trie/Transition.java
+++ b/src/main/java/org/ahocorasick/trie/Transition.java
@@ -23,7 +23,7 @@ import java.util.Objects;
* @author doug.lovell
* @param
*/
-public abstract class Transition {
+public class Transition {
protected final T token;
protected final int start;
protected final int length;
@@ -46,11 +46,10 @@ public abstract class Transition {
return length;
}
- public abstract boolean isWordSeparator();
-
@Override
public String toString() {
- return "Transition on " + token;
+ return "Transition on '" + token + "' start: " + start +
+ ", length: " + length;
}
@Override
diff --git a/src/main/java/org/ahocorasick/trie/Trie.java b/src/main/java/org/ahocorasick/trie/Trie.java
index 985108c..edc2181 100644
--- a/src/main/java/org/ahocorasick/trie/Trie.java
+++ b/src/main/java/org/ahocorasick/trie/Trie.java
@@ -10,6 +10,8 @@ import org.ahocorasick.trie.handler.FirstMatchHandler;
import java.util.Collection;
import java.util.Queue;
+import java.util.LinkedList;
+import java.util.ListIterator;
import java.util.concurrent.LinkedBlockingDeque;
/**
@@ -43,34 +45,16 @@ public class Trie {
public int getPosition() {
return position;
}
+ public abstract Emit match(Keyword kwd, int start, int position);
}
private class WordTokenizer extends KeywordTokenizer {
public WordTokenizer(CharSequence input) {
super(input);
- System.out.println("WORDTOKENIZER input '" + input + "'");
- // leading and trailing white space cannot be part of the
- // search pattern
- int start = 0;
- while (start < length && Character.isWhitespace(input.charAt(start))) {
- ++start;
- }
- int end = length - 1;
- while (start < end && Character.isWhitespace(input.charAt(end))) {
- --end;
- }
- this.input = input.subSequence(start, end + 1);
- this.length = end - start + 1;
- System.out.println("WORDTOKENIZER input '" + this.input + "'");
- System.out.println("input.length " + this.input.length() + ", this.length " + this.length);
}
@Override
public Transition nextTransition() {
WordTransition t = null;
- System.out.println("WORDTOKENIZER get next word transition");
- System.out.println("Position: " + position);
- System.out.println("Text under cursor, '" +
- input.subSequence(Math.min(length-1, position), Math.min(length, position + 10)) + "'");
while (position < length && Character.isWhitespace(currentChar())) {
++position;
}
@@ -82,10 +66,17 @@ public class Trie {
String word = input.subSequence(start, position).toString();
t = new WordTransition(word, start);
}
- System.out.println("New position: " + position);
- System.out.println("Text in transition, '" + (t == null ? "null" : t.transitionToken()) + "'");
return t;
}
+ /*
+ On word matching, we return the matched text, which can be of different
+ length that the keyword, due to whitespace differences.
+ */
+ @Override
+ public Emit match(Keyword kwd, int start, int position) {
+ String matchedText = input.subSequence(start, position).toString();
+ return new Emit(start, position - 1, matchedText);
+ }
}
private class CharacterTokenizer extends KeywordTokenizer {
@@ -95,17 +86,20 @@ public class Trie {
@Override
public Transition nextTransition() {
CharacterTransition t = null;
- System.out.println("CHARACTERTOKENIZER get next character transition");
- System.out.println("Position: " + position);
- System.out.println("Text under cursor, '" + input.subSequence(position, Math.min(length, position + 10)) + "'");
if (position < length) {
t = new CharacterTransition(currentChar(), position);
position += 1;
}
- System.out.println("New position: " + position);
- System.out.println("Text in transition, '" + (t == null ? "null" : t.transitionToken()) + "'");
return t;
}
+ /*
+ On character matching, the tests expect the implementation to
+ return the matched keyword.
+ */
+ @Override
+ public Emit match(Keyword kwd, int start, int position) {
+ return new Emit(start, position - 1, kwd.getText());
+ }
}
private class TokenStream {
@@ -120,7 +114,7 @@ public class Trie {
input.append(trieConfig.isCaseInsensitive() ?
Character.toLowerCase(ch) : ch);
}
- if (trieConfig.hasWordTransitions()) {
+ if (trieConfig.isOnlyWholeWords()) {
kwt = new WordTokenizer(input);
}
else {
@@ -140,18 +134,14 @@ public class Trie {
return next;
}
- public boolean isWholeWord(int start) {
- if (lookahead == null) {
- lookahead = kwt.nextTransition();
- }
- return ((start == 0 ||
- Character.isWhitespace(input.charAt(start-1))) &&
- (lookahead == null || lookahead.isWordSeparator()));
- }
-
public String input() {
return input.toString();
}
+
+ public Emit match(Keyword kwd, int start, int position) {
+ return kwt.match(kwd, start, position);
+ }
+
}
private void addKeyword(CharSequence keyword) {
@@ -165,7 +155,7 @@ public class Trie {
currentState = currentState.addState(tn);
tn = tknz.nextTransition();
}
- currentState.addEmit(tknz.input());
+ currentState.addEmitString(tknz.input());
}
public Collection tokenize(String text) {
@@ -201,27 +191,25 @@ public class Trie {
TokenStream tknz = new TokenStream(text);
+ LinkedList tknHistory = new LinkedList<>();
State currentState = this.rootState;
Transition tn = tknz.nextTransition();
while (tn != null) {
if (flushHandler.stop()) {
return;
}
+ tknHistory.add(tn);
currentState = getState(currentState, tn, flushHandler);
- Collection emits = currentState.emit();
- for (String emit : emits) {
+ Collection emits = currentState.emit();
+ int depth = currentState.getDepth();
+ while (depth < tknHistory.size()) {
+ tknHistory.remove();
+ }
+ for (Keyword emit : emits) {
int position = tn.getStart() + tn.getLength();
- int start = position - emit.length();
- if (start < 0) {
- System.out.println("START < 0 !! at " + position + " on '" + emit + "', length " + emit.length());
- System.out.println("TEXT is '" + text + "'");
- System.out.println(tn);
- }
- boolean isWholeWord = tknz.isWholeWord(start);
- if (isWholeWord || !trieConfig.isOnlyWholeWords()) {
- emitCandidateHolder.addCandidate(
- new Emit(start, position - 1, emit, isWholeWord));
- }
+ int start = tknHistory.get(depth - emit.getDepth()).getStart();
+ ListIterator tns = tknHistory.listIterator();
+ emitCandidateHolder.addCandidate(tknz.match(emit, start, position));
}
tn = tknz.nextTransition();
}
@@ -290,16 +278,11 @@ public class Trie {
}
public TrieBuilder onlyWholeWords() {
- this.trieConfig.setOnlyWholeWords(true);
- return this;
- }
-
- public TrieBuilder wordTransitions() {
if (hasAddedKeyword) {
throw new IllegalStateException(
- "Unable to switch to word transitions after keywords added");
+ "Unable to switch to only whole words after keywords added");
}
- this.trieConfig.setWordTransitions(true);
+ this.trieConfig.setOnlyWholeWords(true);
return this;
}
diff --git a/src/main/java/org/ahocorasick/trie/TrieConfig.java b/src/main/java/org/ahocorasick/trie/TrieConfig.java
index c7c6f72..c556dfa 100644
--- a/src/main/java/org/ahocorasick/trie/TrieConfig.java
+++ b/src/main/java/org/ahocorasick/trie/TrieConfig.java
@@ -8,8 +8,6 @@ public class TrieConfig {
private boolean caseInsensitive = false;
- private boolean wordTransitions = false;
-
public boolean isAllowOverlaps() {
return allowOverlaps;
}
@@ -33,12 +31,4 @@ public class TrieConfig {
public void setCaseInsensitive(boolean caseInsensitive) {
this.caseInsensitive = caseInsensitive;
}
-
- public boolean hasWordTransitions() {
- return wordTransitions;
- }
-
- public void setWordTransitions(boolean wordNodes) {
- this.wordTransitions = wordNodes;
- }
}
diff --git a/src/main/java/org/ahocorasick/trie/WordTransition.java b/src/main/java/org/ahocorasick/trie/WordTransition.java
index e4f6436..f12d0bc 100644
--- a/src/main/java/org/ahocorasick/trie/WordTransition.java
+++ b/src/main/java/org/ahocorasick/trie/WordTransition.java
@@ -28,9 +28,4 @@ public class WordTransition extends Transition {
public WordTransition(String s) {
this(s, 0);
}
-
- @Override
- public boolean isWordSeparator() {
- return true;
- }
}
diff --git a/src/test/java/org/ahocorasick/trie/TrieTest.java b/src/test/java/org/ahocorasick/trie/TrieTest.java
index 1a5b5f1..fcc254a 100644
--- a/src/test/java/org/ahocorasick/trie/TrieTest.java
+++ b/src/test/java/org/ahocorasick/trie/TrieTest.java
@@ -81,7 +81,7 @@ public class TrieTest {
@Test
public void variousKeywordsFirstMatchWordTransitions() {
Trie trie = Trie.builder()
- .wordTransitions()
+ .onlyWholeWords()
.addKeyword("abc")
.addKeyword("bcd")
.addKeyword("cde")
@@ -198,7 +198,7 @@ public class TrieTest {
@Test
public void recipesWordTransitions() {
Trie trie = Trie.builder()
- .wordTransitions()
+ .onlyWholeWords()
.addKeyword("veal")
.addKeyword("cauliflower")
.addKeyword("broccoli")
@@ -265,7 +265,7 @@ public class TrieTest {
public void nonOverlappingWordTransitions() {
Trie trie = Trie.builder()
.removeOverlaps()
- .wordTransitions()
+ .onlyWholeWords()
.addKeyword("peper molen")
.addKeyword("molen wiel")
.addKeyword("wiel dop")
@@ -436,7 +436,7 @@ public class TrieTest {
@Test
public void tokenizeFullSentenceByWords() {
Trie trie = Trie.builder()
- .wordTransitions()
+ .onlyWholeWords()
.addKeyword("Alpha")
.addKeyword("Beta")
.addKeyword("Gamma")
@@ -457,13 +457,13 @@ public class TrieTest {
public ExpectedException thrown = ExpectedException.none();
@Test
- public void wordTransitionsThrowsExceptionAfterKeywordsAdded()
+ public void onlyWholeWordsThrowsExceptionAfterKeywordsAdded()
throws IllegalStateException {
thrown.expect(IllegalStateException.class);
- thrown.expectMessage("Unable to switch to word transitions after keywords added");
+ thrown.expectMessage("Unable to switch to only whole words after keywords added");
Trie trie = Trie.builder()
.addKeyword("Happy for now")
- .wordTransitions()
+ .onlyWholeWords()
.addKeyword("Not so happy")
.build();
}
@@ -589,46 +589,27 @@ public class TrieTest {
checkEmit(emits.iterator().next(), 0, 9, "#sugar-123");
}
- /*
- What does "onlyWholeWords" mean when the keyword itself has spaces?
- @Test
- public void spacesAroundKeyword() {
- String keyword = " lorem ipso facto genera linden pharma six 1 ";
- Trie trie = Trie.builder()
- .onlyWholeWords()
- .caseInsensitive()
- .addKeyword(keyword)
- .build();
- Collection < Emit > emits = trie.parseText(
- "gravita conundrum" + keyword + "under addressed object ");
- assertEquals(1, emits.size());
- checkEmit(emits.iterator().next(), 0, keyword.length() + 1, keyword);
- }
- */
-
/*
- For wordTransitions, we'll ignore leading and trailing white space
+ For onlyWholeWords, we'll ignore leading and trailing white space
included on keywords
*/
@Test
public void spacesAroundKeywordByWords() {
String keyword = "lorem ipso facto genera linden pharma six 1";
Trie trie = Trie.builder()
- .wordTransitions()
+ .onlyWholeWords()
.caseInsensitive()
.addKeyword(" " + keyword + " ")
.build();
Collection < Emit > emits = trie.parseText(
keyword + " under addressed object ");
assertEquals(1, emits.size());
- checkEmit(emits.iterator().next(), 0, keyword.length(), keyword);
+ checkEmit(emits.iterator().next(), 0, keyword.length() - 1, keyword);
}
private void assertToken(Token token, String fragment, boolean match, boolean wholeWord, boolean whiteSpace) {
assertEquals(fragment, token.getFragment());
assertEquals(match, token.isMatch());
- assertEquals(wholeWord, token.isWholeWord());
- assertEquals(whiteSpace, token.isWhiteSpace());
}
private void checkEmit(Emit next, int expectedStart, int expectedEnd, String expectedKeyword) {
From 3839e406ce6ed230cf0eb1359fd9571bf4f56ba6 Mon Sep 17 00:00:00 2001
From: Douglas Lovell
Date: Mon, 2 Nov 2015 17:50:27 -0700
Subject: [PATCH 16/20] we want the keywords back in all cases after all, but
the start and end indexes refer to the text
---
src/main/java/org/ahocorasick/trie/Trie.java | 25 ++-----------------
.../java/org/ahocorasick/trie/TrieTest.java | 9 ++++---
2 files changed, 7 insertions(+), 27 deletions(-)
diff --git a/src/main/java/org/ahocorasick/trie/Trie.java b/src/main/java/org/ahocorasick/trie/Trie.java
index edc2181..3267c02 100644
--- a/src/main/java/org/ahocorasick/trie/Trie.java
+++ b/src/main/java/org/ahocorasick/trie/Trie.java
@@ -45,7 +45,6 @@ public class Trie {
public int getPosition() {
return position;
}
- public abstract Emit match(Keyword kwd, int start, int position);
}
private class WordTokenizer extends KeywordTokenizer {
@@ -68,15 +67,6 @@ public class Trie {
}
return t;
}
- /*
- On word matching, we return the matched text, which can be of different
- length that the keyword, due to whitespace differences.
- */
- @Override
- public Emit match(Keyword kwd, int start, int position) {
- String matchedText = input.subSequence(start, position).toString();
- return new Emit(start, position - 1, matchedText);
- }
}
private class CharacterTokenizer extends KeywordTokenizer {
@@ -92,14 +82,6 @@ public class Trie {
}
return t;
}
- /*
- On character matching, the tests expect the implementation to
- return the matched keyword.
- */
- @Override
- public Emit match(Keyword kwd, int start, int position) {
- return new Emit(start, position - 1, kwd.getText());
- }
}
private class TokenStream {
@@ -137,10 +119,6 @@ public class Trie {
public String input() {
return input.toString();
}
-
- public Emit match(Keyword kwd, int start, int position) {
- return kwt.match(kwd, start, position);
- }
}
@@ -209,7 +187,8 @@ public class Trie {
int position = tn.getStart() + tn.getLength();
int start = tknHistory.get(depth - emit.getDepth()).getStart();
ListIterator tns = tknHistory.listIterator();
- emitCandidateHolder.addCandidate(tknz.match(emit, start, position));
+ emitCandidateHolder.addCandidate(
+ new Emit(start, position - 1, emit.getText()));
}
tn = tknz.nextTransition();
}
diff --git a/src/test/java/org/ahocorasick/trie/TrieTest.java b/src/test/java/org/ahocorasick/trie/TrieTest.java
index fcc254a..43a8fef 100644
--- a/src/test/java/org/ahocorasick/trie/TrieTest.java
+++ b/src/test/java/org/ahocorasick/trie/TrieTest.java
@@ -595,16 +595,17 @@ public class TrieTest {
*/
@Test
public void spacesAroundKeywordByWords() {
- String keyword = "lorem ipso facto genera linden pharma six 1";
+ String text = "lorem ipso facto genera linden pharma six 1";
+ String keyword = " " + text + " ";
Trie trie = Trie.builder()
.onlyWholeWords()
.caseInsensitive()
- .addKeyword(" " + keyword + " ")
+ .addKeyword(keyword)
.build();
Collection < Emit > emits = trie.parseText(
- keyword + " under addressed object ");
+ text + " under addressed object ");
assertEquals(1, emits.size());
- checkEmit(emits.iterator().next(), 0, keyword.length() - 1, keyword);
+ checkEmit(emits.iterator().next(), 0, text.length() - 1, keyword);
}
private void assertToken(Token token, String fragment, boolean match, boolean wholeWord, boolean whiteSpace) {
From 06675a60745803c01b13c518c72b47a0dc6ae69a Mon Sep 17 00:00:00 2001
From: Benni
Date: Wed, 17 Feb 2016 17:56:11 +0100
Subject: [PATCH 17/20] Add support for punctuation in text: Those characters
will form a separate token now.
---
src/main/java/org/ahocorasick/trie/Trie.java | 4 +--
.../java/org/ahocorasick/trie/TrieTest.java | 34 +++++++++++++++++++
2 files changed, 36 insertions(+), 2 deletions(-)
diff --git a/src/main/java/org/ahocorasick/trie/Trie.java b/src/main/java/org/ahocorasick/trie/Trie.java
index 3267c02..9f6c5bb 100644
--- a/src/main/java/org/ahocorasick/trie/Trie.java
+++ b/src/main/java/org/ahocorasick/trie/Trie.java
@@ -59,9 +59,9 @@ public class Trie {
}
int start = position;
if (start < length) {
- while (position < length && !Character.isWhitespace(currentChar())) {
+ do {
++position;
- }
+ } while (position < length && Character.isLetterOrDigit(currentChar()));
String word = input.subSequence(start, position).toString();
t = new WordTransition(word, start);
}
diff --git a/src/test/java/org/ahocorasick/trie/TrieTest.java b/src/test/java/org/ahocorasick/trie/TrieTest.java
index 43a8fef..c4dde0e 100644
--- a/src/test/java/org/ahocorasick/trie/TrieTest.java
+++ b/src/test/java/org/ahocorasick/trie/TrieTest.java
@@ -2,6 +2,7 @@ package org.ahocorasick.trie;
import org.ahocorasick.trie.handler.EmitHandler;
import org.ahocorasick.trie.handler.SimpleEmitHandler;
+import org.junit.Assert;
import org.junit.Test;
import java.util.ArrayList;
@@ -608,6 +609,39 @@ public class TrieTest {
checkEmit(emits.iterator().next(), 0, text.length() - 1, keyword);
}
+ @Test
+ public void punctuationInText() {
+ Trie trie = Trie.builder()
+ .onlyWholeWords()
+ .addKeyword("pie")
+ .build();
+
+ Collection emits = trie.parseText("Want some pie? Gimme pie! pie, pie. The pie's revenge.");
+ Assert.assertEquals(5, emits.size());
+ Iterator it = emits.iterator();
+ checkEmit(it.next(), 10, 12, "pie");
+ checkEmit(it.next(), 21, 23, "pie");
+ checkEmit(it.next(), 26, 28, "pie");
+ checkEmit(it.next(), 31, 33, "pie");
+ checkEmit(it.next(), 40, 42, "pie");
+ }
+
+ @Test
+ public void punctuationInSearchTerm() {
+ Trie trie = Trie.builder()
+ .onlyWholeWords()
+ .addKeyword("Dr. Feelgood")
+ .addKeyword("Oi!")
+ .build();
+
+ Collection emits = trie
+ .parseText("The Oi! music genre is inspired by Dr. Feelgood and other bands. Oi or Dr Feelgood should not match.");
+
+ Assert.assertEquals(2, emits.size());
+
+
+ }
+
private void assertToken(Token token, String fragment, boolean match, boolean wholeWord, boolean whiteSpace) {
assertEquals(fragment, token.getFragment());
assertEquals(match, token.isMatch());
From 165f18e581851ddebe856f8d4f74e207df2414ba Mon Sep 17 00:00:00 2001
From: Benni
Date: Wed, 17 Feb 2016 18:05:42 +0100
Subject: [PATCH 18/20] Remove unused varables, remove duplicate tests (both
"nonOverlappingWordTransitions" and "nonOverlappingWholeWords" contain
exactly the same code)
---
.../java/org/ahocorasick/trie/Keyword.java | 4 ----
.../java/org/ahocorasick/trie/Transition.java | 4 ----
src/main/java/org/ahocorasick/trie/Trie.java | 5 -----
.../java/org/ahocorasick/trie/TrieTest.java | 19 -------------------
4 files changed, 32 deletions(-)
diff --git a/src/main/java/org/ahocorasick/trie/Keyword.java b/src/main/java/org/ahocorasick/trie/Keyword.java
index 1d31728..0a6db3d 100644
--- a/src/main/java/org/ahocorasick/trie/Keyword.java
+++ b/src/main/java/org/ahocorasick/trie/Keyword.java
@@ -28,10 +28,6 @@ public class Keyword implements Comparable {
this.depth = depth;
}
- public void setDepth(int depth) {
- this.depth = depth;
- }
-
public int getDepth() {
return depth;
}
diff --git a/src/main/java/org/ahocorasick/trie/Transition.java b/src/main/java/org/ahocorasick/trie/Transition.java
index aa1b863..f0b11b4 100644
--- a/src/main/java/org/ahocorasick/trie/Transition.java
+++ b/src/main/java/org/ahocorasick/trie/Transition.java
@@ -34,10 +34,6 @@ public class Transition {
this.length = length;
}
- public T transitionToken() {
- return token;
- }
-
public int getStart() {
return start;
}
diff --git a/src/main/java/org/ahocorasick/trie/Trie.java b/src/main/java/org/ahocorasick/trie/Trie.java
index 9f6c5bb..8c72734 100644
--- a/src/main/java/org/ahocorasick/trie/Trie.java
+++ b/src/main/java/org/ahocorasick/trie/Trie.java
@@ -11,7 +11,6 @@ import org.ahocorasick.trie.handler.FirstMatchHandler;
import java.util.Collection;
import java.util.Queue;
import java.util.LinkedList;
-import java.util.ListIterator;
import java.util.concurrent.LinkedBlockingDeque;
/**
@@ -42,9 +41,6 @@ public class Trie {
return (position < length) ? input.charAt(position) : '\0';
}
public abstract Transition nextTransition();
- public int getPosition() {
- return position;
- }
}
private class WordTokenizer extends KeywordTokenizer {
@@ -186,7 +182,6 @@ public class Trie {
for (Keyword emit : emits) {
int position = tn.getStart() + tn.getLength();
int start = tknHistory.get(depth - emit.getDepth()).getStart();
- ListIterator tns = tknHistory.listIterator();
emitCandidateHolder.addCandidate(
new Emit(start, position - 1, emit.getText()));
}
diff --git a/src/test/java/org/ahocorasick/trie/TrieTest.java b/src/test/java/org/ahocorasick/trie/TrieTest.java
index c4dde0e..b55ee07 100644
--- a/src/test/java/org/ahocorasick/trie/TrieTest.java
+++ b/src/test/java/org/ahocorasick/trie/TrieTest.java
@@ -262,25 +262,6 @@ public class TrieTest {
checkEmit(iterator.next(), 41, 48, "wiel dop");
}
- @Test
- public void nonOverlappingWordTransitions() {
- Trie trie = Trie.builder()
- .removeOverlaps()
- .onlyWholeWords()
- .addKeyword("peper molen")
- .addKeyword("molen wiel")
- .addKeyword("wiel dop")
- .addKeyword("dop")
- .build();
- Collection emits = trie.parseText("peper molen wiel dop xwiel dop wiel dopx wiel dop");
- assertEquals(4, emits.size());
- Iterator iterator = emits.iterator();
- checkEmit(iterator.next(), 0, 10, "peper molen");
- checkEmit(iterator.next(), 12, 19, "wiel dop");
- checkEmit(iterator.next(), 27, 29, "dop");
- checkEmit(iterator.next(), 41, 48, "wiel dop");
- }
-
@Test
public void nonOverlappingWholeWordsWithCustomEmitHandler() {
Trie trie = Trie.builder()
From 82d36386fd4ec7f28a055d73a97cef936943908a Mon Sep 17 00:00:00 2001
From: Douglas Lovell
Date: Fri, 13 Oct 2017 20:47:21 -0300
Subject: [PATCH 19/20] Add comments, rename variables, remove unused lookahead
---
.../ahocorasick/trie/CharacterTransition.java | 9 ++++
.../java/org/ahocorasick/trie/Keyword.java | 14 ++++++-
src/main/java/org/ahocorasick/trie/State.java | 22 +++++-----
.../java/org/ahocorasick/trie/Transition.java | 6 ++-
src/main/java/org/ahocorasick/trie/Trie.java | 41 ++++++++-----------
.../org/ahocorasick/trie/WordTransition.java | 17 ++++++--
6 files changed, 65 insertions(+), 44 deletions(-)
diff --git a/src/main/java/org/ahocorasick/trie/CharacterTransition.java b/src/main/java/org/ahocorasick/trie/CharacterTransition.java
index ca561ee..5b19499 100644
--- a/src/main/java/org/ahocorasick/trie/CharacterTransition.java
+++ b/src/main/java/org/ahocorasick/trie/CharacterTransition.java
@@ -21,10 +21,19 @@ package org.ahocorasick.trie;
*/
class CharacterTransition extends Transition {
+ /**
+ * Create a character transition from a position in the source string
+ * @param c character to match
+ * @param start positon of character in source string
+ */
public CharacterTransition(Character c, int start) {
super(c, start, 1);
}
+ /**
+ * Create a character transition without regard for position
+ * @param c character to match
+ */
public CharacterTransition(Character c) {
this(c, 0);
}
diff --git a/src/main/java/org/ahocorasick/trie/Keyword.java b/src/main/java/org/ahocorasick/trie/Keyword.java
index 0a6db3d..d280b0a 100644
--- a/src/main/java/org/ahocorasick/trie/Keyword.java
+++ b/src/main/java/org/ahocorasick/trie/Keyword.java
@@ -16,13 +16,20 @@
package org.ahocorasick.trie;
/**
+ * Keyword encapsulates part of a potential match along with the count
+ * of prior source tokens consumed to create the potential match.
*
* @author doug.lovell
*/
public class Keyword implements Comparable {
private final String text;
private int depth;
-
+
+ /**
+ * Create portion of potential match
+ * @param text content that matches
+ * @param depth count of prior source tokens that comprise the match
+ */
public Keyword(String text, int depth) {
this.text = text;
this.depth = depth;
@@ -37,7 +44,10 @@ public class Keyword implements Comparable {
}
public String toString() {
- return "Keyword '" + text + "' at depth " + depth;
+ final String t = getText();
+ final int d = getDepth();
+
+ return "Keyword '" + t + "' at depth " + d;
}
@Override
diff --git a/src/main/java/org/ahocorasick/trie/State.java b/src/main/java/org/ahocorasick/trie/State.java
index 215a652..fd9780f 100644
--- a/src/main/java/org/ahocorasick/trie/State.java
+++ b/src/main/java/org/ahocorasick/trie/State.java
@@ -27,7 +27,7 @@ import java.util.*;
*/
public class State {
- /** effective the size of the keyword */
+ /** effectively the size of the keyword */
private final int depth;
/** only used for the root state to refer to itself in case no matches have been found */
@@ -54,27 +54,27 @@ public class State {
this.rootState = depth == 0 ? this : null;
}
- private State nextState(Transition t, boolean ignoreRootState) {
- State nextState = this.success.get(t);
+ private State nextState(Transition transition, boolean ignoreRootState) {
+ State nextState = this.success.get(transition);
if (!ignoreRootState && nextState == null && this.rootState != null) {
nextState = this.rootState;
}
return nextState;
}
- public State nextState(Transition t) {
- return nextState(t, false);
+ public State nextState(Transition transition) {
+ return nextState(transition, false);
}
- public State nextStateIgnoreRootState(Transition t) {
- return nextState(t, true);
+ public State nextStateIgnoreRootState(Transition transition) {
+ return nextState(transition, true);
}
- public State addState(Transition t) {
- State nextState = nextStateIgnoreRootState(t);
+ public State addState(Transition transition) {
+ State nextState = nextStateIgnoreRootState(transition);
if (nextState == null) {
nextState = new State(this.depth+1);
- this.success.put(t, nextState);
+ this.success.put(transition, nextState);
}
return nextState;
}
@@ -97,7 +97,7 @@ public class State {
}
public void addEmitString(String key) {
- addEmit(new Keyword(key, depth));
+ addEmit(new Keyword(key, getDepth()));
}
public Collection emit() {
diff --git a/src/main/java/org/ahocorasick/trie/Transition.java b/src/main/java/org/ahocorasick/trie/Transition.java
index f0b11b4..c4dfe9b 100644
--- a/src/main/java/org/ahocorasick/trie/Transition.java
+++ b/src/main/java/org/ahocorasick/trie/Transition.java
@@ -44,8 +44,10 @@ public class Transition {
@Override
public String toString() {
- return "Transition on '" + token + "' start: " + start +
- ", length: " + length;
+ final int s = getStart();
+ final int len = getLength();
+
+ return "Transition on '" + token + "' start: " + s + ", length: " + len;
}
@Override
diff --git a/src/main/java/org/ahocorasick/trie/Trie.java b/src/main/java/org/ahocorasick/trie/Trie.java
index 8c72734..7c6dbab 100644
--- a/src/main/java/org/ahocorasick/trie/Trie.java
+++ b/src/main/java/org/ahocorasick/trie/Trie.java
@@ -81,9 +81,8 @@ public class Trie {
}
private class TokenStream {
- private final KeywordTokenizer kwt;
+ private final KeywordTokenizer tokenizer;
private final StringBuilder input;
- private Transition lookahead;
public TokenStream(CharSequence text) {
input = new StringBuilder(text.length());
@@ -93,23 +92,15 @@ public class Trie {
Character.toLowerCase(ch) : ch);
}
if (trieConfig.isOnlyWholeWords()) {
- kwt = new WordTokenizer(input);
+ tokenizer = new WordTokenizer(input);
}
else {
- kwt = new CharacterTokenizer(input);
+ tokenizer = new CharacterTokenizer(input);
}
- lookahead = null;
}
public Transition nextTransition() {
- Transition next = lookahead;
- if (next == null) {
- next = kwt.nextTransition();
- }
- else {
- lookahead = null;
- }
- return next;
+ return tokenizer.nextTransition();
}
public String input() {
@@ -123,13 +114,13 @@ public class Trie {
return;
}
State currentState = this.rootState;
- TokenStream tknz = new TokenStream(keyword);
- Transition tn = tknz.nextTransition();
- while (tn != null) {
- currentState = currentState.addState(tn);
- tn = tknz.nextTransition();
+ TokenStream tokenStream = new TokenStream(keyword);
+ Transition transition = tokenStream.nextTransition();
+ while (transition != null) {
+ currentState = currentState.addState(transition);
+ transition = tokenStream.nextTransition();
}
- currentState.addEmitString(tknz.input());
+ currentState.addEmitString(tokenStream.input());
}
public Collection tokenize(String text) {
@@ -167,25 +158,25 @@ public class Trie {
LinkedList tknHistory = new LinkedList<>();
State currentState = this.rootState;
- Transition tn = tknz.nextTransition();
- while (tn != null) {
+ Transition nextTransition = tknz.nextTransition();
+ while (nextTransition != null) {
if (flushHandler.stop()) {
return;
}
- tknHistory.add(tn);
- currentState = getState(currentState, tn, flushHandler);
+ tknHistory.add(nextTransition);
+ currentState = getState(currentState, nextTransition, flushHandler);
Collection emits = currentState.emit();
int depth = currentState.getDepth();
while (depth < tknHistory.size()) {
tknHistory.remove();
}
+ int position = nextTransition.getStart() + nextTransition.getLength();
for (Keyword emit : emits) {
- int position = tn.getStart() + tn.getLength();
int start = tknHistory.get(depth - emit.getDepth()).getStart();
emitCandidateHolder.addCandidate(
new Emit(start, position - 1, emit.getText()));
}
- tn = tknz.nextTransition();
+ nextTransition = tknz.nextTransition();
}
flushHandler.flush();
}
diff --git a/src/main/java/org/ahocorasick/trie/WordTransition.java b/src/main/java/org/ahocorasick/trie/WordTransition.java
index f12d0bc..21b5836 100644
--- a/src/main/java/org/ahocorasick/trie/WordTransition.java
+++ b/src/main/java/org/ahocorasick/trie/WordTransition.java
@@ -21,11 +21,20 @@ package org.ahocorasick.trie;
*/
public class WordTransition extends Transition {
- public WordTransition(String s, int start) {
- super(s, start, s.length());
+ /**
+ * Create a transition from a position in the source string
+ * @param word to match
+ * @param start position of first character within the source string
+ */
+ public WordTransition(String word, int start) {
+ super(word, start, word.length());
}
- public WordTransition(String s) {
- this(s, 0);
+ /**
+ * Create a transition without regard for position
+ * @param word to match
+ */
+ public WordTransition(String word) {
+ this(word, 0);
}
}
From 360d76193acf04f264f3b1d0c18dfe4d53317030 Mon Sep 17 00:00:00 2001
From: Douglas Lovell
Date: Fri, 13 Oct 2017 21:17:04 -0300
Subject: [PATCH 20/20] Express paramaters as final
---
.../org/ahocorasick/trie/FragmentToken.java | 2 +-
.../java/org/ahocorasick/trie/Keyword.java | 6 ++--
.../java/org/ahocorasick/trie/MatchToken.java | 2 +-
src/main/java/org/ahocorasick/trie/State.java | 18 +++++------
src/main/java/org/ahocorasick/trie/Token.java | 2 +-
.../java/org/ahocorasick/trie/Tokenizer.java | 6 ++--
.../java/org/ahocorasick/trie/Transition.java | 4 +--
src/main/java/org/ahocorasick/trie/Trie.java | 31 ++++++++++---------
.../org/ahocorasick/trie/WordTransition.java | 4 +--
9 files changed, 40 insertions(+), 35 deletions(-)
diff --git a/src/main/java/org/ahocorasick/trie/FragmentToken.java b/src/main/java/org/ahocorasick/trie/FragmentToken.java
index 37e83d1..be77a16 100644
--- a/src/main/java/org/ahocorasick/trie/FragmentToken.java
+++ b/src/main/java/org/ahocorasick/trie/FragmentToken.java
@@ -2,7 +2,7 @@ package org.ahocorasick.trie;
public class FragmentToken extends Token {
- public FragmentToken(String fragment) {
+ public FragmentToken(final String fragment) {
super(fragment);
}
diff --git a/src/main/java/org/ahocorasick/trie/Keyword.java b/src/main/java/org/ahocorasick/trie/Keyword.java
index d280b0a..dcfcaf9 100644
--- a/src/main/java/org/ahocorasick/trie/Keyword.java
+++ b/src/main/java/org/ahocorasick/trie/Keyword.java
@@ -23,14 +23,14 @@ package org.ahocorasick.trie;
*/
public class Keyword implements Comparable {
private final String text;
- private int depth;
+ private final int depth;
/**
* Create portion of potential match
* @param text content that matches
* @param depth count of prior source tokens that comprise the match
*/
- public Keyword(String text, int depth) {
+ public Keyword(final String text, final int depth) {
this.text = text;
this.depth = depth;
}
@@ -51,7 +51,7 @@ public class Keyword implements Comparable {
}
@Override
- public int compareTo(Object o) {
+ public int compareTo(final Object o) {
if (o instanceof Keyword) {
return text.compareTo(((Keyword) o).text);
}
diff --git a/src/main/java/org/ahocorasick/trie/MatchToken.java b/src/main/java/org/ahocorasick/trie/MatchToken.java
index 9d91693..8ec9b0d 100644
--- a/src/main/java/org/ahocorasick/trie/MatchToken.java
+++ b/src/main/java/org/ahocorasick/trie/MatchToken.java
@@ -4,7 +4,7 @@ public class MatchToken extends Token {
private final Emit emit;
- public MatchToken(String fragment, Emit emit) {
+ public MatchToken(final String fragment, final Emit emit) {
super(fragment);
this.emit = emit;
}
diff --git a/src/main/java/org/ahocorasick/trie/State.java b/src/main/java/org/ahocorasick/trie/State.java
index fd9780f..01349aa 100644
--- a/src/main/java/org/ahocorasick/trie/State.java
+++ b/src/main/java/org/ahocorasick/trie/State.java
@@ -54,7 +54,7 @@ public class State {
this.rootState = depth == 0 ? this : null;
}
- private State nextState(Transition transition, boolean ignoreRootState) {
+ private State nextState(final Transition transition, boolean ignoreRootState) {
State nextState = this.success.get(transition);
if (!ignoreRootState && nextState == null && this.rootState != null) {
nextState = this.rootState;
@@ -62,15 +62,15 @@ public class State {
return nextState;
}
- public State nextState(Transition transition) {
+ public State nextState(final Transition transition) {
return nextState(transition, false);
}
- public State nextStateIgnoreRootState(Transition transition) {
+ public State nextStateIgnoreRootState(final Transition transition) {
return nextState(transition, true);
}
- public State addState(Transition transition) {
+ public State addState(final Transition transition) {
State nextState = nextStateIgnoreRootState(transition);
if (nextState == null) {
nextState = new State(this.depth+1);
@@ -83,20 +83,20 @@ public class State {
return this.depth;
}
- public void addEmit(Keyword keyword) {
+ public void addEmit(final Keyword keyword) {
if (this.emits == null) {
this.emits = new TreeSet<>();
}
this.emits.add(keyword);
}
- public void addEmit(Collection emits) {
+ public void addEmit(final Collection emits) {
for (Keyword emit : emits) {
addEmit(emit);
}
}
- public void addEmitString(String key) {
+ public void addEmitString(final String key) {
addEmit(new Keyword(key, getDepth()));
}
@@ -104,7 +104,7 @@ public class State {
return this.emits == null ? Collections. emptyList() : this.emits;
}
- public State failure(EmitCandidateFlushHandler emitCandidateFlushHandler) {
+ public State failure(final EmitCandidateFlushHandler emitCandidateFlushHandler) {
if (emitCandidateFlushHandler != null && this.failure.isRootState()) {
emitCandidateFlushHandler.flush();
}
@@ -115,7 +115,7 @@ public class State {
return failure(null);
}
- public void setFailure(State failState) {
+ public void setFailure(final State failState) {
this.failure = failState;
}
diff --git a/src/main/java/org/ahocorasick/trie/Token.java b/src/main/java/org/ahocorasick/trie/Token.java
index 2e4c72f..c6380b5 100644
--- a/src/main/java/org/ahocorasick/trie/Token.java
+++ b/src/main/java/org/ahocorasick/trie/Token.java
@@ -4,7 +4,7 @@ public abstract class Token {
private final String fragment;
- public Token(String fragment) {
+ public Token(final String fragment) {
this.fragment = fragment;
}
diff --git a/src/main/java/org/ahocorasick/trie/Tokenizer.java b/src/main/java/org/ahocorasick/trie/Tokenizer.java
index 9f587c4..33c96ef 100644
--- a/src/main/java/org/ahocorasick/trie/Tokenizer.java
+++ b/src/main/java/org/ahocorasick/trie/Tokenizer.java
@@ -10,7 +10,7 @@ public class Tokenizer {
private final String text;
- public Tokenizer(Collection emits, String text) {
+ public Tokenizer(final Collection emits, final String text) {
this.emits = emits;
this.text = text;
}
@@ -34,7 +34,9 @@ public class Tokenizer {
}
private Token createFragment(Emit emit, String text, int lastCollectedPosition) {
- return new FragmentToken(text.substring(lastCollectedPosition+1, emit == null ? text.length() : emit.getStart()));
+ return new FragmentToken(text.substring(
+ lastCollectedPosition+1,
+ emit == null ? text.length() : emit.getStart()));
}
private Token createMatch(Emit emit, String text) {
diff --git a/src/main/java/org/ahocorasick/trie/Transition.java b/src/main/java/org/ahocorasick/trie/Transition.java
index c4dfe9b..c5ad7d5 100644
--- a/src/main/java/org/ahocorasick/trie/Transition.java
+++ b/src/main/java/org/ahocorasick/trie/Transition.java
@@ -28,7 +28,7 @@ public class Transition {
protected final int start;
protected final int length;
- public Transition(T token, int start, int length) {
+ public Transition(final T token, int start, int length) {
this.token = token;
this.start = start;
this.length = length;
@@ -56,7 +56,7 @@ public class Transition {
}
@Override
- public boolean equals(Object obj) {
+ public boolean equals(final Object obj) {
if (obj == null) {
return false;
}
diff --git a/src/main/java/org/ahocorasick/trie/Trie.java b/src/main/java/org/ahocorasick/trie/Trie.java
index 7c6dbab..23d12b3 100644
--- a/src/main/java/org/ahocorasick/trie/Trie.java
+++ b/src/main/java/org/ahocorasick/trie/Trie.java
@@ -24,16 +24,16 @@ public class Trie {
private final State rootState;
- private Trie(TrieConfig trieConfig) {
+ private Trie(final TrieConfig trieConfig) {
this.trieConfig = trieConfig;
this.rootState = new State();
}
private abstract class KeywordTokenizer {
protected int position = 0;
- protected CharSequence input;
+ protected final CharSequence input;
protected int length;
- protected KeywordTokenizer(CharSequence input) {
+ protected KeywordTokenizer(final CharSequence input) {
this.input = input;
this.length = input.length();
}
@@ -44,7 +44,7 @@ public class Trie {
}
private class WordTokenizer extends KeywordTokenizer {
- public WordTokenizer(CharSequence input) {
+ public WordTokenizer(final CharSequence input) {
super(input);
}
@Override
@@ -66,7 +66,7 @@ public class Trie {
}
private class CharacterTokenizer extends KeywordTokenizer {
- public CharacterTokenizer(CharSequence input) {
+ public CharacterTokenizer(final CharSequence input) {
super(input);
}
@Override
@@ -84,7 +84,7 @@ public class Trie {
private final KeywordTokenizer tokenizer;
private final StringBuilder input;
- public TokenStream(CharSequence text) {
+ public TokenStream(final CharSequence text) {
input = new StringBuilder(text.length());
for (int p = 0; p < text.length(); ++p) {
char ch = text.charAt(p);
@@ -109,7 +109,7 @@ public class Trie {
}
- private void addKeyword(CharSequence keyword) {
+ private void addKeyword(final CharSequence keyword) {
if (keyword == null || keyword.length() == 0) {
return;
}
@@ -128,24 +128,24 @@ public class Trie {
}
@SuppressWarnings("unchecked")
- public Collection parseText(CharSequence text) {
+ public Collection parseText(final CharSequence text) {
DefaultEmitHandler emitHandler = new DefaultEmitHandler();
parseText(text, emitHandler);
return emitHandler.getEmits();
}
- public boolean containsMatch(CharSequence text) {
+ public boolean containsMatch(final CharSequence text) {
Emit firstMatch = firstMatch(text);
return firstMatch != null;
}
- public Emit firstMatch(CharSequence text) {
+ public Emit firstMatch(final CharSequence text) {
FirstMatchHandler emitHandler = new FirstMatchHandler();
parseText(text, emitHandler);
return emitHandler.getFirstMatch();
}
- public void parseText(CharSequence text, EmitHandler emitHandler) {
+ public void parseText(final CharSequence text, final EmitHandler emitHandler) {
final EmitCandidateHolder emitCandidateHolder = this.trieConfig.isAllowOverlaps() ?
new OverlappingEmitCandidateHolder() :
@@ -181,11 +181,14 @@ public class Trie {
flushHandler.flush();
}
- private State getState(State currentState, Transition transition, EmitCandidateFlushHandler flushHandler) {
+ private State getState(final State currentState,
+ final Transition transition,
+ final EmitCandidateFlushHandler flushHandler) {
+ State failState = currentState;
State newCurrentState = currentState.nextState(transition);
while (newCurrentState == null) {
- currentState = currentState.failure(flushHandler);
- newCurrentState = currentState.nextState(transition);
+ failState = failState.failure(flushHandler);
+ newCurrentState = failState.nextState(transition);
}
return newCurrentState;
}
diff --git a/src/main/java/org/ahocorasick/trie/WordTransition.java b/src/main/java/org/ahocorasick/trie/WordTransition.java
index 21b5836..c1dbd7b 100644
--- a/src/main/java/org/ahocorasick/trie/WordTransition.java
+++ b/src/main/java/org/ahocorasick/trie/WordTransition.java
@@ -26,7 +26,7 @@ public class WordTransition extends Transition {
* @param word to match
* @param start position of first character within the source string
*/
- public WordTransition(String word, int start) {
+ public WordTransition(final String word, int start) {
super(word, start, word.length());
}
@@ -34,7 +34,7 @@ public class WordTransition extends Transition {
* Create a transition without regard for position
* @param word to match
*/
- public WordTransition(String word) {
+ public WordTransition(final String word) {
this(word, 0);
}
}