diff --git a/pom.xml b/pom.xml
index 9d964e4..ed7c40b 100644
--- a/pom.xml
+++ b/pom.xml
@@ -3,7 +3,7 @@
org.ahocorasick
ahocorasick
- 0.3.1-SNAPSHOT
+ 0.3.1-heartbeat
jar
Aho-CoraSick algorithm for efficient string matching
Java library for efficient string matching against a large set of keywords
@@ -104,4 +104,4 @@
-
\ No newline at end of file
+
diff --git a/src/main/java/org/ahocorasick/trie/CharacterTransition.java b/src/main/java/org/ahocorasick/trie/CharacterTransition.java
new file mode 100644
index 0000000..5b19499
--- /dev/null
+++ b/src/main/java/org/ahocorasick/trie/CharacterTransition.java
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2015 Rogue Wave Software.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.ahocorasick.trie;
+
+/**
+ * Model transitions on characters
+ * @author doug.lovell
+ */
+class CharacterTransition extends Transition {
+
+ /**
+ * Create a character transition from a position in the source string
+ * @param c character to match
+ * @param start positon of character in source string
+ */
+ public CharacterTransition(Character c, int start) {
+ super(c, start, 1);
+ }
+
+ /**
+ * Create a character transition without regard for position
+ * @param c character to match
+ */
+ public CharacterTransition(Character c) {
+ this(c, 0);
+ }
+
+}
diff --git a/src/main/java/org/ahocorasick/trie/Emit.java b/src/main/java/org/ahocorasick/trie/Emit.java
index 60c1f9e..20d73ff 100644
--- a/src/main/java/org/ahocorasick/trie/Emit.java
+++ b/src/main/java/org/ahocorasick/trie/Emit.java
@@ -11,11 +11,11 @@ public class Emit extends Interval implements Intervalable {
super(start, end);
this.keyword = keyword;
}
-
+
public String getKeyword() {
return this.keyword;
}
-
+
@Override
public String toString() {
return super.toString() + "=" + this.keyword;
diff --git a/src/main/java/org/ahocorasick/trie/FragmentToken.java b/src/main/java/org/ahocorasick/trie/FragmentToken.java
index f0c899f..be77a16 100644
--- a/src/main/java/org/ahocorasick/trie/FragmentToken.java
+++ b/src/main/java/org/ahocorasick/trie/FragmentToken.java
@@ -2,16 +2,8 @@ package org.ahocorasick.trie;
public class FragmentToken extends Token {
- private boolean whiteSpace;
-
- public FragmentToken(String fragment) {
+ public FragmentToken(final String fragment) {
super(fragment);
- this.whiteSpace = true;
- for (int position = 0; position < fragment.length(); position++) {
- if (!Character.isWhitespace(fragment.charAt(position))) {
- whiteSpace = false;
- }
- }
}
@Override
@@ -24,9 +16,4 @@ public class FragmentToken extends Token {
return null;
}
- @Override
- public boolean isWhiteSpace() {
- return whiteSpace;
- }
-
}
diff --git a/src/main/java/org/ahocorasick/trie/Keyword.java b/src/main/java/org/ahocorasick/trie/Keyword.java
new file mode 100644
index 0000000..dcfcaf9
--- /dev/null
+++ b/src/main/java/org/ahocorasick/trie/Keyword.java
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2015 Rogue Wave Software.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.ahocorasick.trie;
+
+/**
+ * Keyword encapsulates part of a potential match along with the count
+ * of prior source tokens consumed to create the potential match.
+ *
+ * @author doug.lovell
+ */
+public class Keyword implements Comparable {
+ private final String text;
+ private final int depth;
+
+ /**
+ * Create portion of potential match
+ * @param text content that matches
+ * @param depth count of prior source tokens that comprise the match
+ */
+ public Keyword(final String text, final int depth) {
+ this.text = text;
+ this.depth = depth;
+ }
+
+ public int getDepth() {
+ return depth;
+ }
+
+ public String getText() {
+ return text;
+ }
+
+ public String toString() {
+ final String t = getText();
+ final int d = getDepth();
+
+ return "Keyword '" + t + "' at depth " + d;
+ }
+
+ @Override
+ public int compareTo(final Object o) {
+ if (o instanceof Keyword) {
+ return text.compareTo(((Keyword) o).text);
+ }
+ throw new IllegalArgumentException("Only supports comparison with other keywords");
+ }
+}
diff --git a/src/main/java/org/ahocorasick/trie/MatchToken.java b/src/main/java/org/ahocorasick/trie/MatchToken.java
index 5becc64..8ec9b0d 100644
--- a/src/main/java/org/ahocorasick/trie/MatchToken.java
+++ b/src/main/java/org/ahocorasick/trie/MatchToken.java
@@ -2,19 +2,11 @@ package org.ahocorasick.trie;
public class MatchToken extends Token {
- private final boolean wholeWord;
-
private final Emit emit;
- public MatchToken(String fragment, Emit emit, boolean wholeWord) {
+ public MatchToken(final String fragment, final Emit emit) {
super(fragment);
this.emit = emit;
- this.wholeWord = wholeWord;
- }
-
- @Override
- public boolean isWholeWord() {
- return wholeWord;
}
@Override
diff --git a/src/main/java/org/ahocorasick/trie/State.java b/src/main/java/org/ahocorasick/trie/State.java
index 82d167b..01349aa 100644
--- a/src/main/java/org/ahocorasick/trie/State.java
+++ b/src/main/java/org/ahocorasick/trie/State.java
@@ -10,8 +10,8 @@ import java.util.*;
*
*
*
- * - success; when a character points to another state, it must return that state
- * - failure; when a character has no matching state, the algorithm must be able to fall back on a
+ *
- success; when a transition points to another state, it must return that state
+ * - failure; when a transition has no matching state, the algorithm must be able to fall back on a
* state with less depth
* - emits; when this state is passed and keywords have been matched, the matches must be
* 'emitted' so that they can be used later on.
@@ -19,7 +19,7 @@ import java.util.*;
*
*
* The root state is special in the sense that it has no failure state; it cannot fail. If it 'fails'
- * it will still parse the next character and start from the root node. This ensures that the algorithm
+ * it will still parse the next transition and start from the root node. This ensures that the algorithm
* always runs. All other states always have a fail state.
*
*
@@ -27,7 +27,7 @@ import java.util.*;
*/
public class State {
- /** effective the size of the keyword */
+ /** effectively the size of the keyword */
private final int depth;
/** only used for the root state to refer to itself in case no matches have been found */
@@ -35,15 +35,15 @@ public class State {
/**
* referred to in the white paper as the 'goto' structure. From a state it is possible to go
- * to other states, depending on the character passed.
+ * to other states, depending on the transition passed.
*/
- private Map success = new HashMap();
+ private final Map success = new HashMap<>();
/** if no matching states are found, the failure state will be returned */
private State failure = null;
/** whenever this state is reached, it will emit the matches keywords for future reference */
- private Set emits = null;
+ private Set emits = null;
public State() {
this(0);
@@ -54,27 +54,27 @@ public class State {
this.rootState = depth == 0 ? this : null;
}
- private State nextState(Character character, boolean ignoreRootState) {
- State nextState = this.success.get(character);
+ private State nextState(final Transition transition, boolean ignoreRootState) {
+ State nextState = this.success.get(transition);
if (!ignoreRootState && nextState == null && this.rootState != null) {
nextState = this.rootState;
}
return nextState;
}
- public State nextState(Character character) {
- return nextState(character, false);
+ public State nextState(final Transition transition) {
+ return nextState(transition, false);
}
- public State nextStateIgnoreRootState(Character character) {
- return nextState(character, true);
+ public State nextStateIgnoreRootState(final Transition transition) {
+ return nextState(transition, true);
}
- public State addState(Character character) {
- State nextState = nextStateIgnoreRootState(character);
+ public State addState(final Transition transition) {
+ State nextState = nextStateIgnoreRootState(transition);
if (nextState == null) {
nextState = new State(this.depth+1);
- this.success.put(character, nextState);
+ this.success.put(transition, nextState);
}
return nextState;
}
@@ -83,24 +83,28 @@ public class State {
return this.depth;
}
- public void addEmit(String keyword) {
+ public void addEmit(final Keyword keyword) {
if (this.emits == null) {
this.emits = new TreeSet<>();
}
this.emits.add(keyword);
}
- public void addEmit(Collection emits) {
- for (String emit : emits) {
+ public void addEmit(final Collection emits) {
+ for (Keyword emit : emits) {
addEmit(emit);
}
}
-
- public Collection emit() {
- return this.emits == null ? Collections. emptyList() : this.emits;
+
+ public void addEmitString(final String key) {
+ addEmit(new Keyword(key, getDepth()));
}
- public State failure(EmitCandidateFlushHandler emitCandidateFlushHandler) {
+ public Collection emit() {
+ return this.emits == null ? Collections. emptyList() : this.emits;
+ }
+
+ public State failure(final EmitCandidateFlushHandler emitCandidateFlushHandler) {
if (emitCandidateFlushHandler != null && this.failure.isRootState()) {
emitCandidateFlushHandler.flush();
}
@@ -111,7 +115,7 @@ public class State {
return failure(null);
}
- public void setFailure(State failState) {
+ public void setFailure(final State failState) {
this.failure = failState;
}
@@ -119,7 +123,7 @@ public class State {
return this.success.values();
}
- public Collection getTransitions() {
+ public Collection getTransitions() {
return this.success.keySet();
}
diff --git a/src/main/java/org/ahocorasick/trie/Token.java b/src/main/java/org/ahocorasick/trie/Token.java
index 7ec280f..c6380b5 100644
--- a/src/main/java/org/ahocorasick/trie/Token.java
+++ b/src/main/java/org/ahocorasick/trie/Token.java
@@ -2,9 +2,9 @@ package org.ahocorasick.trie;
public abstract class Token {
- private String fragment;
+ private final String fragment;
- public Token(String fragment) {
+ public Token(final String fragment) {
this.fragment = fragment;
}
@@ -14,14 +14,6 @@ public abstract class Token {
public abstract boolean isMatch();
- public boolean isWholeWord() {
- return false;
- }
-
- public boolean isWhiteSpace() {
- return false;
- }
-
public abstract Emit getEmit();
}
diff --git a/src/main/java/org/ahocorasick/trie/Tokenizer.java b/src/main/java/org/ahocorasick/trie/Tokenizer.java
index b45c438..33c96ef 100644
--- a/src/main/java/org/ahocorasick/trie/Tokenizer.java
+++ b/src/main/java/org/ahocorasick/trie/Tokenizer.java
@@ -9,8 +9,8 @@ public class Tokenizer {
private final Collection emits;
private final String text;
-
- public Tokenizer(Collection emits, String text) {
+
+ public Tokenizer(final Collection emits, final String text) {
this.emits = emits;
this.text = text;
}
@@ -34,14 +34,15 @@ public class Tokenizer {
}
private Token createFragment(Emit emit, String text, int lastCollectedPosition) {
- return new FragmentToken(text.substring(lastCollectedPosition+1, emit == null ? text.length() : emit.getStart()));
+ return new FragmentToken(text.substring(
+ lastCollectedPosition+1,
+ emit == null ? text.length() : emit.getStart()));
}
private Token createMatch(Emit emit, String text) {
return new MatchToken(
text.substring(emit.getStart(), emit.getEnd()+1),
- emit,
- Trie.isWholeWord(this.text, emit.getStart(), emit.getEnd()));
+ emit);
}
}
diff --git a/src/main/java/org/ahocorasick/trie/Transition.java b/src/main/java/org/ahocorasick/trie/Transition.java
new file mode 100644
index 0000000..c5ad7d5
--- /dev/null
+++ b/src/main/java/org/ahocorasick/trie/Transition.java
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2015 Rogue Wave Software.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.ahocorasick.trie;
+
+import java.util.Objects;
+
+/**
+ * Enables the trie to model transitions on whole words or characters
+ * ... or whatever!
+ * @author doug.lovell
+ * @param
+ */
+public class Transition {
+ protected final T token;
+ protected final int start;
+ protected final int length;
+
+ public Transition(final T token, int start, int length) {
+ this.token = token;
+ this.start = start;
+ this.length = length;
+ }
+
+ public int getStart() {
+ return start;
+ }
+
+ public int getLength() {
+ return length;
+ }
+
+ @Override
+ public String toString() {
+ final int s = getStart();
+ final int len = getLength();
+
+ return "Transition on '" + token + "' start: " + s + ", length: " + len;
+ }
+
+ @Override
+ public int hashCode() {
+ return token.hashCode();
+ }
+
+ @Override
+ public boolean equals(final Object obj) {
+ if (obj == null) {
+ return false;
+ }
+ if (getClass() != obj.getClass()) {
+ return false;
+ }
+ final Transition> other = (Transition>) obj;
+ return Objects.equals(this.token, other.token);
+ }
+}
diff --git a/src/main/java/org/ahocorasick/trie/Trie.java b/src/main/java/org/ahocorasick/trie/Trie.java
index 3c5403d..23d12b3 100644
--- a/src/main/java/org/ahocorasick/trie/Trie.java
+++ b/src/main/java/org/ahocorasick/trie/Trie.java
@@ -10,6 +10,7 @@ import org.ahocorasick.trie.handler.FirstMatchHandler;
import java.util.Collection;
import java.util.Queue;
+import java.util.LinkedList;
import java.util.concurrent.LinkedBlockingDeque;
/**
@@ -19,27 +20,107 @@ import java.util.concurrent.LinkedBlockingDeque;
*/
public class Trie {
- private TrieConfig trieConfig;
+ private final TrieConfig trieConfig;
- private State rootState;
+ private final State rootState;
- private Trie(TrieConfig trieConfig) {
+ private Trie(final TrieConfig trieConfig) {
this.trieConfig = trieConfig;
this.rootState = new State();
}
+
+ private abstract class KeywordTokenizer {
+ protected int position = 0;
+ protected final CharSequence input;
+ protected int length;
+ protected KeywordTokenizer(final CharSequence input) {
+ this.input = input;
+ this.length = input.length();
+ }
+ protected char currentChar() {
+ return (position < length) ? input.charAt(position) : '\0';
+ }
+ public abstract Transition nextTransition();
+ }
+
+ private class WordTokenizer extends KeywordTokenizer {
+ public WordTokenizer(final CharSequence input) {
+ super(input);
+ }
+ @Override
+ public Transition nextTransition() {
+ WordTransition t = null;
+ while (position < length && Character.isWhitespace(currentChar())) {
+ ++position;
+ }
+ int start = position;
+ if (start < length) {
+ do {
+ ++position;
+ } while (position < length && Character.isLetterOrDigit(currentChar()));
+ String word = input.subSequence(start, position).toString();
+ t = new WordTransition(word, start);
+ }
+ return t;
+ }
+ }
+
+ private class CharacterTokenizer extends KeywordTokenizer {
+ public CharacterTokenizer(final CharSequence input) {
+ super(input);
+ }
+ @Override
+ public Transition nextTransition() {
+ CharacterTransition t = null;
+ if (position < length) {
+ t = new CharacterTransition(currentChar(), position);
+ position += 1;
+ }
+ return t;
+ }
+ }
+
+ private class TokenStream {
+ private final KeywordTokenizer tokenizer;
+ private final StringBuilder input;
+
+ public TokenStream(final CharSequence text) {
+ input = new StringBuilder(text.length());
+ for (int p = 0; p < text.length(); ++p) {
+ char ch = text.charAt(p);
+ input.append(trieConfig.isCaseInsensitive() ?
+ Character.toLowerCase(ch) : ch);
+ }
+ if (trieConfig.isOnlyWholeWords()) {
+ tokenizer = new WordTokenizer(input);
+ }
+ else {
+ tokenizer = new CharacterTokenizer(input);
+ }
+ }
+
+ public Transition nextTransition() {
+ return tokenizer.nextTransition();
+ }
+
+ public String input() {
+ return input.toString();
+ }
- private void addKeyword(String keyword) {
+ }
+
+ private void addKeyword(final CharSequence keyword) {
if (keyword == null || keyword.length() == 0) {
return;
}
State currentState = this.rootState;
- for (Character character : keyword.toCharArray()) {
- if (trieConfig.isCaseInsensitive()) {
- character = Character.toLowerCase(character);
- }
- currentState = currentState.addState(character);
+ TokenStream tokenStream = new TokenStream(keyword);
+ Transition transition = tokenStream.nextTransition();
+ while (transition != null) {
+ currentState = currentState.addState(transition);
+ transition = tokenStream.nextTransition();
}
- currentState.addEmit(trieConfig.isCaseInsensitive() ? keyword.toLowerCase() : keyword);
+ currentState.addEmitString(tokenStream.input());
}
public Collection tokenize(String text) {
@@ -47,66 +128,67 @@ public class Trie {
}
@SuppressWarnings("unchecked")
- public Collection parseText(CharSequence text) {
+ public Collection parseText(final CharSequence text) {
DefaultEmitHandler emitHandler = new DefaultEmitHandler();
parseText(text, emitHandler);
return emitHandler.getEmits();
}
- public boolean containsMatch(CharSequence text) {
- Emit firstMatch = firstMatch(text);
- return firstMatch != null;
- }
+ public boolean containsMatch(final CharSequence text) {
+ Emit firstMatch = firstMatch(text);
+ return firstMatch != null;
+ }
- public Emit firstMatch(CharSequence text) {
+ public Emit firstMatch(final CharSequence text) {
FirstMatchHandler emitHandler = new FirstMatchHandler();
parseText(text, emitHandler);
return emitHandler.getFirstMatch();
}
- public void parseText(CharSequence text, EmitHandler emitHandler) {
+ public void parseText(final CharSequence text, final EmitHandler emitHandler) {
final EmitCandidateHolder emitCandidateHolder = this.trieConfig.isAllowOverlaps() ?
new OverlappingEmitCandidateHolder() :
new NonOverlappingEmitCandidateHolder();
- final EmitCandidateFlushHandler flushHandler = new EmitCandidateFlushHandler(emitHandler, emitCandidateHolder);
+ final EmitCandidateFlushHandler flushHandler =
+ new EmitCandidateFlushHandler(emitHandler, emitCandidateHolder);
+ TokenStream tknz = new TokenStream(text);
+
+ LinkedList tknHistory = new LinkedList<>();
State currentState = this.rootState;
- for (int position = 0; position < text.length(); position++) {
-
+ Transition nextTransition = tknz.nextTransition();
+ while (nextTransition != null) {
if (flushHandler.stop()) {
return;
}
-
- Character character = text.charAt(position);
- if (trieConfig.isCaseInsensitive()) {
- character = Character.toLowerCase(character);
+ tknHistory.add(nextTransition);
+ currentState = getState(currentState, nextTransition, flushHandler);
+ Collection emits = currentState.emit();
+ int depth = currentState.getDepth();
+ while (depth < tknHistory.size()) {
+ tknHistory.remove();
}
- currentState = getState(currentState, character, flushHandler);
-
- Collection emits = currentState.emit();
- for (String emit : emits) {
- int start = position - emit.length() + 1;
- if (!trieConfig.isOnlyWholeWords() || isWholeWord(text, start, position)) {
- emitCandidateHolder.addCandidate(new Emit(start, position, emit));
- }
+ int position = nextTransition.getStart() + nextTransition.getLength();
+ for (Keyword emit : emits) {
+ int start = tknHistory.get(depth - emit.getDepth()).getStart();
+ emitCandidateHolder.addCandidate(
+ new Emit(start, position - 1, emit.getText()));
}
-
+ nextTransition = tknz.nextTransition();
}
flushHandler.flush();
}
- public static boolean isWholeWord(CharSequence text, int start, int end) {
- return (start == 0 || Character.isWhitespace(text.charAt(start - 1))) &&
- (end == text.length() - 1 || Character.isWhitespace(text.charAt(end + 1)));
- }
-
- private State getState(State currentState, Character character, EmitCandidateFlushHandler flushHandler) {
- State newCurrentState = currentState.nextState(character);
+ private State getState(final State currentState,
+ final Transition transition,
+ final EmitCandidateFlushHandler flushHandler) {
+ State failState = currentState;
+ State newCurrentState = currentState.nextState(transition);
while (newCurrentState == null) {
- currentState = currentState.failure(flushHandler);
- newCurrentState = currentState.nextState(character);
+ failState = failState.failure(flushHandler);
+ newCurrentState = failState.nextState(transition);
}
return newCurrentState;
}
@@ -124,7 +206,7 @@ public class Trie {
while (!queue.isEmpty()) {
State currentState = queue.remove();
- for (Character transition : currentState.getTransitions()) {
+ for (Transition transition : currentState.getTransitions()) {
State targetState = currentState.nextState(transition);
queue.add(targetState);
@@ -145,9 +227,11 @@ public class Trie {
public static class TrieBuilder {
- private TrieConfig trieConfig = new TrieConfig();
+ private final TrieConfig trieConfig = new TrieConfig();
- private Trie trie = new Trie(trieConfig);
+ private final Trie trie = new Trie(trieConfig);
+
+ private boolean hasAddedKeyword = false;
private TrieBuilder() {}
@@ -162,15 +246,20 @@ public class Trie {
}
public TrieBuilder onlyWholeWords() {
+ if (hasAddedKeyword) {
+ throw new IllegalStateException(
+ "Unable to switch to only whole words after keywords added");
+ }
this.trieConfig.setOnlyWholeWords(true);
return this;
}
public TrieBuilder addKeyword(String keyword) {
trie.addKeyword(keyword);
+ hasAddedKeyword = true;
return this;
}
-
+
public Trie build() {
trie.constructFailureStates();
return trie;
diff --git a/src/main/java/org/ahocorasick/trie/TrieConfig.java b/src/main/java/org/ahocorasick/trie/TrieConfig.java
index 6fa05c7..c556dfa 100644
--- a/src/main/java/org/ahocorasick/trie/TrieConfig.java
+++ b/src/main/java/org/ahocorasick/trie/TrieConfig.java
@@ -7,7 +7,7 @@ public class TrieConfig {
private boolean onlyWholeWords = false;
private boolean caseInsensitive = false;
-
+
public boolean isAllowOverlaps() {
return allowOverlaps;
}
diff --git a/src/main/java/org/ahocorasick/trie/WordTransition.java b/src/main/java/org/ahocorasick/trie/WordTransition.java
new file mode 100644
index 0000000..c1dbd7b
--- /dev/null
+++ b/src/main/java/org/ahocorasick/trie/WordTransition.java
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2015 Rogue Wave Software.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.ahocorasick.trie;
+
+/**
+ * Model transitions on words
+ * @author doug.lovell
+ */
+public class WordTransition extends Transition {
+
+ /**
+ * Create a transition from a position in the source string
+ * @param word to match
+ * @param start position of first character within the source string
+ */
+ public WordTransition(final String word, int start) {
+ super(word, start, word.length());
+ }
+
+ /**
+ * Create a transition without regard for position
+ * @param word to match
+ */
+ public WordTransition(final String word) {
+ this(word, 0);
+ }
+}
diff --git a/src/test/java/org/ahocorasick/trie/StateTest.java b/src/test/java/org/ahocorasick/trie/StateTest.java
index 2a64370..7036e44 100644
--- a/src/test/java/org/ahocorasick/trie/StateTest.java
+++ b/src/test/java/org/ahocorasick/trie/StateTest.java
@@ -1,6 +1,5 @@
package org.ahocorasick.trie;
-import org.ahocorasick.trie.State;
import org.junit.Test;
import static junit.framework.Assert.assertEquals;
@@ -10,15 +9,36 @@ public class StateTest {
@Test
public void constructSequenceOfCharacters() {
State rootState = new State();
+ Transition a = new CharacterTransition('a');
+ Transition b = new CharacterTransition('b');
+ Transition c = new CharacterTransition('c');
rootState
- .addState('a')
- .addState('b')
- .addState('c');
- State currentState = rootState.nextState('a');
+ .addState(a)
+ .addState(b)
+ .addState(c);
+ State currentState = rootState.nextState(a);
assertEquals(1, currentState.getDepth());
- currentState = currentState.nextState('b');
+ currentState = currentState.nextState(b);
assertEquals(2, currentState.getDepth());
- currentState = currentState.nextState('c');
+ currentState = currentState.nextState(c);
+ assertEquals(3, currentState.getDepth());
+ }
+
+ @Test
+ public void constructSequenceOfWords() {
+ State rootState = new State();
+ Transition a = new WordTransition("Alpha");
+ Transition b = new WordTransition("Bravo");
+ Transition c = new WordTransition("Charlie");
+ rootState
+ .addState(a)
+ .addState(b)
+ .addState(c);
+ State currentState = rootState.nextState(a);
+ assertEquals(1, currentState.getDepth());
+ currentState = currentState.nextState(b);
+ assertEquals(2, currentState.getDepth());
+ currentState = currentState.nextState(c);
assertEquals(3, currentState.getDepth());
}
diff --git a/src/test/java/org/ahocorasick/trie/TrieTest.java b/src/test/java/org/ahocorasick/trie/TrieTest.java
index 03f7924..b55ee07 100644
--- a/src/test/java/org/ahocorasick/trie/TrieTest.java
+++ b/src/test/java/org/ahocorasick/trie/TrieTest.java
@@ -2,6 +2,7 @@ package org.ahocorasick.trie;
import org.ahocorasick.trie.handler.EmitHandler;
import org.ahocorasick.trie.handler.SimpleEmitHandler;
+import org.junit.Assert;
import org.junit.Test;
import java.util.ArrayList;
@@ -10,8 +11,9 @@ import java.util.Iterator;
import java.util.List;
import static junit.framework.Assert.assertEquals;
-import static junit.framework.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
+import org.junit.Rule;
+import org.junit.rules.ExpectedException;
public class TrieTest {
@@ -78,8 +80,9 @@ public class TrieTest {
}
@Test
- public void variousKeywordsFirstMatch() {
+ public void variousKeywordsFirstMatchWordTransitions() {
Trie trie = Trie.builder()
+ .onlyWholeWords()
.addKeyword("abc")
.addKeyword("bcd")
.addKeyword("cde")
@@ -193,6 +196,23 @@ public class TrieTest {
checkEmit(iterator.next(), 51, 58, "broccoli");
}
+@Test
+ public void recipesWordTransitions() {
+ Trie trie = Trie.builder()
+ .onlyWholeWords()
+ .addKeyword("veal")
+ .addKeyword("cauliflower")
+ .addKeyword("broccoli")
+ .addKeyword("tomatoes")
+ .build();
+ Collection emits = trie.parseText("2 cauliflower 3 tomatoes 4 slices of veal 100g broccoli");
+ Iterator iterator = emits.iterator();
+ checkEmit(iterator.next(), 2, 12, "cauliflower");
+ checkEmit(iterator.next(), 16, 23, "tomatoes");
+ checkEmit(iterator.next(), 37, 40, "veal");
+ checkEmit(iterator.next(), 47, 54, "broccoli");
+ }
+
@Test
public void recipesFirstMatch() {
Trie trie = Trie.builder()
@@ -395,6 +415,41 @@ public class TrieTest {
assertToken(tokensIt.next(), " in reserve", false, false, false);
}
+ @Test
+ public void tokenizeFullSentenceByWords() {
+ Trie trie = Trie.builder()
+ .onlyWholeWords()
+ .addKeyword("Alpha")
+ .addKeyword("Beta")
+ .addKeyword("Gamma")
+ .build();
+ Collection tokens = trie.tokenize("Hear: Alpha team first, Beta from the rear, Gamma in reserve");
+ assertEquals(7, tokens.size());
+ Iterator tokensIt = tokens.iterator();
+ assertToken(tokensIt.next(), "Hear: ", false, false, false);
+ assertToken(tokensIt.next(), "Alpha", true, true, false);
+ assertToken(tokensIt.next(), " team first, ", false, false, false);
+ assertToken(tokensIt.next(), "Beta", true, true, false);
+ assertToken(tokensIt.next(), " from the rear, ", false, false, false);
+ assertToken(tokensIt.next(), "Gamma", true, true, false);
+ assertToken(tokensIt.next(), " in reserve", false, false, false);
+ }
+
+ @Rule
+ public ExpectedException thrown = ExpectedException.none();
+
+ @Test
+ public void onlyWholeWordsThrowsExceptionAfterKeywordsAdded()
+ throws IllegalStateException {
+ thrown.expect(IllegalStateException.class);
+ thrown.expectMessage("Unable to switch to only whole words after keywords added");
+ Trie trie = Trie.builder()
+ .addKeyword("Happy for now")
+ .onlyWholeWords()
+ .addKeyword("Not so happy")
+ .build();
+ }
+
@Test
public void bug5InGithubReportedByXCurry() {
Trie trie = Trie.builder().caseInsensitive().onlyWholeWords()
@@ -489,6 +544,22 @@ public class TrieTest {
checkEmit(firstMatch, 5, 8, "this");
}
+ @Test
+ public void unicodeInKeyword() {
+ // The upper case character ('İ') is Unicode,
+ // which was read by AC as a 2-byte char
+ String target = "it is so much LİKE Unicode to mess with Java";
+ Trie trie = Trie.builder()
+ .onlyWholeWords()
+ .addKeyword("so much LİKE Unicode")
+ .addKeyword("it is")
+ .build();
+ Collection emits = trie.parseText(target);
+ Iterator it = emits.iterator();
+ checkEmit(it.next(), 0, 4, "it is");
+ checkEmit(it.next(), 6, 25, "so much LİKE Unicode");
+ }
+
@Test
public void partialMatchWhiteSpaces() {
Trie trie = Trie.builder()
@@ -500,11 +571,61 @@ public class TrieTest {
checkEmit(emits.iterator().next(), 0, 9, "#sugar-123");
}
+ /*
+ For onlyWholeWords, we'll ignore leading and trailing white space
+ included on keywords
+ */
+ @Test
+ public void spacesAroundKeywordByWords() {
+ String text = "lorem ipso facto genera linden pharma six 1";
+ String keyword = " " + text + " ";
+ Trie trie = Trie.builder()
+ .onlyWholeWords()
+ .caseInsensitive()
+ .addKeyword(keyword)
+ .build();
+ Collection < Emit > emits = trie.parseText(
+ text + " under addressed object ");
+ assertEquals(1, emits.size());
+ checkEmit(emits.iterator().next(), 0, text.length() - 1, keyword);
+ }
+
+ @Test
+ public void punctuationInText() {
+ Trie trie = Trie.builder()
+ .onlyWholeWords()
+ .addKeyword("pie")
+ .build();
+
+ Collection emits = trie.parseText("Want some pie? Gimme pie! pie, pie. The pie's revenge.");
+ Assert.assertEquals(5, emits.size());
+ Iterator it = emits.iterator();
+ checkEmit(it.next(), 10, 12, "pie");
+ checkEmit(it.next(), 21, 23, "pie");
+ checkEmit(it.next(), 26, 28, "pie");
+ checkEmit(it.next(), 31, 33, "pie");
+ checkEmit(it.next(), 40, 42, "pie");
+ }
+
+ @Test
+ public void punctuationInSearchTerm() {
+ Trie trie = Trie.builder()
+ .onlyWholeWords()
+ .addKeyword("Dr. Feelgood")
+ .addKeyword("Oi!")
+ .build();
+
+ Collection emits = trie
+ .parseText("The Oi! music genre is inspired by Dr. Feelgood and other bands. Oi or Dr Feelgood should not match.");
+
+ Assert.assertEquals(2, emits.size());
+
+
+ }
+
private void assertToken(Token token, String fragment, boolean match, boolean wholeWord, boolean whiteSpace) {
assertEquals(fragment, token.getFragment());
assertEquals(match, token.isMatch());
- assertEquals(wholeWord, token.isWholeWord());
- assertEquals(whiteSpace, token.isWhiteSpace());
}
private void checkEmit(Emit next, int expectedStart, int expectedEnd, String expectedKeyword) {