Added final modifier. Added helper methods for adding keywords using arrays and collections. Added test for large character strings. Simplified code for adding keywords. Renamed a few methods for consistency. Some code formatting. Updated unit tests with constant arrays, as a first step to reducing the duplication in the unit tests; migrated away from deprecated methods.
This commit is contained in:
parent
8c422583b5
commit
f6a7103f5f
1
.gitignore
vendored
1
.gitignore
vendored
@ -3,3 +3,4 @@
|
||||
src/main/java/Main.java
|
||||
*.txt
|
||||
docs
|
||||
/target/
|
||||
1
pom.xml
1
pom.xml
@ -77,6 +77,7 @@
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<version>3.6.0</version>
|
||||
<configuration>
|
||||
<source>1.7</source>
|
||||
<target>1.7</target>
|
||||
|
||||
@ -35,32 +35,34 @@ public class State {
|
||||
* referred to in the white paper as the 'goto' structure. From a state it is possible to go
|
||||
* to other states, depending on the character passed.
|
||||
*/
|
||||
private Map<Character,State> success = new HashMap<Character, State>();
|
||||
private final Map<Character,State> success = new HashMap<>();
|
||||
|
||||
/** if no matching states are found, the failure state will be returned */
|
||||
private State failure = null;
|
||||
private State failure;
|
||||
|
||||
/** whenever this state is reached, it will emit the matches keywords for future reference */
|
||||
private Set<String> emits = null;
|
||||
private Set<String> emits;
|
||||
|
||||
public State() {
|
||||
this(0);
|
||||
}
|
||||
|
||||
public State(int depth) {
|
||||
public State(final int depth) {
|
||||
this.depth = depth;
|
||||
this.rootState = depth == 0 ? this : null;
|
||||
}
|
||||
|
||||
private State nextState(Character character, boolean ignoreRootState) {
|
||||
private State nextState(final Character character, final boolean ignoreRootState) {
|
||||
State nextState = this.success.get(character);
|
||||
|
||||
if (!ignoreRootState && nextState == null && this.rootState != null) {
|
||||
nextState = this.rootState;
|
||||
}
|
||||
|
||||
return nextState;
|
||||
}
|
||||
|
||||
public State nextState(Character character) {
|
||||
public State nextState(final Character character) {
|
||||
return nextState(character, false);
|
||||
}
|
||||
|
||||
@ -68,6 +70,16 @@ public class State {
|
||||
return nextState(character, true);
|
||||
}
|
||||
|
||||
public State addState( String keyword ) {
|
||||
State state = this;
|
||||
|
||||
for (final Character character : keyword.toCharArray()) {
|
||||
state = state.addState(character);
|
||||
}
|
||||
|
||||
return state;
|
||||
}
|
||||
|
||||
public State addState(Character character) {
|
||||
State nextState = nextStateIgnoreRootState(character);
|
||||
if (nextState == null) {
|
||||
@ -113,5 +125,4 @@ public class State {
|
||||
public Collection<Character> getTransitions() {
|
||||
return this.success.keySet();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,59 +1,92 @@
|
||||
package org.ahocorasick.trie;
|
||||
|
||||
import org.ahocorasick.interval.IntervalTree;
|
||||
import org.ahocorasick.interval.Intervalable;
|
||||
import org.ahocorasick.trie.handler.DefaultEmitHandler;
|
||||
import org.ahocorasick.trie.handler.EmitHandler;
|
||||
|
||||
import static java.lang.Character.isWhitespace;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Queue;
|
||||
import java.util.concurrent.LinkedBlockingDeque;
|
||||
import org.ahocorasick.interval.IntervalTree;
|
||||
import org.ahocorasick.interval.Intervalable;
|
||||
import org.ahocorasick.trie.handler.DefaultEmitHandler;
|
||||
import org.ahocorasick.trie.handler.EmitHandler;
|
||||
|
||||
/**
|
||||
* Based on the Aho-Corasick white paper, Bell technologies:
|
||||
* http://cr.yp.to/bib/1975/aho.pdf
|
||||
*
|
||||
* Based on the Aho-Corasick white paper, Bell technologies: http://cr.yp.to/bib/1975/aho.pdf
|
||||
* @author Robert Bor
|
||||
*/
|
||||
public class Trie {
|
||||
|
||||
private TrieConfig trieConfig;
|
||||
private final TrieConfig trieConfig;
|
||||
|
||||
private State rootState;
|
||||
private final State rootState;
|
||||
|
||||
private Trie(TrieConfig trieConfig) {
|
||||
private Trie(final TrieConfig trieConfig) {
|
||||
this.trieConfig = trieConfig;
|
||||
this.rootState = new State();
|
||||
}
|
||||
|
||||
/**
|
||||
* Used by the builder to add a text search keyword.
|
||||
*
|
||||
* @param keyword The search term to add to the list of search terms.
|
||||
*
|
||||
* @throws NullPointerException if the keyword is null.
|
||||
*/
|
||||
private void addKeyword(String keyword) {
|
||||
if (keyword == null || keyword.length() == 0) {
|
||||
return;
|
||||
if( keyword.isEmpty() ) {
|
||||
return;
|
||||
}
|
||||
State currentState = this.rootState;
|
||||
for (Character character : keyword.toCharArray()) {
|
||||
if (trieConfig.isCaseInsensitive()) {
|
||||
character = Character.toLowerCase(character);
|
||||
}
|
||||
currentState = currentState.addState(character);
|
||||
|
||||
if( isCaseInsensitive() ) {
|
||||
keyword = keyword.toLowerCase();
|
||||
}
|
||||
currentState.addEmit(trieConfig.isCaseInsensitive() ? keyword.toLowerCase() : keyword);
|
||||
|
||||
addState(keyword).addEmit(keyword);
|
||||
}
|
||||
|
||||
public Collection<Token> tokenize(String text) {
|
||||
/**
|
||||
* Delegates to addKeyword.
|
||||
*
|
||||
* @param keywords List of search term to add to the list of search terms.
|
||||
*/
|
||||
private void addKeywords( final String[] keywords ) {
|
||||
for( final String keyword : keywords ) {
|
||||
addKeyword( keyword );
|
||||
}
|
||||
}
|
||||
|
||||
Collection<Token> tokens = new ArrayList<>();
|
||||
/**
|
||||
* Delegates to addKeyword.
|
||||
*
|
||||
* @param keywords List of search term to add to the list of search terms.
|
||||
*/
|
||||
private void addKeywords( final Collection<String> keywords ) {
|
||||
for( final String keyword : keywords ) {
|
||||
addKeyword( keyword );
|
||||
}
|
||||
}
|
||||
|
||||
Collection<Emit> collectedEmits = parseText(text);
|
||||
private State addState(final String keyword) {
|
||||
return getRootState().addState(keyword);
|
||||
}
|
||||
|
||||
public Collection<Token> tokenize(final String text) {
|
||||
final Collection<Token> tokens = new ArrayList<>();
|
||||
final Collection<Emit> collectedEmits = parseText(text);
|
||||
int lastCollectedPosition = -1;
|
||||
for (Emit emit : collectedEmits) {
|
||||
|
||||
for (final Emit emit : collectedEmits) {
|
||||
if (emit.getStart() - lastCollectedPosition > 1) {
|
||||
tokens.add(createFragment(emit, text, lastCollectedPosition));
|
||||
}
|
||||
|
||||
tokens.add(createMatch(emit, text));
|
||||
lastCollectedPosition = emit.getEnd();
|
||||
}
|
||||
|
||||
if (text.length() - lastCollectedPosition > 1) {
|
||||
tokens.add(createFragment(null, text, lastCollectedPosition));
|
||||
}
|
||||
@ -61,7 +94,7 @@ public class Trie {
|
||||
return tokens;
|
||||
}
|
||||
|
||||
private Token createFragment(Emit emit, String text, int lastCollectedPosition) {
|
||||
private Token createFragment(final Emit emit, final String text, final int lastCollectedPosition) {
|
||||
return new FragmentToken(text.substring(lastCollectedPosition+1, emit == null ? text.length() : emit.getStart()));
|
||||
}
|
||||
|
||||
@ -70,11 +103,11 @@ public class Trie {
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public Collection<Emit> parseText(CharSequence text) {
|
||||
DefaultEmitHandler emitHandler = new DefaultEmitHandler();
|
||||
public Collection<Emit> parseText(final CharSequence text) {
|
||||
final DefaultEmitHandler emitHandler = new DefaultEmitHandler();
|
||||
parseText(text, emitHandler);
|
||||
|
||||
List<Emit> collectedEmits = emitHandler.getEmits();
|
||||
final List<Emit> collectedEmits = emitHandler.getEmits();
|
||||
|
||||
if (trieConfig.isOnlyWholeWords()) {
|
||||
removePartialMatches(text, collectedEmits);
|
||||
@ -92,117 +125,132 @@ public class Trie {
|
||||
return collectedEmits;
|
||||
}
|
||||
|
||||
public boolean containsMatch(CharSequence text) {
|
||||
Emit firstMatch = firstMatch(text);
|
||||
return firstMatch != null;
|
||||
}
|
||||
public boolean containsMatch(final CharSequence text) {
|
||||
return firstMatch(text) != null;
|
||||
}
|
||||
|
||||
public void parseText(final CharSequence text, final EmitHandler emitHandler) {
|
||||
State currentState = getRootState();
|
||||
|
||||
public void parseText(CharSequence text, EmitHandler emitHandler) {
|
||||
State currentState = this.rootState;
|
||||
for (int position = 0; position < text.length(); position++) {
|
||||
Character character = text.charAt(position);
|
||||
|
||||
// TODO: Maybe lowercase the entire string at once?
|
||||
if (trieConfig.isCaseInsensitive()) {
|
||||
character = Character.toLowerCase(character);
|
||||
}
|
||||
|
||||
currentState = getState(currentState, character);
|
||||
if (storeEmits(position, currentState, emitHandler) && trieConfig.isStopOnHit()) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public Emit firstMatch(CharSequence text) {
|
||||
if (!trieConfig.isAllowOverlaps()) {
|
||||
// Slow path. Needs to find all the matches to detect overlaps.
|
||||
Collection<Emit> parseText = parseText(text);
|
||||
if (parseText != null && !parseText.isEmpty()) {
|
||||
return parseText.iterator().next();
|
||||
}
|
||||
} else {
|
||||
// Fast path. Returns first match found.
|
||||
State currentState = this.rootState;
|
||||
public Emit firstMatch(final CharSequence text) {
|
||||
if (!trieConfig.isAllowOverlaps()) {
|
||||
// Slow path. Needs to find all the matches to detect overlaps.
|
||||
Collection<Emit> parseText = parseText(text);
|
||||
if (parseText != null && !parseText.isEmpty()) {
|
||||
return parseText.iterator().next();
|
||||
}
|
||||
} else {
|
||||
// Fast path. Returns first match found.
|
||||
State currentState = getRootState();
|
||||
|
||||
for (int position = 0; position < text.length(); position++) {
|
||||
Character character = text.charAt(position);
|
||||
if (trieConfig.isCaseInsensitive()) {
|
||||
character = Character.toLowerCase(character);
|
||||
}
|
||||
currentState = getState(currentState, character);
|
||||
Collection<String> emitStrs = currentState.emit();
|
||||
if (emitStrs != null && !emitStrs.isEmpty()) {
|
||||
for (String emitStr : emitStrs) {
|
||||
final Emit emit = new Emit(position - emitStr.length() + 1, position, emitStr);
|
||||
if (trieConfig.isOnlyWholeWords()) {
|
||||
if (!isPartialMatch(text, emit)) {
|
||||
return emit;
|
||||
}
|
||||
} else {
|
||||
return emit;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private boolean isPartialMatch(CharSequence searchText, Emit emit) {
|
||||
return (emit.getStart() != 0 &&
|
||||
Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) ||
|
||||
(emit.getEnd() + 1 != searchText.length() &&
|
||||
Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1)));
|
||||
}
|
||||
// TODO: Lowercase the entire string at once?
|
||||
if (trieConfig.isCaseInsensitive()) {
|
||||
character = Character.toLowerCase(character);
|
||||
}
|
||||
|
||||
private void removePartialMatches(CharSequence searchText, List<Emit> collectedEmits) {
|
||||
List<Emit> removeEmits = new ArrayList<>();
|
||||
for (Emit emit : collectedEmits) {
|
||||
if (isPartialMatch(searchText, emit)) {
|
||||
removeEmits.add(emit);
|
||||
}
|
||||
}
|
||||
for (Emit removeEmit : removeEmits) {
|
||||
collectedEmits.remove(removeEmit);
|
||||
}
|
||||
}
|
||||
currentState = getState(currentState, character);
|
||||
Collection<String> emitStrs = currentState.emit();
|
||||
|
||||
private void removePartialMatchesWhiteSpaceSeparated(CharSequence searchText, List<Emit> collectedEmits) {
|
||||
long size = searchText.length();
|
||||
List<Emit> removeEmits = new ArrayList<>();
|
||||
for (Emit emit : collectedEmits) {
|
||||
if ((emit.getStart() == 0 || Character.isWhitespace(searchText.charAt(emit.getStart() - 1))) &&
|
||||
(emit.getEnd() + 1 == size || Character.isWhitespace(searchText.charAt(emit.getEnd() + 1)))) {
|
||||
continue;
|
||||
if (emitStrs != null && !emitStrs.isEmpty()) {
|
||||
for (String emitStr : emitStrs) {
|
||||
final Emit emit = new Emit(position - emitStr.length() + 1, position, emitStr);
|
||||
if (trieConfig.isOnlyWholeWords()) {
|
||||
if (!isPartialMatch(text, emit)) {
|
||||
return emit;
|
||||
}
|
||||
} else {
|
||||
return emit;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
removeEmits.add(emit);
|
||||
}
|
||||
for (Emit removeEmit : removeEmits) {
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private boolean isPartialMatch(final CharSequence searchText, final Emit emit) {
|
||||
return (emit.getStart() != 0 &&
|
||||
Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) ||
|
||||
(emit.getEnd() + 1 != searchText.length() &&
|
||||
Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1)));
|
||||
}
|
||||
|
||||
private void removePartialMatches(final CharSequence searchText, final List<Emit> collectedEmits) {
|
||||
final List<Emit> removeEmits = new ArrayList<>();
|
||||
|
||||
for (final Emit emit : collectedEmits) {
|
||||
if (isPartialMatch(searchText, emit)) {
|
||||
removeEmits.add(emit);
|
||||
}
|
||||
}
|
||||
|
||||
for (final Emit removeEmit : removeEmits) {
|
||||
collectedEmits.remove(removeEmit);
|
||||
}
|
||||
}
|
||||
|
||||
private State getState(State currentState, Character character) {
|
||||
private void removePartialMatchesWhiteSpaceSeparated(final CharSequence searchText, final List<Emit> collectedEmits) {
|
||||
final long size = searchText.length();
|
||||
final List<Emit> removeEmits = new ArrayList<>();
|
||||
|
||||
for (final Emit emit : collectedEmits) {
|
||||
if ((emit.getStart() == 0 || isWhitespace(searchText.charAt(emit.getStart() - 1))) &&
|
||||
(emit.getEnd() + 1 == size || isWhitespace(searchText.charAt(emit.getEnd() + 1)))) {
|
||||
continue;
|
||||
}
|
||||
removeEmits.add(emit);
|
||||
}
|
||||
|
||||
for (final Emit removeEmit : removeEmits) {
|
||||
collectedEmits.remove(removeEmit);
|
||||
}
|
||||
}
|
||||
|
||||
private State getState(State currentState, final Character character) {
|
||||
State newCurrentState = currentState.nextState(character);
|
||||
|
||||
while (newCurrentState == null) {
|
||||
currentState = currentState.failure();
|
||||
newCurrentState = currentState.nextState(character);
|
||||
}
|
||||
|
||||
return newCurrentState;
|
||||
}
|
||||
|
||||
private void constructFailureStates() {
|
||||
Queue<State> queue = new LinkedBlockingDeque<>();
|
||||
final Queue<State> queue = new LinkedBlockingDeque<>();
|
||||
final State startState = getRootState();
|
||||
|
||||
// First, set the fail state of all depth 1 states to the root state
|
||||
for (State depthOneState : this.rootState.getStates()) {
|
||||
depthOneState.setFailure(this.rootState);
|
||||
for (State depthOneState : startState.getStates()) {
|
||||
depthOneState.setFailure(startState);
|
||||
queue.add(depthOneState);
|
||||
}
|
||||
|
||||
// Second, determine the fail state for all depth > 1 state
|
||||
while (!queue.isEmpty()) {
|
||||
State currentState = queue.remove();
|
||||
final State currentState = queue.remove();
|
||||
|
||||
for (Character transition : currentState.getTransitions()) {
|
||||
for (final Character transition : currentState.getTransitions()) {
|
||||
State targetState = currentState.nextState(transition);
|
||||
queue.add(targetState);
|
||||
|
||||
@ -210,70 +258,174 @@ public class Trie {
|
||||
while (traceFailureState.nextState(transition) == null) {
|
||||
traceFailureState = traceFailureState.failure();
|
||||
}
|
||||
State newFailureState = traceFailureState.nextState(transition);
|
||||
|
||||
final State newFailureState = traceFailureState.nextState(transition);
|
||||
targetState.setFailure(newFailureState);
|
||||
targetState.addEmit(newFailureState.emit());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private boolean storeEmits(int position, State currentState, EmitHandler emitHandler) {
|
||||
private boolean storeEmits(final int position, final State currentState, final EmitHandler emitHandler) {
|
||||
boolean emitted = false;
|
||||
Collection<String> emits = currentState.emit();
|
||||
final Collection<String> emits = currentState.emit();
|
||||
|
||||
// TODO: The check for empty might be superfluous.
|
||||
if (emits != null && !emits.isEmpty()) {
|
||||
for (String emit : emits) {
|
||||
for (final String emit : emits) {
|
||||
emitHandler.emit(new Emit(position - emit.length() + 1, position, emit));
|
||||
emitted = true;
|
||||
}
|
||||
}
|
||||
|
||||
return emitted;
|
||||
}
|
||||
|
||||
private boolean isCaseInsensitive() {
|
||||
return trieConfig.isCaseInsensitive();
|
||||
}
|
||||
|
||||
private State getRootState() {
|
||||
return this.rootState;
|
||||
}
|
||||
|
||||
/**
|
||||
* Provides a fluent interface for constructing Trie instances.
|
||||
*
|
||||
* @return The builder used to configure its Trie.
|
||||
*/
|
||||
public static TrieBuilder builder() {
|
||||
return new TrieBuilder();
|
||||
}
|
||||
|
||||
public static class TrieBuilder {
|
||||
|
||||
private TrieConfig trieConfig = new TrieConfig();
|
||||
private final TrieConfig trieConfig = new TrieConfig();
|
||||
|
||||
private Trie trie = new Trie(trieConfig);
|
||||
private final Trie trie = new Trie(trieConfig);
|
||||
|
||||
/**
|
||||
* Default (empty) constructor.
|
||||
*/
|
||||
private TrieBuilder() {}
|
||||
|
||||
public TrieBuilder caseInsensitive() {
|
||||
/**
|
||||
* Adds a keyword to the Trie's list of text search keywords.
|
||||
*
|
||||
* @param keyword The keyword to add to the list.
|
||||
*
|
||||
* @return This builder.
|
||||
* @throws NullPointerException if the keyword is null.
|
||||
*/
|
||||
public TrieBuilder addKeyword(final String keyword) {
|
||||
this.trie.addKeyword(keyword);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a list of keywords to the Trie's list of text search keywords.
|
||||
*
|
||||
* @param keywords The keywords to add to the list.
|
||||
*
|
||||
* @return This builder.
|
||||
*/
|
||||
public TrieBuilder addKeywords(final String... keywords) {
|
||||
this.trie.addKeywords(keywords);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a list of keywords to the Trie's list of text search keywords.
|
||||
*
|
||||
* @param keywords The keywords to add to the list.
|
||||
*
|
||||
* @return This builder.
|
||||
*/
|
||||
public TrieBuilder addKeywords(final Collection<String> keywords) {
|
||||
this.trie.addKeywords(keywords);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Configure the Trie to ignore case when searching for keywords in
|
||||
* the text.
|
||||
*
|
||||
* @return This builder.
|
||||
*/
|
||||
public TrieBuilder ignoreCase() {
|
||||
this.trieConfig.setCaseInsensitive(true);
|
||||
return this;
|
||||
}
|
||||
|
||||
public TrieBuilder removeOverlaps() {
|
||||
/**
|
||||
* Configure the Trie to ignore overlapping keywords.
|
||||
*
|
||||
* @return This builder.
|
||||
*/
|
||||
public TrieBuilder ignoreOverlaps() {
|
||||
this.trieConfig.setAllowOverlaps(false);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Configure the Trie to match whole keywords in the text.
|
||||
*
|
||||
* @return This builder.
|
||||
*/
|
||||
public TrieBuilder onlyWholeWords() {
|
||||
this.trieConfig.setOnlyWholeWords(true);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Configure the Trie to match whole keywords that are separated by
|
||||
* whitespace in the text. For example, "this keyword thatkeyword"
|
||||
* would only match the first occurrence of "keyword".
|
||||
*
|
||||
* @return This builder.
|
||||
*/
|
||||
public TrieBuilder onlyWholeWordsWhiteSpaceSeparated() {
|
||||
this.trieConfig.setOnlyWholeWordsWhiteSpaceSeparated(true);
|
||||
return this;
|
||||
}
|
||||
|
||||
public TrieBuilder addKeyword(String keyword) {
|
||||
trie.addKeyword(keyword);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Configure the Trie to stop after the first keyword is found in the
|
||||
* text.
|
||||
*
|
||||
* @return This builder.
|
||||
*/
|
||||
public TrieBuilder stopOnHit() {
|
||||
trie.trieConfig.setStopOnHit(true);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Configure the Trie based on the builder settings.
|
||||
*
|
||||
* @return The configured Trie.
|
||||
*/
|
||||
public Trie build() {
|
||||
trie.constructFailureStates();
|
||||
return trie;
|
||||
this.trie.constructFailureStates();
|
||||
return this.trie;
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated Use ignoreCase()
|
||||
*
|
||||
* @return This builder.
|
||||
*/
|
||||
public TrieBuilder caseInsensitive() {
|
||||
return ignoreCase();
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated Use ignoreOverlaps()
|
||||
*
|
||||
* @return This builder.
|
||||
*/
|
||||
public TrieBuilder removeOverlaps() {
|
||||
return ignoreOverlaps();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,62 +1,78 @@
|
||||
package org.ahocorasick.trie;
|
||||
|
||||
import org.ahocorasick.trie.handler.EmitHandler;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import java.util.concurrent.ThreadLocalRandom;
|
||||
import static junit.framework.Assert.assertEquals;
|
||||
import org.ahocorasick.trie.handler.EmitHandler;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import org.junit.Test;
|
||||
|
||||
public class TrieTest {
|
||||
private final static String[] ALPHABET = new String[]{
|
||||
"abc", "bcd", "cde"
|
||||
};
|
||||
|
||||
private final static String[] PRONOUNS = new String[]{
|
||||
"hers", "his", "she", "he"
|
||||
};
|
||||
|
||||
private final static String[] FOOD = new String[]{
|
||||
"veal", "cauliflower", "broccoli", "tomatoes"
|
||||
};
|
||||
|
||||
private final static String[] GREEK_LETTERS = new String[]{
|
||||
"Alpha", "Beta", "Gamma"
|
||||
};
|
||||
|
||||
private final static String[] UNICODE = new String[]{
|
||||
"turning", "once", "again", "börkü"
|
||||
};
|
||||
|
||||
@Test
|
||||
public void keywordAndTextAreTheSame() {
|
||||
Trie trie = Trie.builder()
|
||||
.addKeyword("abc")
|
||||
.addKeyword(ALPHABET[0])
|
||||
.build();
|
||||
Collection<Emit> emits = trie.parseText("abc");
|
||||
Collection<Emit> emits = trie.parseText(ALPHABET[0]);
|
||||
Iterator<Emit> iterator = emits.iterator();
|
||||
checkEmit(iterator.next(), 0, 2, "abc");
|
||||
checkEmit(iterator.next(), 0, 2, ALPHABET[0]);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void keywordAndTextAreTheSameFirstMatch() {
|
||||
Trie trie = Trie.builder()
|
||||
.addKeyword("abc")
|
||||
.addKeyword(ALPHABET[0])
|
||||
.build();
|
||||
Emit firstMatch = trie.firstMatch("abc");
|
||||
checkEmit(firstMatch, 0, 2, "abc");
|
||||
Emit firstMatch = trie.firstMatch(ALPHABET[0]);
|
||||
checkEmit(firstMatch, 0, 2, ALPHABET[0]);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void textIsLongerThanKeyword() {
|
||||
Trie trie = Trie.builder()
|
||||
.addKeyword("abc")
|
||||
.addKeyword(ALPHABET[0])
|
||||
.build();
|
||||
Collection<Emit> emits = trie.parseText(" abc");
|
||||
Collection<Emit> emits = trie.parseText(" " + ALPHABET[0]);
|
||||
Iterator<Emit> iterator = emits.iterator();
|
||||
checkEmit(iterator.next(), 1, 3, "abc");
|
||||
checkEmit(iterator.next(), 1, 3, ALPHABET[0]);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void textIsLongerThanKeywordFirstMatch() {
|
||||
Trie trie = Trie.builder()
|
||||
.addKeyword("abc")
|
||||
.addKeyword(ALPHABET[0])
|
||||
.build();
|
||||
Emit firstMatch = trie.firstMatch(" abc");
|
||||
checkEmit(firstMatch, 1, 3, "abc");
|
||||
Emit firstMatch = trie.firstMatch(" " + ALPHABET[0]);
|
||||
checkEmit(firstMatch, 1, 3, ALPHABET[0]);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void variousKeywordsOneMatch() {
|
||||
Trie trie = Trie.builder()
|
||||
.addKeyword("abc")
|
||||
.addKeyword("bcd")
|
||||
.addKeyword("cde")
|
||||
.addKeywords(ALPHABET)
|
||||
.build();
|
||||
Collection<Emit> emits = trie.parseText("bcd");
|
||||
Iterator<Emit> iterator = emits.iterator();
|
||||
@ -66,9 +82,7 @@ public class TrieTest {
|
||||
@Test
|
||||
public void variousKeywordsFirstMatch() {
|
||||
Trie trie = Trie.builder()
|
||||
.addKeyword("abc")
|
||||
.addKeyword("bcd")
|
||||
.addKeyword("cde")
|
||||
.addKeywords(ALPHABET)
|
||||
.build();
|
||||
Emit firstMatch = trie.firstMatch("bcd");
|
||||
checkEmit(firstMatch, 0, 2, "bcd");
|
||||
@ -77,10 +91,7 @@ public class TrieTest {
|
||||
@Test
|
||||
public void ushersTestAndStopOnHit() {
|
||||
Trie trie = Trie.builder()
|
||||
.addKeyword("hers")
|
||||
.addKeyword("his")
|
||||
.addKeyword("she")
|
||||
.addKeyword("he")
|
||||
.addKeywords(PRONOUNS)
|
||||
.stopOnHit()
|
||||
.build();
|
||||
Collection<Emit> emits = trie.parseText("ushers");
|
||||
@ -93,10 +104,7 @@ public class TrieTest {
|
||||
@Test
|
||||
public void ushersTest() {
|
||||
Trie trie = Trie.builder()
|
||||
.addKeyword("hers")
|
||||
.addKeyword("his")
|
||||
.addKeyword("she")
|
||||
.addKeyword("he")
|
||||
.addKeywords(PRONOUNS)
|
||||
.build();
|
||||
Collection<Emit> emits = trie.parseText("ushers");
|
||||
assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5
|
||||
@ -109,7 +117,7 @@ public class TrieTest {
|
||||
@Test
|
||||
public void ushersTestWithCapitalKeywords() {
|
||||
Trie trie = Trie.builder()
|
||||
.caseInsensitive()
|
||||
.ignoreCase()
|
||||
.addKeyword("HERS")
|
||||
.addKeyword("HIS")
|
||||
.addKeyword("SHE")
|
||||
@ -126,10 +134,7 @@ public class TrieTest {
|
||||
@Test
|
||||
public void ushersTestFirstMatch() {
|
||||
Trie trie = Trie.builder()
|
||||
.addKeyword("hers")
|
||||
.addKeyword("his")
|
||||
.addKeyword("she")
|
||||
.addKeyword("he")
|
||||
.addKeywords(PRONOUNS)
|
||||
.build();
|
||||
Emit firstMatch = trie.firstMatch("ushers");
|
||||
checkEmit(firstMatch, 2, 3, "he");
|
||||
@ -138,10 +143,7 @@ public class TrieTest {
|
||||
@Test
|
||||
public void ushersTestByCallback() {
|
||||
Trie trie = Trie.builder()
|
||||
.addKeyword("hers")
|
||||
.addKeyword("his")
|
||||
.addKeyword("she")
|
||||
.addKeyword("he")
|
||||
.addKeywords(PRONOUNS)
|
||||
.build();
|
||||
|
||||
final List<Emit> emits = new ArrayList<>();
|
||||
@ -182,10 +184,7 @@ public class TrieTest {
|
||||
@Test
|
||||
public void recipes() {
|
||||
Trie trie = Trie.builder()
|
||||
.addKeyword("veal")
|
||||
.addKeyword("cauliflower")
|
||||
.addKeyword("broccoli")
|
||||
.addKeyword("tomatoes")
|
||||
.addKeywords(FOOD)
|
||||
.build();
|
||||
Collection<Emit> emits = trie.parseText("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli");
|
||||
Iterator<Emit> iterator = emits.iterator();
|
||||
@ -198,10 +197,7 @@ public class TrieTest {
|
||||
@Test
|
||||
public void recipesFirstMatch() {
|
||||
Trie trie = Trie.builder()
|
||||
.addKeyword("veal")
|
||||
.addKeyword("cauliflower")
|
||||
.addKeyword("broccoli")
|
||||
.addKeyword("tomatoes")
|
||||
.addKeywords(FOOD)
|
||||
.build();
|
||||
Emit firstMatch = trie.firstMatch("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli");
|
||||
|
||||
@ -305,9 +301,7 @@ public class TrieTest {
|
||||
@Test
|
||||
public void tokenizeFullSentence() {
|
||||
Trie trie = Trie.builder()
|
||||
.addKeyword("Alpha")
|
||||
.addKeyword("Beta")
|
||||
.addKeyword("Gamma")
|
||||
.addKeywords(GREEK_LETTERS)
|
||||
.build();
|
||||
Collection<Token> tokens = trie.tokenize("Hear: Alpha team first, Beta from the rear, Gamma in reserve");
|
||||
assertEquals(7, tokens.size());
|
||||
@ -321,13 +315,11 @@ public class TrieTest {
|
||||
assertEquals(" in reserve", tokensIt.next().getFragment());
|
||||
}
|
||||
|
||||
// @see https://github.com/robert-bor/aho-corasick/issues/5
|
||||
@Test
|
||||
public void bug5InGithubReportedByXCurry() {
|
||||
Trie trie = Trie.builder().caseInsensitive().onlyWholeWords()
|
||||
.addKeyword("turning")
|
||||
.addKeyword("once")
|
||||
.addKeyword("again")
|
||||
.addKeyword("börkü")
|
||||
public void testStringIndexOutOfBoundsException() {
|
||||
Trie trie = Trie.builder().ignoreCase().onlyWholeWords()
|
||||
.addKeywords(UNICODE)
|
||||
.build();
|
||||
Collection<Emit> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ");
|
||||
assertEquals(4, emits.size()); // Match must not be made
|
||||
@ -339,12 +331,9 @@ public class TrieTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
public void caseInsensitive() {
|
||||
Trie trie = Trie.builder().caseInsensitive()
|
||||
.addKeyword("turning")
|
||||
.addKeyword("once")
|
||||
.addKeyword("again")
|
||||
.addKeyword("börkü")
|
||||
public void testIgnoreCase() {
|
||||
Trie trie = Trie.builder().ignoreCase()
|
||||
.addKeywords(UNICODE)
|
||||
.build();
|
||||
Collection<Emit> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ");
|
||||
assertEquals(4, emits.size()); // Match must not be made
|
||||
@ -356,12 +345,9 @@ public class TrieTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
public void caseInsensitiveFirstMatch() {
|
||||
Trie trie = Trie.builder().caseInsensitive()
|
||||
.addKeyword("turning")
|
||||
.addKeyword("once")
|
||||
.addKeyword("again")
|
||||
.addKeyword("börkü")
|
||||
public void testIgnoreCaseFirstMatch() {
|
||||
Trie trie = Trie.builder().ignoreCase()
|
||||
.addKeywords(UNICODE)
|
||||
.build();
|
||||
Emit firstMatch = trie.firstMatch("TurninG OnCe AgAiN BÖRKÜ");
|
||||
|
||||
@ -371,29 +357,27 @@ public class TrieTest {
|
||||
@Test
|
||||
public void tokenizeTokensInSequence() {
|
||||
Trie trie = Trie.builder()
|
||||
.addKeyword("Alpha")
|
||||
.addKeyword("Beta")
|
||||
.addKeyword("Gamma")
|
||||
.addKeywords(GREEK_LETTERS)
|
||||
.build();
|
||||
Collection<Token> tokens = trie.tokenize("Alpha Beta Gamma");
|
||||
assertEquals(5, tokens.size());
|
||||
}
|
||||
|
||||
// Test offered by XCurry, https://github.com/robert-bor/aho-corasick/issues/7
|
||||
// @see https://github.com/robert-bor/aho-corasick/issues/7
|
||||
@Test
|
||||
public void zeroLengthTestBug7InGithubReportedByXCurry() {
|
||||
Trie trie = Trie.builder().removeOverlaps().onlyWholeWords().caseInsensitive()
|
||||
public void testZeroLength() {
|
||||
Trie trie = Trie.builder().ignoreOverlaps().onlyWholeWords().ignoreCase()
|
||||
.addKeyword("")
|
||||
.build();
|
||||
trie.tokenize("Try a natural lip and subtle bronzer to keep all the focus on those big bright eyes with NARS Eyeshadow Duo in Rated R And the winner is... Boots No7 Advanced Renewal Anti-ageing Glycolic Peel Kit ($25 amazon.com) won most-appealing peel.");
|
||||
}
|
||||
|
||||
// Test offered by dwyerk, https://github.com/robert-bor/aho-corasick/issues/8
|
||||
// @see https://github.com/robert-bor/aho-corasick/issues/8
|
||||
@Test
|
||||
public void unicodeIssueBug8ReportedByDwyerk() {
|
||||
public void testUnicode1() {
|
||||
String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char
|
||||
assertEquals("THIS", target.substring(5, 9)); // Java does it the right way
|
||||
Trie trie = Trie.builder().caseInsensitive().onlyWholeWords()
|
||||
Trie trie = Trie.builder().ignoreCase().onlyWholeWords()
|
||||
.addKeyword("this")
|
||||
.build();
|
||||
Collection<Emit> emits = trie.parseText(target);
|
||||
@ -402,11 +386,12 @@ public class TrieTest {
|
||||
checkEmit(it.next(), 5, 8, "this");
|
||||
}
|
||||
|
||||
// @see https://github.com/robert-bor/aho-corasick/issues/8
|
||||
@Test
|
||||
public void unicodeIssueBug8ReportedByDwyerkFirstMatch() {
|
||||
public void testUnicode2() {
|
||||
String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char
|
||||
Trie trie = Trie.builder()
|
||||
.caseInsensitive()
|
||||
.ignoreCase()
|
||||
.onlyWholeWords()
|
||||
.addKeyword("this")
|
||||
.build();
|
||||
@ -416,7 +401,7 @@ public class TrieTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
public void partialMatchWhiteSpaces() {
|
||||
public void testPartialMatchWhiteSpaces() {
|
||||
Trie trie = Trie.builder()
|
||||
.onlyWholeWordsWhiteSpaceSeparated()
|
||||
.addKeyword("#sugar-123")
|
||||
@ -426,10 +411,66 @@ public class TrieTest {
|
||||
checkEmit(emits.iterator().next(), 0, 9, "#sugar-123");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLargeString() {
|
||||
final int interval = 100;
|
||||
final int textSize = 1000000;
|
||||
final String keyword = FOOD[ 1 ];
|
||||
final StringBuilder text = randomNumbers( textSize );
|
||||
|
||||
injectKeyword( text, keyword, interval );
|
||||
|
||||
Trie trie = Trie.builder()
|
||||
.onlyWholeWords()
|
||||
.addKeyword( keyword )
|
||||
.build();
|
||||
|
||||
final Collection<Emit> emits = trie.parseText( text );
|
||||
|
||||
assertEquals( textSize / interval, emits.size() );
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates a random sequence of ASCII numbers.
|
||||
*
|
||||
* @param count The number of numbers to generate.
|
||||
* @return A character sequence filled with random digits.
|
||||
*/
|
||||
private StringBuilder randomNumbers( int count ) {
|
||||
final StringBuilder sb = new StringBuilder( count );
|
||||
|
||||
while( --count > 0 ) {
|
||||
sb.append( randomInt( 0, 10 ) );
|
||||
}
|
||||
|
||||
return sb;
|
||||
}
|
||||
|
||||
/**
|
||||
* Injects keywords into a string builder.
|
||||
*
|
||||
* @param source Should contain a bunch of random data that cannot match
|
||||
* any keyword.
|
||||
* @param keyword A keyword to inject repeatedly in the text.
|
||||
* @param interval How often to inject the keyword.
|
||||
*/
|
||||
private void injectKeyword(
|
||||
final StringBuilder source,
|
||||
final String keyword,
|
||||
final int interval ) {
|
||||
final int length = source.length();
|
||||
for( int i = 0; i < length; i += interval ) {
|
||||
source.replace( i, i + keyword.length(), keyword );
|
||||
}
|
||||
}
|
||||
|
||||
private int randomInt( final int min, final int max ) {
|
||||
return ThreadLocalRandom.current().nextInt( min, max );
|
||||
}
|
||||
|
||||
private void checkEmit(Emit next, int expectedStart, int expectedEnd, String expectedKeyword) {
|
||||
assertEquals("Start of emit should have been " + expectedStart, expectedStart, next.getStart());
|
||||
assertEquals("End of emit should have been " + expectedEnd, expectedEnd, next.getEnd());
|
||||
assertEquals(expectedKeyword, next.getKeyword());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user