diff --git a/.gitignore b/.gitignore index 7b0cbc7..8ba608e 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ *.iml src/main/java/Main.java *.txt -docs \ No newline at end of file +docs +/target/ \ No newline at end of file diff --git a/pom.xml b/pom.xml index 9d964e4..919b66d 100644 --- a/pom.xml +++ b/pom.xml @@ -77,6 +77,7 @@ org.apache.maven.plugins maven-compiler-plugin + 3.6.0 1.7 1.7 diff --git a/src/main/java/org/ahocorasick/trie/State.java b/src/main/java/org/ahocorasick/trie/State.java index 5220f72..0055d91 100644 --- a/src/main/java/org/ahocorasick/trie/State.java +++ b/src/main/java/org/ahocorasick/trie/State.java @@ -35,38 +35,50 @@ public class State { * referred to in the white paper as the 'goto' structure. From a state it is possible to go * to other states, depending on the character passed. */ - private Map success = new HashMap(); + private final Map success = new HashMap<>(); /** if no matching states are found, the failure state will be returned */ - private State failure = null; + private State failure; /** whenever this state is reached, it will emit the matches keywords for future reference */ - private Set emits = null; + private Set emits; public State() { this(0); } - public State(int depth) { + public State(final int depth) { this.depth = depth; this.rootState = depth == 0 ? this : null; } - private State nextState(Character character, boolean ignoreRootState) { + private State nextState(final Character character, final boolean ignoreRootState) { State nextState = this.success.get(character); + if (!ignoreRootState && nextState == null && this.rootState != null) { nextState = this.rootState; } + return nextState; } - public State nextState(Character character) { + public State nextState(final Character character) { return nextState(character, false); } public State nextStateIgnoreRootState(Character character) { return nextState(character, true); } + + public State addState( String keyword ) { + State state = this; + + for (final Character character : keyword.toCharArray()) { + state = state.addState(character); + } + + return state; + } public State addState(Character character) { State nextState = nextStateIgnoreRootState(character); @@ -113,5 +125,4 @@ public class State { public Collection getTransitions() { return this.success.keySet(); } - } diff --git a/src/main/java/org/ahocorasick/trie/Trie.java b/src/main/java/org/ahocorasick/trie/Trie.java index 0b62c82..88097a5 100644 --- a/src/main/java/org/ahocorasick/trie/Trie.java +++ b/src/main/java/org/ahocorasick/trie/Trie.java @@ -1,59 +1,92 @@ package org.ahocorasick.trie; -import org.ahocorasick.interval.IntervalTree; -import org.ahocorasick.interval.Intervalable; -import org.ahocorasick.trie.handler.DefaultEmitHandler; -import org.ahocorasick.trie.handler.EmitHandler; - +import static java.lang.Character.isWhitespace; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.Queue; import java.util.concurrent.LinkedBlockingDeque; +import org.ahocorasick.interval.IntervalTree; +import org.ahocorasick.interval.Intervalable; +import org.ahocorasick.trie.handler.DefaultEmitHandler; +import org.ahocorasick.trie.handler.EmitHandler; /** - * - * Based on the Aho-Corasick white paper, Bell technologies: http://cr.yp.to/bib/1975/aho.pdf + * Based on the Aho-Corasick white paper, Bell technologies: + * http://cr.yp.to/bib/1975/aho.pdf + * * @author Robert Bor */ public class Trie { - private TrieConfig trieConfig; + private final TrieConfig trieConfig; - private State rootState; + private final State rootState; - private Trie(TrieConfig trieConfig) { + private Trie(final TrieConfig trieConfig) { this.trieConfig = trieConfig; this.rootState = new State(); } - + + /** + * Used by the builder to add a text search keyword. + * + * @param keyword The search term to add to the list of search terms. + * + * @throws NullPointerException if the keyword is null. + */ private void addKeyword(String keyword) { - if (keyword == null || keyword.length() == 0) { - return; + if( keyword.isEmpty() ) { + return; } - State currentState = this.rootState; - for (Character character : keyword.toCharArray()) { - if (trieConfig.isCaseInsensitive()) { - character = Character.toLowerCase(character); - } - currentState = currentState.addState(character); + + if( isCaseInsensitive() ) { + keyword = keyword.toLowerCase(); } - currentState.addEmit(trieConfig.isCaseInsensitive() ? keyword.toLowerCase() : keyword); + + addState(keyword).addEmit(keyword); } - public Collection tokenize(String text) { + /** + * Delegates to addKeyword. + * + * @param keywords List of search term to add to the list of search terms. + */ + private void addKeywords( final String[] keywords ) { + for( final String keyword : keywords ) { + addKeyword( keyword ); + } + } + + /** + * Delegates to addKeyword. + * + * @param keywords List of search term to add to the list of search terms. + */ + private void addKeywords( final Collection keywords ) { + for( final String keyword : keywords ) { + addKeyword( keyword ); + } + } - Collection tokens = new ArrayList<>(); - - Collection collectedEmits = parseText(text); + private State addState(final String keyword) { + return getRootState().addState(keyword); + } + + public Collection tokenize(final String text) { + final Collection tokens = new ArrayList<>(); + final Collection collectedEmits = parseText(text); int lastCollectedPosition = -1; - for (Emit emit : collectedEmits) { + + for (final Emit emit : collectedEmits) { if (emit.getStart() - lastCollectedPosition > 1) { tokens.add(createFragment(emit, text, lastCollectedPosition)); } + tokens.add(createMatch(emit, text)); lastCollectedPosition = emit.getEnd(); } + if (text.length() - lastCollectedPosition > 1) { tokens.add(createFragment(null, text, lastCollectedPosition)); } @@ -61,7 +94,7 @@ public class Trie { return tokens; } - private Token createFragment(Emit emit, String text, int lastCollectedPosition) { + private Token createFragment(final Emit emit, final String text, final int lastCollectedPosition) { return new FragmentToken(text.substring(lastCollectedPosition+1, emit == null ? text.length() : emit.getStart())); } @@ -70,11 +103,11 @@ public class Trie { } @SuppressWarnings("unchecked") - public Collection parseText(CharSequence text) { - DefaultEmitHandler emitHandler = new DefaultEmitHandler(); + public Collection parseText(final CharSequence text) { + final DefaultEmitHandler emitHandler = new DefaultEmitHandler(); parseText(text, emitHandler); - List collectedEmits = emitHandler.getEmits(); + final List collectedEmits = emitHandler.getEmits(); if (trieConfig.isOnlyWholeWords()) { removePartialMatches(text, collectedEmits); @@ -92,117 +125,132 @@ public class Trie { return collectedEmits; } - public boolean containsMatch(CharSequence text) { - Emit firstMatch = firstMatch(text); - return firstMatch != null; - } + public boolean containsMatch(final CharSequence text) { + return firstMatch(text) != null; + } - public void parseText(CharSequence text, EmitHandler emitHandler) { - State currentState = this.rootState; + public void parseText(final CharSequence text, final EmitHandler emitHandler) { + State currentState = getRootState(); + for (int position = 0; position < text.length(); position++) { Character character = text.charAt(position); + + // TODO: Maybe lowercase the entire string at once? if (trieConfig.isCaseInsensitive()) { character = Character.toLowerCase(character); } + currentState = getState(currentState, character); if (storeEmits(position, currentState, emitHandler) && trieConfig.isStopOnHit()) { return; } } - } - public Emit firstMatch(CharSequence text) { - if (!trieConfig.isAllowOverlaps()) { - // Slow path. Needs to find all the matches to detect overlaps. - Collection parseText = parseText(text); - if (parseText != null && !parseText.isEmpty()) { - return parseText.iterator().next(); - } - } else { - // Fast path. Returns first match found. - State currentState = this.rootState; + public Emit firstMatch(final CharSequence text) { + if (!trieConfig.isAllowOverlaps()) { + // Slow path. Needs to find all the matches to detect overlaps. + Collection parseText = parseText(text); + if (parseText != null && !parseText.isEmpty()) { + return parseText.iterator().next(); + } + } else { + // Fast path. Returns first match found. + State currentState = getRootState(); + for (int position = 0; position < text.length(); position++) { Character character = text.charAt(position); - if (trieConfig.isCaseInsensitive()) { - character = Character.toLowerCase(character); - } - currentState = getState(currentState, character); - Collection emitStrs = currentState.emit(); - if (emitStrs != null && !emitStrs.isEmpty()) { - for (String emitStr : emitStrs) { - final Emit emit = new Emit(position - emitStr.length() + 1, position, emitStr); - if (trieConfig.isOnlyWholeWords()) { - if (!isPartialMatch(text, emit)) { - return emit; - } - } else { - return emit; - } - } - } - } - } - return null; - } - - private boolean isPartialMatch(CharSequence searchText, Emit emit) { - return (emit.getStart() != 0 && - Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) || - (emit.getEnd() + 1 != searchText.length() && - Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1))); - } - - private void removePartialMatches(CharSequence searchText, List collectedEmits) { - List removeEmits = new ArrayList<>(); - for (Emit emit : collectedEmits) { - if (isPartialMatch(searchText, emit)) { - removeEmits.add(emit); - } - } - for (Emit removeEmit : removeEmits) { - collectedEmits.remove(removeEmit); - } - } - - private void removePartialMatchesWhiteSpaceSeparated(CharSequence searchText, List collectedEmits) { - long size = searchText.length(); - List removeEmits = new ArrayList<>(); - for (Emit emit : collectedEmits) { - if ((emit.getStart() == 0 || Character.isWhitespace(searchText.charAt(emit.getStart() - 1))) && - (emit.getEnd() + 1 == size || Character.isWhitespace(searchText.charAt(emit.getEnd() + 1)))) { - continue; + + // TODO: Lowercase the entire string at once? + if (trieConfig.isCaseInsensitive()) { + character = Character.toLowerCase(character); + } + + currentState = getState(currentState, character); + Collection emitStrs = currentState.emit(); + + if (emitStrs != null && !emitStrs.isEmpty()) { + for (String emitStr : emitStrs) { + final Emit emit = new Emit(position - emitStr.length() + 1, position, emitStr); + if (trieConfig.isOnlyWholeWords()) { + if (!isPartialMatch(text, emit)) { + return emit; + } + } else { + return emit; + } + } + } } - removeEmits.add(emit); } - for (Emit removeEmit : removeEmits) { + + return null; + } + + private boolean isPartialMatch(final CharSequence searchText, final Emit emit) { + return (emit.getStart() != 0 && + Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) || + (emit.getEnd() + 1 != searchText.length() && + Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1))); + } + + private void removePartialMatches(final CharSequence searchText, final List collectedEmits) { + final List removeEmits = new ArrayList<>(); + + for (final Emit emit : collectedEmits) { + if (isPartialMatch(searchText, emit)) { + removeEmits.add(emit); + } + } + + for (final Emit removeEmit : removeEmits) { collectedEmits.remove(removeEmit); } } - private State getState(State currentState, Character character) { + private void removePartialMatchesWhiteSpaceSeparated(final CharSequence searchText, final List collectedEmits) { + final long size = searchText.length(); + final List removeEmits = new ArrayList<>(); + + for (final Emit emit : collectedEmits) { + if ((emit.getStart() == 0 || isWhitespace(searchText.charAt(emit.getStart() - 1))) && + (emit.getEnd() + 1 == size || isWhitespace(searchText.charAt(emit.getEnd() + 1)))) { + continue; + } + removeEmits.add(emit); + } + + for (final Emit removeEmit : removeEmits) { + collectedEmits.remove(removeEmit); + } + } + + private State getState(State currentState, final Character character) { State newCurrentState = currentState.nextState(character); + while (newCurrentState == null) { currentState = currentState.failure(); newCurrentState = currentState.nextState(character); } + return newCurrentState; } private void constructFailureStates() { - Queue queue = new LinkedBlockingDeque<>(); + final Queue queue = new LinkedBlockingDeque<>(); + final State startState = getRootState(); // First, set the fail state of all depth 1 states to the root state - for (State depthOneState : this.rootState.getStates()) { - depthOneState.setFailure(this.rootState); + for (State depthOneState : startState.getStates()) { + depthOneState.setFailure(startState); queue.add(depthOneState); } // Second, determine the fail state for all depth > 1 state while (!queue.isEmpty()) { - State currentState = queue.remove(); + final State currentState = queue.remove(); - for (Character transition : currentState.getTransitions()) { + for (final Character transition : currentState.getTransitions()) { State targetState = currentState.nextState(transition); queue.add(targetState); @@ -210,70 +258,174 @@ public class Trie { while (traceFailureState.nextState(transition) == null) { traceFailureState = traceFailureState.failure(); } - State newFailureState = traceFailureState.nextState(transition); + + final State newFailureState = traceFailureState.nextState(transition); targetState.setFailure(newFailureState); targetState.addEmit(newFailureState.emit()); } } } - private boolean storeEmits(int position, State currentState, EmitHandler emitHandler) { + private boolean storeEmits(final int position, final State currentState, final EmitHandler emitHandler) { boolean emitted = false; - Collection emits = currentState.emit(); + final Collection emits = currentState.emit(); + + // TODO: The check for empty might be superfluous. if (emits != null && !emits.isEmpty()) { - for (String emit : emits) { + for (final String emit : emits) { emitHandler.emit(new Emit(position - emit.length() + 1, position, emit)); emitted = true; } } + return emitted; } + private boolean isCaseInsensitive() { + return trieConfig.isCaseInsensitive(); + } + + private State getRootState() { + return this.rootState; + } + + /** + * Provides a fluent interface for constructing Trie instances. + * + * @return The builder used to configure its Trie. + */ public static TrieBuilder builder() { return new TrieBuilder(); } public static class TrieBuilder { - private TrieConfig trieConfig = new TrieConfig(); + private final TrieConfig trieConfig = new TrieConfig(); - private Trie trie = new Trie(trieConfig); + private final Trie trie = new Trie(trieConfig); + /** + * Default (empty) constructor. + */ private TrieBuilder() {} - public TrieBuilder caseInsensitive() { + /** + * Adds a keyword to the Trie's list of text search keywords. + * + * @param keyword The keyword to add to the list. + * + * @return This builder. + * @throws NullPointerException if the keyword is null. + */ + public TrieBuilder addKeyword(final String keyword) { + this.trie.addKeyword(keyword); + return this; + } + + /** + * Adds a list of keywords to the Trie's list of text search keywords. + * + * @param keywords The keywords to add to the list. + * + * @return This builder. + */ + public TrieBuilder addKeywords(final String... keywords) { + this.trie.addKeywords(keywords); + return this; + } + + /** + * Adds a list of keywords to the Trie's list of text search keywords. + * + * @param keywords The keywords to add to the list. + * + * @return This builder. + */ + public TrieBuilder addKeywords(final Collection keywords) { + this.trie.addKeywords(keywords); + return this; + } + + /** + * Configure the Trie to ignore case when searching for keywords in + * the text. + * + * @return This builder. + */ + public TrieBuilder ignoreCase() { this.trieConfig.setCaseInsensitive(true); return this; } - public TrieBuilder removeOverlaps() { + /** + * Configure the Trie to ignore overlapping keywords. + * + * @return This builder. + */ + public TrieBuilder ignoreOverlaps() { this.trieConfig.setAllowOverlaps(false); return this; } + /** + * Configure the Trie to match whole keywords in the text. + * + * @return This builder. + */ public TrieBuilder onlyWholeWords() { this.trieConfig.setOnlyWholeWords(true); return this; } + /** + * Configure the Trie to match whole keywords that are separated by + * whitespace in the text. For example, "this keyword thatkeyword" + * would only match the first occurrence of "keyword". + * + * @return This builder. + */ public TrieBuilder onlyWholeWordsWhiteSpaceSeparated() { this.trieConfig.setOnlyWholeWordsWhiteSpaceSeparated(true); return this; } - public TrieBuilder addKeyword(String keyword) { - trie.addKeyword(keyword); - return this; - } - + /** + * Configure the Trie to stop after the first keyword is found in the + * text. + * + * @return This builder. + */ public TrieBuilder stopOnHit() { trie.trieConfig.setStopOnHit(true); return this; } + /** + * Configure the Trie based on the builder settings. + * + * @return The configured Trie. + */ public Trie build() { - trie.constructFailureStates(); - return trie; + this.trie.constructFailureStates(); + return this.trie; + } + + /** + * @deprecated Use ignoreCase() + * + * @return This builder. + */ + public TrieBuilder caseInsensitive() { + return ignoreCase(); + } + + /** + * @deprecated Use ignoreOverlaps() + * + * @return This builder. + */ + public TrieBuilder removeOverlaps() { + return ignoreOverlaps(); } } } diff --git a/src/test/java/org/ahocorasick/trie/TrieTest.java b/src/test/java/org/ahocorasick/trie/TrieTest.java index 6a620f0..c4c780b 100644 --- a/src/test/java/org/ahocorasick/trie/TrieTest.java +++ b/src/test/java/org/ahocorasick/trie/TrieTest.java @@ -1,62 +1,78 @@ package org.ahocorasick.trie; -import org.ahocorasick.trie.handler.EmitHandler; -import org.junit.Test; - import java.util.ArrayList; import java.util.Collection; import java.util.Iterator; import java.util.List; - +import java.util.concurrent.ThreadLocalRandom; import static junit.framework.Assert.assertEquals; +import org.ahocorasick.trie.handler.EmitHandler; import static org.junit.Assert.assertTrue; +import org.junit.Test; public class TrieTest { + private final static String[] ALPHABET = new String[]{ + "abc", "bcd", "cde" + }; + + private final static String[] PRONOUNS = new String[]{ + "hers", "his", "she", "he" + }; + + private final static String[] FOOD = new String[]{ + "veal", "cauliflower", "broccoli", "tomatoes" + }; + + private final static String[] GREEK_LETTERS = new String[]{ + "Alpha", "Beta", "Gamma" + }; + + private final static String[] UNICODE = new String[]{ + "turning", "once", "again", "börkü" + }; @Test public void keywordAndTextAreTheSame() { Trie trie = Trie.builder() - .addKeyword("abc") + .addKeyword(ALPHABET[0]) .build(); - Collection emits = trie.parseText("abc"); + Collection emits = trie.parseText(ALPHABET[0]); Iterator iterator = emits.iterator(); - checkEmit(iterator.next(), 0, 2, "abc"); + checkEmit(iterator.next(), 0, 2, ALPHABET[0]); } @Test public void keywordAndTextAreTheSameFirstMatch() { Trie trie = Trie.builder() - .addKeyword("abc") + .addKeyword(ALPHABET[0]) .build(); - Emit firstMatch = trie.firstMatch("abc"); - checkEmit(firstMatch, 0, 2, "abc"); + Emit firstMatch = trie.firstMatch(ALPHABET[0]); + checkEmit(firstMatch, 0, 2, ALPHABET[0]); } @Test public void textIsLongerThanKeyword() { Trie trie = Trie.builder() - .addKeyword("abc") + .addKeyword(ALPHABET[0]) .build(); - Collection emits = trie.parseText(" abc"); + Collection emits = trie.parseText(" " + ALPHABET[0]); Iterator iterator = emits.iterator(); - checkEmit(iterator.next(), 1, 3, "abc"); + checkEmit(iterator.next(), 1, 3, ALPHABET[0]); } @Test public void textIsLongerThanKeywordFirstMatch() { Trie trie = Trie.builder() - .addKeyword("abc") + .addKeyword(ALPHABET[0]) .build(); - Emit firstMatch = trie.firstMatch(" abc"); - checkEmit(firstMatch, 1, 3, "abc"); + Emit firstMatch = trie.firstMatch(" " + ALPHABET[0]); + checkEmit(firstMatch, 1, 3, ALPHABET[0]); } @Test public void variousKeywordsOneMatch() { Trie trie = Trie.builder() - .addKeyword("abc") - .addKeyword("bcd") - .addKeyword("cde") + .addKeywords(ALPHABET) .build(); Collection emits = trie.parseText("bcd"); Iterator iterator = emits.iterator(); @@ -66,9 +82,7 @@ public class TrieTest { @Test public void variousKeywordsFirstMatch() { Trie trie = Trie.builder() - .addKeyword("abc") - .addKeyword("bcd") - .addKeyword("cde") + .addKeywords(ALPHABET) .build(); Emit firstMatch = trie.firstMatch("bcd"); checkEmit(firstMatch, 0, 2, "bcd"); @@ -77,10 +91,7 @@ public class TrieTest { @Test public void ushersTestAndStopOnHit() { Trie trie = Trie.builder() - .addKeyword("hers") - .addKeyword("his") - .addKeyword("she") - .addKeyword("he") + .addKeywords(PRONOUNS) .stopOnHit() .build(); Collection emits = trie.parseText("ushers"); @@ -93,10 +104,7 @@ public class TrieTest { @Test public void ushersTest() { Trie trie = Trie.builder() - .addKeyword("hers") - .addKeyword("his") - .addKeyword("she") - .addKeyword("he") + .addKeywords(PRONOUNS) .build(); Collection emits = trie.parseText("ushers"); assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5 @@ -109,7 +117,7 @@ public class TrieTest { @Test public void ushersTestWithCapitalKeywords() { Trie trie = Trie.builder() - .caseInsensitive() + .ignoreCase() .addKeyword("HERS") .addKeyword("HIS") .addKeyword("SHE") @@ -126,10 +134,7 @@ public class TrieTest { @Test public void ushersTestFirstMatch() { Trie trie = Trie.builder() - .addKeyword("hers") - .addKeyword("his") - .addKeyword("she") - .addKeyword("he") + .addKeywords(PRONOUNS) .build(); Emit firstMatch = trie.firstMatch("ushers"); checkEmit(firstMatch, 2, 3, "he"); @@ -138,10 +143,7 @@ public class TrieTest { @Test public void ushersTestByCallback() { Trie trie = Trie.builder() - .addKeyword("hers") - .addKeyword("his") - .addKeyword("she") - .addKeyword("he") + .addKeywords(PRONOUNS) .build(); final List emits = new ArrayList<>(); @@ -182,10 +184,7 @@ public class TrieTest { @Test public void recipes() { Trie trie = Trie.builder() - .addKeyword("veal") - .addKeyword("cauliflower") - .addKeyword("broccoli") - .addKeyword("tomatoes") + .addKeywords(FOOD) .build(); Collection emits = trie.parseText("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli"); Iterator iterator = emits.iterator(); @@ -198,10 +197,7 @@ public class TrieTest { @Test public void recipesFirstMatch() { Trie trie = Trie.builder() - .addKeyword("veal") - .addKeyword("cauliflower") - .addKeyword("broccoli") - .addKeyword("tomatoes") + .addKeywords(FOOD) .build(); Emit firstMatch = trie.firstMatch("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli"); @@ -305,9 +301,7 @@ public class TrieTest { @Test public void tokenizeFullSentence() { Trie trie = Trie.builder() - .addKeyword("Alpha") - .addKeyword("Beta") - .addKeyword("Gamma") + .addKeywords(GREEK_LETTERS) .build(); Collection tokens = trie.tokenize("Hear: Alpha team first, Beta from the rear, Gamma in reserve"); assertEquals(7, tokens.size()); @@ -321,13 +315,11 @@ public class TrieTest { assertEquals(" in reserve", tokensIt.next().getFragment()); } + // @see https://github.com/robert-bor/aho-corasick/issues/5 @Test - public void bug5InGithubReportedByXCurry() { - Trie trie = Trie.builder().caseInsensitive().onlyWholeWords() - .addKeyword("turning") - .addKeyword("once") - .addKeyword("again") - .addKeyword("börkü") + public void testStringIndexOutOfBoundsException() { + Trie trie = Trie.builder().ignoreCase().onlyWholeWords() + .addKeywords(UNICODE) .build(); Collection emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ"); assertEquals(4, emits.size()); // Match must not be made @@ -339,12 +331,9 @@ public class TrieTest { } @Test - public void caseInsensitive() { - Trie trie = Trie.builder().caseInsensitive() - .addKeyword("turning") - .addKeyword("once") - .addKeyword("again") - .addKeyword("börkü") + public void testIgnoreCase() { + Trie trie = Trie.builder().ignoreCase() + .addKeywords(UNICODE) .build(); Collection emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ"); assertEquals(4, emits.size()); // Match must not be made @@ -356,12 +345,9 @@ public class TrieTest { } @Test - public void caseInsensitiveFirstMatch() { - Trie trie = Trie.builder().caseInsensitive() - .addKeyword("turning") - .addKeyword("once") - .addKeyword("again") - .addKeyword("börkü") + public void testIgnoreCaseFirstMatch() { + Trie trie = Trie.builder().ignoreCase() + .addKeywords(UNICODE) .build(); Emit firstMatch = trie.firstMatch("TurninG OnCe AgAiN BÖRKÜ"); @@ -371,29 +357,27 @@ public class TrieTest { @Test public void tokenizeTokensInSequence() { Trie trie = Trie.builder() - .addKeyword("Alpha") - .addKeyword("Beta") - .addKeyword("Gamma") + .addKeywords(GREEK_LETTERS) .build(); Collection tokens = trie.tokenize("Alpha Beta Gamma"); assertEquals(5, tokens.size()); } - // Test offered by XCurry, https://github.com/robert-bor/aho-corasick/issues/7 + // @see https://github.com/robert-bor/aho-corasick/issues/7 @Test - public void zeroLengthTestBug7InGithubReportedByXCurry() { - Trie trie = Trie.builder().removeOverlaps().onlyWholeWords().caseInsensitive() + public void testZeroLength() { + Trie trie = Trie.builder().ignoreOverlaps().onlyWholeWords().ignoreCase() .addKeyword("") .build(); trie.tokenize("Try a natural lip and subtle bronzer to keep all the focus on those big bright eyes with NARS Eyeshadow Duo in Rated R And the winner is... Boots No7 Advanced Renewal Anti-ageing Glycolic Peel Kit ($25 amazon.com) won most-appealing peel."); } - // Test offered by dwyerk, https://github.com/robert-bor/aho-corasick/issues/8 + // @see https://github.com/robert-bor/aho-corasick/issues/8 @Test - public void unicodeIssueBug8ReportedByDwyerk() { + public void testUnicode1() { String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char assertEquals("THIS", target.substring(5, 9)); // Java does it the right way - Trie trie = Trie.builder().caseInsensitive().onlyWholeWords() + Trie trie = Trie.builder().ignoreCase().onlyWholeWords() .addKeyword("this") .build(); Collection emits = trie.parseText(target); @@ -402,11 +386,12 @@ public class TrieTest { checkEmit(it.next(), 5, 8, "this"); } + // @see https://github.com/robert-bor/aho-corasick/issues/8 @Test - public void unicodeIssueBug8ReportedByDwyerkFirstMatch() { + public void testUnicode2() { String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char Trie trie = Trie.builder() - .caseInsensitive() + .ignoreCase() .onlyWholeWords() .addKeyword("this") .build(); @@ -416,7 +401,7 @@ public class TrieTest { } @Test - public void partialMatchWhiteSpaces() { + public void testPartialMatchWhiteSpaces() { Trie trie = Trie.builder() .onlyWholeWordsWhiteSpaceSeparated() .addKeyword("#sugar-123") @@ -426,10 +411,66 @@ public class TrieTest { checkEmit(emits.iterator().next(), 0, 9, "#sugar-123"); } + @Test + public void testLargeString() { + final int interval = 100; + final int textSize = 1000000; + final String keyword = FOOD[ 1 ]; + final StringBuilder text = randomNumbers( textSize ); + + injectKeyword( text, keyword, interval ); + + Trie trie = Trie.builder() + .onlyWholeWords() + .addKeyword( keyword ) + .build(); + + final Collection emits = trie.parseText( text ); + + assertEquals( textSize / interval, emits.size() ); + } + + /** + * Generates a random sequence of ASCII numbers. + * + * @param count The number of numbers to generate. + * @return A character sequence filled with random digits. + */ + private StringBuilder randomNumbers( int count ) { + final StringBuilder sb = new StringBuilder( count ); + + while( --count > 0 ) { + sb.append( randomInt( 0, 10 ) ); + } + + return sb; + } + + /** + * Injects keywords into a string builder. + * + * @param source Should contain a bunch of random data that cannot match + * any keyword. + * @param keyword A keyword to inject repeatedly in the text. + * @param interval How often to inject the keyword. + */ + private void injectKeyword( + final StringBuilder source, + final String keyword, + final int interval ) { + final int length = source.length(); + for( int i = 0; i < length; i += interval ) { + source.replace( i, i + keyword.length(), keyword ); + } + } + + private int randomInt( final int min, final int max ) { + return ThreadLocalRandom.current().nextInt( min, max ); + } + private void checkEmit(Emit next, int expectedStart, int expectedEnd, String expectedKeyword) { assertEquals("Start of emit should have been " + expectedStart, expectedStart, next.getStart()); assertEquals("End of emit should have been " + expectedEnd, expectedEnd, next.getEnd()); assertEquals(expectedKeyword, next.getKeyword()); } - }