Added final modifier. Added helper methods for adding keywords using arrays and collections. Added test for large character strings. Simplified code for adding keywords. Renamed a few methods for consistency. Some code formatting. Updated unit tests with constant arrays, as a first step to reducing the duplication in the unit tests; migrated away from deprecated methods.

2016-11-28 21:20:57 -08:00 · 2016-11-28 21:20:57 -08:00 · f6a7103f5f
commit f6a7103f5f
parent 8c422583b5
5 changed files with 412 additions and 206 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,4 +2,5 @@
 *.iml
 src/main/java/Main.java
 *.txt
-docs
+docs
+/target/
--- a/pom.xml
+++ b/pom.xml
@ -77,6 +77,7 @@
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
+                <version>3.6.0</version>
                <configuration>
                    <source>1.7</source>
                    <target>1.7</target>
--- a/src/main/java/org/ahocorasick/trie/State.java
+++ b/src/main/java/org/ahocorasick/trie/State.java
@ -35,38 +35,50 @@ public class State {
     * referred to in the white paper as the 'goto' structure. From a state it is possible to go
     * to other states, depending on the character passed.
     */
-    private Map<Character,State> success = new HashMap<Character, State>();
+    private final Map<Character,State> success = new HashMap<>();

    /** if no matching states are found, the failure state will be returned */
-    private State failure = null;
+    private State failure;

    /** whenever this state is reached, it will emit the matches keywords for future reference */
-    private Set<String> emits = null;
+    private Set<String> emits;

    public State() {
        this(0);
    }

-    public State(int depth) {
+    public State(final int depth) {
        this.depth = depth;
        this.rootState = depth == 0 ? this : null;
    }

-    private State nextState(Character character, boolean ignoreRootState) {
+    private State nextState(final Character character, final boolean ignoreRootState) {
        State nextState = this.success.get(character);
+        
        if (!ignoreRootState && nextState == null && this.rootState != null) {
            nextState = this.rootState;
        }
+        
        return nextState;
    }

-    public State nextState(Character character) {
+    public State nextState(final Character character) {
        return nextState(character, false);
    }

    public State nextStateIgnoreRootState(Character character) {
        return nextState(character, true);
    }
+    
+    public State addState( String keyword ) {
+      State state = this;
+          
+      for (final Character character : keyword.toCharArray()) {
+          state = state.addState(character);
+      }
+      
+      return state;
+    }

    public State addState(Character character) {
        State nextState = nextStateIgnoreRootState(character);
@ -113,5 +125,4 @@ public class State {
    public Collection<Character> getTransitions() {
        return this.success.keySet();
    }
-
 }
--- a/src/main/java/org/ahocorasick/trie/Trie.java
+++ b/src/main/java/org/ahocorasick/trie/Trie.java
@ -1,59 +1,92 @@
 package org.ahocorasick.trie;

-import org.ahocorasick.interval.IntervalTree;
-import org.ahocorasick.interval.Intervalable;
-import org.ahocorasick.trie.handler.DefaultEmitHandler;
-import org.ahocorasick.trie.handler.EmitHandler;
-
+import static java.lang.Character.isWhitespace;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.List;
 import java.util.Queue;
 import java.util.concurrent.LinkedBlockingDeque;
+import org.ahocorasick.interval.IntervalTree;
+import org.ahocorasick.interval.Intervalable;
+import org.ahocorasick.trie.handler.DefaultEmitHandler;
+import org.ahocorasick.trie.handler.EmitHandler;

 /**
- *
- * Based on the Aho-Corasick white paper, Bell technologies: http://cr.yp.to/bib/1975/aho.pdf
+ * Based on the Aho-Corasick white paper, Bell technologies:
+ * http://cr.yp.to/bib/1975/aho.pdf
+ * 
 * @author Robert Bor
 */
 public class Trie {

-    private TrieConfig trieConfig;
+    private final TrieConfig trieConfig;

-    private State rootState;
+    private final State rootState;

-    private Trie(TrieConfig trieConfig) {
+    private Trie(final TrieConfig trieConfig) {
        this.trieConfig = trieConfig;
        this.rootState = new State();
    }
-
+    
+    /**
+     * Used by the builder to add a text search keyword.
+     * 
+     * @param keyword The search term to add to the list of search terms.
+     * 
+     * @throws NullPointerException if the keyword is null.
+     */
    private void addKeyword(String keyword) {
-        if (keyword == null || keyword.length() == 0) {
-            return;
+        if( keyword.isEmpty() ) {
+          return;
        }
-        State currentState = this.rootState;
-        for (Character character : keyword.toCharArray()) {
-            if (trieConfig.isCaseInsensitive()) {
-                character = Character.toLowerCase(character);
-            }
-            currentState = currentState.addState(character);
+      
+        if( isCaseInsensitive() ) {
+          keyword = keyword.toLowerCase();
        }
-        currentState.addEmit(trieConfig.isCaseInsensitive() ? keyword.toLowerCase() : keyword);
+
+        addState(keyword).addEmit(keyword);
    }

-    public Collection<Token> tokenize(String text) {
+    /**
+     * Delegates to addKeyword.
+     * 
+     * @param keywords List of search term to add to the list of search terms.
+     */
+    private void addKeywords( final String[] keywords ) {
+      for( final String keyword : keywords ) {
+        addKeyword( keyword );
+      }
+    }
+    
+    /**
+     * Delegates to addKeyword.
+     * 
+     * @param keywords List of search term to add to the list of search terms.
+     */
+    private void addKeywords( final Collection<String> keywords ) {
+      for( final String keyword : keywords ) {
+        addKeyword( keyword );
+      }
+    }

-        Collection<Token> tokens = new ArrayList<>();
-
-        Collection<Emit> collectedEmits = parseText(text);
+    private State addState(final String keyword) {
+        return getRootState().addState(keyword);
+    }
+    
+    public Collection<Token> tokenize(final String text) {
+        final Collection<Token> tokens = new ArrayList<>();
+        final Collection<Emit> collectedEmits = parseText(text);
        int lastCollectedPosition = -1;
-        for (Emit emit : collectedEmits) {
+        
+        for (final Emit emit : collectedEmits) {
            if (emit.getStart() - lastCollectedPosition > 1) {
                tokens.add(createFragment(emit, text, lastCollectedPosition));
            }
+            
            tokens.add(createMatch(emit, text));
            lastCollectedPosition = emit.getEnd();
        }
+        
        if (text.length() - lastCollectedPosition > 1) {
            tokens.add(createFragment(null, text, lastCollectedPosition));
        }
@ -61,7 +94,7 @@ public class Trie {
        return tokens;
    }

-    private Token createFragment(Emit emit, String text, int lastCollectedPosition) {
+    private Token createFragment(final Emit emit, final String text, final int lastCollectedPosition) {
        return new FragmentToken(text.substring(lastCollectedPosition+1, emit == null ? text.length() : emit.getStart()));
    }

@ -70,11 +103,11 @@ public class Trie {
    }

    @SuppressWarnings("unchecked")
-    public Collection<Emit> parseText(CharSequence text) {
-        DefaultEmitHandler emitHandler = new DefaultEmitHandler();
+    public Collection<Emit> parseText(final CharSequence text) {
+        final DefaultEmitHandler emitHandler = new DefaultEmitHandler();
        parseText(text, emitHandler);

-        List<Emit> collectedEmits = emitHandler.getEmits();
+        final List<Emit> collectedEmits = emitHandler.getEmits();

        if (trieConfig.isOnlyWholeWords()) {
            removePartialMatches(text, collectedEmits);
@ -92,117 +125,132 @@ public class Trie {
        return collectedEmits;
    }

-	public boolean containsMatch(CharSequence text) {
-		Emit firstMatch = firstMatch(text);
-		return firstMatch != null;
-	}
+    public boolean containsMatch(final CharSequence text) {
+        return firstMatch(text) != null;
+    }

-    public void parseText(CharSequence text, EmitHandler emitHandler) {
-        State currentState = this.rootState;
+    public void parseText(final CharSequence text, final EmitHandler emitHandler) {
+        State currentState = getRootState();
+        
        for (int position = 0; position < text.length(); position++) {
            Character character = text.charAt(position);
+            
+            // TODO: Maybe lowercase the entire string at once?
            if (trieConfig.isCaseInsensitive()) {
                character = Character.toLowerCase(character);
            }
+            
            currentState = getState(currentState, character);
            if (storeEmits(position, currentState, emitHandler) && trieConfig.isStopOnHit()) {
                return;
            }
        }
-
    }

-	public Emit firstMatch(CharSequence text) {
-		if (!trieConfig.isAllowOverlaps()) {
-			// Slow path. Needs to find all the matches to detect overlaps.
-			Collection<Emit> parseText = parseText(text);
-			if (parseText != null && !parseText.isEmpty()) {
-				return parseText.iterator().next();
-			}
-		} else {
-			// Fast path. Returns first match found.
-			State currentState = this.rootState;
+    public Emit firstMatch(final CharSequence text) {
+        if (!trieConfig.isAllowOverlaps()) {
+            // Slow path. Needs to find all the matches to detect overlaps.
+            Collection<Emit> parseText = parseText(text);
+            if (parseText != null && !parseText.isEmpty()) {
+                return parseText.iterator().next();
+            }
+        } else {
+            // Fast path. Returns first match found.
+            State currentState = getRootState();
+            
            for (int position = 0; position < text.length(); position++) {
                Character character = text.charAt(position);
-				if (trieConfig.isCaseInsensitive()) {
-					character = Character.toLowerCase(character);
-				}
-				currentState = getState(currentState, character);
-				Collection<String> emitStrs = currentState.emit();
-				if (emitStrs != null && !emitStrs.isEmpty()) {
-					for (String emitStr : emitStrs) {
-						final Emit emit = new Emit(position - emitStr.length() + 1, position, emitStr);
-						if (trieConfig.isOnlyWholeWords()) {
-							if (!isPartialMatch(text, emit)) {
-								return emit;
-							}
-						} else {
-							return emit;
-						}
-					}
-				}
-			}
-		}
-		return null;
-	}
-
-	private boolean isPartialMatch(CharSequence searchText, Emit emit) {
-		return (emit.getStart() != 0 &&
-			Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) ||
-			(emit.getEnd() + 1 != searchText.length() &&
-			Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1)));
-	}
-
-	private void removePartialMatches(CharSequence searchText, List<Emit> collectedEmits) {
-		List<Emit> removeEmits = new ArrayList<>();
-		for (Emit emit : collectedEmits) {
-			if (isPartialMatch(searchText, emit)) {
-				removeEmits.add(emit);
-			}
-		}
-		for (Emit removeEmit : removeEmits) {
-			collectedEmits.remove(removeEmit);
-		}
-	}
-
-    private void removePartialMatchesWhiteSpaceSeparated(CharSequence searchText, List<Emit> collectedEmits) {
-        long size = searchText.length();
-        List<Emit> removeEmits = new ArrayList<>();
-        for (Emit emit : collectedEmits) {
-            if ((emit.getStart() == 0 || Character.isWhitespace(searchText.charAt(emit.getStart() - 1))) &&
-                (emit.getEnd() + 1 == size || Character.isWhitespace(searchText.charAt(emit.getEnd() + 1)))) {
-                continue;
+                
+                // TODO: Lowercase the entire string at once?
+                if (trieConfig.isCaseInsensitive()) {
+                    character = Character.toLowerCase(character);
+                }
+                
+                currentState = getState(currentState, character);
+                Collection<String> emitStrs = currentState.emit();
+                
+                if (emitStrs != null && !emitStrs.isEmpty()) {
+                    for (String emitStr : emitStrs) {
+                        final Emit emit = new Emit(position - emitStr.length() + 1, position, emitStr);
+                        if (trieConfig.isOnlyWholeWords()) {
+                            if (!isPartialMatch(text, emit)) {
+                                return emit;
+                            }
+                        } else {
+                            return emit;
+                        }
+                    }
+                }
            }
-            removeEmits.add(emit);
        }
-        for (Emit removeEmit : removeEmits) {
+    
+        return null;
+    }
+
+    private boolean isPartialMatch(final CharSequence searchText, final Emit emit) {
+        return (emit.getStart() != 0 &&
+            Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) ||
+            (emit.getEnd() + 1 != searchText.length() &&
+            Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1)));
+    }
+
+    private void removePartialMatches(final CharSequence searchText, final List<Emit> collectedEmits) {
+        final List<Emit> removeEmits = new ArrayList<>();
+        
+        for (final Emit emit : collectedEmits) {
+            if (isPartialMatch(searchText, emit)) {
+                removeEmits.add(emit);
+            }
+        }
+        
+        for (final Emit removeEmit : removeEmits) {
            collectedEmits.remove(removeEmit);
        }
    }

-    private State getState(State currentState, Character character) {
+    private void removePartialMatchesWhiteSpaceSeparated(final CharSequence searchText, final List<Emit> collectedEmits) {
+        final long size = searchText.length();
+        final List<Emit> removeEmits = new ArrayList<>();
+        
+        for (final Emit emit : collectedEmits) {
+            if ((emit.getStart() == 0 || isWhitespace(searchText.charAt(emit.getStart() - 1))) &&
+                (emit.getEnd() + 1 == size || isWhitespace(searchText.charAt(emit.getEnd() + 1)))) {
+                continue;
+            }
+            removeEmits.add(emit);
+        }
+        
+        for (final Emit removeEmit : removeEmits) {
+            collectedEmits.remove(removeEmit);
+        }
+    }
+
+    private State getState(State currentState, final Character character) {
        State newCurrentState = currentState.nextState(character);
+        
        while (newCurrentState == null) {
            currentState = currentState.failure();
            newCurrentState = currentState.nextState(character);
        }
+        
        return newCurrentState;
    }

    private void constructFailureStates() {
-        Queue<State> queue = new LinkedBlockingDeque<>();
+        final Queue<State> queue = new LinkedBlockingDeque<>();
+        final State startState = getRootState();

        // First, set the fail state of all depth 1 states to the root state
-        for (State depthOneState : this.rootState.getStates()) {
-            depthOneState.setFailure(this.rootState);
+        for (State depthOneState : startState.getStates()) {
+            depthOneState.setFailure(startState);
            queue.add(depthOneState);
        }

        // Second, determine the fail state for all depth > 1 state
        while (!queue.isEmpty()) {
-            State currentState = queue.remove();
+            final State currentState = queue.remove();

-            for (Character transition : currentState.getTransitions()) {
+            for (final Character transition : currentState.getTransitions()) {
                State targetState = currentState.nextState(transition);
                queue.add(targetState);

@ -210,70 +258,174 @@ public class Trie {
                while (traceFailureState.nextState(transition) == null) {
                    traceFailureState = traceFailureState.failure();
                }
-                State newFailureState = traceFailureState.nextState(transition);
+
+                final State newFailureState = traceFailureState.nextState(transition);
                targetState.setFailure(newFailureState);
                targetState.addEmit(newFailureState.emit());
            }
        }
    }

-    private boolean storeEmits(int position, State currentState, EmitHandler emitHandler) {
+    private boolean storeEmits(final int position, final State currentState, final EmitHandler emitHandler) {
        boolean emitted = false;
-        Collection<String> emits = currentState.emit();
+        final Collection<String> emits = currentState.emit();
+        
+        // TODO: The check for empty might be superfluous.
        if (emits != null && !emits.isEmpty()) {
-            for (String emit : emits) {
+            for (final String emit : emits) {
                emitHandler.emit(new Emit(position - emit.length() + 1, position, emit));
                emitted = true;
            }
        }
+        
        return emitted;
    }

+    private boolean isCaseInsensitive() {
+      return trieConfig.isCaseInsensitive();
+    }
+    
+    private State getRootState() {
+      return this.rootState;
+    }
+
+    /**
+     * Provides a fluent interface for constructing Trie instances.
+     * 
+     * @return The builder used to configure its Trie.
+     */
    public static TrieBuilder builder() {
        return new TrieBuilder();
    }

    public static class TrieBuilder {

-        private TrieConfig trieConfig = new TrieConfig();
+        private final TrieConfig trieConfig = new TrieConfig();

-        private Trie trie = new Trie(trieConfig);
+        private final Trie trie = new Trie(trieConfig);

+        /**
+         * Default (empty) constructor.
+         */
        private TrieBuilder() {}

-        public TrieBuilder caseInsensitive() {
+        /**
+         * Adds a keyword to the Trie's list of text search keywords.
+         * 
+         * @param keyword The keyword to add to the list.
+         * 
+         * @return This builder.
+         * @throws NullPointerException if the keyword is null.
+         */
+        public TrieBuilder addKeyword(final String keyword) {
+            this.trie.addKeyword(keyword);
+            return this;
+        }
+        
+        /**
+         * Adds a list of keywords to the Trie's list of text search keywords.
+         * 
+         * @param keywords The keywords to add to the list.
+         * 
+         * @return This builder.
+         */
+        public TrieBuilder addKeywords(final String... keywords) {
+          this.trie.addKeywords(keywords);
+          return this;
+        }
+
+        /**
+         * Adds a list of keywords to the Trie's list of text search keywords.
+         * 
+         * @param keywords The keywords to add to the list.
+         * 
+         * @return This builder.
+         */
+        public TrieBuilder addKeywords(final Collection<String> keywords) {
+          this.trie.addKeywords(keywords);
+          return this;
+        }
+
+        /**
+         * Configure the Trie to ignore case when searching for keywords in
+         * the text.
+         * 
+         * @return This builder.
+         */
+        public TrieBuilder ignoreCase() {
            this.trieConfig.setCaseInsensitive(true);
            return this;
        }

-        public TrieBuilder removeOverlaps() {
+        /**
+         * Configure the Trie to ignore overlapping keywords.
+         * 
+         * @return This builder.
+         */
+        public TrieBuilder ignoreOverlaps() {
            this.trieConfig.setAllowOverlaps(false);
            return this;
        }

+        /**
+         * Configure the Trie to match whole keywords in the text.
+         * 
+         * @return This builder.
+         */
        public TrieBuilder onlyWholeWords() {
            this.trieConfig.setOnlyWholeWords(true);
            return this;
        }

+        /**
+         * Configure the Trie to match whole keywords that are separated by
+         * whitespace in the text. For example, "this keyword thatkeyword"
+         * would only match the first occurrence of "keyword".
+         * 
+         * @return This builder.
+         */
        public TrieBuilder onlyWholeWordsWhiteSpaceSeparated() {
            this.trieConfig.setOnlyWholeWordsWhiteSpaceSeparated(true);
            return this;
        }

-        public TrieBuilder addKeyword(String keyword) {
-            trie.addKeyword(keyword);
-            return this;
-        }
-
+        /**
+         * Configure the Trie to stop after the first keyword is found in the
+         * text.
+         * 
+         * @return This builder.
+         */
        public TrieBuilder stopOnHit() {
            trie.trieConfig.setStopOnHit(true);
            return this;
        }

+        /**
+         * Configure the Trie based on the builder settings.
+         * 
+         * @return The configured Trie.
+         */
        public Trie build() {
-            trie.constructFailureStates();
-            return trie;
+            this.trie.constructFailureStates();
+            return this.trie;
+        }
+        
+        /**
+         * @deprecated Use ignoreCase()
+         * 
+         * @return This builder.
+         */
+        public TrieBuilder caseInsensitive() {
+            return ignoreCase();
+        }
+
+        /**
+         * @deprecated Use ignoreOverlaps()
+         * 
+         * @return This builder.
+         */
+        public TrieBuilder removeOverlaps() {
+            return ignoreOverlaps();
        }
    }
 }
--- a/src/test/java/org/ahocorasick/trie/TrieTest.java
+++ b/src/test/java/org/ahocorasick/trie/TrieTest.java
@ -1,62 +1,78 @@
 package org.ahocorasick.trie;

-import org.ahocorasick.trie.handler.EmitHandler;
-import org.junit.Test;
-
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Iterator;
 import java.util.List;
-
+import java.util.concurrent.ThreadLocalRandom;
 import static junit.framework.Assert.assertEquals;
+import org.ahocorasick.trie.handler.EmitHandler;
 import static org.junit.Assert.assertTrue;
+import org.junit.Test;

 public class TrieTest {
+    private final static String[] ALPHABET = new String[]{
+      "abc", "bcd", "cde"
+    };
+    
+    private final static String[] PRONOUNS = new String[]{
+      "hers", "his", "she", "he"
+    };
+
+    private final static String[] FOOD = new String[]{
+      "veal", "cauliflower", "broccoli", "tomatoes"
+    };
+
+    private final static String[] GREEK_LETTERS = new String[]{
+      "Alpha", "Beta", "Gamma"
+    };
+    
+    private final static String[] UNICODE = new String[]{
+      "turning", "once", "again", "börkü"
+    };

    @Test
    public void keywordAndTextAreTheSame() {
        Trie trie = Trie.builder()
-                .addKeyword("abc")
+                .addKeyword(ALPHABET[0])
                .build();
-        Collection<Emit> emits = trie.parseText("abc");
+        Collection<Emit> emits = trie.parseText(ALPHABET[0]);
        Iterator<Emit> iterator = emits.iterator();
-        checkEmit(iterator.next(), 0, 2, "abc");
+        checkEmit(iterator.next(), 0, 2, ALPHABET[0]);
    }

    @Test
    public void keywordAndTextAreTheSameFirstMatch() {
        Trie trie = Trie.builder()
-                .addKeyword("abc")
+                .addKeyword(ALPHABET[0])
                .build();
-        Emit firstMatch = trie.firstMatch("abc");
-        checkEmit(firstMatch, 0, 2, "abc");
+        Emit firstMatch = trie.firstMatch(ALPHABET[0]);
+        checkEmit(firstMatch, 0, 2, ALPHABET[0]);
    }

    @Test
    public void textIsLongerThanKeyword() {
        Trie trie = Trie.builder()
-                .addKeyword("abc")
+                .addKeyword(ALPHABET[0])
                .build();
-        Collection<Emit> emits = trie.parseText(" abc");
+        Collection<Emit> emits = trie.parseText(" " + ALPHABET[0]);
        Iterator<Emit> iterator = emits.iterator();
-        checkEmit(iterator.next(), 1, 3, "abc");
+        checkEmit(iterator.next(), 1, 3, ALPHABET[0]);
    }

    @Test
    public void textIsLongerThanKeywordFirstMatch() {
        Trie trie = Trie.builder()
-                .addKeyword("abc")
+                .addKeyword(ALPHABET[0])
                .build();
-        Emit firstMatch = trie.firstMatch(" abc");
-        checkEmit(firstMatch, 1, 3, "abc");
+        Emit firstMatch = trie.firstMatch(" " + ALPHABET[0]);
+        checkEmit(firstMatch, 1, 3, ALPHABET[0]);
    }

    @Test
    public void variousKeywordsOneMatch() {
        Trie trie = Trie.builder()
-                .addKeyword("abc")
-                .addKeyword("bcd")
-                .addKeyword("cde")
+                .addKeywords(ALPHABET)
                .build();
        Collection<Emit> emits = trie.parseText("bcd");
        Iterator<Emit> iterator = emits.iterator();
@ -66,9 +82,7 @@ public class TrieTest {
    @Test
    public void variousKeywordsFirstMatch() {
        Trie trie = Trie.builder()
-                .addKeyword("abc")
-                .addKeyword("bcd")
-                .addKeyword("cde")
+                .addKeywords(ALPHABET)
                .build();
        Emit firstMatch = trie.firstMatch("bcd");
        checkEmit(firstMatch, 0, 2, "bcd");
@ -77,10 +91,7 @@ public class TrieTest {
    @Test
    public void ushersTestAndStopOnHit() {
        Trie trie = Trie.builder()
-                .addKeyword("hers")
-                .addKeyword("his")
-                .addKeyword("she")
-                .addKeyword("he")
+                .addKeywords(PRONOUNS)
                .stopOnHit()
                .build();
        Collection<Emit> emits = trie.parseText("ushers");
@ -93,10 +104,7 @@ public class TrieTest {
    @Test
    public void ushersTest() {
        Trie trie = Trie.builder()
-                .addKeyword("hers")
-                .addKeyword("his")
-                .addKeyword("she")
-                .addKeyword("he")
+                .addKeywords(PRONOUNS)
                .build();
        Collection<Emit> emits = trie.parseText("ushers");
        assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5
@ -109,7 +117,7 @@ public class TrieTest {
    @Test
    public void ushersTestWithCapitalKeywords() {
        Trie trie = Trie.builder()
-                .caseInsensitive()
+                .ignoreCase()
                .addKeyword("HERS")
                .addKeyword("HIS")
                .addKeyword("SHE")
@ -126,10 +134,7 @@ public class TrieTest {
    @Test
    public void ushersTestFirstMatch() {
        Trie trie = Trie.builder()
-                .addKeyword("hers")
-                .addKeyword("his")
-                .addKeyword("she")
-                .addKeyword("he")
+                .addKeywords(PRONOUNS)
                .build();
        Emit firstMatch = trie.firstMatch("ushers");
        checkEmit(firstMatch, 2, 3, "he");
@ -138,10 +143,7 @@ public class TrieTest {
    @Test
    public void ushersTestByCallback() {
        Trie trie = Trie.builder()
-                .addKeyword("hers")
-                .addKeyword("his")
-                .addKeyword("she")
-                .addKeyword("he")
+                .addKeywords(PRONOUNS)
                .build();

        final List<Emit> emits = new ArrayList<>();
@ -182,10 +184,7 @@ public class TrieTest {
    @Test
    public void recipes() {
        Trie trie = Trie.builder()
-                .addKeyword("veal")
-                .addKeyword("cauliflower")
-                .addKeyword("broccoli")
-                .addKeyword("tomatoes")
+                .addKeywords(FOOD)
                .build();
        Collection<Emit> emits = trie.parseText("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli");
        Iterator<Emit> iterator = emits.iterator();
@ -198,10 +197,7 @@ public class TrieTest {
    @Test
    public void recipesFirstMatch() {
        Trie trie = Trie.builder()
-                .addKeyword("veal")
-                .addKeyword("cauliflower")
-                .addKeyword("broccoli")
-                .addKeyword("tomatoes")
+                .addKeywords(FOOD)
                .build();
        Emit firstMatch = trie.firstMatch("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli");

@ -305,9 +301,7 @@ public class TrieTest {
    @Test
    public void tokenizeFullSentence() {
        Trie trie = Trie.builder()
-                .addKeyword("Alpha")
-                .addKeyword("Beta")
-                .addKeyword("Gamma")
+                .addKeywords(GREEK_LETTERS)
                .build();
        Collection<Token> tokens = trie.tokenize("Hear: Alpha team first, Beta from the rear, Gamma in reserve");
        assertEquals(7, tokens.size());
@ -321,13 +315,11 @@ public class TrieTest {
        assertEquals(" in reserve", tokensIt.next().getFragment());
    }

+    // @see https://github.com/robert-bor/aho-corasick/issues/5
    @Test
-    public void bug5InGithubReportedByXCurry() {
-        Trie trie = Trie.builder().caseInsensitive().onlyWholeWords()
-                .addKeyword("turning")
-                .addKeyword("once")
-                .addKeyword("again")
-                .addKeyword("börkü")
+    public void testStringIndexOutOfBoundsException() {
+        Trie trie = Trie.builder().ignoreCase().onlyWholeWords()
+                .addKeywords(UNICODE)
                .build();
        Collection<Emit> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ");
        assertEquals(4, emits.size()); // Match must not be made
@ -339,12 +331,9 @@ public class TrieTest {
    }

    @Test
-    public void caseInsensitive() {
-        Trie trie = Trie.builder().caseInsensitive()
-                .addKeyword("turning")
-                .addKeyword("once")
-                .addKeyword("again")
-                .addKeyword("börkü")
+    public void testIgnoreCase() {
+        Trie trie = Trie.builder().ignoreCase()
+                .addKeywords(UNICODE)
                .build();
        Collection<Emit> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ");
        assertEquals(4, emits.size()); // Match must not be made
@ -356,12 +345,9 @@ public class TrieTest {
    }

    @Test
-    public void caseInsensitiveFirstMatch() {
-        Trie trie = Trie.builder().caseInsensitive()
-                .addKeyword("turning")
-                .addKeyword("once")
-                .addKeyword("again")
-                .addKeyword("börkü")
+    public void testIgnoreCaseFirstMatch() {
+        Trie trie = Trie.builder().ignoreCase()
+                .addKeywords(UNICODE)
                .build();
        Emit firstMatch = trie.firstMatch("TurninG OnCe AgAiN BÖRKÜ");

@ -371,29 +357,27 @@ public class TrieTest {
    @Test
    public void tokenizeTokensInSequence() {
        Trie trie = Trie.builder()
-                .addKeyword("Alpha")
-                .addKeyword("Beta")
-                .addKeyword("Gamma")
+                .addKeywords(GREEK_LETTERS)
                .build();
        Collection<Token> tokens = trie.tokenize("Alpha Beta Gamma");
        assertEquals(5, tokens.size());
    }

-    // Test offered by XCurry, https://github.com/robert-bor/aho-corasick/issues/7
+    // @see https://github.com/robert-bor/aho-corasick/issues/7
    @Test
-    public void zeroLengthTestBug7InGithubReportedByXCurry() {
-        Trie trie = Trie.builder().removeOverlaps().onlyWholeWords().caseInsensitive()
+    public void testZeroLength() {
+        Trie trie = Trie.builder().ignoreOverlaps().onlyWholeWords().ignoreCase()
                .addKeyword("")
                .build();
        trie.tokenize("Try a natural lip and subtle bronzer to keep all the focus on those big bright eyes with NARS Eyeshadow Duo in Rated R And the winner is... Boots No7 Advanced Renewal Anti-ageing Glycolic Peel Kit ($25 amazon.com) won most-appealing peel.");
    }

-    // Test offered by dwyerk, https://github.com/robert-bor/aho-corasick/issues/8
+    // @see https://github.com/robert-bor/aho-corasick/issues/8
    @Test
-    public void unicodeIssueBug8ReportedByDwyerk() {
+    public void testUnicode1() {
        String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char
        assertEquals("THIS", target.substring(5, 9)); // Java does it the right way
-        Trie trie = Trie.builder().caseInsensitive().onlyWholeWords()
+        Trie trie = Trie.builder().ignoreCase().onlyWholeWords()
                .addKeyword("this")
                .build();
        Collection<Emit> emits = trie.parseText(target);
@ -402,11 +386,12 @@ public class TrieTest {
        checkEmit(it.next(), 5, 8, "this");
    }

+    // @see https://github.com/robert-bor/aho-corasick/issues/8
    @Test
-    public void unicodeIssueBug8ReportedByDwyerkFirstMatch() {
+    public void testUnicode2() {
        String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char
        Trie trie = Trie.builder()
-                .caseInsensitive()
+                .ignoreCase()
                .onlyWholeWords()
                .addKeyword("this")
                .build();
@ -416,7 +401,7 @@ public class TrieTest {
    }

    @Test
-    public void partialMatchWhiteSpaces() {
+    public void testPartialMatchWhiteSpaces() {
        Trie trie = Trie.builder()
                .onlyWholeWordsWhiteSpaceSeparated()
                .addKeyword("#sugar-123")
@ -426,10 +411,66 @@ public class TrieTest {
        checkEmit(emits.iterator().next(), 0, 9, "#sugar-123");
    }

+    @Test
+    public void testLargeString() {
+        final int interval = 100;
+        final int textSize = 1000000;
+        final String keyword = FOOD[ 1 ];
+        final StringBuilder text = randomNumbers( textSize );
+
+        injectKeyword( text, keyword, interval );
+
+        Trie trie = Trie.builder()
+            .onlyWholeWords()
+            .addKeyword( keyword )
+            .build();
+
+        final Collection<Emit> emits = trie.parseText( text );
+
+        assertEquals( textSize / interval, emits.size() );
+    }
+    
+    /**
+     * Generates a random sequence of ASCII numbers.
+     * 
+     * @param count The number of numbers to generate.
+     * @return A character sequence filled with random digits.
+     */
+    private StringBuilder randomNumbers( int count ) {
+        final StringBuilder sb = new StringBuilder( count );
+
+        while( --count > 0 ) {
+            sb.append( randomInt( 0, 10 ) );
+        }
+
+        return sb;
+    }
+    
+    /**
+     * Injects keywords into a string builder.
+     * 
+     * @param source Should contain a bunch of random data that cannot match
+     * any keyword.
+     * @param keyword A keyword to inject repeatedly in the text.
+     * @param interval How often to inject the keyword.
+     */
+    private void injectKeyword( 
+        final StringBuilder source, 
+        final String keyword,
+        final int interval ) {
+        final int length = source.length();
+        for( int i = 0; i < length; i += interval ) {
+            source.replace( i, i + keyword.length(), keyword );
+        }
+    }
+    
+    private int randomInt( final int min, final int max ) {
+        return ThreadLocalRandom.current().nextInt( min, max );
+    }
+
    private void checkEmit(Emit next, int expectedStart, int expectedEnd, String expectedKeyword) {
        assertEquals("Start of emit should have been " + expectedStart, expectedStart, next.getStart());
        assertEquals("End of emit should have been " + expectedEnd, expectedEnd, next.getEnd());
        assertEquals(expectedKeyword, next.getKeyword());
    }
-
 }