From df503bae438d6b9f728c0c18d3884a131441be9a Mon Sep 17 00:00:00 2001 From: ryan Date: Mon, 6 Oct 2014 10:52:35 -0700 Subject: [PATCH 1/3] Added method and tests for a faster path to return the first match. --- src/main/java/org/ahocorasick/trie/Trie.java | 354 +++++++++++------- .../java/org/ahocorasick/trie/TrieTest.java | 111 +++++- 2 files changed, 316 insertions(+), 149 deletions(-) diff --git a/src/main/java/org/ahocorasick/trie/Trie.java b/src/main/java/org/ahocorasick/trie/Trie.java index 155698b..c24af01 100644 --- a/src/main/java/org/ahocorasick/trie/Trie.java +++ b/src/main/java/org/ahocorasick/trie/Trie.java @@ -11,177 +11,251 @@ import java.util.concurrent.LinkedBlockingDeque; /** * - * Based on the Aho-Corasick white paper, Bell technologies: ftp://163.13.200.222/assistant/bearhero/prog/%A8%E4%A5%A6/ac_bm.pdf + * Based on the Aho-Corasick white paper, Bell technologies: + * ftp://163.13.200.222/assistant/bearhero/prog/%A8%E4%A5%A6/ac_bm.pdf + * * @author Robert Bor */ -public class Trie { +public class Trie +{ - private TrieConfig trieConfig; + private TrieConfig trieConfig; - private State rootState; + private State rootState; - private boolean failureStatesConstructed = false; + private boolean failureStatesConstructed = false; - public Trie(TrieConfig trieConfig) { - this.trieConfig = trieConfig; - this.rootState = new State(); - } + public Trie(TrieConfig trieConfig) + { + this.trieConfig = trieConfig; + this.rootState = new State(); + } - public Trie() { - this(new TrieConfig()); - } + public Trie() + { + this(new TrieConfig()); + } - public Trie caseInsensitive() { - this.trieConfig.setCaseInsensitive(true); - return this; - } + public Trie caseInsensitive() + { + this.trieConfig.setCaseInsensitive(true); + return this; + } - public Trie removeOverlaps() { - this.trieConfig.setAllowOverlaps(false); - return this; - } + public Trie removeOverlaps() + { + this.trieConfig.setAllowOverlaps(false); + return this; + } - public Trie onlyWholeWords() { - this.trieConfig.setOnlyWholeWords(true); - return this; - } + public Trie onlyWholeWords() + { + this.trieConfig.setOnlyWholeWords(true); + return this; + } - public void addKeyword(String keyword) { - if (keyword == null || keyword.length() == 0) { - return; - } - State currentState = this.rootState; - for (Character character : keyword.toCharArray()) { - currentState = currentState.addState(character); - } - currentState.addEmit(keyword); - } + public void addKeyword(String keyword) + { + if (keyword == null || keyword.length() == 0) { + return; + } + State currentState = this.rootState; + for (Character character : keyword.toCharArray()) { + currentState = currentState.addState(character); + } + currentState.addEmit(keyword); + } - public Collection tokenize(String text) { + public Collection tokenize(String text) + { - Collection tokens = new ArrayList(); + Collection tokens = new ArrayList(); - Collection collectedEmits = parseText(text); - int lastCollectedPosition = -1; - for (Emit emit : collectedEmits) { - if (emit.getStart() - lastCollectedPosition > 1) { - tokens.add(createFragment(emit, text, lastCollectedPosition)); - } - tokens.add(createMatch(emit, text)); - lastCollectedPosition = emit.getEnd(); - } - if (text.length() - lastCollectedPosition > 1) { - tokens.add(createFragment(null, text, lastCollectedPosition)); - } + Collection collectedEmits = parseText(text); + int lastCollectedPosition = -1; + for (Emit emit : collectedEmits) { + if (emit.getStart() - lastCollectedPosition > 1) { + tokens.add(createFragment(emit, text, lastCollectedPosition)); + } + tokens.add(createMatch(emit, text)); + lastCollectedPosition = emit.getEnd(); + } + if (text.length() - lastCollectedPosition > 1) { + tokens.add(createFragment(null, text, lastCollectedPosition)); + } - return tokens; - } + return tokens; + } - private Token createFragment(Emit emit, String text, int lastCollectedPosition) { - return new FragmentToken(text.substring(lastCollectedPosition+1, emit == null ? text.length() : emit.getStart())); - } + private Token createFragment(Emit emit, String text, int lastCollectedPosition) + { + return new FragmentToken(text.substring(lastCollectedPosition + 1, emit == null ? text.length() : emit. + getStart())); + } - private Token createMatch(Emit emit, String text) { - return new MatchToken(text.substring(emit.getStart(), emit.getEnd()+1), emit); - } + private Token createMatch(Emit emit, String text) + { + return new MatchToken(text.substring(emit.getStart(), emit.getEnd() + 1), emit); + } - @SuppressWarnings("unchecked") - public Collection parseText(String text) { - checkForConstructedFailureStates(); + @SuppressWarnings("unchecked") + public Collection parseText(String text) + { + checkForConstructedFailureStates(); - int position = 0; - State currentState = this.rootState; - List collectedEmits = new ArrayList(); - for (Character character : text.toCharArray()) { - if (trieConfig.isCaseInsensitive()) { - character = Character.toLowerCase(character); - } - currentState = getState(currentState, character); - storeEmits(position, currentState, collectedEmits); - position++; - } + int position = 0; + State currentState = this.rootState; + List collectedEmits = new ArrayList(); + for (Character character : text.toCharArray()) { + if (trieConfig.isCaseInsensitive()) { + character = Character.toLowerCase(character); + } + currentState = getState(currentState, character); + storeEmits(position, currentState, collectedEmits); + position++; + } - if (trieConfig.isOnlyWholeWords()) { - removePartialMatches(text, collectedEmits); - } + if (trieConfig.isOnlyWholeWords()) { + removePartialMatches(text, collectedEmits); + } - if (!trieConfig.isAllowOverlaps()) { - IntervalTree intervalTree = new IntervalTree((List)(List)collectedEmits); - intervalTree.removeOverlaps((List) (List) collectedEmits); - } + if (!trieConfig.isAllowOverlaps()) { + IntervalTree intervalTree = new IntervalTree((List) (List) collectedEmits); + intervalTree.removeOverlaps((List) (List) collectedEmits); + } - return collectedEmits; - } + return collectedEmits; + } - private void removePartialMatches(String searchText, List collectedEmits) { - long size = searchText.length(); - List removeEmits = new ArrayList(); - for (Emit emit : collectedEmits) { - if ((emit.getStart() == 0 || - !Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) && - (emit.getEnd() + 1 == size || - !Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1)))) { - continue; - } - removeEmits.add(emit); - } + public boolean matches(String text) + { + Emit firstMatch = firstMatch(text); + return firstMatch != null; + } - for (Emit removeEmit : removeEmits) { - collectedEmits.remove(removeEmit); - } - } + public Emit firstMatch(String text) + { - private State getState(State currentState, Character character) { - State newCurrentState = currentState.nextState(character); - while (newCurrentState == null) { - currentState = currentState.failure(); - newCurrentState = currentState.nextState(character); - } - return newCurrentState; - } + if (!trieConfig.isAllowOverlaps()) { + // Slow path. Needs to find all the matches to detect overlaps. + Collection parseText = parseText(text); + if (parseText != null && !parseText.isEmpty()) { + return parseText.iterator().next(); + } + } else { + // Fast path. Returs first match found. - private void checkForConstructedFailureStates() { - if (!this.failureStatesConstructed) { - constructFailureStates(); - } - } + checkForConstructedFailureStates(); - private void constructFailureStates() { - Queue queue = new LinkedBlockingDeque(); + int position = 0; + State currentState = this.rootState; + for (Character character : text.toCharArray()) { + if (trieConfig.isCaseInsensitive()) { + character = Character.toLowerCase(character); + } + currentState = getState(currentState, character); - // First, set the fail state of all depth 1 states to the root state - for (State depthOneState : this.rootState.getStates()) { - depthOneState.setFailure(this.rootState); - queue.add(depthOneState); - } - this.failureStatesConstructed = true; + Collection emitStrs = currentState.emit(); + if (emitStrs != null && !emitStrs.isEmpty()) { + for (String emitStr : emitStrs) { + final Emit emit = new Emit(position - emitStr.length() + 1, position, emitStr); - // Second, determine the fail state for all depth > 1 state - while (!queue.isEmpty()) { - State currentState = queue.remove(); + if (trieConfig.isOnlyWholeWords()) { + if (!isPartialMatch(text, emit)) { + return emit; + } + } else { + return emit; + } + } + } - for (Character transition : currentState.getTransitions()) { - State targetState = currentState.nextState(transition); - queue.add(targetState); + position++; + } - State traceFailureState = currentState.failure(); - while (traceFailureState.nextState(transition) == null) { - traceFailureState = traceFailureState.failure(); - } - State newFailureState = traceFailureState.nextState(transition); - targetState.setFailure(newFailureState); - targetState.addEmit(newFailureState.emit()); - } - } - } + } - private void storeEmits(int position, State currentState, List collectedEmits) { - Collection emits = currentState.emit(); - if (emits != null && !emits.isEmpty()) { - for (String emit : emits) { - collectedEmits.add(new Emit(position-emit.length()+1, position, emit)); - } - } - } + return null; + } + + private boolean isPartialMatch(String searchText, Emit emit) + { + return (emit.getStart() != 0 && + Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) || + (emit.getEnd() + 1 != searchText.length() && + Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1))); + } + + private void removePartialMatches(String searchText, List collectedEmits) + { + long size = searchText.length(); + List removeEmits = new ArrayList(); + for (Emit emit : collectedEmits) { + if (isPartialMatch(searchText, emit)) { + removeEmits.add(emit); + } + } + + for (Emit removeEmit : removeEmits) { + collectedEmits.remove(removeEmit); + } + } + + private State getState(State currentState, Character character) + { + State newCurrentState = currentState.nextState(character); + while (newCurrentState == null) { + currentState = currentState.failure(); + newCurrentState = currentState.nextState(character); + } + return newCurrentState; + } + + private void checkForConstructedFailureStates() + { + if (!this.failureStatesConstructed) { + constructFailureStates(); + } + } + + private void constructFailureStates() + { + Queue queue = new LinkedBlockingDeque(); + + // First, set the fail state of all depth 1 states to the root state + for (State depthOneState : this.rootState.getStates()) { + depthOneState.setFailure(this.rootState); + queue.add(depthOneState); + } + this.failureStatesConstructed = true; + + // Second, determine the fail state for all depth > 1 state + while (!queue.isEmpty()) { + State currentState = queue.remove(); + + for (Character transition : currentState.getTransitions()) { + State targetState = currentState.nextState(transition); + queue.add(targetState); + + State traceFailureState = currentState.failure(); + while (traceFailureState.nextState(transition) == null) { + traceFailureState = traceFailureState.failure(); + } + State newFailureState = traceFailureState.nextState(transition); + targetState.setFailure(newFailureState); + targetState.addEmit(newFailureState.emit()); + } + } + } + + private void storeEmits(int position, State currentState, List collectedEmits) + { + Collection emits = currentState.emit(); + if (emits != null && !emits.isEmpty()) { + for (String emit : emits) { + collectedEmits.add(new Emit(position - emit.length() + 1, position, emit)); + } + } + } } diff --git a/src/test/java/org/ahocorasick/trie/TrieTest.java b/src/test/java/org/ahocorasick/trie/TrieTest.java index 6cc7ff7..d0d20a9 100644 --- a/src/test/java/org/ahocorasick/trie/TrieTest.java +++ b/src/test/java/org/ahocorasick/trie/TrieTest.java @@ -18,6 +18,14 @@ public class TrieTest { checkEmit(iterator.next(), 0, 2, "abc"); } + @Test + public void keywordAndTextAreTheSameFirstMatch() { + Trie trie = new Trie(); + trie.addKeyword("abc"); + Emit firstMatch = trie.firstMatch("abc"); + checkEmit(firstMatch, 0, 2, "abc"); + } + @Test public void textIsLongerThanKeyword() { Trie trie = new Trie(); @@ -27,6 +35,14 @@ public class TrieTest { checkEmit(iterator.next(), 1, 3, "abc"); } + @Test + public void textIsLongerThanKeywordFirstMatch() { + Trie trie = new Trie(); + trie.addKeyword("abc"); + Emit firstMatch = trie.firstMatch(" abc"); + checkEmit(firstMatch, 1, 3, "abc"); + } + @Test public void variousKeywordsOneMatch() { Trie trie = new Trie(); @@ -38,6 +54,16 @@ public class TrieTest { checkEmit(iterator.next(), 0, 2, "bcd"); } + @Test + public void variousKeywordsFirstMatch() { + Trie trie = new Trie(); + trie.addKeyword("abc"); + trie.addKeyword("bcd"); + trie.addKeyword("cde"); + Emit firstMatch = trie.firstMatch("bcd"); + checkEmit(firstMatch, 0, 2, "bcd"); + } + @Test public void ushersTest() { Trie trie = new Trie(); @@ -53,6 +79,17 @@ public class TrieTest { checkEmit(iterator.next(), 2, 5, "hers"); } + @Test + public void ushersTestFirstMatch() { + Trie trie = new Trie(); + trie.addKeyword("hers"); + trie.addKeyword("his"); + trie.addKeyword("she"); + trie.addKeyword("he"); + Emit firstMatch = trie.firstMatch("ushers"); + checkEmit(firstMatch, 2, 3, "he"); + } + @Test public void misleadingTest() { Trie trie = new Trie(); @@ -62,6 +99,14 @@ public class TrieTest { checkEmit(iterator.next(), 9, 12, "hers"); } + @Test + public void misleadingTestFirstMatch() { + Trie trie = new Trie(); + trie.addKeyword("hers"); + Emit firstMatch = trie.firstMatch("h he her hers"); + checkEmit(firstMatch, 9, 12, "hers"); + } + @Test public void recipes() { Trie trie = new Trie(); @@ -77,20 +122,26 @@ public class TrieTest { checkEmit(iterator.next(), 51, 58, "broccoli"); } + @Test + public void recipesFirstMatch() { + Trie trie = new Trie(); + trie.addKeyword("veal"); + trie.addKeyword("cauliflower"); + trie.addKeyword("broccoli"); + trie.addKeyword("tomatoes"); + Emit firstMatch = trie.firstMatch("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli"); + + checkEmit(firstMatch, 2, 12, "cauliflower"); + } + @Test public void longAndShortOverlappingMatch() { Trie trie = new Trie(); trie.addKeyword("he"); trie.addKeyword("hehehehe"); - Collection emits = trie.parseText("hehehehehe"); - Iterator iterator = emits.iterator(); - checkEmit(iterator.next(), 0, 1, "he"); - checkEmit(iterator.next(), 2, 3, "he"); - checkEmit(iterator.next(), 4, 5, "he"); - checkEmit(iterator.next(), 6, 7, "he"); - checkEmit(iterator.next(), 0, 7, "hehehehe"); - checkEmit(iterator.next(), 8, 9, "he"); - checkEmit(iterator.next(), 2, 9, "hehehehe"); + Emit firstMatch = trie.firstMatch("hehehehehe"); + + checkEmit(firstMatch, 0, 1, "he"); } @Test @@ -107,6 +158,17 @@ public class TrieTest { checkEmit(iterator.next(), 6, 7, "ab"); } + @Test + public void nonOverlappingFirstMatch() { + Trie trie = new Trie().removeOverlaps(); + trie.addKeyword("ab"); + trie.addKeyword("cba"); + trie.addKeyword("ababc"); + Emit firstMatch = trie.firstMatch("ababcbab"); + + checkEmit(firstMatch, 0, 4, "ababc"); + } + @Test public void startOfChurchillSpeech() { Trie trie = new Trie().removeOverlaps(); @@ -133,6 +195,15 @@ public class TrieTest { checkEmit(emits.iterator().next(), 20, 24, "sugar"); } + @Test + public void partialMatchFirstMatch() { + Trie trie = new Trie().onlyWholeWords(); + trie.addKeyword("sugar"); + Emit firstMatch = trie.firstMatch("sugarcane sugarcane sugar canesugar"); // left, middle, right test + + checkEmit(firstMatch, 20, 24, "sugar"); + } + @Test public void tokenizeFullSentence() { Trie trie = new Trie(); @@ -183,6 +254,18 @@ public class TrieTest { checkEmit(it.next(), 19, 23, "börkü"); } + @Test + public void caseInsensitiveFirstMatch() { + Trie trie = new Trie().caseInsensitive(); + trie.addKeyword("turning"); + trie.addKeyword("once"); + trie.addKeyword("again"); + trie.addKeyword("börkü"); + Emit firstMatch = trie.firstMatch("TurninG OnCe AgAiN BÖRKÜ"); + + checkEmit(firstMatch, 0, 6, "turning"); + } + @Test public void tokenizeTokensInSequence() { Trie trie = new Trie(); @@ -214,6 +297,16 @@ public class TrieTest { checkEmit(it.next(), 5, 8, "this"); } + @Test + public void unicodeIssueBug8ReportedByDwyerkFirstMatch() { + String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char + Trie trie = new Trie().caseInsensitive().onlyWholeWords(); + assertEquals("THIS", target.substring(5,9)); // Java does it the right way + trie.addKeyword("this"); + Emit firstMatch = trie.firstMatch(target); + checkEmit(firstMatch, 5, 8, "this"); + } + private void checkEmit(Emit next, int expectedStart, int expectedEnd, String expectedKeyword) { assertEquals("Start of emit should have been "+expectedStart, expectedStart, next.getStart()); assertEquals("End of emit should have been "+expectedEnd, expectedEnd, next.getEnd()); From a46e7dfe1db54196209e1f38a56c0aa30b78a582 Mon Sep 17 00:00:00 2001 From: ryan Date: Mon, 6 Oct 2014 11:02:01 -0700 Subject: [PATCH 2/3] Fixed formatting changes. --- src/main/java/org/ahocorasick/trie/Trie.java | 277 +++++++++---------- 1 file changed, 125 insertions(+), 152 deletions(-) diff --git a/src/main/java/org/ahocorasick/trie/Trie.java b/src/main/java/org/ahocorasick/trie/Trie.java index c24af01..8d8f0eb 100644 --- a/src/main/java/org/ahocorasick/trie/Trie.java +++ b/src/main/java/org/ahocorasick/trie/Trie.java @@ -12,120 +12,107 @@ import java.util.concurrent.LinkedBlockingDeque; /** * * Based on the Aho-Corasick white paper, Bell technologies: - * ftp://163.13.200.222/assistant/bearhero/prog/%A8%E4%A5%A6/ac_bm.pdf - * + * ftp://163.13.200.222/assistant/bearhero/prog/%A8%E4%A5%A6/ac_bm.pdf * @author Robert Bor */ -public class Trie -{ +public class Trie { - private TrieConfig trieConfig; + private TrieConfig trieConfig; - private State rootState; + private State rootState; - private boolean failureStatesConstructed = false; + private boolean failureStatesConstructed = false; - public Trie(TrieConfig trieConfig) - { - this.trieConfig = trieConfig; - this.rootState = new State(); - } + public Trie(TrieConfig trieConfig) { + this.trieConfig = trieConfig; + this.rootState = new State(); + } - public Trie() - { - this(new TrieConfig()); - } + public Trie() { + this(new TrieConfig()); + } - public Trie caseInsensitive() - { - this.trieConfig.setCaseInsensitive(true); - return this; - } + public Trie caseInsensitive() { + this.trieConfig.setCaseInsensitive(true); + return this; + } - public Trie removeOverlaps() - { - this.trieConfig.setAllowOverlaps(false); - return this; - } + public Trie removeOverlaps() { + this.trieConfig.setAllowOverlaps(false); + return this; + } - public Trie onlyWholeWords() - { - this.trieConfig.setOnlyWholeWords(true); - return this; - } + public Trie onlyWholeWords() { + this.trieConfig.setOnlyWholeWords(true); + return this; + } - public void addKeyword(String keyword) - { - if (keyword == null || keyword.length() == 0) { - return; - } - State currentState = this.rootState; - for (Character character : keyword.toCharArray()) { - currentState = currentState.addState(character); - } - currentState.addEmit(keyword); - } + public void addKeyword(String keyword) { + if (keyword == null || keyword.length() == 0) { + return; + } + State currentState = this.rootState; + for (Character character : keyword.toCharArray()) { + currentState = currentState.addState(character); + } + currentState.addEmit(keyword); + } - public Collection tokenize(String text) - { + public Collection tokenize(String text) { - Collection tokens = new ArrayList(); + Collection tokens = new ArrayList(); - Collection collectedEmits = parseText(text); - int lastCollectedPosition = -1; - for (Emit emit : collectedEmits) { - if (emit.getStart() - lastCollectedPosition > 1) { - tokens.add(createFragment(emit, text, lastCollectedPosition)); - } - tokens.add(createMatch(emit, text)); - lastCollectedPosition = emit.getEnd(); - } - if (text.length() - lastCollectedPosition > 1) { - tokens.add(createFragment(null, text, lastCollectedPosition)); - } + Collection collectedEmits = parseText(text); + int lastCollectedPosition = -1; + for (Emit emit : collectedEmits) { + if (emit.getStart() - lastCollectedPosition > 1) { + tokens.add(createFragment(emit, text, lastCollectedPosition)); + } + tokens.add(createMatch(emit, text)); + lastCollectedPosition = emit.getEnd(); + } + if (text.length() - lastCollectedPosition > 1) { + tokens.add(createFragment(null, text, lastCollectedPosition)); + } - return tokens; - } + return tokens; + } - private Token createFragment(Emit emit, String text, int lastCollectedPosition) - { - return new FragmentToken(text.substring(lastCollectedPosition + 1, emit == null ? text.length() : emit. - getStart())); - } + private Token createFragment(Emit emit, String text, int lastCollectedPosition) { + return new FragmentToken(text.substring(lastCollectedPosition+1, emit == null ? text.length() : emit.getStart())); + } - private Token createMatch(Emit emit, String text) - { - return new MatchToken(text.substring(emit.getStart(), emit.getEnd() + 1), emit); - } + private Token createMatch(Emit emit, String text) { + return new MatchToken(text.substring(emit.getStart(), emit.getEnd()+1), emit); + } - @SuppressWarnings("unchecked") - public Collection parseText(String text) - { - checkForConstructedFailureStates(); + @SuppressWarnings("unchecked") + public Collection parseText(String text) { + checkForConstructedFailureStates(); - int position = 0; - State currentState = this.rootState; - List collectedEmits = new ArrayList(); - for (Character character : text.toCharArray()) { - if (trieConfig.isCaseInsensitive()) { - character = Character.toLowerCase(character); - } - currentState = getState(currentState, character); - storeEmits(position, currentState, collectedEmits); - position++; - } + int position = 0; + State currentState = this.rootState; + List collectedEmits = new ArrayList(); + for (Character character : text.toCharArray()) { + if (trieConfig.isCaseInsensitive()) { + character = Character.toLowerCase(character); + } + currentState = getState(currentState, character); + storeEmits(position, currentState, collectedEmits); + position++; + } - if (trieConfig.isOnlyWholeWords()) { - removePartialMatches(text, collectedEmits); - } + if (trieConfig.isOnlyWholeWords()) { + removePartialMatches(text, collectedEmits); + } - if (!trieConfig.isAllowOverlaps()) { - IntervalTree intervalTree = new IntervalTree((List) (List) collectedEmits); - intervalTree.removeOverlaps((List) (List) collectedEmits); - } + if (!trieConfig.isAllowOverlaps()) { + IntervalTree intervalTree = new IntervalTree((List)(List)collectedEmits); + intervalTree.removeOverlaps((List) (List) collectedEmits); + } - return collectedEmits; - } + return collectedEmits; + } public boolean matches(String text) { @@ -135,7 +122,6 @@ public class Trie public Emit firstMatch(String text) { - if (!trieConfig.isAllowOverlaps()) { // Slow path. Needs to find all the matches to detect overlaps. Collection parseText = parseText(text); @@ -143,10 +129,8 @@ public class Trie return parseText.iterator().next(); } } else { - // Fast path. Returs first match found. - + // Fast path. Returs first match found. checkForConstructedFailureStates(); - int position = 0; State currentState = this.rootState; for (Character character : text.toCharArray()) { @@ -154,12 +138,10 @@ public class Trie character = Character.toLowerCase(character); } currentState = getState(currentState, character); - Collection emitStrs = currentState.emit(); if (emitStrs != null && !emitStrs.isEmpty()) { for (String emitStr : emitStrs) { final Emit emit = new Emit(position - emitStr.length() + 1, position, emitStr); - if (trieConfig.isOnlyWholeWords()) { if (!isPartialMatch(text, emit)) { return emit; @@ -169,12 +151,9 @@ public class Trie } } } - position++; } - } - return null; } @@ -188,74 +167,68 @@ public class Trie private void removePartialMatches(String searchText, List collectedEmits) { - long size = searchText.length(); List removeEmits = new ArrayList(); for (Emit emit : collectedEmits) { if (isPartialMatch(searchText, emit)) { removeEmits.add(emit); } } - for (Emit removeEmit : removeEmits) { collectedEmits.remove(removeEmit); } } - private State getState(State currentState, Character character) - { - State newCurrentState = currentState.nextState(character); - while (newCurrentState == null) { - currentState = currentState.failure(); - newCurrentState = currentState.nextState(character); - } - return newCurrentState; - } + private State getState(State currentState, Character character) { + State newCurrentState = currentState.nextState(character); + while (newCurrentState == null) { + currentState = currentState.failure(); + newCurrentState = currentState.nextState(character); + } + return newCurrentState; + } - private void checkForConstructedFailureStates() - { - if (!this.failureStatesConstructed) { - constructFailureStates(); - } - } + private void checkForConstructedFailureStates() { + if (!this.failureStatesConstructed) { + constructFailureStates(); + } + } - private void constructFailureStates() - { - Queue queue = new LinkedBlockingDeque(); + private void constructFailureStates() { + Queue queue = new LinkedBlockingDeque(); - // First, set the fail state of all depth 1 states to the root state - for (State depthOneState : this.rootState.getStates()) { - depthOneState.setFailure(this.rootState); - queue.add(depthOneState); - } - this.failureStatesConstructed = true; + // First, set the fail state of all depth 1 states to the root state + for (State depthOneState : this.rootState.getStates()) { + depthOneState.setFailure(this.rootState); + queue.add(depthOneState); + } + this.failureStatesConstructed = true; - // Second, determine the fail state for all depth > 1 state - while (!queue.isEmpty()) { - State currentState = queue.remove(); + // Second, determine the fail state for all depth > 1 state + while (!queue.isEmpty()) { + State currentState = queue.remove(); - for (Character transition : currentState.getTransitions()) { - State targetState = currentState.nextState(transition); - queue.add(targetState); + for (Character transition : currentState.getTransitions()) { + State targetState = currentState.nextState(transition); + queue.add(targetState); - State traceFailureState = currentState.failure(); - while (traceFailureState.nextState(transition) == null) { - traceFailureState = traceFailureState.failure(); - } - State newFailureState = traceFailureState.nextState(transition); - targetState.setFailure(newFailureState); - targetState.addEmit(newFailureState.emit()); - } - } - } + State traceFailureState = currentState.failure(); + while (traceFailureState.nextState(transition) == null) { + traceFailureState = traceFailureState.failure(); + } + State newFailureState = traceFailureState.nextState(transition); + targetState.setFailure(newFailureState); + targetState.addEmit(newFailureState.emit()); + } + } + } - private void storeEmits(int position, State currentState, List collectedEmits) - { - Collection emits = currentState.emit(); - if (emits != null && !emits.isEmpty()) { - for (String emit : emits) { - collectedEmits.add(new Emit(position - emit.length() + 1, position, emit)); - } - } - } + private void storeEmits(int position, State currentState, List collectedEmits) { + Collection emits = currentState.emit(); + if (emits != null && !emits.isEmpty()) { + for (String emit : emits) { + collectedEmits.add(new Emit(position-emit.length()+1, position, emit)); + } + } + } } From d1478c7480f07ce00c12cda3f02c213de59e8aae Mon Sep 17 00:00:00 2001 From: ryan Date: Mon, 6 Oct 2014 13:34:03 -0700 Subject: [PATCH 3/3] HashMap has better performance in my test cases. --- src/main/java/org/ahocorasick/trie/State.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/ahocorasick/trie/State.java b/src/main/java/org/ahocorasick/trie/State.java index 1173fe9..5220f72 100644 --- a/src/main/java/org/ahocorasick/trie/State.java +++ b/src/main/java/org/ahocorasick/trie/State.java @@ -35,7 +35,7 @@ public class State { * referred to in the white paper as the 'goto' structure. From a state it is possible to go * to other states, depending on the character passed. */ - private Map success = new TreeMap(); + private Map success = new HashMap(); /** if no matching states are found, the failure state will be returned */ private State failure = null;