diff --git a/src/main/java/org/ahocorasick/trie/State.java b/src/main/java/org/ahocorasick/trie/State.java index 5220f72..82d167b 100644 --- a/src/main/java/org/ahocorasick/trie/State.java +++ b/src/main/java/org/ahocorasick/trie/State.java @@ -1,5 +1,7 @@ package org.ahocorasick.trie; +import org.ahocorasick.trie.candidate.EmitCandidateFlushHandler; + import java.util.*; /** @@ -98,10 +100,17 @@ public class State { return this.emits == null ? Collections. emptyList() : this.emits; } - public State failure() { + public State failure(EmitCandidateFlushHandler emitCandidateFlushHandler) { + if (emitCandidateFlushHandler != null && this.failure.isRootState()) { + emitCandidateFlushHandler.flush(); + } return this.failure; } + public State failure() { + return failure(null); + } + public void setFailure(State failState) { this.failure = failState; } @@ -114,4 +123,8 @@ public class State { return this.success.keySet(); } + public boolean isRootState() { + return this.depth == 0; + } + } diff --git a/src/main/java/org/ahocorasick/trie/Trie.java b/src/main/java/org/ahocorasick/trie/Trie.java index b223805..2b21334 100644 --- a/src/main/java/org/ahocorasick/trie/Trie.java +++ b/src/main/java/org/ahocorasick/trie/Trie.java @@ -1,13 +1,14 @@ package org.ahocorasick.trie; -import org.ahocorasick.interval.IntervalTree; -import org.ahocorasick.interval.Intervalable; +import org.ahocorasick.trie.candidate.EmitCandidateFlushHandler; +import org.ahocorasick.trie.candidate.EmitCandidateHolder; +import org.ahocorasick.trie.candidate.NonOverlappingEmitCandidateHolder; +import org.ahocorasick.trie.candidate.OverlappingEmitCandidateHolder; import org.ahocorasick.trie.handler.DefaultEmitHandler; import org.ahocorasick.trie.handler.EmitHandler; +import org.ahocorasick.trie.handler.FirstMatchHandler; -import java.util.ArrayList; import java.util.Collection; -import java.util.List; import java.util.Queue; import java.util.concurrent.LinkedBlockingDeque; @@ -49,23 +50,7 @@ public class Trie { public Collection parseText(CharSequence text) { DefaultEmitHandler emitHandler = new DefaultEmitHandler(); parseText(text, emitHandler); - - List collectedEmits = emitHandler.getEmits(); - - if (trieConfig.isOnlyWholeWords()) { - removePartialMatches(text, collectedEmits); - } - - if (trieConfig.isOnlyWholeWordsWhiteSpaceSeparated()) { - removePartialMatchesWhiteSpaceSeparated(text, collectedEmits); - } - - if (!trieConfig.isAllowOverlaps()) { - IntervalTree intervalTree = new IntervalTree((List)(List)collectedEmits); - intervalTree.removeOverlaps((List) (List) collectedEmits); - } - - return collectedEmits; + return emitHandler.getEmits(); } public boolean containsMatch(CharSequence text) { @@ -73,91 +58,56 @@ public class Trie { return firstMatch != null; } + public Emit firstMatch(CharSequence text) { + FirstMatchHandler emitHandler = new FirstMatchHandler(); + parseText(text, emitHandler); + return emitHandler.getFirstMatch(); + } + public void parseText(CharSequence text, EmitHandler emitHandler) { + + final EmitCandidateHolder emitCandidateHolder = this.trieConfig.isAllowOverlaps() ? + new OverlappingEmitCandidateHolder() : + new NonOverlappingEmitCandidateHolder(); + + final EmitCandidateFlushHandler flushHandler = new EmitCandidateFlushHandler(emitHandler, emitCandidateHolder); + State currentState = this.rootState; for (int position = 0; position < text.length(); position++) { + + if (flushHandler.stop()) { + return; + } + Character character = text.charAt(position); if (trieConfig.isCaseInsensitive()) { character = Character.toLowerCase(character); } - currentState = getState(currentState, character); - storeEmits(position, currentState, emitHandler); - } + currentState = getState(currentState, character, flushHandler); - } - - public Emit firstMatch(CharSequence text) { - if (!trieConfig.isAllowOverlaps()) { - // Slow path. Needs to find all the matches to detect overlaps. - Collection parseText = parseText(text); - if (parseText != null && !parseText.isEmpty()) { - return parseText.iterator().next(); - } - } else { - // Fast path. Returns first match found. - State currentState = this.rootState; - for (int position = 0; position < text.length(); position++) { - Character character = text.charAt(position); - if (trieConfig.isCaseInsensitive()) { - character = Character.toLowerCase(character); - } - currentState = getState(currentState, character); - Collection emitStrs = currentState.emit(); - if (emitStrs != null && !emitStrs.isEmpty()) { - for (String emitStr : emitStrs) { - final Emit emit = new Emit(position - emitStr.length() + 1, position, emitStr); - if (trieConfig.isOnlyWholeWords()) { - if (!isPartialMatch(text, emit)) { - return emit; - } - } else { - return emit; - } - } - } - } - } - return null; - } - - private boolean isPartialMatch(CharSequence searchText, Emit emit) { - return (emit.getStart() != 0 && - Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) || - (emit.getEnd() + 1 != searchText.length() && - Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1))); - } - - private void removePartialMatches(CharSequence searchText, List collectedEmits) { - List removeEmits = new ArrayList<>(); - for (Emit emit : collectedEmits) { - if (isPartialMatch(searchText, emit)) { - removeEmits.add(emit); - } - } - for (Emit removeEmit : removeEmits) { - collectedEmits.remove(removeEmit); - } - } - - private void removePartialMatchesWhiteSpaceSeparated(CharSequence searchText, List collectedEmits) { - long size = searchText.length(); - List removeEmits = new ArrayList<>(); - for (Emit emit : collectedEmits) { - if ((emit.getStart() == 0 || Character.isWhitespace(searchText.charAt(emit.getStart() - 1))) && - (emit.getEnd() + 1 == size || Character.isWhitespace(searchText.charAt(emit.getEnd() + 1)))) { - continue; + Collection emits = currentState.emit(); + if (emits != null && !emits.isEmpty()) { + for (String emit : emits) { + int start = position - emit.length() + 1; + if (!trieConfig.isOnlyWholeWords() || isWholeWord(text, start, position)) { + emitCandidateHolder.addCandidate(new Emit(start, position, emit)); + } + } } - removeEmits.add(emit); - } - for (Emit removeEmit : removeEmits) { - collectedEmits.remove(removeEmit); + } + flushHandler.flush(); } - private State getState(State currentState, Character character) { + private boolean isWholeWord(CharSequence text, int start, int end) { + return (start == 0 || Character.isWhitespace(text.charAt(start - 1))) && + (end == text.length() - 1 || Character.isWhitespace(text.charAt(end + 1))); + } + + private State getState(State currentState, Character character, EmitCandidateFlushHandler flushHandler) { State newCurrentState = currentState.nextState(character); while (newCurrentState == null) { - currentState = currentState.failure(); + currentState = currentState.failure(flushHandler); newCurrentState = currentState.nextState(character); } return newCurrentState; @@ -191,15 +141,6 @@ public class Trie { } } - private void storeEmits(int position, State currentState, EmitHandler emitHandler) { - Collection emits = currentState.emit(); - if (emits != null && !emits.isEmpty()) { - for (String emit : emits) { - emitHandler.emit(new Emit(position - emit.length() + 1, position, emit)); - } - } - } - public static TrieBuilder builder() { return new TrieBuilder(); } @@ -227,11 +168,6 @@ public class Trie { return this; } - public TrieBuilder onlyWholeWordsWhiteSpaceSeparated() { - this.trieConfig.setOnlyWholeWordsWhiteSpaceSeparated(true); - return this; - } - public TrieBuilder addKeyword(String keyword) { trie.addKeyword(keyword); return this; diff --git a/src/main/java/org/ahocorasick/trie/TrieConfig.java b/src/main/java/org/ahocorasick/trie/TrieConfig.java index 1bd365e..6fa05c7 100644 --- a/src/main/java/org/ahocorasick/trie/TrieConfig.java +++ b/src/main/java/org/ahocorasick/trie/TrieConfig.java @@ -6,8 +6,6 @@ public class TrieConfig { private boolean onlyWholeWords = false; - private boolean onlyWholeWordsWhiteSpaceSeparated = false; - private boolean caseInsensitive = false; public boolean isAllowOverlaps() { @@ -26,12 +24,6 @@ public class TrieConfig { this.onlyWholeWords = onlyWholeWords; } - public boolean isOnlyWholeWordsWhiteSpaceSeparated() { return onlyWholeWordsWhiteSpaceSeparated; } - - public void setOnlyWholeWordsWhiteSpaceSeparated(boolean onlyWholeWordsWhiteSpaceSeparated) { - this.onlyWholeWordsWhiteSpaceSeparated = onlyWholeWordsWhiteSpaceSeparated; - } - public boolean isCaseInsensitive() { return caseInsensitive; } diff --git a/src/main/java/org/ahocorasick/trie/candidate/EmitCandidateFlushHandler.java b/src/main/java/org/ahocorasick/trie/candidate/EmitCandidateFlushHandler.java new file mode 100644 index 0000000..319f96d --- /dev/null +++ b/src/main/java/org/ahocorasick/trie/candidate/EmitCandidateFlushHandler.java @@ -0,0 +1,27 @@ +package org.ahocorasick.trie.candidate; + +import org.ahocorasick.trie.Emit; +import org.ahocorasick.trie.handler.EmitHandler; + +public class EmitCandidateFlushHandler { + + private final EmitHandler emitHandler; + + private final EmitCandidateHolder emitCandidateHolder; + + public EmitCandidateFlushHandler(EmitHandler emitHandler, EmitCandidateHolder emitCandidateHolder) { + this.emitHandler = emitHandler; + this.emitCandidateHolder = emitCandidateHolder; + } + + public void flush() { + for (Emit emit : emitCandidateHolder.flush()) { + emitHandler.emit(emit); + } + } + + public boolean stop() { + return emitHandler.stop(); + } + +} diff --git a/src/main/java/org/ahocorasick/trie/candidate/EmitCandidateHolder.java b/src/main/java/org/ahocorasick/trie/candidate/EmitCandidateHolder.java new file mode 100644 index 0000000..e89e01f --- /dev/null +++ b/src/main/java/org/ahocorasick/trie/candidate/EmitCandidateHolder.java @@ -0,0 +1,12 @@ +package org.ahocorasick.trie.candidate; + +import org.ahocorasick.trie.Emit; + +import java.util.List; + +public interface EmitCandidateHolder { + + void addCandidate(Emit emitCandidate); + List flush(); + +} diff --git a/src/main/java/org/ahocorasick/trie/candidate/NonOverlappingEmitCandidateHolder.java b/src/main/java/org/ahocorasick/trie/candidate/NonOverlappingEmitCandidateHolder.java new file mode 100644 index 0000000..6d6ff97 --- /dev/null +++ b/src/main/java/org/ahocorasick/trie/candidate/NonOverlappingEmitCandidateHolder.java @@ -0,0 +1,18 @@ +package org.ahocorasick.trie.candidate; + +import org.ahocorasick.interval.IntervalTree; +import org.ahocorasick.interval.Intervalable; +import org.ahocorasick.trie.Emit; + +import java.util.*; + +public class NonOverlappingEmitCandidateHolder extends OverlappingEmitCandidateHolder { + + @Override + public List flush() { + IntervalTree intervalTree = new IntervalTree((List)(List)emitCandidates); + intervalTree.removeOverlaps((List) (List) emitCandidates); + return super.flush(); + } + +} diff --git a/src/main/java/org/ahocorasick/trie/candidate/OverlappingEmitCandidateHolder.java b/src/main/java/org/ahocorasick/trie/candidate/OverlappingEmitCandidateHolder.java new file mode 100644 index 0000000..512caa0 --- /dev/null +++ b/src/main/java/org/ahocorasick/trie/candidate/OverlappingEmitCandidateHolder.java @@ -0,0 +1,28 @@ +package org.ahocorasick.trie.candidate; + +import org.ahocorasick.trie.Emit; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +public class OverlappingEmitCandidateHolder implements EmitCandidateHolder { + + protected List emitCandidates = new ArrayList<>(); + + @Override + public void addCandidate(Emit emitCandidate) { + this.emitCandidates.add(emitCandidate); + } + + @Override + public List flush() { + return reset(emitCandidates); + } + + private List reset(List emitCandidates) { + this.emitCandidates = new ArrayList<>(); + return emitCandidates; + } + +} diff --git a/src/main/java/org/ahocorasick/trie/handler/DefaultEmitHandler.java b/src/main/java/org/ahocorasick/trie/handler/DefaultEmitHandler.java index 656d1e2..22f998b 100644 --- a/src/main/java/org/ahocorasick/trie/handler/DefaultEmitHandler.java +++ b/src/main/java/org/ahocorasick/trie/handler/DefaultEmitHandler.java @@ -5,7 +5,7 @@ import org.ahocorasick.trie.Emit; import java.util.ArrayList; import java.util.List; -public class DefaultEmitHandler implements EmitHandler { +public class DefaultEmitHandler extends SimpleEmitHandler { private List emits = new ArrayList<>(); diff --git a/src/main/java/org/ahocorasick/trie/handler/EmitHandler.java b/src/main/java/org/ahocorasick/trie/handler/EmitHandler.java index 74fd71e..40791d9 100644 --- a/src/main/java/org/ahocorasick/trie/handler/EmitHandler.java +++ b/src/main/java/org/ahocorasick/trie/handler/EmitHandler.java @@ -3,5 +3,16 @@ package org.ahocorasick.trie.handler; import org.ahocorasick.trie.Emit; public interface EmitHandler { + + /** + * Callback handler that deals with an emit it gets from the parser + * @param emit the current emit that must be dealt with + */ void emit(Emit emit); + + /** + * Force the parse process to stop + * @return true if the process must stop + */ + boolean stop(); } diff --git a/src/main/java/org/ahocorasick/trie/handler/FirstMatchHandler.java b/src/main/java/org/ahocorasick/trie/handler/FirstMatchHandler.java new file mode 100644 index 0000000..2852c69 --- /dev/null +++ b/src/main/java/org/ahocorasick/trie/handler/FirstMatchHandler.java @@ -0,0 +1,28 @@ +package org.ahocorasick.trie.handler; + +import org.ahocorasick.trie.Emit; + +public class FirstMatchHandler extends SimpleEmitHandler { + + private Emit firstMatch; + + private boolean stop = false; + + @Override + public void emit(Emit emit) { + if (!stop) { + firstMatch = emit; + stop = true; + } + } + + public Emit getFirstMatch() { + return firstMatch; + } + + @Override + public boolean stop() { + return this.stop; + } + +} diff --git a/src/main/java/org/ahocorasick/trie/handler/SimpleEmitHandler.java b/src/main/java/org/ahocorasick/trie/handler/SimpleEmitHandler.java new file mode 100644 index 0000000..82ba4c5 --- /dev/null +++ b/src/main/java/org/ahocorasick/trie/handler/SimpleEmitHandler.java @@ -0,0 +1,14 @@ +package org.ahocorasick.trie.handler; + +import org.ahocorasick.trie.Emit; + +public abstract class SimpleEmitHandler implements EmitHandler { + + @Override + public abstract void emit(Emit emit); + + @Override + public boolean stop() { + return false; + } +} diff --git a/src/test/java/org/ahocorasick/trie/TrieTest.java b/src/test/java/org/ahocorasick/trie/TrieTest.java index 4f931c8..0be9708 100644 --- a/src/test/java/org/ahocorasick/trie/TrieTest.java +++ b/src/test/java/org/ahocorasick/trie/TrieTest.java @@ -1,6 +1,7 @@ package org.ahocorasick.trie; import org.ahocorasick.trie.handler.EmitHandler; +import org.ahocorasick.trie.handler.SimpleEmitHandler; import org.junit.Test; import java.util.ArrayList; @@ -129,7 +130,7 @@ public class TrieTest { .build(); final List emits = new ArrayList<>(); - EmitHandler emitHandler = new EmitHandler() { + EmitHandler emitHandler = new SimpleEmitHandler() { @Override public void emit(Emit emit) { @@ -209,6 +210,51 @@ public class TrieTest { checkEmit(iterator.next(), 2, 9, "hehehehe"); } + @Test + public void nonOverlappingWholeWords() { + Trie trie = Trie.builder() + .removeOverlaps() + .onlyWholeWords() + .addKeyword("peper molen") + .addKeyword("molen wiel") + .addKeyword("wiel dop") + .addKeyword("dop") + .build(); + Collection emits = trie.parseText("peper molen wiel dop xwiel dop wiel dopx wiel dop"); + assertEquals(4, emits.size()); + Iterator iterator = emits.iterator(); + checkEmit(iterator.next(), 0, 10, "peper molen"); + checkEmit(iterator.next(), 12, 19, "wiel dop"); + checkEmit(iterator.next(), 27, 29, "dop"); + checkEmit(iterator.next(), 41, 48, "wiel dop"); + } + + @Test + public void nonOverlappingWholeWordsWithCustomEmitHandler() { + Trie trie = Trie.builder() + .removeOverlaps() + .onlyWholeWords() + .addKeyword("peper molen") + .addKeyword("molen wiel") + .addKeyword("wiel dop") + .addKeyword("dop") + .build(); + final List emits = new ArrayList<>(); + EmitHandler emitHandler = new SimpleEmitHandler() { + @Override + public void emit(Emit emit) { + emits.add(emit); + } + }; + trie.parseText("peper molen wiel dop xwiel dop wiel dopx wiel dop", emitHandler); + assertEquals(4, emits.size()); + Iterator iterator = emits.iterator(); + checkEmit(iterator.next(), 0, 10, "peper molen"); + checkEmit(iterator.next(), 12, 19, "wiel dop"); + checkEmit(iterator.next(), 27, 29, "dop"); + checkEmit(iterator.next(), 41, 48, "wiel dop"); + } + @Test public void nonOverlapping() { Trie trie = Trie.builder().removeOverlaps() @@ -402,7 +448,7 @@ public class TrieTest { @Test public void partialMatchWhiteSpaces() { Trie trie = Trie.builder() - .onlyWholeWordsWhiteSpaceSeparated() + .onlyWholeWords() .addKeyword("#sugar-123") .build(); Collection < Emit > emits = trie.parseText("#sugar-123 #sugar-1234"); // left, middle, right test diff --git a/src/test/java/org/ahocorasick/trie/candidate/EmitCandidateFlushHandlerTest.java b/src/test/java/org/ahocorasick/trie/candidate/EmitCandidateFlushHandlerTest.java new file mode 100644 index 0000000..94a1ece --- /dev/null +++ b/src/test/java/org/ahocorasick/trie/candidate/EmitCandidateFlushHandlerTest.java @@ -0,0 +1,21 @@ +package org.ahocorasick.trie.candidate; + +import org.ahocorasick.trie.Emit; +import org.ahocorasick.trie.handler.EmitHandler; +import org.ahocorasick.trie.handler.FirstMatchHandler; +import org.junit.Test; + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +public class EmitCandidateFlushHandlerTest { + + @Test + public void stop() { + EmitHandler emitHandler = new FirstMatchHandler(); + EmitCandidateFlushHandler flushHandler = new EmitCandidateFlushHandler(emitHandler, null); + assertFalse(flushHandler.stop()); + emitHandler.emit(new Emit(0, 2, "bla")); + assertTrue(flushHandler.stop()); + } +} diff --git a/src/test/java/org/ahocorasick/trie/candidate/NonOverlappingEmitCandidateHolderTest.java b/src/test/java/org/ahocorasick/trie/candidate/NonOverlappingEmitCandidateHolderTest.java new file mode 100644 index 0000000..176b2ed --- /dev/null +++ b/src/test/java/org/ahocorasick/trie/candidate/NonOverlappingEmitCandidateHolderTest.java @@ -0,0 +1,35 @@ +package org.ahocorasick.trie.candidate; + +import org.ahocorasick.trie.Emit; +import org.junit.Test; + +import java.util.Collection; +import java.util.List; + +import static org.junit.Assert.assertEquals; + +public class NonOverlappingEmitCandidateHolderTest { + + @Test + public void retainLongestEmit() { + EmitCandidateHolder holder = new NonOverlappingEmitCandidateHolder(); + holder.addCandidate(new Emit(0, 2, "she")); + holder.addCandidate(new Emit(1, 2, "he")); + List emits = holder.flush(); + assertEquals(1, emits.size()); + assertEquals("she", emits.get(0).getKeyword()); + } + + @Test + public void multipleOverlaps() { + EmitCandidateHolder holder = new NonOverlappingEmitCandidateHolder(); + holder.addCandidate(new Emit(0, 4, "ababc")); + holder.addCandidate(new Emit(4, 6, "cba")); + holder.addCandidate(new Emit(6, 7, "ab")); + List emits = holder.flush(); + assertEquals(2, emits.size()); + assertEquals("ababc", emits.get(0).getKeyword()); + assertEquals("ab", emits.get(1).getKeyword()); + } + +} diff --git a/src/test/java/org/ahocorasick/trie/candidate/OverlappingEmitCandidateHolderTest.java b/src/test/java/org/ahocorasick/trie/candidate/OverlappingEmitCandidateHolderTest.java new file mode 100644 index 0000000..ad14c8d --- /dev/null +++ b/src/test/java/org/ahocorasick/trie/candidate/OverlappingEmitCandidateHolderTest.java @@ -0,0 +1,22 @@ +package org.ahocorasick.trie.candidate; + +import org.ahocorasick.trie.Emit; +import org.junit.Test; + +import java.util.List; + +import static org.junit.Assert.assertEquals; + +public class OverlappingEmitCandidateHolderTest { + + @Test + public void addAndFlush() { + EmitCandidateHolder holder = new OverlappingEmitCandidateHolder(); + holder.addCandidate(new Emit(0, 2, "ABC")); + holder.addCandidate(new Emit(2, 4, "CDE")); + List emits = holder.flush(); + assertEquals(2, emits.size()); + assertEquals("ABC", emits.get(0).getKeyword()); + assertEquals("CDE", emits.get(1).getKeyword()); + } +}