diff --git a/src/main/java/org/ahocorasick/trie/PayloadTrie.java b/src/main/java/org/ahocorasick/trie/PayloadTrie.java index d1de158..a5ddde0 100644 --- a/src/main/java/org/ahocorasick/trie/PayloadTrie.java +++ b/src/main/java/org/ahocorasick/trie/PayloadTrie.java @@ -2,7 +2,7 @@ package org.ahocorasick.trie; import static java.lang.Character.isWhitespace; -import java.util.ArrayList; +import java.util.LinkedList; import java.util.Collection; import java.util.List; import java.util.Queue; @@ -13,8 +13,6 @@ import org.ahocorasick.interval.Intervalable; import org.ahocorasick.trie.handler.DefaultPayloadEmitHandler; import org.ahocorasick.trie.handler.PayloadEmitHandler; import org.ahocorasick.trie.handler.StatefulPayloadEmitHandler; -import org.ahocorasick.util.ListElementRemoval; -import org.ahocorasick.util.ListElementRemoval.RemoveElementPredicate; /** * A trie implementation that carries a payload. See {@link Trie} for @@ -24,7 +22,7 @@ import org.ahocorasick.util.ListElementRemoval.RemoveElementPredicate; * The payload trie adds the possibility to specify emitted payloads for each * added keyword. *

- * + * * @author Daniel Beck * @param The type of the supplied of the payload. */ @@ -79,12 +77,12 @@ public class PayloadTrie { /** * Tokenizes the specified text and returns the emitted outputs. - * + * * @param text The text to tokenize. * @return the emitted outputs */ public Collection> tokenize(final String text) { - final Collection> tokens = new ArrayList<>(); + final Collection> tokens = new LinkedList<>(); final Collection> collectedEmits = parseText(text); int lastCollectedPosition = -1; @@ -118,7 +116,7 @@ public class PayloadTrie { /** * Tokenizes a specified text and returns the emitted outputs. - * + * * @param text The character sequence to tokenize. * @return A collection of emits. */ @@ -129,7 +127,7 @@ public class PayloadTrie { /** * Tokenizes the specified text by using a custom EmitHandler and returns the * emitted outputs. - * + * * @param text The character sequence to tokenize. * @param emitHandler The emit handler that will be used to parse the text. * @return A collection of emits. @@ -140,14 +138,6 @@ public class PayloadTrie { final List> collectedEmits = emitHandler.getEmits(); - if (trieConfig.isOnlyWholeWords()) { - removePartialMatches(text, collectedEmits); - } - - if (trieConfig.isOnlyWholeWordsWhiteSpaceSeparated()) { - removePartialMatchesWhiteSpaceSeparated(text, collectedEmits); - } - if (!trieConfig.isAllowOverlaps()) { IntervalTree intervalTree = new IntervalTree((List) (List) collectedEmits); intervalTree.removeOverlaps((List) (List) collectedEmits); @@ -159,7 +149,7 @@ public class PayloadTrie { /** * Returns true if the text contains contains one of the search terms. Else, * returns false. - * + * * @param text Specified text. * @return true if the text contains one of the search terms. Else, returns * false. @@ -171,7 +161,7 @@ public class PayloadTrie { /** * Tokenizes the specified text by using a custom EmitHandler and returns the * emitted outputs. - * + * * @param text The character sequence to tokenize. * @param emitHandler The emit handler that will be used to parse the text. */ @@ -181,13 +171,13 @@ public class PayloadTrie { for (int position = 0; position < text.length(); position++) { char character = text.charAt( position); - // TODO: Maybe lowercase the entire string at once? if (trieConfig.isCaseInsensitive()) { character = Character.toLowerCase(character); } currentState = getState(currentState, character); - if (storeEmits(position, currentState, emitHandler) && trieConfig.isStopOnHit()) { + final Collection> payloads = currentState.emit(); + if (processEmits(text, position, payloads, emitHandler) && trieConfig.isStopOnHit()) { return; } } @@ -214,7 +204,6 @@ public class PayloadTrie { for (int position = 0; position < text.length(); position++) { char character = text.charAt( position); - // TODO: Lowercase the entire string at once? if (trieConfig.isCaseInsensitive()) { character = Character.toLowerCase(character); } @@ -246,29 +235,10 @@ public class PayloadTrie { || (emit.getEnd() + 1 != searchText.length() && Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1))); } - private void removePartialMatches(final CharSequence searchText, final List> collectedEmits) { - - final RemoveElementPredicate> predicate = emit -> isPartialMatch( searchText, emit); - - ListElementRemoval.removeIf(collectedEmits, predicate); - } - - private void removePartialMatchesWhiteSpaceSeparated(final CharSequence searchText, - final List> collectedEmits) { + private boolean isPartialMatchWhiteSpaceSeparated(final CharSequence searchText, final PayloadEmit emit) { final long size = searchText.length(); - final List> removeEmits = new ArrayList<>(); - - for (final PayloadEmit emit : collectedEmits) { - if ((emit.getStart() == 0 || isWhitespace(searchText.charAt(emit.getStart() - 1))) - && (emit.getEnd() + 1 == size || isWhitespace(searchText.charAt(emit.getEnd() + 1)))) { - continue; - } - removeEmits.add(emit); - } - - for (final PayloadEmit removeEmit : removeEmits) { - collectedEmits.remove(removeEmit); - } + return (emit.getStart() != 0 && !isWhitespace(searchText.charAt(emit.getStart() - 1))) + || (emit.getEnd() + 1 != size && !isWhitespace(searchText.charAt(emit.getEnd() + 1))); } private PayloadState getState(PayloadState currentState, final Character character) { @@ -312,16 +282,14 @@ public class PayloadTrie { } } - private boolean storeEmits(final int position, final PayloadState currentState, final PayloadEmitHandler emitHandler) { + private boolean processEmits(final CharSequence text, final int position, final Collection> payloads, final PayloadEmitHandler emitHandler) { boolean emitted = false; - final Collection> payloads = currentState.emit(); - - // TODO: The check for empty might be superfluous. - if (payloads != null && !payloads.isEmpty()) { - for (final Payload payload : payloads) { - emitted = emitHandler.emit(new PayloadEmit<>(position - payload.getKeyword().length() + 1, position, - payload.getKeyword(), payload.getData())) || emitted; - + for (final Payload payload : payloads) { + final PayloadEmit payloadEmit = new PayloadEmit<>(position - payload.getKeyword().length() + 1, + position, payload.getKeyword(), payload.getData()); + if (!(trieConfig.isOnlyWholeWords() && isPartialMatch(text, payloadEmit)) && + !(trieConfig.isOnlyWholeWordsWhiteSpaceSeparated() && isPartialMatchWhiteSpaceSeparated(text, payloadEmit))) { + emitted = emitHandler.emit(payloadEmit) || emitted; if (emitted && trieConfig.isStopOnHit()) { break; } @@ -351,7 +319,7 @@ public class PayloadTrie { /** * Builder class to create a PayloadTrie instance. - * + * * @param The type of the emitted payload. */ public static class PayloadTrieBuilder { @@ -475,6 +443,7 @@ public class PayloadTrie { * @return This builder. * @deprecated Use ignoreCase() */ + @Deprecated public PayloadTrieBuilder caseInsensitive() { return ignoreCase(); } @@ -483,6 +452,7 @@ public class PayloadTrie { * @return This builder. * @deprecated Use ignoreOverlaps() */ + @Deprecated public PayloadTrieBuilder removeOverlaps() { return ignoreOverlaps(); } diff --git a/src/main/java/org/ahocorasick/util/ListElementRemoval.java b/src/main/java/org/ahocorasick/util/ListElementRemoval.java deleted file mode 100644 index e229633..0000000 --- a/src/main/java/org/ahocorasick/util/ListElementRemoval.java +++ /dev/null @@ -1,51 +0,0 @@ -package org.ahocorasick.util; - -import java.util.ArrayList; -import java.util.List; - -/** - * Helps removes elements from a list in a efficient way. - * - *

Removing elements from an ArrayList in a naive way can lead to O(n^3) - * running time. If the algorithm first creates a list of all the elements - * to remove, then we for each element in this list (assume n elements) we look - * for the element in the original list (against n elements) and when found we need - * to remove the element and move the elements to the right (of the removed element) - * to the left by one, the size of this operation is at worst n hence O(n^3).

- * - *

This instead makes a new list and copies over only elements we want to keep, - * we then clear the original list and then add all of the elements to the original - * list. This gives us (for ArrayList) a running time of O(n).

- * - *

The performance of this has not been thoroughly tested for linked list.

- * - *

This can be completely removed in java 8 as the List#removeIf() method can be used instead - * as this already is optimised for each list implementation. - * - */ -public class ListElementRemoval { - - public interface RemoveElementPredicate { - boolean remove( T t ); - } - - /** - * Removes all elements from the list matching the given predicate. - * - * @param list the list from which to remove - * @param predicate to test for removal - * @param type of list - */ - public static void removeIf(final List list, final RemoveElementPredicate predicate) { - final List newList = new ArrayList<>(list.size()); - - for(final T element : list) { - if (!predicate.remove(element)) { - newList.add(element); - } - } - - list.clear(); - list.addAll(newList); - } -} diff --git a/src/test/java/org/ahocorasick/interval/IntervalTest.java b/src/test/java/org/ahocorasick/interval/IntervalTest.java index 4a3598e..328b902 100644 --- a/src/test/java/org/ahocorasick/interval/IntervalTest.java +++ b/src/test/java/org/ahocorasick/interval/IntervalTest.java @@ -6,52 +6,62 @@ import java.util.Iterator; import java.util.Set; import java.util.TreeSet; -import static junit.framework.Assert.*; +import static org.junit.Assert.*; public class IntervalTest { @Test - public void construct() { - Interval i = new Interval(1, 3); + public void test_construct() { + final Interval i = new Interval(1, 3); assertEquals(1, i.getStart()); assertEquals(3, i.getEnd()); } @Test - public void size() { + public void test_size() { assertEquals(3, new Interval(0, 2).size()); } @Test - public void intervaloverlaps() { + public void test_intervaloverlaps() { assertTrue(new Interval(1, 3).overlapsWith(new Interval(2, 4))); } @Test - public void intervalDoesNotOverlap() { + public void test_intervalDoesNotOverlap() { assertFalse(new Interval(1, 13).overlapsWith(new Interval(27, 42))); } @Test - public void pointOverlaps() { + public void test_pointOverlaps() { assertTrue(new Interval(1, 3).overlapsWith(2)); } @Test - public void pointDoesNotOverlap() { + public void test_pointDoesNotOverlap() { assertFalse(new Interval(1, 13).overlapsWith(42)); } @Test - public void comparable() { - Set intervals = new TreeSet<>(); + public void test_comparable() { + final Set intervals = new TreeSet<>(); intervals.add(new Interval(4, 6)); intervals.add(new Interval(2, 7)); intervals.add(new Interval(3, 4)); - Iterator it = intervals.iterator(); + final Iterator it = intervals.iterator(); assertEquals(2, it.next().getStart()); assertEquals(3, it.next().getStart()); assertEquals(4, it.next().getStart()); } + @Test + public void test_checkToString() { + assertEquals("4:6", new Interval(4, 6).toString()); + } + + @Test + public void test_compareToNegativeTest() { + assertEquals(-1, new Interval(4, 6).compareTo(new Object())); + } + } diff --git a/src/test/java/org/ahocorasick/trie/PayloadTrieTest.java b/src/test/java/org/ahocorasick/trie/PayloadTrieTest.java index 556fa8e..a69e462 100644 --- a/src/test/java/org/ahocorasick/trie/PayloadTrieTest.java +++ b/src/test/java/org/ahocorasick/trie/PayloadTrieTest.java @@ -5,15 +5,14 @@ import org.ahocorasick.trie.handler.PayloadEmitHandler; import org.ahocorasick.trie.handler.StatefulPayloadEmitHandler; import org.junit.Test; -import java.util.ArrayList; +import java.util.LinkedList; import java.util.Collection; import java.util.Iterator; import java.util.List; import java.util.concurrent.ThreadLocalRandom; import static java.util.Arrays.asList; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.Assert.*; public class PayloadTrieTest { @@ -216,7 +215,7 @@ public class PayloadTrieTest { public void ushersTestByCallback() { PayloadTrie trie = PayloadTrie.builder().addKeywords(PRONOUNS_WITH_PAYLOADS).build(); - final List> emits = new ArrayList<>(); + final List> emits = new LinkedList<>(); PayloadEmitHandler emitHandler = emit -> { emits.add(emit); return true; @@ -448,6 +447,42 @@ public class PayloadTrieTest { assertEquals(textSize / interval, emits.size()); } + @Test + public void test_containsMatchWithCaseInsensitive() { + PayloadTrie trie = PayloadTrie.builder().caseInsensitive().addKeyword("foo", "bar").build(); + + assertTrue(trie.containsMatch("FOOBAR")); + assertFalse(trie.containsMatch("FO!?AR")); + } + + // @see https://github.com/robert-bor/aho-corasick/issues/85 + @Test + public void test_wholeWords() { + PayloadTrie trie = PayloadTrie.builder().addKeyword("foo", "bar").onlyWholeWords().build(); + // access via PayloadTrie.parseText(CharSequence) + Collection> result1 = trie.parseText("foobar"); + // access via PayloadTrie.parseText(CharSequence, PayloadEmitHandler) + Collection> result2 = new LinkedList<>(); + trie.parseText("foobar", result2::add); + + assertTrue(result1.isEmpty()); + assertEquals(result1, result2); + } + + // @see https://github.com/robert-bor/aho-corasick/issues/85 + @Test + public void test_wholeWordsWhiteSpaceSeparated() { + PayloadTrie trie = PayloadTrie.builder().addKeyword("foo", "bar").onlyWholeWordsWhiteSpaceSeparated().build(); + // access via PayloadTrie.parseText(CharSequence) + Collection> result1 = trie.parseText("foo#bar"); + // access via PayloadTrie.parseText(CharSequence, PayloadEmitHandler) + Collection> result2 = new LinkedList<>(); + trie.parseText("foo#bar", result2::add); + + assertTrue(result1.isEmpty()); + assertEquals(result1, result2); + } + /** * Generates a random sequence of ASCII numbers. * diff --git a/src/test/java/org/ahocorasick/trie/StateTest.java b/src/test/java/org/ahocorasick/trie/StateTest.java index 0834d52..02840d9 100644 --- a/src/test/java/org/ahocorasick/trie/StateTest.java +++ b/src/test/java/org/ahocorasick/trie/StateTest.java @@ -2,12 +2,15 @@ package org.ahocorasick.trie; import org.junit.Test; -import static junit.framework.Assert.assertEquals; +import java.util.Collection; +import java.util.Collections; + +import static org.junit.Assert.*; public class StateTest { @Test - public void constructSequenceOfCharacters() { + public void test_constructSequenceOfCharacters() { final State rootState = new State(); rootState .addState('a') @@ -19,5 +22,50 @@ public class StateTest { assertEquals(2, currentState.getDepth()); currentState = currentState.nextState('c'); assertEquals(3, currentState.getDepth()); + currentState = currentState.nextState('F'); + assertNull(currentState); } + + @Test + public void test_getStates() { + final State rootState = new State(); + rootState.addState("foo"); + final State currentState = rootState.nextState('f'); + final Collection states = rootState.getStates(); + + assertEquals(1, states.size()); + assertEquals(currentState, states.iterator().next()); + } + + @Test + public void test_getTransitions() { + final State rootState = new State(); + rootState.addState("foo"); + final State currentState = rootState.nextState('f'); + final Collection transitions = rootState.getTransitions(); + + assertEquals(1, transitions.size()); + assertEquals(Character.valueOf('f'), transitions.iterator().next()); + } + + @Test + public void test_failure() { + final State failureState = new State(); + final State rootState = new State(); + rootState.setFailure(failureState); + + assertEquals(failureState, rootState.failure()); + } + + @Test + public void test_checkEmits() { + final State rootState = new State(); + rootState.addState('a') + .addEmit(Collections.singleton("tag")); + final Collection actual = rootState.nextState('a').emit(); + + assertEquals(1, actual.size()); + assertEquals("tag", actual.iterator().next()); + } + } diff --git a/src/test/java/org/ahocorasick/util/ListElementRemovalTest.java b/src/test/java/org/ahocorasick/util/ListElementRemovalTest.java deleted file mode 100644 index 6fc508a..0000000 --- a/src/test/java/org/ahocorasick/util/ListElementRemovalTest.java +++ /dev/null @@ -1,52 +0,0 @@ -package org.ahocorasick.util; - -import org.ahocorasick.util.ListElementRemoval.RemoveElementPredicate; -import org.junit.Test; - -import java.util.ArrayList; -import java.util.List; - -import static java.util.Arrays.asList; -import static org.junit.Assert.assertEquals; - -/** - * Responsible for testing that elements can be removed efficiently. - */ -public class ListElementRemovalTest { - - @Test - public void test_RemoveNone() { - final List list = createList(); - RemoveElementPredicate matchNothing = t -> false; - - ListElementRemoval.removeIf( list, matchNothing ); - - assertEquals( 3, list.size() ); - } - - @Test - public void test_RemoveAll() { - final List list = createList(); - RemoveElementPredicate matchNothing = t -> true; - - ListElementRemoval.removeIf( list, matchNothing ); - - assertEquals( 0, list.size() ); - } - - @Test - public void test_RemoveSome() { - final List list = createList(); - RemoveElementPredicate matchNothing = - t -> "a".equals( t ) || "c".equals( t ); - - ListElementRemoval.removeIf( list, matchNothing ); - - assertEquals( 1, list.size() ); - assertEquals( "b", list.get( 0 ) ); - } - - private List createList() { - return new ArrayList<>( asList( "a", "b", "c" ) ); - } -}