PayloadTrie.parseText method inconsistencies (#86)

* make PayloadTrie.parseText(CharSequence) consistent with PayloadTrie.parseText(CharSequence, PayloadEmitHandler<T>)

* added IntervalTest, StateTest to increase code coverage

Co-authored-by: omarshibli <omar.shibli@personetics.com>
This commit is contained in:
Omar Shibli 2020-11-10 19:01:49 +02:00 committed by GitHub
parent 73ad827b1f
commit 66eef7b76f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 133 additions and 173 deletions

View File

@ -2,7 +2,7 @@ package org.ahocorasick.trie;
import static java.lang.Character.isWhitespace;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.Collection;
import java.util.List;
import java.util.Queue;
@ -13,8 +13,6 @@ import org.ahocorasick.interval.Intervalable;
import org.ahocorasick.trie.handler.DefaultPayloadEmitHandler;
import org.ahocorasick.trie.handler.PayloadEmitHandler;
import org.ahocorasick.trie.handler.StatefulPayloadEmitHandler;
import org.ahocorasick.util.ListElementRemoval;
import org.ahocorasick.util.ListElementRemoval.RemoveElementPredicate;
/**
* A trie implementation that carries a payload. See {@link Trie} for
@ -24,7 +22,7 @@ import org.ahocorasick.util.ListElementRemoval.RemoveElementPredicate;
* The payload trie adds the possibility to specify emitted payloads for each
* added keyword.
* </p>
*
*
* @author Daniel Beck
* @param <T> The type of the supplied of the payload.
*/
@ -79,12 +77,12 @@ public class PayloadTrie<T> {
/**
* Tokenizes the specified text and returns the emitted outputs.
*
*
* @param text The text to tokenize.
* @return the emitted outputs
*/
public Collection<PayloadToken<T>> tokenize(final String text) {
final Collection<PayloadToken<T>> tokens = new ArrayList<>();
final Collection<PayloadToken<T>> tokens = new LinkedList<>();
final Collection<PayloadEmit<T>> collectedEmits = parseText(text);
int lastCollectedPosition = -1;
@ -118,7 +116,7 @@ public class PayloadTrie<T> {
/**
* Tokenizes a specified text and returns the emitted outputs.
*
*
* @param text The character sequence to tokenize.
* @return A collection of emits.
*/
@ -129,7 +127,7 @@ public class PayloadTrie<T> {
/**
* Tokenizes the specified text by using a custom EmitHandler and returns the
* emitted outputs.
*
*
* @param text The character sequence to tokenize.
* @param emitHandler The emit handler that will be used to parse the text.
* @return A collection of emits.
@ -140,14 +138,6 @@ public class PayloadTrie<T> {
final List<PayloadEmit<T>> collectedEmits = emitHandler.getEmits();
if (trieConfig.isOnlyWholeWords()) {
removePartialMatches(text, collectedEmits);
}
if (trieConfig.isOnlyWholeWordsWhiteSpaceSeparated()) {
removePartialMatchesWhiteSpaceSeparated(text, collectedEmits);
}
if (!trieConfig.isAllowOverlaps()) {
IntervalTree intervalTree = new IntervalTree((List<Intervalable>) (List<?>) collectedEmits);
intervalTree.removeOverlaps((List<Intervalable>) (List<?>) collectedEmits);
@ -159,7 +149,7 @@ public class PayloadTrie<T> {
/**
* Returns true if the text contains contains one of the search terms. Else,
* returns false.
*
*
* @param text Specified text.
* @return true if the text contains one of the search terms. Else, returns
* false.
@ -171,7 +161,7 @@ public class PayloadTrie<T> {
/**
* Tokenizes the specified text by using a custom EmitHandler and returns the
* emitted outputs.
*
*
* @param text The character sequence to tokenize.
* @param emitHandler The emit handler that will be used to parse the text.
*/
@ -181,13 +171,13 @@ public class PayloadTrie<T> {
for (int position = 0; position < text.length(); position++) {
char character = text.charAt( position);
// TODO: Maybe lowercase the entire string at once?
if (trieConfig.isCaseInsensitive()) {
character = Character.toLowerCase(character);
}
currentState = getState(currentState, character);
if (storeEmits(position, currentState, emitHandler) && trieConfig.isStopOnHit()) {
final Collection<Payload<T>> payloads = currentState.emit();
if (processEmits(text, position, payloads, emitHandler) && trieConfig.isStopOnHit()) {
return;
}
}
@ -214,7 +204,6 @@ public class PayloadTrie<T> {
for (int position = 0; position < text.length(); position++) {
char character = text.charAt( position);
// TODO: Lowercase the entire string at once?
if (trieConfig.isCaseInsensitive()) {
character = Character.toLowerCase(character);
}
@ -246,29 +235,10 @@ public class PayloadTrie<T> {
|| (emit.getEnd() + 1 != searchText.length() && Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1)));
}
private void removePartialMatches(final CharSequence searchText, final List<PayloadEmit<T>> collectedEmits) {
final RemoveElementPredicate<PayloadEmit<T>> predicate = emit -> isPartialMatch( searchText, emit);
ListElementRemoval.removeIf(collectedEmits, predicate);
}
private void removePartialMatchesWhiteSpaceSeparated(final CharSequence searchText,
final List<PayloadEmit<T>> collectedEmits) {
private boolean isPartialMatchWhiteSpaceSeparated(final CharSequence searchText, final PayloadEmit<T> emit) {
final long size = searchText.length();
final List<PayloadEmit<T>> removeEmits = new ArrayList<>();
for (final PayloadEmit<T> emit : collectedEmits) {
if ((emit.getStart() == 0 || isWhitespace(searchText.charAt(emit.getStart() - 1)))
&& (emit.getEnd() + 1 == size || isWhitespace(searchText.charAt(emit.getEnd() + 1)))) {
continue;
}
removeEmits.add(emit);
}
for (final PayloadEmit<T> removeEmit : removeEmits) {
collectedEmits.remove(removeEmit);
}
return (emit.getStart() != 0 && !isWhitespace(searchText.charAt(emit.getStart() - 1)))
|| (emit.getEnd() + 1 != size && !isWhitespace(searchText.charAt(emit.getEnd() + 1)));
}
private PayloadState<T> getState(PayloadState<T> currentState, final Character character) {
@ -312,16 +282,14 @@ public class PayloadTrie<T> {
}
}
private boolean storeEmits(final int position, final PayloadState<T> currentState, final PayloadEmitHandler<T> emitHandler) {
private boolean processEmits(final CharSequence text, final int position, final Collection<Payload<T>> payloads, final PayloadEmitHandler<T> emitHandler) {
boolean emitted = false;
final Collection<Payload<T>> payloads = currentState.emit();
// TODO: The check for empty might be superfluous.
if (payloads != null && !payloads.isEmpty()) {
for (final Payload<T> payload : payloads) {
emitted = emitHandler.emit(new PayloadEmit<>(position - payload.getKeyword().length() + 1, position,
payload.getKeyword(), payload.getData())) || emitted;
for (final Payload<T> payload : payloads) {
final PayloadEmit<T> payloadEmit = new PayloadEmit<>(position - payload.getKeyword().length() + 1,
position, payload.getKeyword(), payload.getData());
if (!(trieConfig.isOnlyWholeWords() && isPartialMatch(text, payloadEmit)) &&
!(trieConfig.isOnlyWholeWordsWhiteSpaceSeparated() && isPartialMatchWhiteSpaceSeparated(text, payloadEmit))) {
emitted = emitHandler.emit(payloadEmit) || emitted;
if (emitted && trieConfig.isStopOnHit()) {
break;
}
@ -351,7 +319,7 @@ public class PayloadTrie<T> {
/**
* Builder class to create a PayloadTrie instance.
*
*
* @param <T> The type of the emitted payload.
*/
public static class PayloadTrieBuilder<T> {
@ -475,6 +443,7 @@ public class PayloadTrie<T> {
* @return This builder.
* @deprecated Use ignoreCase()
*/
@Deprecated
public PayloadTrieBuilder<T> caseInsensitive() {
return ignoreCase();
}
@ -483,6 +452,7 @@ public class PayloadTrie<T> {
* @return This builder.
* @deprecated Use ignoreOverlaps()
*/
@Deprecated
public PayloadTrieBuilder<T> removeOverlaps() {
return ignoreOverlaps();
}

View File

@ -1,51 +0,0 @@
package org.ahocorasick.util;
import java.util.ArrayList;
import java.util.List;
/**
* Helps removes elements from a list in a efficient way.
*
* <p>Removing elements from an ArrayList in a naive way can lead to O(n^3)
* running time. If the algorithm first creates a list of all the elements
* to remove, then we for each element in this list (assume n elements) we look
* for the element in the original list (against n elements) and when found we need
* to remove the element and move the elements to the right (of the removed element)
* to the left by one, the size of this operation is at worst n hence O(n^3).</p>
*
* <p>This instead makes a new list and copies over only elements we want to keep,
* we then clear the original list and then add all of the elements to the original
* list. This gives us (for ArrayList) a running time of O(n).</p>
*
* <p>The performance of this has not been thoroughly tested for linked list.</p>
*
* <p>This can be completely removed in java 8 as the List#removeIf() method can be used instead
* as this already is optimised for each list implementation.
*
*/
public class ListElementRemoval {
public interface RemoveElementPredicate<T> {
boolean remove( T t );
}
/**
* Removes all elements from the list matching the given predicate.
*
* @param list the list from which to remove
* @param predicate to test for removal
* @param <T> type of list
*/
public static <T> void removeIf(final List<T> list, final RemoveElementPredicate<T> predicate) {
final List<T> newList = new ArrayList<>(list.size());
for(final T element : list) {
if (!predicate.remove(element)) {
newList.add(element);
}
}
list.clear();
list.addAll(newList);
}
}

View File

@ -6,52 +6,62 @@ import java.util.Iterator;
import java.util.Set;
import java.util.TreeSet;
import static junit.framework.Assert.*;
import static org.junit.Assert.*;
public class IntervalTest {
@Test
public void construct() {
Interval i = new Interval(1, 3);
public void test_construct() {
final Interval i = new Interval(1, 3);
assertEquals(1, i.getStart());
assertEquals(3, i.getEnd());
}
@Test
public void size() {
public void test_size() {
assertEquals(3, new Interval(0, 2).size());
}
@Test
public void intervaloverlaps() {
public void test_intervaloverlaps() {
assertTrue(new Interval(1, 3).overlapsWith(new Interval(2, 4)));
}
@Test
public void intervalDoesNotOverlap() {
public void test_intervalDoesNotOverlap() {
assertFalse(new Interval(1, 13).overlapsWith(new Interval(27, 42)));
}
@Test
public void pointOverlaps() {
public void test_pointOverlaps() {
assertTrue(new Interval(1, 3).overlapsWith(2));
}
@Test
public void pointDoesNotOverlap() {
public void test_pointDoesNotOverlap() {
assertFalse(new Interval(1, 13).overlapsWith(42));
}
@Test
public void comparable() {
Set<Interval> intervals = new TreeSet<>();
public void test_comparable() {
final Set<Interval> intervals = new TreeSet<>();
intervals.add(new Interval(4, 6));
intervals.add(new Interval(2, 7));
intervals.add(new Interval(3, 4));
Iterator<Interval> it = intervals.iterator();
final Iterator<Interval> it = intervals.iterator();
assertEquals(2, it.next().getStart());
assertEquals(3, it.next().getStart());
assertEquals(4, it.next().getStart());
}
@Test
public void test_checkToString() {
assertEquals("4:6", new Interval(4, 6).toString());
}
@Test
public void test_compareToNegativeTest() {
assertEquals(-1, new Interval(4, 6).compareTo(new Object()));
}
}

View File

@ -5,15 +5,14 @@ import org.ahocorasick.trie.handler.PayloadEmitHandler;
import org.ahocorasick.trie.handler.StatefulPayloadEmitHandler;
import org.junit.Test;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.ThreadLocalRandom;
import static java.util.Arrays.asList;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.*;
public class PayloadTrieTest {
@ -216,7 +215,7 @@ public class PayloadTrieTest {
public void ushersTestByCallback() {
PayloadTrie<Integer> trie = PayloadTrie.<Integer>builder().addKeywords(PRONOUNS_WITH_PAYLOADS).build();
final List<PayloadEmit<Integer>> emits = new ArrayList<>();
final List<PayloadEmit<Integer>> emits = new LinkedList<>();
PayloadEmitHandler<Integer> emitHandler = emit -> {
emits.add(emit);
return true;
@ -448,6 +447,42 @@ public class PayloadTrieTest {
assertEquals(textSize / interval, emits.size());
}
@Test
public void test_containsMatchWithCaseInsensitive() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().caseInsensitive().addKeyword("foo", "bar").build();
assertTrue(trie.containsMatch("FOOBAR"));
assertFalse(trie.containsMatch("FO!?AR"));
}
// @see https://github.com/robert-bor/aho-corasick/issues/85
@Test
public void test_wholeWords() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword("foo", "bar").onlyWholeWords().build();
// access via PayloadTrie.parseText(CharSequence)
Collection<PayloadEmit<String>> result1 = trie.parseText("foobar");
// access via PayloadTrie.parseText(CharSequence, PayloadEmitHandler<String>)
Collection<PayloadEmit<String>> result2 = new LinkedList<>();
trie.parseText("foobar", result2::add);
assertTrue(result1.isEmpty());
assertEquals(result1, result2);
}
// @see https://github.com/robert-bor/aho-corasick/issues/85
@Test
public void test_wholeWordsWhiteSpaceSeparated() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword("foo", "bar").onlyWholeWordsWhiteSpaceSeparated().build();
// access via PayloadTrie.parseText(CharSequence)
Collection<PayloadEmit<String>> result1 = trie.parseText("foo#bar");
// access via PayloadTrie.parseText(CharSequence, PayloadEmitHandler<String>)
Collection<PayloadEmit<String>> result2 = new LinkedList<>();
trie.parseText("foo#bar", result2::add);
assertTrue(result1.isEmpty());
assertEquals(result1, result2);
}
/**
* Generates a random sequence of ASCII numbers.
*

View File

@ -2,12 +2,15 @@ package org.ahocorasick.trie;
import org.junit.Test;
import static junit.framework.Assert.assertEquals;
import java.util.Collection;
import java.util.Collections;
import static org.junit.Assert.*;
public class StateTest {
@Test
public void constructSequenceOfCharacters() {
public void test_constructSequenceOfCharacters() {
final State rootState = new State();
rootState
.addState('a')
@ -19,5 +22,50 @@ public class StateTest {
assertEquals(2, currentState.getDepth());
currentState = currentState.nextState('c');
assertEquals(3, currentState.getDepth());
currentState = currentState.nextState('F');
assertNull(currentState);
}
@Test
public void test_getStates() {
final State rootState = new State();
rootState.addState("foo");
final State currentState = rootState.nextState('f');
final Collection<State> states = rootState.getStates();
assertEquals(1, states.size());
assertEquals(currentState, states.iterator().next());
}
@Test
public void test_getTransitions() {
final State rootState = new State();
rootState.addState("foo");
final State currentState = rootState.nextState('f');
final Collection<Character> transitions = rootState.getTransitions();
assertEquals(1, transitions.size());
assertEquals(Character.valueOf('f'), transitions.iterator().next());
}
@Test
public void test_failure() {
final State failureState = new State();
final State rootState = new State();
rootState.setFailure(failureState);
assertEquals(failureState, rootState.failure());
}
@Test
public void test_checkEmits() {
final State rootState = new State();
rootState.addState('a')
.addEmit(Collections.singleton("tag"));
final Collection<String> actual = rootState.nextState('a').emit();
assertEquals(1, actual.size());
assertEquals("tag", actual.iterator().next());
}
}

View File

@ -1,52 +0,0 @@
package org.ahocorasick.util;
import org.ahocorasick.util.ListElementRemoval.RemoveElementPredicate;
import org.junit.Test;
import java.util.ArrayList;
import java.util.List;
import static java.util.Arrays.asList;
import static org.junit.Assert.assertEquals;
/**
* Responsible for testing that elements can be removed efficiently.
*/
public class ListElementRemovalTest {
@Test
public void test_RemoveNone() {
final List<String> list = createList();
RemoveElementPredicate<String> matchNothing = t -> false;
ListElementRemoval.removeIf( list, matchNothing );
assertEquals( 3, list.size() );
}
@Test
public void test_RemoveAll() {
final List<String> list = createList();
RemoveElementPredicate<String> matchNothing = t -> true;
ListElementRemoval.removeIf( list, matchNothing );
assertEquals( 0, list.size() );
}
@Test
public void test_RemoveSome() {
final List<String> list = createList();
RemoveElementPredicate<String> matchNothing =
t -> "a".equals( t ) || "c".equals( t );
ListElementRemoval.removeIf( list, matchNothing );
assertEquals( 1, list.size() );
assertEquals( "b", list.get( 0 ) );
}
private List<String> createList() {
return new ArrayList<>( asList( "a", "b", "c" ) );
}
}