PayloadTrie.parseText method inconsistencies (#86)
* make PayloadTrie.parseText(CharSequence) consistent with PayloadTrie.parseText(CharSequence, PayloadEmitHandler<T>) * added IntervalTest, StateTest to increase code coverage Co-authored-by: omarshibli <omar.shibli@personetics.com>
This commit is contained in:
parent
73ad827b1f
commit
66eef7b76f
@ -2,7 +2,7 @@ package org.ahocorasick.trie;
|
||||
|
||||
import static java.lang.Character.isWhitespace;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Queue;
|
||||
@ -13,8 +13,6 @@ import org.ahocorasick.interval.Intervalable;
|
||||
import org.ahocorasick.trie.handler.DefaultPayloadEmitHandler;
|
||||
import org.ahocorasick.trie.handler.PayloadEmitHandler;
|
||||
import org.ahocorasick.trie.handler.StatefulPayloadEmitHandler;
|
||||
import org.ahocorasick.util.ListElementRemoval;
|
||||
import org.ahocorasick.util.ListElementRemoval.RemoveElementPredicate;
|
||||
|
||||
/**
|
||||
* A trie implementation that carries a payload. See {@link Trie} for
|
||||
@ -24,7 +22,7 @@ import org.ahocorasick.util.ListElementRemoval.RemoveElementPredicate;
|
||||
* The payload trie adds the possibility to specify emitted payloads for each
|
||||
* added keyword.
|
||||
* </p>
|
||||
*
|
||||
*
|
||||
* @author Daniel Beck
|
||||
* @param <T> The type of the supplied of the payload.
|
||||
*/
|
||||
@ -79,12 +77,12 @@ public class PayloadTrie<T> {
|
||||
|
||||
/**
|
||||
* Tokenizes the specified text and returns the emitted outputs.
|
||||
*
|
||||
*
|
||||
* @param text The text to tokenize.
|
||||
* @return the emitted outputs
|
||||
*/
|
||||
public Collection<PayloadToken<T>> tokenize(final String text) {
|
||||
final Collection<PayloadToken<T>> tokens = new ArrayList<>();
|
||||
final Collection<PayloadToken<T>> tokens = new LinkedList<>();
|
||||
final Collection<PayloadEmit<T>> collectedEmits = parseText(text);
|
||||
int lastCollectedPosition = -1;
|
||||
|
||||
@ -118,7 +116,7 @@ public class PayloadTrie<T> {
|
||||
|
||||
/**
|
||||
* Tokenizes a specified text and returns the emitted outputs.
|
||||
*
|
||||
*
|
||||
* @param text The character sequence to tokenize.
|
||||
* @return A collection of emits.
|
||||
*/
|
||||
@ -129,7 +127,7 @@ public class PayloadTrie<T> {
|
||||
/**
|
||||
* Tokenizes the specified text by using a custom EmitHandler and returns the
|
||||
* emitted outputs.
|
||||
*
|
||||
*
|
||||
* @param text The character sequence to tokenize.
|
||||
* @param emitHandler The emit handler that will be used to parse the text.
|
||||
* @return A collection of emits.
|
||||
@ -140,14 +138,6 @@ public class PayloadTrie<T> {
|
||||
|
||||
final List<PayloadEmit<T>> collectedEmits = emitHandler.getEmits();
|
||||
|
||||
if (trieConfig.isOnlyWholeWords()) {
|
||||
removePartialMatches(text, collectedEmits);
|
||||
}
|
||||
|
||||
if (trieConfig.isOnlyWholeWordsWhiteSpaceSeparated()) {
|
||||
removePartialMatchesWhiteSpaceSeparated(text, collectedEmits);
|
||||
}
|
||||
|
||||
if (!trieConfig.isAllowOverlaps()) {
|
||||
IntervalTree intervalTree = new IntervalTree((List<Intervalable>) (List<?>) collectedEmits);
|
||||
intervalTree.removeOverlaps((List<Intervalable>) (List<?>) collectedEmits);
|
||||
@ -159,7 +149,7 @@ public class PayloadTrie<T> {
|
||||
/**
|
||||
* Returns true if the text contains contains one of the search terms. Else,
|
||||
* returns false.
|
||||
*
|
||||
*
|
||||
* @param text Specified text.
|
||||
* @return true if the text contains one of the search terms. Else, returns
|
||||
* false.
|
||||
@ -171,7 +161,7 @@ public class PayloadTrie<T> {
|
||||
/**
|
||||
* Tokenizes the specified text by using a custom EmitHandler and returns the
|
||||
* emitted outputs.
|
||||
*
|
||||
*
|
||||
* @param text The character sequence to tokenize.
|
||||
* @param emitHandler The emit handler that will be used to parse the text.
|
||||
*/
|
||||
@ -181,13 +171,13 @@ public class PayloadTrie<T> {
|
||||
for (int position = 0; position < text.length(); position++) {
|
||||
char character = text.charAt( position);
|
||||
|
||||
// TODO: Maybe lowercase the entire string at once?
|
||||
if (trieConfig.isCaseInsensitive()) {
|
||||
character = Character.toLowerCase(character);
|
||||
}
|
||||
|
||||
currentState = getState(currentState, character);
|
||||
if (storeEmits(position, currentState, emitHandler) && trieConfig.isStopOnHit()) {
|
||||
final Collection<Payload<T>> payloads = currentState.emit();
|
||||
if (processEmits(text, position, payloads, emitHandler) && trieConfig.isStopOnHit()) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
@ -214,7 +204,6 @@ public class PayloadTrie<T> {
|
||||
for (int position = 0; position < text.length(); position++) {
|
||||
char character = text.charAt( position);
|
||||
|
||||
// TODO: Lowercase the entire string at once?
|
||||
if (trieConfig.isCaseInsensitive()) {
|
||||
character = Character.toLowerCase(character);
|
||||
}
|
||||
@ -246,29 +235,10 @@ public class PayloadTrie<T> {
|
||||
|| (emit.getEnd() + 1 != searchText.length() && Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1)));
|
||||
}
|
||||
|
||||
private void removePartialMatches(final CharSequence searchText, final List<PayloadEmit<T>> collectedEmits) {
|
||||
|
||||
final RemoveElementPredicate<PayloadEmit<T>> predicate = emit -> isPartialMatch( searchText, emit);
|
||||
|
||||
ListElementRemoval.removeIf(collectedEmits, predicate);
|
||||
}
|
||||
|
||||
private void removePartialMatchesWhiteSpaceSeparated(final CharSequence searchText,
|
||||
final List<PayloadEmit<T>> collectedEmits) {
|
||||
private boolean isPartialMatchWhiteSpaceSeparated(final CharSequence searchText, final PayloadEmit<T> emit) {
|
||||
final long size = searchText.length();
|
||||
final List<PayloadEmit<T>> removeEmits = new ArrayList<>();
|
||||
|
||||
for (final PayloadEmit<T> emit : collectedEmits) {
|
||||
if ((emit.getStart() == 0 || isWhitespace(searchText.charAt(emit.getStart() - 1)))
|
||||
&& (emit.getEnd() + 1 == size || isWhitespace(searchText.charAt(emit.getEnd() + 1)))) {
|
||||
continue;
|
||||
}
|
||||
removeEmits.add(emit);
|
||||
}
|
||||
|
||||
for (final PayloadEmit<T> removeEmit : removeEmits) {
|
||||
collectedEmits.remove(removeEmit);
|
||||
}
|
||||
return (emit.getStart() != 0 && !isWhitespace(searchText.charAt(emit.getStart() - 1)))
|
||||
|| (emit.getEnd() + 1 != size && !isWhitespace(searchText.charAt(emit.getEnd() + 1)));
|
||||
}
|
||||
|
||||
private PayloadState<T> getState(PayloadState<T> currentState, final Character character) {
|
||||
@ -312,16 +282,14 @@ public class PayloadTrie<T> {
|
||||
}
|
||||
}
|
||||
|
||||
private boolean storeEmits(final int position, final PayloadState<T> currentState, final PayloadEmitHandler<T> emitHandler) {
|
||||
private boolean processEmits(final CharSequence text, final int position, final Collection<Payload<T>> payloads, final PayloadEmitHandler<T> emitHandler) {
|
||||
boolean emitted = false;
|
||||
final Collection<Payload<T>> payloads = currentState.emit();
|
||||
|
||||
// TODO: The check for empty might be superfluous.
|
||||
if (payloads != null && !payloads.isEmpty()) {
|
||||
for (final Payload<T> payload : payloads) {
|
||||
emitted = emitHandler.emit(new PayloadEmit<>(position - payload.getKeyword().length() + 1, position,
|
||||
payload.getKeyword(), payload.getData())) || emitted;
|
||||
|
||||
for (final Payload<T> payload : payloads) {
|
||||
final PayloadEmit<T> payloadEmit = new PayloadEmit<>(position - payload.getKeyword().length() + 1,
|
||||
position, payload.getKeyword(), payload.getData());
|
||||
if (!(trieConfig.isOnlyWholeWords() && isPartialMatch(text, payloadEmit)) &&
|
||||
!(trieConfig.isOnlyWholeWordsWhiteSpaceSeparated() && isPartialMatchWhiteSpaceSeparated(text, payloadEmit))) {
|
||||
emitted = emitHandler.emit(payloadEmit) || emitted;
|
||||
if (emitted && trieConfig.isStopOnHit()) {
|
||||
break;
|
||||
}
|
||||
@ -351,7 +319,7 @@ public class PayloadTrie<T> {
|
||||
|
||||
/**
|
||||
* Builder class to create a PayloadTrie instance.
|
||||
*
|
||||
*
|
||||
* @param <T> The type of the emitted payload.
|
||||
*/
|
||||
public static class PayloadTrieBuilder<T> {
|
||||
@ -475,6 +443,7 @@ public class PayloadTrie<T> {
|
||||
* @return This builder.
|
||||
* @deprecated Use ignoreCase()
|
||||
*/
|
||||
@Deprecated
|
||||
public PayloadTrieBuilder<T> caseInsensitive() {
|
||||
return ignoreCase();
|
||||
}
|
||||
@ -483,6 +452,7 @@ public class PayloadTrie<T> {
|
||||
* @return This builder.
|
||||
* @deprecated Use ignoreOverlaps()
|
||||
*/
|
||||
@Deprecated
|
||||
public PayloadTrieBuilder<T> removeOverlaps() {
|
||||
return ignoreOverlaps();
|
||||
}
|
||||
|
||||
@ -1,51 +0,0 @@
|
||||
package org.ahocorasick.util;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Helps removes elements from a list in a efficient way.
|
||||
*
|
||||
* <p>Removing elements from an ArrayList in a naive way can lead to O(n^3)
|
||||
* running time. If the algorithm first creates a list of all the elements
|
||||
* to remove, then we for each element in this list (assume n elements) we look
|
||||
* for the element in the original list (against n elements) and when found we need
|
||||
* to remove the element and move the elements to the right (of the removed element)
|
||||
* to the left by one, the size of this operation is at worst n hence O(n^3).</p>
|
||||
*
|
||||
* <p>This instead makes a new list and copies over only elements we want to keep,
|
||||
* we then clear the original list and then add all of the elements to the original
|
||||
* list. This gives us (for ArrayList) a running time of O(n).</p>
|
||||
*
|
||||
* <p>The performance of this has not been thoroughly tested for linked list.</p>
|
||||
*
|
||||
* <p>This can be completely removed in java 8 as the List#removeIf() method can be used instead
|
||||
* as this already is optimised for each list implementation.
|
||||
*
|
||||
*/
|
||||
public class ListElementRemoval {
|
||||
|
||||
public interface RemoveElementPredicate<T> {
|
||||
boolean remove( T t );
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes all elements from the list matching the given predicate.
|
||||
*
|
||||
* @param list the list from which to remove
|
||||
* @param predicate to test for removal
|
||||
* @param <T> type of list
|
||||
*/
|
||||
public static <T> void removeIf(final List<T> list, final RemoveElementPredicate<T> predicate) {
|
||||
final List<T> newList = new ArrayList<>(list.size());
|
||||
|
||||
for(final T element : list) {
|
||||
if (!predicate.remove(element)) {
|
||||
newList.add(element);
|
||||
}
|
||||
}
|
||||
|
||||
list.clear();
|
||||
list.addAll(newList);
|
||||
}
|
||||
}
|
||||
@ -6,52 +6,62 @@ import java.util.Iterator;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import static junit.framework.Assert.*;
|
||||
import static org.junit.Assert.*;
|
||||
|
||||
public class IntervalTest {
|
||||
|
||||
@Test
|
||||
public void construct() {
|
||||
Interval i = new Interval(1, 3);
|
||||
public void test_construct() {
|
||||
final Interval i = new Interval(1, 3);
|
||||
assertEquals(1, i.getStart());
|
||||
assertEquals(3, i.getEnd());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void size() {
|
||||
public void test_size() {
|
||||
assertEquals(3, new Interval(0, 2).size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void intervaloverlaps() {
|
||||
public void test_intervaloverlaps() {
|
||||
assertTrue(new Interval(1, 3).overlapsWith(new Interval(2, 4)));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void intervalDoesNotOverlap() {
|
||||
public void test_intervalDoesNotOverlap() {
|
||||
assertFalse(new Interval(1, 13).overlapsWith(new Interval(27, 42)));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void pointOverlaps() {
|
||||
public void test_pointOverlaps() {
|
||||
assertTrue(new Interval(1, 3).overlapsWith(2));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void pointDoesNotOverlap() {
|
||||
public void test_pointDoesNotOverlap() {
|
||||
assertFalse(new Interval(1, 13).overlapsWith(42));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void comparable() {
|
||||
Set<Interval> intervals = new TreeSet<>();
|
||||
public void test_comparable() {
|
||||
final Set<Interval> intervals = new TreeSet<>();
|
||||
intervals.add(new Interval(4, 6));
|
||||
intervals.add(new Interval(2, 7));
|
||||
intervals.add(new Interval(3, 4));
|
||||
Iterator<Interval> it = intervals.iterator();
|
||||
final Iterator<Interval> it = intervals.iterator();
|
||||
assertEquals(2, it.next().getStart());
|
||||
assertEquals(3, it.next().getStart());
|
||||
assertEquals(4, it.next().getStart());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test_checkToString() {
|
||||
assertEquals("4:6", new Interval(4, 6).toString());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test_compareToNegativeTest() {
|
||||
assertEquals(-1, new Interval(4, 6).compareTo(new Object()));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -5,15 +5,14 @@ import org.ahocorasick.trie.handler.PayloadEmitHandler;
|
||||
import org.ahocorasick.trie.handler.StatefulPayloadEmitHandler;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.ThreadLocalRandom;
|
||||
|
||||
import static java.util.Arrays.asList;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import static org.junit.Assert.*;
|
||||
|
||||
public class PayloadTrieTest {
|
||||
|
||||
@ -216,7 +215,7 @@ public class PayloadTrieTest {
|
||||
public void ushersTestByCallback() {
|
||||
PayloadTrie<Integer> trie = PayloadTrie.<Integer>builder().addKeywords(PRONOUNS_WITH_PAYLOADS).build();
|
||||
|
||||
final List<PayloadEmit<Integer>> emits = new ArrayList<>();
|
||||
final List<PayloadEmit<Integer>> emits = new LinkedList<>();
|
||||
PayloadEmitHandler<Integer> emitHandler = emit -> {
|
||||
emits.add(emit);
|
||||
return true;
|
||||
@ -448,6 +447,42 @@ public class PayloadTrieTest {
|
||||
assertEquals(textSize / interval, emits.size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test_containsMatchWithCaseInsensitive() {
|
||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().caseInsensitive().addKeyword("foo", "bar").build();
|
||||
|
||||
assertTrue(trie.containsMatch("FOOBAR"));
|
||||
assertFalse(trie.containsMatch("FO!?AR"));
|
||||
}
|
||||
|
||||
// @see https://github.com/robert-bor/aho-corasick/issues/85
|
||||
@Test
|
||||
public void test_wholeWords() {
|
||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword("foo", "bar").onlyWholeWords().build();
|
||||
// access via PayloadTrie.parseText(CharSequence)
|
||||
Collection<PayloadEmit<String>> result1 = trie.parseText("foobar");
|
||||
// access via PayloadTrie.parseText(CharSequence, PayloadEmitHandler<String>)
|
||||
Collection<PayloadEmit<String>> result2 = new LinkedList<>();
|
||||
trie.parseText("foobar", result2::add);
|
||||
|
||||
assertTrue(result1.isEmpty());
|
||||
assertEquals(result1, result2);
|
||||
}
|
||||
|
||||
// @see https://github.com/robert-bor/aho-corasick/issues/85
|
||||
@Test
|
||||
public void test_wholeWordsWhiteSpaceSeparated() {
|
||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword("foo", "bar").onlyWholeWordsWhiteSpaceSeparated().build();
|
||||
// access via PayloadTrie.parseText(CharSequence)
|
||||
Collection<PayloadEmit<String>> result1 = trie.parseText("foo#bar");
|
||||
// access via PayloadTrie.parseText(CharSequence, PayloadEmitHandler<String>)
|
||||
Collection<PayloadEmit<String>> result2 = new LinkedList<>();
|
||||
trie.parseText("foo#bar", result2::add);
|
||||
|
||||
assertTrue(result1.isEmpty());
|
||||
assertEquals(result1, result2);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates a random sequence of ASCII numbers.
|
||||
*
|
||||
|
||||
@ -2,12 +2,15 @@ package org.ahocorasick.trie;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
import static junit.framework.Assert.assertEquals;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
|
||||
import static org.junit.Assert.*;
|
||||
|
||||
public class StateTest {
|
||||
|
||||
@Test
|
||||
public void constructSequenceOfCharacters() {
|
||||
public void test_constructSequenceOfCharacters() {
|
||||
final State rootState = new State();
|
||||
rootState
|
||||
.addState('a')
|
||||
@ -19,5 +22,50 @@ public class StateTest {
|
||||
assertEquals(2, currentState.getDepth());
|
||||
currentState = currentState.nextState('c');
|
||||
assertEquals(3, currentState.getDepth());
|
||||
currentState = currentState.nextState('F');
|
||||
assertNull(currentState);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test_getStates() {
|
||||
final State rootState = new State();
|
||||
rootState.addState("foo");
|
||||
final State currentState = rootState.nextState('f');
|
||||
final Collection<State> states = rootState.getStates();
|
||||
|
||||
assertEquals(1, states.size());
|
||||
assertEquals(currentState, states.iterator().next());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test_getTransitions() {
|
||||
final State rootState = new State();
|
||||
rootState.addState("foo");
|
||||
final State currentState = rootState.nextState('f');
|
||||
final Collection<Character> transitions = rootState.getTransitions();
|
||||
|
||||
assertEquals(1, transitions.size());
|
||||
assertEquals(Character.valueOf('f'), transitions.iterator().next());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test_failure() {
|
||||
final State failureState = new State();
|
||||
final State rootState = new State();
|
||||
rootState.setFailure(failureState);
|
||||
|
||||
assertEquals(failureState, rootState.failure());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test_checkEmits() {
|
||||
final State rootState = new State();
|
||||
rootState.addState('a')
|
||||
.addEmit(Collections.singleton("tag"));
|
||||
final Collection<String> actual = rootState.nextState('a').emit();
|
||||
|
||||
assertEquals(1, actual.size());
|
||||
assertEquals("tag", actual.iterator().next());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,52 +0,0 @@
|
||||
package org.ahocorasick.util;
|
||||
|
||||
import org.ahocorasick.util.ListElementRemoval.RemoveElementPredicate;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import static java.util.Arrays.asList;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
/**
|
||||
* Responsible for testing that elements can be removed efficiently.
|
||||
*/
|
||||
public class ListElementRemovalTest {
|
||||
|
||||
@Test
|
||||
public void test_RemoveNone() {
|
||||
final List<String> list = createList();
|
||||
RemoveElementPredicate<String> matchNothing = t -> false;
|
||||
|
||||
ListElementRemoval.removeIf( list, matchNothing );
|
||||
|
||||
assertEquals( 3, list.size() );
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test_RemoveAll() {
|
||||
final List<String> list = createList();
|
||||
RemoveElementPredicate<String> matchNothing = t -> true;
|
||||
|
||||
ListElementRemoval.removeIf( list, matchNothing );
|
||||
|
||||
assertEquals( 0, list.size() );
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test_RemoveSome() {
|
||||
final List<String> list = createList();
|
||||
RemoveElementPredicate<String> matchNothing =
|
||||
t -> "a".equals( t ) || "c".equals( t );
|
||||
|
||||
ListElementRemoval.removeIf( list, matchNothing );
|
||||
|
||||
assertEquals( 1, list.size() );
|
||||
assertEquals( "b", list.get( 0 ) );
|
||||
}
|
||||
|
||||
private List<String> createList() {
|
||||
return new ArrayList<>( asList( "a", "b", "c" ) );
|
||||
}
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user