#49 Allow to specify Payload with Keyword (#68)

* #49: Allow to fix Payload with Keyword
This commit is contained in:
Daniel Beck 2019-08-20 05:16:46 +02:00 committed by Dave Jarvis
parent b7cc1136e5
commit 9f80565b53
25 changed files with 1575 additions and 295 deletions

View File

@ -1,6 +1,6 @@
language: java
install: mvn install -DskipTests=true -Dgpg.skip=true
jdk:
- oraclejdk8
- openjdk8
after_success:
- bash <(curl -s https://codecov.io/bash)
- bash <(curl -s https://codecov.io/bash)

View File

@ -182,6 +182,26 @@ matches as soon as you encounter them. Let's look at an example where we want to
System.out.println(html);
```
You can also emit custom outputs. This might for example be useful to implement a trivial named entity
recognizer. In this case use a PayloadTrie instead of a Trie:
```java
class Word {
private final String gender;
public Word(String gender) {
this.gender = gender;
}
}
PayloadTrie<Word> trie = PayloadTrie.<Word>builder()
.addKeyword("hers", new Word("f")
.addKeyword("his", new Word("m"))
.addKeyword("she", new Word("f"))
.addKeyword("he", new Word("m"))
.build();
Collection<PayloadEmit<Word>> emits = trie.parseText("ushers");
```
Releases
--------
Information on the aho-corasick [releases](https://github.com/robert-bor/aho-corasick/releases).

View File

@ -0,0 +1,21 @@
package org.ahocorasick.trie;
public class DefaultToken extends Token {
private PayloadToken<String> payloadToken;
public DefaultToken(PayloadToken<String> payloadToken) {
super(payloadToken.getFragment());
this.payloadToken = payloadToken;
}
public boolean isMatch() {
return payloadToken.isMatch();
}
public Emit getEmit() {
PayloadEmit<String> emit = payloadToken.getEmit();
return new Emit(emit.getStart(), emit.getEnd(), emit.getKeyword());
}
}

View File

@ -4,10 +4,9 @@ import org.ahocorasick.interval.Interval;
import org.ahocorasick.interval.Intervalable;
public class Emit extends Interval implements Intervalable {
private final String keyword;
public Emit(final int start, final int end, final String keyword) {
public Emit(final int start, final int end, String keyword) {
super(start, end);
this.keyword = keyword;
}
@ -20,4 +19,5 @@ public class Emit extends Interval implements Intervalable {
public String toString() {
return super.toString() + "=" + this.keyword;
}
}

View File

@ -15,4 +15,5 @@ public class FragmentToken extends Token {
public Emit getEmit() {
return null;
}
}

View File

@ -7,6 +7,7 @@ public class MatchToken extends Token {
public MatchToken(final String fragment, final Emit emit) {
super(fragment);
this.emit = emit;
}
@Override

View File

@ -0,0 +1,33 @@
package org.ahocorasick.trie;
/**
* Payload holds the matched keyword and some payload-data.
*
* @author Daniel Beck
*
* @param <T> The type of the wrapped payload data.
*/
public class Payload<T> implements Comparable<Payload<T>> {
private final String keyword;
private final T data;
public Payload(final String keyword, final T data) {
super();
this.keyword = keyword;
this.data = data;
}
public String getKeyword() {
return keyword;
}
public T getData() {
return data;
}
@Override
public int compareTo(Payload<T> other) {
return keyword.compareTo(other.getKeyword());
}
}

View File

@ -0,0 +1,50 @@
package org.ahocorasick.trie;
import org.ahocorasick.interval.Interval;
import org.ahocorasick.interval.Intervalable;
/**
* PayloadEmit contains a matched term and its associated payload data.
*
* @param <T> Type of the wrapped payload-data.
* @author Daniel Beck
*
*/
public class PayloadEmit<T> extends Interval implements Intervalable {
private final String keyword;
private final T payload;
/**
* Created a PayloadEmit
*
* @param start Start of the matched search term.
* @param end End of the matched search term.
* @param keyword Keyword that matched.
* @param payload Emitted payload data.
*/
public PayloadEmit(final int start, final int end, String keyword, T payload) {
super(start, end);
this.keyword = keyword;
this.payload = payload;
}
public String getKeyword() {
return this.keyword;
}
/**
* Returns the payload associated to this emit.
*
* @return the associated payload
*/
public T getPayload() {
return this.payload;
}
@Override
public String toString() {
return super.toString() + "=" + this.keyword + (this.payload != null ? "->" + this.payload : "");
}
}

View File

@ -0,0 +1,31 @@
package org.ahocorasick.trie;
/***
* PayloadFragmentToken holds a text ("the fragment").
* <p>
* It does not matches a search term - so its <code>isMatch</code>-method
* returns always false. <code>getEmits</code> returns not Emits.
*
* @author Daniel Beck
*
* @param <T> The Type of the emitted payloads.
*/
public class PayloadFragmentToken<T> extends PayloadToken<T> {
public PayloadFragmentToken(String fragment) {
super(fragment);
}
@Override
public boolean isMatch() {
return false;
}
/**
* Returns null.
*/
@Override
public PayloadEmit<T> getEmit() {
return null;
}
}

View File

@ -0,0 +1,31 @@
package org.ahocorasick.trie;
/**
* PayloadMatchToken holds a text ("the fragment") an emits some output.
* <p>
* It matches a search term - so its <code>isMatch</code>-method returns always
* true..
*
* @author Daniel Beck
*
* @param <T> The Type of the emitted payloads.
*/
public class PayloadMatchToken<T> extends PayloadToken<T> {
private final PayloadEmit<T> emit;
public PayloadMatchToken(final String fragment, final PayloadEmit<T> emit) {
super(fragment);
this.emit = emit;
}
@Override
public boolean isMatch() {
return true;
}
@Override
public PayloadEmit<T> getEmit() {
return this.emit;
}
}

View File

@ -0,0 +1,156 @@
package org.ahocorasick.trie;
import java.util.*;
/**
* <p>
* A state has various important tasks it must attend to:
* </p>
* <p>
* <ul>
* <li>success; when a character points to another state, it must return that
* state</li>
* <li>failure; when a character has no matching state, the algorithm must be
* able to fall back on a state with less depth</li>
* <li>emits; when this state is passed and keywords have been matched, the
* matches and their payloads must be 'emitted' so that they can be used later
* on.</li>
* </ul>
* <p>
* <p>
* The root state is special in the sense that it has no failure state; it
* cannot fail. If it 'fails' it will still parse the next character and start
* from the root node. This ensures that the algorithm always runs. All other
* states always have a fail state.
* </p>
*
* @author Daniel Beck
*/
public class PayloadState<T> {
/**
* effective the size of the keyword
*/
private final int depth;
/**
* only used for the root state to refer to itself in case no matches have been
* found
*/
private final PayloadState<T> rootState;
/**
* referred to in the white paper as the 'goto' structure. From a state it is
* possible to go to other states, depending on the character passed.
*/
private final Map<Character, PayloadState<T>> success = new HashMap<>();
/**
* if no matching states are found, the failure state will be returned
*/
private PayloadState<T> failure;
/**
* whenever this state is reached, it will emit the matches keywords for future
* reference
*/
private Set<Payload<T>> emits;
public PayloadState() {
this(0);
}
public PayloadState(final int depth) {
this.depth = depth;
this.rootState = depth == 0 ? this : null;
}
private PayloadState<T> nextState(final Character character, final boolean ignoreRootState) {
PayloadState<T> nextState = this.success.get(character);
if (!ignoreRootState && nextState == null && this.rootState != null) {
nextState = this.rootState;
}
return nextState;
}
public PayloadState<T> nextState(final Character character) {
return nextState(character, false);
}
public PayloadState<T> nextStateIgnoreRootState(Character character) {
return nextState(character, true);
}
public PayloadState<T> addState(String keyword) {
PayloadState<T> state = this;
for (final Character character : keyword.toCharArray()) {
state = state.addState(character);
}
return state;
}
public PayloadState<T> addState(Character character) {
PayloadState<T> nextState = nextStateIgnoreRootState(character);
if (nextState == null) {
nextState = new PayloadState<T>(this.depth + 1);
this.success.put(character, nextState);
}
return nextState;
}
public int getDepth() {
return this.depth;
}
/**
* Adds a payload to be emitted for this state.
*
* @param emit Payload to be emitted.
*/
public void addEmit(Payload<T> payload) {
if (this.emits == null) {
this.emits = new TreeSet<>();
}
this.emits.add(payload);
}
/**
* Adds a collection of payloads to be emitted for this state.
*
* @param emits Collection of payloads to be emitted.
*/
public void addEmit(Collection<Payload<T>> emits) {
for (Payload<T> emit : emits) {
addEmit(emit);
}
}
/**
* Returns a collection of emitted payloads for this state.
*
* @return Collection of emitted payloads.
*/
public Collection<Payload<T>> emit() {
return this.emits == null ? Collections.<Payload<T>>emptyList() : this.emits;
}
public PayloadState<T> failure() {
return this.failure;
}
public void setFailure(PayloadState<T> failState) {
this.failure = failState;
}
public Collection<PayloadState<T>> getStates() {
return this.success.values();
}
public Collection<Character> getTransitions() {
return this.success.keySet();
}
}

View File

@ -0,0 +1,28 @@
package org.ahocorasick.trie;
/***
* PayloadToken holds a text ("the fragment") an emits some output. If
* <code>isMatch</code> returns true, the token matched a search term.
*
* @author Daniel Beck
*
* @param <T> The Type of the emitted payloads.
*/
public abstract class PayloadToken<T> {
private String fragment;
public PayloadToken(String fragment) {
this.fragment = fragment;
}
public String getFragment() {
return this.fragment;
}
/**
* Return true if a search term matched.
*/
public abstract boolean isMatch();
public abstract PayloadEmit<T> getEmit();
}

View File

@ -0,0 +1,495 @@
package org.ahocorasick.trie;
import static java.lang.Character.isWhitespace;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Queue;
import java.util.concurrent.LinkedBlockingDeque;
import org.ahocorasick.interval.IntervalTree;
import org.ahocorasick.interval.Intervalable;
import org.ahocorasick.trie.handler.DefaultPayloadEmitHandler;
import org.ahocorasick.trie.handler.PayloadEmitHandler;
import org.ahocorasick.trie.handler.StatefulPayloadEmitHandler;
import org.ahocorasick.util.ListElementRemoval;
import org.ahocorasick.util.ListElementRemoval.RemoveElementPredicate;
/**
* A trie implementation, based on the Aho-Corasick white paper, Bell
* technologies: http://cr.yp.to/bib/1975/aho.pdf
* <p>
*
* The payload trie adds the possibility to specify emitted payloads for each
* added keyword.
*
* @author Daniel Beck
* @param <T> The type of the supplied of the payload
*/
public class PayloadTrie<T> {
private final TrieConfig trieConfig;
private final PayloadState<T> rootState;
protected PayloadTrie(final TrieConfig trieConfig) {
this.trieConfig = trieConfig;
this.rootState = new PayloadState<>();
}
/**
* Used by the builder to add a text search keyword with a emit payload.
*
* @param keyword The search term to add to the list of search terms.
* @param emit the payload to emit for this search term.
* @throws NullPointerException if the keyword is null.
*/
private void addKeyword(String keyword, T emit) {
if (keyword.isEmpty()) {
return;
}
if (isCaseInsensitive()) {
keyword = keyword.toLowerCase();
}
addState(keyword).addEmit(new Payload<T>(keyword, emit));
}
/**
* Used by the builder to add a text search keyword.
*
* @param keyword The search term to add to the list of search terms.
* @throws NullPointerException if the keyword is null.
*/
private void addKeyword(String keyword) {
if (keyword.isEmpty()) {
return;
}
if (isCaseInsensitive()) {
keyword = keyword.toLowerCase();
}
addState(keyword).addEmit(new Payload<T>(keyword, null));
}
private PayloadState<T> addState(final String keyword) {
return getRootState().addState(keyword);
}
/**
* Tokenizes the specified text and returns the emitted outputs.
*
* @param text The text to tokenize.
*/
public Collection<PayloadToken<T>> tokenize(final String text) {
final Collection<PayloadToken<T>> tokens = new ArrayList<>();
final Collection<PayloadEmit<T>> collectedEmits = parseText(text);
int lastCollectedPosition = -1;
for (final PayloadEmit<T> emit : collectedEmits) {
if (emit.getStart() - lastCollectedPosition > 1) {
tokens.add((PayloadToken<T>) createFragment(emit, text, lastCollectedPosition));
}
tokens.add(createMatch(emit, text));
lastCollectedPosition = emit.getEnd();
}
if (text.length() - lastCollectedPosition > 1) {
tokens.add((PayloadToken<T>) createFragment(null, text, lastCollectedPosition));
}
return tokens;
}
private PayloadToken<T> createFragment(final PayloadEmit<T> emit, final String text, final int lastCollectedPosition) {
return new PayloadFragmentToken<T>(
text.substring(lastCollectedPosition + 1, emit == null ? text.length() : emit.getStart()));
}
private PayloadToken<T> createMatch(PayloadEmit<T> emit, String text) {
return new PayloadMatchToken<T>(text.substring(emit.getStart(), emit.getEnd() + 1), emit);
}
/**
* Tokenizes a specified text and returns the emitted outputs.
*
* @param text The character sequence to tokenize.
* @return A collection of emits.
*/
public Collection<PayloadEmit<T>> parseText(final CharSequence text) {
return parseText(text, new DefaultPayloadEmitHandler<T>());
}
/**
* Tokenizes the specified text by using a custom EmitHandler and returns the
* emitted outputs.
*
* @param text The character sequence to tokenize.
* @param emitHandler The emit handler that will be used to parse the text.
* @return A collection of emits.
*/
@SuppressWarnings("unchecked")
public Collection<PayloadEmit<T>> parseText(final CharSequence text, final StatefulPayloadEmitHandler<T> emitHandler) {
parseText(text, (PayloadEmitHandler<T>) emitHandler);
final List<PayloadEmit<T>> collectedEmits = emitHandler.getEmits();
if (trieConfig.isOnlyWholeWords()) {
removePartialMatches(text, collectedEmits);
}
if (trieConfig.isOnlyWholeWordsWhiteSpaceSeparated()) {
removePartialMatchesWhiteSpaceSeparated(text, collectedEmits);
}
if (!trieConfig.isAllowOverlaps()) {
IntervalTree intervalTree = new IntervalTree((List<Intervalable>) (List<?>) collectedEmits);
intervalTree.removeOverlaps((List<Intervalable>) (List<?>) collectedEmits);
}
return collectedEmits;
}
/**
* Returns true if the text contains contains one of the search terms. Else,
* returns false.
*
* @param Text Specified text.
* @return true if the text contains one of the search terms. Else, returns
* false.
*/
public boolean containsMatch(final CharSequence text) {
return firstMatch(text) != null;
}
/**
* Tokenizes the specified text by using a custom EmitHandler and returns the
* emitted outputs.
*
* @param text The character sequence to tokenize.
* @param emitHandler The emit handler that will be used to parse the text.
* @return A collection of emits.
*/
public void parseText(final CharSequence text, final PayloadEmitHandler<T> emitHandler) {
PayloadState<T> currentState = getRootState();
for (int position = 0; position < text.length(); position++) {
Character character = text.charAt(position);
// TODO: Maybe lowercase the entire string at once?
if (trieConfig.isCaseInsensitive()) {
character = Character.toLowerCase(character);
}
currentState = getState(currentState, character);
if (storeEmits(position, currentState, emitHandler) && trieConfig.isStopOnHit()) {
return;
}
}
}
/**
* The first matching text sequence.
*
* @param text The text to search for keywords.
* @return null if no matches found.
*/
public PayloadEmit<T> firstMatch(final CharSequence text) {
if (!trieConfig.isAllowOverlaps()) {
// Slow path. Needs to find all the matches to detect overlaps.
final Collection<PayloadEmit<T>> parseText = parseText(text);
if (parseText != null && !parseText.isEmpty()) {
return parseText.iterator().next();
}
} else {
// Fast path. Returns first match found.
PayloadState<T> currentState = getRootState();
for (int position = 0; position < text.length(); position++) {
Character character = text.charAt(position);
// TODO: Lowercase the entire string at once?
if (trieConfig.isCaseInsensitive()) {
character = Character.toLowerCase(character);
}
currentState = getState(currentState, character);
Collection<Payload<T>> payloads = currentState.emit();
if (payloads != null && !payloads.isEmpty()) {
for (final Payload<T> payload : payloads) {
final PayloadEmit<T> emit = new PayloadEmit<>(position - payload.getKeyword().length() + 1, position,
payload.getKeyword(), payload.getData());
if (trieConfig.isOnlyWholeWords()) {
if (!isPartialMatch(text, emit)) {
return emit;
}
} else {
return emit;
}
}
}
}
}
return null;
}
private boolean isPartialMatch(final CharSequence searchText, final PayloadEmit<T> emit) {
return (emit.getStart() != 0 && Character.isAlphabetic(searchText.charAt(emit.getStart() - 1)))
|| (emit.getEnd() + 1 != searchText.length() && Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1)));
}
private void removePartialMatches(final CharSequence searchText, final List<PayloadEmit<T>> collectedEmits) {
final RemoveElementPredicate<PayloadEmit<T>> predicate = new RemoveElementPredicate<PayloadEmit<T>>() {
@Override
public boolean remove(PayloadEmit<T> emit) {
return isPartialMatch(searchText, emit);
}
};
ListElementRemoval.removeIf(collectedEmits, predicate);
}
private void removePartialMatchesWhiteSpaceSeparated(final CharSequence searchText,
final List<PayloadEmit<T>> collectedEmits) {
final long size = searchText.length();
final List<PayloadEmit<T>> removeEmits = new ArrayList<>();
for (final PayloadEmit<T> emit : collectedEmits) {
if ((emit.getStart() == 0 || isWhitespace(searchText.charAt(emit.getStart() - 1)))
&& (emit.getEnd() + 1 == size || isWhitespace(searchText.charAt(emit.getEnd() + 1)))) {
continue;
}
removeEmits.add(emit);
}
for (final PayloadEmit<T> removeEmit : removeEmits) {
collectedEmits.remove(removeEmit);
}
}
private PayloadState<T> getState(PayloadState<T> currentState, final Character character) {
PayloadState<T> newCurrentState = currentState.nextState(character);
while (newCurrentState == null) {
currentState = currentState.failure();
newCurrentState = currentState.nextState(character);
}
return newCurrentState;
}
private void constructFailureStates() {
final Queue<PayloadState<T>> queue = new LinkedBlockingDeque<>();
final PayloadState<T> startState = getRootState();
// First, set the fail state of all depth 1 states to the root state
for (PayloadState<T> depthOneState : startState.getStates()) {
depthOneState.setFailure(startState);
queue.add(depthOneState);
}
// Second, determine the fail state for all depth > 1 state
while (!queue.isEmpty()) {
final PayloadState<T> currentState = queue.remove();
for (final Character transition : currentState.getTransitions()) {
PayloadState<T> targetState = currentState.nextState(transition);
queue.add(targetState);
PayloadState<T> traceFailureState = currentState.failure();
while (traceFailureState.nextState(transition) == null) {
traceFailureState = traceFailureState.failure();
}
final PayloadState<T> newFailureState = traceFailureState.nextState(transition);
targetState.setFailure(newFailureState);
targetState.addEmit(newFailureState.emit());
}
}
}
private boolean storeEmits(final int position, final PayloadState<T> currentState, final PayloadEmitHandler<T> emitHandler) {
boolean emitted = false;
final Collection<Payload<T>> payloads = currentState.emit();
// TODO: The check for empty might be superfluous.
if (payloads != null && !payloads.isEmpty()) {
for (final Payload<T> payload : payloads) {
emitted = emitHandler.emit(new PayloadEmit<T>(position - payload.getKeyword().length() + 1, position,
payload.getKeyword(), payload.getData())) || emitted;
if (emitted && trieConfig.isStopOnHit()) {
break;
}
}
}
return emitted;
}
private boolean isCaseInsensitive() {
return trieConfig.isCaseInsensitive();
}
private PayloadState<T> getRootState() {
return this.rootState;
}
/**
* Provides a fluent interface for constructing Trie instances with payloads.
*
* @return The builder used to configure its Trie.
*/
public static <T> PayloadTrieBuilder<T> builder() {
return new PayloadTrieBuilder<T>();
}
/**
* Builder class to create a PayloadTrie instance.
*
* @param <T> The type of the emitted payload.
*/
public static class PayloadTrieBuilder<T> {
private final TrieConfig trieConfig = new TrieConfig();
private final PayloadTrie<T> trie = new PayloadTrie<>(trieConfig);
/**
* Default (empty) constructor.
*/
private PayloadTrieBuilder() {
}
/**
* Configure the Trie to ignore case when searching for keywords in the text.
* This must be called before calling addKeyword because the algorithm converts
* keywords to lowercase as they are added, depending on this case sensitivity
* setting.
*
* @return This builder.
*/
public PayloadTrieBuilder<T> ignoreCase() {
this.trieConfig.setCaseInsensitive(true);
return this;
}
/**
* Configure the Trie to ignore overlapping keywords.
*
* @return This builder.
*/
public PayloadTrieBuilder<T> ignoreOverlaps() {
this.trieConfig.setAllowOverlaps(false);
return this;
}
/**
* Adds a keyword to the Trie's list of text search keywords. No Payload is
* supplied.
*
* @param keyword The keyword to add to the list.
* @return This builder.
* @throws NullPointerException if the keyword is null.
*/
public PayloadTrieBuilder<T> addKeyword(final String keyword) {
this.trie.addKeyword(keyword);
return this;
}
/**
* Adds a keyword and a payload to the Trie's list of text search keywords.
*
* @param keyword The keyword to add to the list.
* @return This builder.
* @throws NullPointerException if the keyword is null.
*/
public PayloadTrieBuilder<T> addKeyword(final String keyword, final T payload) {
this.trie.addKeyword(keyword, payload);
return this;
}
/**
* Adds a list of keywords and payloads to the Trie's list of text search
* keywords.
*
* @param keywords The keywords to add to the list.
* @return This builder.
*/
public PayloadTrieBuilder<T> addKeywords(final Collection<Payload<T>> keywords) {
for (Payload<T> payload : keywords) {
this.trie.addKeyword(payload.getKeyword(), payload.getData());
}
return this;
}
/**
* Configure the Trie to match whole keywords in the text.
*
* @return This builder.
*/
public PayloadTrieBuilder<T> onlyWholeWords() {
this.trieConfig.setOnlyWholeWords(true);
return this;
}
/**
* Configure the Trie to match whole keywords that are separated by whitespace
* in the text. For example, "this keyword thatkeyword" would only match the
* first occurrence of "keyword".
*
* @return This builder.
*/
public PayloadTrieBuilder<T> onlyWholeWordsWhiteSpaceSeparated() {
this.trieConfig.setOnlyWholeWordsWhiteSpaceSeparated(true);
return this;
}
/**
* Configure the Trie to stop after the first keyword is found in the text.
*
* @return This builder.
*/
public PayloadTrieBuilder<T> stopOnHit() {
trie.trieConfig.setStopOnHit(true);
return this;
}
/**
* Configure the PayloadTrie based on the builder settings.
*
* @return The configured PayloadTrie.
*/
public PayloadTrie<T> build() {
this.trie.constructFailureStates();
return this.trie;
}
/**
* @return This builder.
* @deprecated Use ignoreCase()
*/
public PayloadTrieBuilder<T> caseInsensitive() {
return ignoreCase();
}
/**
* @return This builder.
* @deprecated Use ignoreOverlaps()
*/
public PayloadTrieBuilder<T> removeOverlaps() {
return ignoreOverlaps();
}
}
}

View File

@ -133,4 +133,4 @@ public class State {
public Collection<Character> getTransitions() {
return this.success.keySet();
}
}
}

View File

@ -1,7 +1,6 @@
package org.ahocorasick.trie;
public abstract class Token {
private String fragment;
public Token(String fragment) {
@ -15,5 +14,4 @@ public abstract class Token {
public abstract boolean isMatch();
public abstract Emit getEmit();
}

View File

@ -1,20 +1,13 @@
package org.ahocorasick.trie;
import org.ahocorasick.interval.IntervalTree;
import org.ahocorasick.interval.Intervalable;
import org.ahocorasick.trie.handler.DefaultEmitHandler;
import org.ahocorasick.trie.handler.EmitHandler;
import org.ahocorasick.trie.handler.StatefulEmitHandler;
import org.ahocorasick.util.ListElementRemoval;
import org.ahocorasick.util.ListElementRemoval.RemoveElementPredicate;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Queue;
import java.util.concurrent.LinkedBlockingDeque;
import static java.lang.Character.isWhitespace;
import org.ahocorasick.trie.PayloadTrie.PayloadTrieBuilder;
import org.ahocorasick.trie.handler.EmitHandler;
import org.ahocorasick.trie.handler.StatefulPayloadEmitDelegateHandler;
import org.ahocorasick.trie.handler.PayloadEmitDelegateHandler;
import org.ahocorasick.trie.handler.StatefulEmitHandler;
/**
* Based on the Aho-Corasick white paper, Bell technologies:
@ -24,112 +17,47 @@ import static java.lang.Character.isWhitespace;
*/
public class Trie {
private final TrieConfig trieConfig;
private final PayloadTrie<String> payloadTrie;
private final State rootState;
private Trie(final TrieConfig trieConfig) {
this.trieConfig = trieConfig;
this.rootState = new State();
}
/**
* Used by the builder to add a text search keyword.
*
* @param keyword The search term to add to the list of search terms.
* @throws NullPointerException if the keyword is null.
*/
private void addKeyword(String keyword) {
if (keyword.isEmpty()) {
return;
}
if (isCaseInsensitive()) {
keyword = keyword.toLowerCase();
}
addState(keyword).addEmit(keyword);
}
/**
* Delegates to addKeyword.
*
* @param keywords List of search term to add to the list of search terms.
*/
private void addKeywords(final String[] keywords) {
for (final String keyword : keywords) {
addKeyword(keyword);
}
}
/**
* Delegates to addKeyword.
*
* @param keywords List of search term to add to the list of search terms.
*/
private void addKeywords(final Collection<String> keywords) {
for (final String keyword : keywords) {
addKeyword(keyword);
}
}
private State addState(final String keyword) {
return getRootState().addState(keyword);
private Trie(final PayloadTrie<String> payloadTrie) {
this.payloadTrie = payloadTrie;
}
public Collection<Token> tokenize(final String text) {
final Collection<Token> tokens = new ArrayList<>();
final Collection<Emit> collectedEmits = parseText(text);
int lastCollectedPosition = -1;
for (final Emit emit : collectedEmits) {
if (emit.getStart() - lastCollectedPosition > 1) {
tokens.add(createFragment(emit, text, lastCollectedPosition));
}
tokens.add(createMatch(emit, text));
lastCollectedPosition = emit.getEnd();
}
if (text.length() - lastCollectedPosition > 1) {
tokens.add(createFragment(null, text, lastCollectedPosition));
}
return tokens;
Collection<PayloadToken<String>> tokens = this.payloadTrie.tokenize(text);
return asTokens(tokens);
}
private Token createFragment(final Emit emit, final String text, final int lastCollectedPosition) {
return new FragmentToken(text.substring(lastCollectedPosition + 1, emit == null ? text.length() : emit.getStart()));
private static Collection<Token> asTokens(Collection<PayloadToken<String>> tokens) {
Collection<Token> result = new ArrayList<>();
for (PayloadToken<String> payloadToken : tokens) {
result.add(new DefaultToken(payloadToken));
}
return result;
}
private Token createMatch(Emit emit, String text) {
return new MatchToken(text.substring(emit.getStart(), emit.getEnd() + 1), emit);
private static Collection<Emit> asEmits(Collection<PayloadEmit<String>> emits) {
Collection<Emit> result = new ArrayList<>();
for (PayloadEmit<String> emit : emits) {
result.add(asEmit(emit));
}
return result;
}
private static Emit asEmit(PayloadEmit<String> payloadEmit) {
return new Emit(payloadEmit.getStart(), payloadEmit.getEnd(), payloadEmit.getKeyword());
}
public Collection<Emit> parseText(final CharSequence text) {
return parseText(text, new DefaultEmitHandler());
Collection<PayloadEmit<String>> parsedText = this.payloadTrie.parseText(text);
return asEmits(parsedText);
}
@SuppressWarnings("unchecked")
public Collection<Emit> parseText(final CharSequence text, final StatefulEmitHandler emitHandler) {
parseText(text, (EmitHandler) emitHandler);
final List<Emit> collectedEmits = emitHandler.getEmits();
if (trieConfig.isOnlyWholeWords()) {
removePartialMatches(text, collectedEmits);
}
if (trieConfig.isOnlyWholeWordsWhiteSpaceSeparated()) {
removePartialMatchesWhiteSpaceSeparated(text, collectedEmits);
}
if (!trieConfig.isAllowOverlaps()) {
IntervalTree intervalTree = new IntervalTree((List<Intervalable>) (List<?>) collectedEmits);
intervalTree.removeOverlaps((List<Intervalable>) (List<?>) collectedEmits);
}
return collectedEmits;
Collection<PayloadEmit<String>> parsedText = this.payloadTrie.parseText(text,
new StatefulPayloadEmitDelegateHandler(emitHandler));
return asEmits(parsedText);
}
public boolean containsMatch(final CharSequence text) {
@ -137,21 +65,7 @@ public class Trie {
}
public void parseText(final CharSequence text, final EmitHandler emitHandler) {
State currentState = getRootState();
for (int position = 0; position < text.length(); position++) {
Character character = text.charAt(position);
// TODO: Maybe lowercase the entire string at once?
if (trieConfig.isCaseInsensitive()) {
character = Character.toLowerCase(character);
}
currentState = getState(currentState, character);
if (storeEmits(position, currentState, emitHandler) && trieConfig.isStopOnHit()) {
return;
}
}
this.payloadTrie.parseText(text, new PayloadEmitDelegateHandler(emitHandler));
}
/**
@ -161,148 +75,8 @@ public class Trie {
* @return null if no matches found.
*/
public Emit firstMatch(final CharSequence text) {
if (!trieConfig.isAllowOverlaps()) {
// Slow path. Needs to find all the matches to detect overlaps.
final Collection<Emit> parseText = parseText(text);
if (parseText != null && !parseText.isEmpty()) {
return parseText.iterator().next();
}
} else {
// Fast path. Returns first match found.
State currentState = getRootState();
for (int position = 0; position < text.length(); position++) {
Character character = text.charAt(position);
// TODO: Lowercase the entire string at once?
if (trieConfig.isCaseInsensitive()) {
character = Character.toLowerCase(character);
}
currentState = getState(currentState, character);
Collection<String> emitStrs = currentState.emit();
if (emitStrs != null && !emitStrs.isEmpty()) {
for (final String emitStr : emitStrs) {
final Emit emit = new Emit(position - emitStr.length() + 1, position, emitStr);
if (trieConfig.isOnlyWholeWords()) {
if (!isPartialMatch(text, emit)) {
return emit;
}
} else {
return emit;
}
}
}
}
}
return null;
}
private boolean isPartialMatch(final CharSequence searchText, final Emit emit) {
return (emit.getStart() != 0 &&
Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) ||
(emit.getEnd() + 1 != searchText.length() &&
Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1)));
}
private void removePartialMatches(final CharSequence searchText, final List<Emit> collectedEmits) {
final RemoveElementPredicate<Emit> predicate = new RemoveElementPredicate<Emit>() {
@Override
public boolean remove(Emit emit) {
return isPartialMatch(searchText, emit);
}
};
ListElementRemoval.removeIf(collectedEmits, predicate);
}
private void removePartialMatchesWhiteSpaceSeparated(final CharSequence searchText, final List<Emit> collectedEmits) {
final long size = searchText.length();
final List<Emit> removeEmits = new ArrayList<>();
for (final Emit emit : collectedEmits) {
if ((emit.getStart() == 0 || isWhitespace(searchText.charAt(emit.getStart() - 1))) &&
(emit.getEnd() + 1 == size || isWhitespace(searchText.charAt(emit.getEnd() + 1)))) {
continue;
}
removeEmits.add(emit);
}
for (final Emit removeEmit : removeEmits) {
collectedEmits.remove(removeEmit);
}
}
private State getState(State currentState, final Character character) {
State newCurrentState = currentState.nextState(character);
while (newCurrentState == null) {
currentState = currentState.failure();
newCurrentState = currentState.nextState(character);
}
return newCurrentState;
}
private void constructFailureStates() {
final Queue<State> queue = new LinkedBlockingDeque<>();
final State startState = getRootState();
// First, set the fail state of all depth 1 states to the root state
for (State depthOneState : startState.getStates()) {
depthOneState.setFailure(startState);
queue.add(depthOneState);
}
// Second, determine the fail state for all depth > 1 state
while (!queue.isEmpty()) {
final State currentState = queue.remove();
for (final Character transition : currentState.getTransitions()) {
State targetState = currentState.nextState(transition);
queue.add(targetState);
State traceFailureState = currentState.failure();
while (traceFailureState.nextState(transition) == null) {
traceFailureState = traceFailureState.failure();
}
final State newFailureState = traceFailureState.nextState(transition);
targetState.setFailure(newFailureState);
targetState.addEmit(newFailureState.emit());
}
}
}
private boolean storeEmits(final int position, final State currentState, final EmitHandler emitHandler) {
boolean emitted = false;
final Collection<String> emits = currentState.emit();
// TODO: The check for empty might be superfluous.
if (emits != null && !emits.isEmpty()) {
for (final String emit : emits) {
emitted = emitHandler.emit(new Emit(position - emit.length() + 1, position, emit)) || emitted;
if (emitted && trieConfig.isStopOnHit()) {
break;
}
}
}
return emitted;
}
private boolean isCaseInsensitive() {
return trieConfig.isCaseInsensitive();
}
private State getRootState() {
return this.rootState;
PayloadEmit<String> firstMatch = this.payloadTrie.firstMatch(text);
return new Emit(firstMatch.getStart(), firstMatch.getEnd(), firstMatch.getKeyword());
}
/**
@ -318,7 +92,9 @@ public class Trie {
private final TrieConfig trieConfig = new TrieConfig();
private final Trie trie = new Trie(trieConfig);
private final PayloadTrie<String> trie = new PayloadTrie<>(trieConfig);
private final PayloadTrieBuilder<String> delegate = PayloadTrie.builder();
/**
* Default (empty) constructor.
@ -327,15 +103,16 @@ public class Trie {
}
/**
* Configure the Trie to ignore case when searching for keywords in
* the text. This must be called before calling addKeyword because
* the algorithm converts keywords to lowercase as they are added,
* depending on this case sensitivity setting.
* Configure the Trie to ignore case when searching for keywords in the text.
* This must be called before calling addKeyword because the algorithm converts
* keywords to lowercase as they are added, depending on this case sensitivity
* setting.
*
* @return This builder.
*/
public TrieBuilder ignoreCase() {
this.trieConfig.setCaseInsensitive(true);
delegate.ignoreCase();
// this.trieConfig.setCaseInsensitive(true);
return this;
}
@ -345,7 +122,7 @@ public class Trie {
* @return This builder.
*/
public TrieBuilder ignoreOverlaps() {
this.trieConfig.setAllowOverlaps(false);
delegate.ignoreOverlaps();
return this;
}
@ -357,7 +134,7 @@ public class Trie {
* @throws NullPointerException if the keyword is null.
*/
public TrieBuilder addKeyword(final String keyword) {
this.trie.addKeyword(keyword);
delegate.addKeyword(keyword, null);
return this;
}
@ -368,7 +145,9 @@ public class Trie {
* @return This builder.
*/
public TrieBuilder addKeywords(final String... keywords) {
this.trie.addKeywords(keywords);
for (String keyword : keywords) {
delegate.addKeyword(keyword, null);
}
return this;
}
@ -379,7 +158,9 @@ public class Trie {
* @return This builder.
*/
public TrieBuilder addKeywords(final Collection<String> keywords) {
this.trie.addKeywords(keywords);
for (String keyword : keywords) {
this.delegate.addKeyword(keyword, null);
}
return this;
}
@ -389,30 +170,29 @@ public class Trie {
* @return This builder.
*/
public TrieBuilder onlyWholeWords() {
this.trieConfig.setOnlyWholeWords(true);
this.delegate.onlyWholeWords();
return this;
}
/**
* Configure the Trie to match whole keywords that are separated by
* whitespace in the text. For example, "this keyword thatkeyword"
* would only match the first occurrence of "keyword".
* Configure the Trie to match whole keywords that are separated by whitespace
* in the text. For example, "this keyword thatkeyword" would only match the
* first occurrence of "keyword".
*
* @return This builder.
*/
public TrieBuilder onlyWholeWordsWhiteSpaceSeparated() {
this.trieConfig.setOnlyWholeWordsWhiteSpaceSeparated(true);
this.delegate.onlyWholeWordsWhiteSpaceSeparated();
return this;
}
/**
* Configure the Trie to stop after the first keyword is found in the
* text.
* Configure the Trie to stop after the first keyword is found in the text.
*
* @return This builder.
*/
public TrieBuilder stopOnHit() {
trie.trieConfig.setStopOnHit(true);
this.delegate.stopOnHit();
return this;
}
@ -422,8 +202,8 @@ public class Trie {
* @return The configured Trie.
*/
public Trie build() {
this.trie.constructFailureStates();
return this.trie;
PayloadTrie<String> payloadTrie = this.delegate.build();
return new Trie(payloadTrie);
}
/**

View File

@ -0,0 +1,21 @@
package org.ahocorasick.trie.handler;
import java.util.ArrayList;
import java.util.List;
import org.ahocorasick.trie.PayloadEmit;
public abstract class AbstractStatefulPayloadEmitHandler<T> implements StatefulPayloadEmitHandler<T> {
private final List<PayloadEmit<T>> emits = new ArrayList<>();
public void addEmit(final PayloadEmit<T> emit) {
this.emits.add(emit);
}
@Override
public List<PayloadEmit<T>> getEmits() {
return this.emits;
}
}

View File

@ -1,10 +1,10 @@
package org.ahocorasick.trie.handler;
import org.ahocorasick.trie.Emit;
import java.util.ArrayList;
import java.util.List;
import org.ahocorasick.trie.Emit;
public class DefaultEmitHandler implements StatefulEmitHandler {
private final List<Emit> emits = new ArrayList<>();

View File

@ -0,0 +1,22 @@
package org.ahocorasick.trie.handler;
import java.util.ArrayList;
import java.util.List;
import org.ahocorasick.trie.PayloadEmit;
public class DefaultPayloadEmitHandler<T> implements StatefulPayloadEmitHandler<T> {
private final List<PayloadEmit<T>> emits = new ArrayList<>();
@Override
public boolean emit(final PayloadEmit<T> emit) {
this.emits.add(emit);
return true;
}
@Override
public List<PayloadEmit<T>> getEmits() {
return this.emits;
}
}

View File

@ -0,0 +1,24 @@
package org.ahocorasick.trie.handler;
import org.ahocorasick.trie.Emit;
import org.ahocorasick.trie.PayloadEmit;
/**
* Convenience wrapper class that delegates every method to a EmitHandler.
*/
public class PayloadEmitDelegateHandler implements PayloadEmitHandler<String> {
private EmitHandler handler;
public PayloadEmitDelegateHandler(EmitHandler handler) {
this.handler = handler;
}
@Override
public boolean emit(PayloadEmit<String> emit) {
Emit newEmit = new Emit(emit.getStart(), emit.getEnd(), emit.getKeyword());
return handler.emit(newEmit);
}
}

View File

@ -0,0 +1,7 @@
package org.ahocorasick.trie.handler;
import org.ahocorasick.trie.PayloadEmit;
public interface PayloadEmitHandler<T> {
boolean emit(PayloadEmit<T> emit);
}

View File

@ -0,0 +1,42 @@
package org.ahocorasick.trie.handler;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import org.ahocorasick.trie.Emit;
import org.ahocorasick.trie.PayloadEmit;
/**
* Convenience wrapper class that delegates every method to a
* StatefullPayloadEmitHandler.
*/
public class StatefulPayloadEmitDelegateHandler implements StatefulPayloadEmitHandler<String> {
private StatefulEmitHandler handler;
public StatefulPayloadEmitDelegateHandler(StatefulEmitHandler handler) {
this.handler = handler;
}
private static List<PayloadEmit<String>> asEmits(Collection<Emit> emits) {
List<PayloadEmit<String>> result = new ArrayList<>();
for (Emit emit : emits) {
result.add(new PayloadEmit<String>(emit.getStart(), emit.getEnd(), emit.getKeyword(), null));
}
return result;
}
@Override
public boolean emit(PayloadEmit<String> emit) {
Emit newEmit = new Emit(emit.getStart(), emit.getEnd(), emit.getKeyword());
return handler.emit(newEmit);
}
@Override
public List<PayloadEmit<String>> getEmits() {
List<Emit> emits = this.handler.getEmits();
return asEmits(emits);
}
}

View File

@ -0,0 +1,9 @@
package org.ahocorasick.trie.handler;
import java.util.List;
import org.ahocorasick.trie.PayloadEmit;
public interface StatefulPayloadEmitHandler<T> extends PayloadEmitHandler<T>{
List<PayloadEmit<T>> getEmits();
}

View File

@ -0,0 +1,510 @@
package org.ahocorasick.trie;
import static junit.framework.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.ThreadLocalRandom;
import org.ahocorasick.trie.handler.AbstractStatefulPayloadEmitHandler;
import org.ahocorasick.trie.handler.PayloadEmitHandler;
import org.ahocorasick.trie.handler.StatefulPayloadEmitHandler;
import org.junit.Test;
public class PayloadTrieTest {
private final static String[] ALPHABET = new String[] { "abc", "bcd", "cde" };
private final static String[] ALPHABET_PAYLOAD = new String[] { "alpha:abc", "alpha:bcd", "alpha:cde" };
private final static List<Payload<String>> ALPHABET_WITH_PAYLOADS = Arrays.asList(//
new Payload<String>(ALPHABET[0], ALPHABET_PAYLOAD[0]), //
new Payload<String>(ALPHABET[1], ALPHABET_PAYLOAD[1]), //
new Payload<String>(ALPHABET[2], ALPHABET_PAYLOAD[2]));
private final static String[] PRONOUNS = new String[] { "hers", "his", "she", "he" };
private final static int[] PRONOUNS_PAYLOAD_ID = new int[] { 9, 12, 4, 20 };
private final static List<Payload<Integer>> PRONOUNS_WITH_PAYLOADS = Arrays.asList(//
new Payload<Integer>(PRONOUNS[0], PRONOUNS_PAYLOAD_ID[0]), //
new Payload<Integer>(PRONOUNS[1], PRONOUNS_PAYLOAD_ID[1]), //
new Payload<Integer>(PRONOUNS[2], PRONOUNS_PAYLOAD_ID[2]), //
new Payload<Integer>(PRONOUNS[3], PRONOUNS_PAYLOAD_ID[3]) //
);
private final static String[] FOOD = new String[] { "veal", "cauliflower", "broccoli", "tomatoes" };
private final static Food[] FOOD_PAYLOAD = new Food[] { new Food("veal"), new Food("cauliflower"), new Food("broccoli"),
new Food("tomatoes") };
private final static List<Payload<Food>> FOOD_WITH_PAYLOADS = Arrays.asList(//
new Payload<Food>(FOOD[0], FOOD_PAYLOAD[0]), //
new Payload<Food>(FOOD[1], FOOD_PAYLOAD[1]), //
new Payload<Food>(FOOD[2], FOOD_PAYLOAD[2]), //
new Payload<Food>(FOOD[3], FOOD_PAYLOAD[3]) //
);
private final static String[] GREEK_LETTERS = new String[] { "Alpha", "Beta", "Gamma" };
private final static String[] GREEK_LETTERS_PAYLOAD = new String[] { "greek:Alpha", "greek:Beta", "greek:Gamma" };
private final static List<Payload<String>> GREEK_LETTERS_WITH_PAYLOADS = Arrays.asList(//
new Payload<String>(GREEK_LETTERS[0], GREEK_LETTERS_PAYLOAD[0]), //
new Payload<String>(GREEK_LETTERS[1], GREEK_LETTERS_PAYLOAD[1]), //
new Payload<String>(GREEK_LETTERS[2], GREEK_LETTERS_PAYLOAD[2]));
private final static String[] UNICODE = new String[] { "turning", "once", "again", "börkü" };
private final static String[] UNICODE_PAYLOAD = new String[] { "uni:turning", "uni:once", "uni:again", "uni:börkü" };
private final static List<Payload<String>> UNICODE_WITH_PAYLOADS = Arrays.asList(//
new Payload<String>(UNICODE[0], UNICODE_PAYLOAD[0]), //
new Payload<String>(UNICODE[1], UNICODE_PAYLOAD[1]), //
new Payload<String>(UNICODE[2], UNICODE_PAYLOAD[2]), //
new Payload<String>(UNICODE[3], UNICODE_PAYLOAD[3]));
public static class Food {
private final String name;
public Food(String name) {
this.name = name;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((name == null) ? 0 : name.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
Food other = (Food) obj;
if (name == null) {
if (other.name != null)
return false;
} else if (!name.equals(other.name))
return false;
return true;
}
}
@Test
public void keywordAndTextAreTheSame() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword(ALPHABET[0], ALPHABET_PAYLOAD[0]).build();
Collection<PayloadEmit<String>> emits = trie.parseText(ALPHABET[0]);
Iterator<PayloadEmit<String>> iterator = emits.iterator();
checkEmit(iterator.next(), 0, 2, ALPHABET[0], ALPHABET_PAYLOAD[0]);
}
@Test
public void keywordAndTextAreTheSameFirstMatch() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword(ALPHABET[0], ALPHABET_PAYLOAD[0]).build();
PayloadEmit<String> firstMatch = trie.firstMatch(ALPHABET[0]);
checkEmit(firstMatch, 0, 2, ALPHABET[0], ALPHABET_PAYLOAD[0]);
}
@Test
public void textIsLongerThanKeyword() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword(ALPHABET[0], ALPHABET_PAYLOAD[0]).build();
Collection<PayloadEmit<String>> emits = trie.parseText(" " + ALPHABET[0]);
Iterator<PayloadEmit<String>> iterator = emits.iterator();
checkEmit(iterator.next(), 1, 3, ALPHABET[0], ALPHABET_PAYLOAD[0]);
}
@Test
public void textIsLongerThanKeywordFirstMatch() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword(ALPHABET[0], ALPHABET_PAYLOAD[0]).build();
PayloadEmit<String> firstMatch = trie.firstMatch(" " + ALPHABET[0]);
checkEmit(firstMatch, 1, 3, ALPHABET[0], ALPHABET_PAYLOAD[0]);
}
@Test
public void variousKeywordsOneMatch() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeywords(ALPHABET_WITH_PAYLOADS).build();
Collection<PayloadEmit<String>> emits = trie.parseText("bcd");
Iterator<PayloadEmit<String>> iterator = emits.iterator();
checkEmit(iterator.next(), 0, 2, "bcd", "alpha:bcd");
}
@Test
public void variousKeywordsFirstMatch() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeywords(ALPHABET_WITH_PAYLOADS).build();
PayloadEmit<String> firstMatch = trie.firstMatch("bcd");
checkEmit(firstMatch, 0, 2, "bcd", "alpha:bcd");
}
@Test
public void ushersTestAndStopOnHit() {
PayloadTrie<Integer> trie = PayloadTrie.<Integer>builder().addKeywords(PRONOUNS_WITH_PAYLOADS).stopOnHit().build();
Collection<PayloadEmit<Integer>> emits = trie.parseText("ushers");
assertEquals(1, emits.size()); // she @ 3, he @ 3, hers @ 5
Iterator<PayloadEmit<Integer>> iterator = emits.iterator();
checkEmit(iterator.next(), 2, 3, "he", 20);
}
@Test
public void ushersTestStopOnHitSkipOne() {
PayloadTrie<Integer> trie = PayloadTrie.<Integer>builder().addKeywords(PRONOUNS_WITH_PAYLOADS).stopOnHit().build();
StatefulPayloadEmitHandler<Integer> testEmitHandler = new AbstractStatefulPayloadEmitHandler<Integer>() {
boolean first = true;
@Override
public boolean emit(final PayloadEmit<Integer> emit) {
if (first) {
// return false for the first element
first = false;
return false;
}
addEmit(emit);
return true;
}
};
trie.parseText("ushers", testEmitHandler);
Collection<PayloadEmit<Integer>> emits = testEmitHandler.getEmits();
assertEquals(1, emits.size()); // she @ 3, he @ 3, hers @ 5
Iterator<PayloadEmit<Integer>> iterator = emits.iterator();
checkEmit(iterator.next(), 1, 3, "she", 4);
}
@Test
public void ushersTest() {
PayloadTrie<Integer> trie = PayloadTrie.<Integer>builder().addKeywords(PRONOUNS_WITH_PAYLOADS).build();
Collection<PayloadEmit<Integer>> emits = trie.parseText("ushers");
assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5
Iterator<PayloadEmit<Integer>> iterator = emits.iterator();
checkEmit(iterator.next(), 2, 3, "he", 20);
checkEmit(iterator.next(), 1, 3, "she", 4);
checkEmit(iterator.next(), 2, 5, "hers", 9);
}
@Test
public void ushersTestWithCapitalKeywords() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().addKeyword("HERS", "hers").addKeyword("HIS", "his")
.addKeyword("SHE", "she").addKeyword("HE", "he").build();
Collection<PayloadEmit<String>> emits = trie.parseText("ushers");
assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5
Iterator<PayloadEmit<String>> iterator = emits.iterator();
checkEmit(iterator.next(), 2, 3, "he", "he");
checkEmit(iterator.next(), 1, 3, "she", "she");
checkEmit(iterator.next(), 2, 5, "hers", "hers");
}
@Test
public void ushersTestFirstMatch() {
PayloadTrie<Integer> trie = PayloadTrie.<Integer>builder().addKeywords(PRONOUNS_WITH_PAYLOADS).build();
PayloadEmit<Integer> firstMatch = trie.firstMatch("ushers");
checkEmit(firstMatch, 2, 3, "he", 20);
}
@Test
public void ushersTestByCallback() {
PayloadTrie<Integer> trie = PayloadTrie.<Integer>builder().addKeywords(PRONOUNS_WITH_PAYLOADS).build();
final List<PayloadEmit<Integer>> emits = new ArrayList<>();
PayloadEmitHandler<Integer> emitHandler = new PayloadEmitHandler<Integer>() {
@Override
public boolean emit(PayloadEmit<Integer> emit) {
emits.add(emit);
return true;
}
};
trie.parseText("ushers", emitHandler);
assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5
Iterator<PayloadEmit<Integer>> iterator = emits.iterator();
checkEmit(iterator.next(), 2, 3, "he", 20);
checkEmit(iterator.next(), 1, 3, "she", 4);
checkEmit(iterator.next(), 2, 5, "hers", 9);
}
@Test
public void misleadingTest() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword("hers", "pronon:hers").build();
Collection<PayloadEmit<String>> emits = trie.parseText("h he her hers");
Iterator<PayloadEmit<String>> iterator = emits.iterator();
checkEmit(iterator.next(), 9, 12, "hers", "pronon:hers");
}
@Test
public void misleadingTestFirstMatch() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword("hers", "pronon:hers").build();
PayloadEmit<String> firstMatch = trie.firstMatch("h he her hers");
checkEmit(firstMatch, 9, 12, "hers", "pronon:hers");
}
@Test
public void recipes() {
PayloadTrie<Food> trie = PayloadTrie.<Food>builder().addKeywords(FOOD_WITH_PAYLOADS).build();
Collection<PayloadEmit<Food>> emits = trie.parseText("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli");
Iterator<PayloadEmit<Food>> iterator = emits.iterator();
checkEmit(iterator.next(), 2, 12, "cauliflower", new Food("cauliflower"));
checkEmit(iterator.next(), 18, 25, "tomatoes", new Food("tomatoes"));
checkEmit(iterator.next(), 40, 43, "veal", new Food("veal"));
checkEmit(iterator.next(), 51, 58, "broccoli", new Food("broccoli"));
}
@Test
public void recipesFirstMatch() {
PayloadTrie<Food> trie = PayloadTrie.<Food>builder().addKeywords(FOOD_WITH_PAYLOADS).build();
PayloadEmit<Food> firstMatch = trie.firstMatch("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli");
checkEmit(firstMatch, 2, 12, "cauliflower", new Food("cauliflower"));
}
@Test
public void longAndShortOverlappingMatch() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword("he", "pronon:he").addKeyword("hehehehe", "garbage")
.build();
Collection<PayloadEmit<String>> emits = trie.parseText("hehehehehe");
Iterator<PayloadEmit<String>> iterator = emits.iterator();
checkEmit(iterator.next(), 0, 1, "he", "pronon:he");
checkEmit(iterator.next(), 2, 3, "he", "pronon:he");
checkEmit(iterator.next(), 4, 5, "he", "pronon:he");
checkEmit(iterator.next(), 6, 7, "he", "pronon:he");
checkEmit(iterator.next(), 0, 7, "hehehehe", "garbage");
checkEmit(iterator.next(), 8, 9, "he", "pronon:he");
checkEmit(iterator.next(), 2, 9, "hehehehe", "garbage");
}
@Test
public void nonOverlapping() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().removeOverlaps().addKeyword("ab", "alpha:ab")
.addKeyword("cba", "alpha:cba").addKeyword("ababc", "alpha:ababc").build();
Collection<PayloadEmit<String>> emits = trie.parseText("ababcbab");
assertEquals(2, emits.size());
Iterator<PayloadEmit<String>> iterator = emits.iterator();
// With overlaps: ab@1, ab@3, ababc@4, cba@6, ab@7
checkEmit(iterator.next(), 0, 4, "ababc", "alpha:ababc");
checkEmit(iterator.next(), 6, 7, "ab", "alpha:ab");
}
@Test
public void nonOverlappingFirstMatch() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().removeOverlaps().addKeyword("ab", "alpha:ab")
.addKeyword("cba", "alpha:cba").addKeyword("ababc", "alpha:ababc").build();
PayloadEmit<String> firstMatch = trie.firstMatch("ababcbab");
checkEmit(firstMatch, 0, 4, "ababc", "alpha:ababc");
}
@Test
public void containsMatch() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().removeOverlaps().addKeyword("ab", "alpha:ab")
.addKeyword("cba", "alpha:cba").addKeyword("ababc", "alpha:ababc").build();
assertTrue(trie.containsMatch("ababcbab"));
}
@Test
public void startOfChurchillSpeech() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().removeOverlaps().addKeyword("T").addKeyword("u").addKeyword("ur")
.addKeyword("r").addKeyword("urn").addKeyword("ni").addKeyword("i").addKeyword("in").addKeyword("n")
.addKeyword("urning").build();
Collection<PayloadEmit<String>> emits = trie.parseText("Turning");
assertEquals(2, emits.size());
}
@Test
public void partialMatch() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().onlyWholeWords().addKeyword("sugar", "food:sugar").build();
Collection<PayloadEmit<String>> emits = trie.parseText("sugarcane sugarcane sugar canesugar"); // left, middle, right test
assertEquals(1, emits.size()); // Match must not be made
checkEmit(emits.iterator().next(), 20, 24, "sugar", "food:sugar");
}
@Test
public void partialMatchFirstMatch() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().onlyWholeWords().addKeyword("sugar", "food:sugar").build();
PayloadEmit<String> firstMatch = trie.firstMatch("sugarcane sugarcane sugar canesugar"); // left, middle, right test
checkEmit(firstMatch, 20, 24, "sugar", "food:sugar");
}
@Test
public void tokenizeFullSentence() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeywords(GREEK_LETTERS_WITH_PAYLOADS).build();
Collection<PayloadToken<String>> tokens = trie.tokenize("Hear: Alpha team first, Beta from the rear, Gamma in reserve");
assertEquals(7, tokens.size());
Iterator<PayloadToken<String>> tokensIt = tokens.iterator();
assertEquals("Hear: ", tokensIt.next().getFragment());
assertEquals("Alpha", tokensIt.next().getFragment());
assertEquals(" team first, ", tokensIt.next().getFragment());
assertEquals("Beta", tokensIt.next().getFragment());
assertEquals(" from the rear, ", tokensIt.next().getFragment());
assertEquals("Gamma", tokensIt.next().getFragment());
assertEquals(" in reserve", tokensIt.next().getFragment());
}
// @see https://github.com/robert-bor/aho-corasick/issues/5
@Test
public void testStringIndexOutOfBoundsException() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().onlyWholeWords().addKeywords(UNICODE_WITH_PAYLOADS)
.build();
Collection<PayloadEmit<String>> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ");
assertEquals(4, emits.size()); // Match must not be made
Iterator<PayloadEmit<String>> it = emits.iterator();
checkEmit(it.next(), 0, 6, "turning", "uni:turning");
checkEmit(it.next(), 8, 11, "once", "uni:once");
checkEmit(it.next(), 13, 17, "again", "uni:again");
checkEmit(it.next(), 19, 23, "börkü", "uni:börkü");
}
@Test
public void testIgnoreCase() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().addKeywords(UNICODE_WITH_PAYLOADS).build();
Collection<PayloadEmit<String>> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ");
assertEquals(4, emits.size()); // Match must not be made
Iterator<PayloadEmit<String>> it = emits.iterator();
checkEmit(it.next(), 0, 6, "turning", "uni:turning");
checkEmit(it.next(), 8, 11, "once", "uni:once");
checkEmit(it.next(), 13, 17, "again", "uni:again");
checkEmit(it.next(), 19, 23, "börkü", "uni:börkü");
}
@Test
public void testIgnoreCaseFirstMatch() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().addKeywords(UNICODE_WITH_PAYLOADS).build();
PayloadEmit<String> firstMatch = trie.firstMatch("TurninG OnCe AgAiN BÖRKÜ");
checkEmit(firstMatch, 0, 6, "turning", "uni:turning");
}
@Test
public void tokenizeTokensInSequence() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeywords(GREEK_LETTERS_WITH_PAYLOADS).build();
Collection<PayloadToken<String>> tokens = trie.tokenize("Alpha Beta Gamma");
assertEquals(5, tokens.size());
}
// @see https://github.com/robert-bor/aho-corasick/issues/7
@Test
public void testZeroLength() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreOverlaps().onlyWholeWords().ignoreCase().addKeyword("")
.build();
trie.tokenize(
"Try a natural lip and subtle bronzer to keep all the focus on those big bright eyes with NARS Eyeshadow Duo in Rated R And the winner is... Boots No7 Advanced Renewal Anti-ageing Glycolic Peel Kit ($25 amazon.com) won most-appealing peel.");
}
// @see https://github.com/robert-bor/aho-corasick/issues/8
@Test
public void testUnicode1() {
String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char
assertEquals("THIS", target.substring(5, 9)); // Java does it the right way
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().onlyWholeWords().addKeyword("this", "pronon:this")
.build();
Collection<PayloadEmit<String>> emits = trie.parseText(target);
assertEquals(1, emits.size());
Iterator<PayloadEmit<String>> it = emits.iterator();
checkEmit(it.next(), 5, 8, "this", "pronon:this");
}
// @see https://github.com/robert-bor/aho-corasick/issues/8
@Test
public void testUnicode2() {
String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().onlyWholeWords().addKeyword("this", "pronon:this")
.build();
assertEquals("THIS", target.substring(5, 9)); // Java does it the right way
PayloadEmit<String> firstMatch = trie.firstMatch(target);
checkEmit(firstMatch, 5, 8, "this", "pronon:this");
}
@Test
public void testPartialMatchWhiteSpaces() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().onlyWholeWordsWhiteSpaceSeparated()
.addKeyword("#sugar-123", "sugar").build();
Collection<PayloadEmit<String>> emits = trie.parseText("#sugar-123 #sugar-1234"); // left, middle, right test
assertEquals(1, emits.size()); // Match must not be made
checkEmit(emits.iterator().next(), 0, 9, "#sugar-123", "sugar");
}
@Test
public void testLargeString() {
final int interval = 100;
final int textSize = 1000000;
final String keyword = FOOD[1];
final Food payload = FOOD_PAYLOAD[1];
final StringBuilder text = randomNumbers(textSize);
injectKeyword(text, keyword, interval);
PayloadTrie<Food> trie = PayloadTrie.<Food>builder().onlyWholeWords().addKeyword(keyword, payload).build();
final Collection<PayloadEmit<Food>> emits = trie.parseText(text);
assertEquals(textSize / interval, emits.size());
}
/**
* Generates a random sequence of ASCII numbers.
*
* @param count The number of numbers to generate.
* @return A character sequence filled with random digits.
*/
private StringBuilder randomNumbers(int count) {
final StringBuilder sb = new StringBuilder(count);
while (--count > 0) {
sb.append(randomInt(0, 10));
}
return sb;
}
/**
* Injects keywords into a string builder.
*
* @param source Should contain a bunch of random data that cannot match any
* keyword.
* @param keyword A keyword to inject repeatedly in the text.
* @param interval How often to inject the keyword.
*/
private void injectKeyword(final StringBuilder source, final String keyword, final int interval) {
final int length = source.length();
for (int i = 0; i < length; i += interval) {
source.replace(i, i + keyword.length(), keyword);
}
}
private int randomInt(final int min, final int max) {
return ThreadLocalRandom.current().nextInt(min, max);
}
private void checkEmit(PayloadEmit<Food> next, int expectedStart, int expectedEnd, String expectedKeyword,
Food expectedPayload) {
assertEquals("Start of emit should have been " + expectedStart, expectedStart, next.getStart());
assertEquals("End of emit should have been " + expectedEnd, expectedEnd, next.getEnd());
assertEquals("Keyword of emit shoud be " + expectedKeyword, expectedKeyword, next.getKeyword());
assertEquals("Payload of emit shoud be " + expectedPayload, expectedPayload, next.getPayload());
}
private void checkEmit(PayloadEmit<Integer> next, int expectedStart, int expectedEnd, String expectedKeyword,
Integer expectedPayload) {
assertEquals("Start of emit should have been " + expectedStart, expectedStart, next.getStart());
assertEquals("End of emit should have been " + expectedEnd, expectedEnd, next.getEnd());
assertEquals("Keyword of emit shoud be " + expectedKeyword, expectedKeyword, next.getKeyword());
assertEquals("Payload of emit shoud be " + expectedPayload, expectedPayload, next.getPayload());
}
private void checkEmit(PayloadEmit<String> next, int expectedStart, int expectedEnd, String expectedKeyword,
String expectedPayload) {
assertEquals("Start of emit should have been " + expectedStart, expectedStart, next.getStart());
assertEquals("End of emit should have been " + expectedEnd, expectedEnd, next.getEnd());
assertEquals("Keyword of emit shoud be " + expectedKeyword, expectedKeyword, next.getKeyword());
assertEquals("Payload of emit shoud be " + expectedPayload, expectedPayload, next.getPayload());
}
}

View File

@ -1,9 +1,7 @@
package org.ahocorasick.trie;
import org.ahocorasick.trie.handler.AbstractStatefulEmitHandler;
import org.ahocorasick.trie.handler.EmitHandler;
import org.ahocorasick.trie.handler.StatefulEmitHandler;
import org.junit.Test;
import static junit.framework.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.util.ArrayList;
import java.util.Collection;
@ -11,8 +9,10 @@ import java.util.Iterator;
import java.util.List;
import java.util.concurrent.ThreadLocalRandom;
import static junit.framework.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import org.ahocorasick.trie.handler.AbstractStatefulEmitHandler;
import org.ahocorasick.trie.handler.EmitHandler;
import org.ahocorasick.trie.handler.StatefulEmitHandler;
import org.junit.Test;
public class TrieTest {
private final static String[] ALPHABET = new String[]{