* #49: Allow to fix Payload with Keyword
This commit is contained in:
parent
b7cc1136e5
commit
9f80565b53
@ -1,6 +1,6 @@
|
||||
language: java
|
||||
install: mvn install -DskipTests=true -Dgpg.skip=true
|
||||
jdk:
|
||||
- oraclejdk8
|
||||
- openjdk8
|
||||
after_success:
|
||||
- bash <(curl -s https://codecov.io/bash)
|
||||
- bash <(curl -s https://codecov.io/bash)
|
||||
|
||||
20
README.md
20
README.md
@ -182,6 +182,26 @@ matches as soon as you encounter them. Let's look at an example where we want to
|
||||
System.out.println(html);
|
||||
```
|
||||
|
||||
You can also emit custom outputs. This might for example be useful to implement a trivial named entity
|
||||
recognizer. In this case use a PayloadTrie instead of a Trie:
|
||||
|
||||
```java
|
||||
class Word {
|
||||
private final String gender;
|
||||
public Word(String gender) {
|
||||
this.gender = gender;
|
||||
}
|
||||
}
|
||||
|
||||
PayloadTrie<Word> trie = PayloadTrie.<Word>builder()
|
||||
.addKeyword("hers", new Word("f")
|
||||
.addKeyword("his", new Word("m"))
|
||||
.addKeyword("she", new Word("f"))
|
||||
.addKeyword("he", new Word("m"))
|
||||
.build();
|
||||
Collection<PayloadEmit<Word>> emits = trie.parseText("ushers");
|
||||
```
|
||||
|
||||
Releases
|
||||
--------
|
||||
Information on the aho-corasick [releases](https://github.com/robert-bor/aho-corasick/releases).
|
||||
|
||||
21
src/main/java/org/ahocorasick/trie/DefaultToken.java
Normal file
21
src/main/java/org/ahocorasick/trie/DefaultToken.java
Normal file
@ -0,0 +1,21 @@
|
||||
package org.ahocorasick.trie;
|
||||
|
||||
public class DefaultToken extends Token {
|
||||
|
||||
private PayloadToken<String> payloadToken;
|
||||
|
||||
public DefaultToken(PayloadToken<String> payloadToken) {
|
||||
super(payloadToken.getFragment());
|
||||
this.payloadToken = payloadToken;
|
||||
}
|
||||
|
||||
public boolean isMatch() {
|
||||
return payloadToken.isMatch();
|
||||
}
|
||||
|
||||
public Emit getEmit() {
|
||||
PayloadEmit<String> emit = payloadToken.getEmit();
|
||||
return new Emit(emit.getStart(), emit.getEnd(), emit.getKeyword());
|
||||
}
|
||||
|
||||
}
|
||||
@ -4,10 +4,9 @@ import org.ahocorasick.interval.Interval;
|
||||
import org.ahocorasick.interval.Intervalable;
|
||||
|
||||
public class Emit extends Interval implements Intervalable {
|
||||
|
||||
private final String keyword;
|
||||
|
||||
public Emit(final int start, final int end, final String keyword) {
|
||||
public Emit(final int start, final int end, String keyword) {
|
||||
super(start, end);
|
||||
this.keyword = keyword;
|
||||
}
|
||||
@ -20,4 +19,5 @@ public class Emit extends Interval implements Intervalable {
|
||||
public String toString() {
|
||||
return super.toString() + "=" + this.keyword;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -15,4 +15,5 @@ public class FragmentToken extends Token {
|
||||
public Emit getEmit() {
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -7,6 +7,7 @@ public class MatchToken extends Token {
|
||||
public MatchToken(final String fragment, final Emit emit) {
|
||||
super(fragment);
|
||||
this.emit = emit;
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
|
||||
33
src/main/java/org/ahocorasick/trie/Payload.java
Normal file
33
src/main/java/org/ahocorasick/trie/Payload.java
Normal file
@ -0,0 +1,33 @@
|
||||
package org.ahocorasick.trie;
|
||||
|
||||
/**
|
||||
* Payload holds the matched keyword and some payload-data.
|
||||
*
|
||||
* @author Daniel Beck
|
||||
*
|
||||
* @param <T> The type of the wrapped payload data.
|
||||
*/
|
||||
public class Payload<T> implements Comparable<Payload<T>> {
|
||||
|
||||
private final String keyword;
|
||||
private final T data;
|
||||
|
||||
public Payload(final String keyword, final T data) {
|
||||
super();
|
||||
this.keyword = keyword;
|
||||
this.data = data;
|
||||
}
|
||||
|
||||
public String getKeyword() {
|
||||
return keyword;
|
||||
}
|
||||
|
||||
public T getData() {
|
||||
return data;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(Payload<T> other) {
|
||||
return keyword.compareTo(other.getKeyword());
|
||||
}
|
||||
}
|
||||
50
src/main/java/org/ahocorasick/trie/PayloadEmit.java
Normal file
50
src/main/java/org/ahocorasick/trie/PayloadEmit.java
Normal file
@ -0,0 +1,50 @@
|
||||
package org.ahocorasick.trie;
|
||||
|
||||
import org.ahocorasick.interval.Interval;
|
||||
import org.ahocorasick.interval.Intervalable;
|
||||
|
||||
/**
|
||||
* PayloadEmit contains a matched term and its associated payload data.
|
||||
*
|
||||
* @param <T> Type of the wrapped payload-data.
|
||||
* @author Daniel Beck
|
||||
*
|
||||
*/
|
||||
public class PayloadEmit<T> extends Interval implements Intervalable {
|
||||
|
||||
private final String keyword;
|
||||
|
||||
private final T payload;
|
||||
|
||||
/**
|
||||
* Created a PayloadEmit
|
||||
*
|
||||
* @param start Start of the matched search term.
|
||||
* @param end End of the matched search term.
|
||||
* @param keyword Keyword that matched.
|
||||
* @param payload Emitted payload data.
|
||||
*/
|
||||
public PayloadEmit(final int start, final int end, String keyword, T payload) {
|
||||
super(start, end);
|
||||
this.keyword = keyword;
|
||||
this.payload = payload;
|
||||
}
|
||||
|
||||
public String getKeyword() {
|
||||
return this.keyword;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the payload associated to this emit.
|
||||
*
|
||||
* @return the associated payload
|
||||
*/
|
||||
public T getPayload() {
|
||||
return this.payload;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return super.toString() + "=" + this.keyword + (this.payload != null ? "->" + this.payload : "");
|
||||
}
|
||||
}
|
||||
31
src/main/java/org/ahocorasick/trie/PayloadFragmentToken.java
Normal file
31
src/main/java/org/ahocorasick/trie/PayloadFragmentToken.java
Normal file
@ -0,0 +1,31 @@
|
||||
package org.ahocorasick.trie;
|
||||
|
||||
/***
|
||||
* PayloadFragmentToken holds a text ("the fragment").
|
||||
* <p>
|
||||
* It does not matches a search term - so its <code>isMatch</code>-method
|
||||
* returns always false. <code>getEmits</code> returns not Emits.
|
||||
*
|
||||
* @author Daniel Beck
|
||||
*
|
||||
* @param <T> The Type of the emitted payloads.
|
||||
*/
|
||||
public class PayloadFragmentToken<T> extends PayloadToken<T> {
|
||||
|
||||
public PayloadFragmentToken(String fragment) {
|
||||
super(fragment);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isMatch() {
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns null.
|
||||
*/
|
||||
@Override
|
||||
public PayloadEmit<T> getEmit() {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
31
src/main/java/org/ahocorasick/trie/PayloadMatchToken.java
Normal file
31
src/main/java/org/ahocorasick/trie/PayloadMatchToken.java
Normal file
@ -0,0 +1,31 @@
|
||||
package org.ahocorasick.trie;
|
||||
|
||||
/**
|
||||
* PayloadMatchToken holds a text ("the fragment") an emits some output.
|
||||
* <p>
|
||||
* It matches a search term - so its <code>isMatch</code>-method returns always
|
||||
* true..
|
||||
*
|
||||
* @author Daniel Beck
|
||||
*
|
||||
* @param <T> The Type of the emitted payloads.
|
||||
*/
|
||||
public class PayloadMatchToken<T> extends PayloadToken<T> {
|
||||
|
||||
private final PayloadEmit<T> emit;
|
||||
|
||||
public PayloadMatchToken(final String fragment, final PayloadEmit<T> emit) {
|
||||
super(fragment);
|
||||
this.emit = emit;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isMatch() {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public PayloadEmit<T> getEmit() {
|
||||
return this.emit;
|
||||
}
|
||||
}
|
||||
156
src/main/java/org/ahocorasick/trie/PayloadState.java
Normal file
156
src/main/java/org/ahocorasick/trie/PayloadState.java
Normal file
@ -0,0 +1,156 @@
|
||||
package org.ahocorasick.trie;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* A state has various important tasks it must attend to:
|
||||
* </p>
|
||||
* <p>
|
||||
* <ul>
|
||||
* <li>success; when a character points to another state, it must return that
|
||||
* state</li>
|
||||
* <li>failure; when a character has no matching state, the algorithm must be
|
||||
* able to fall back on a state with less depth</li>
|
||||
* <li>emits; when this state is passed and keywords have been matched, the
|
||||
* matches and their payloads must be 'emitted' so that they can be used later
|
||||
* on.</li>
|
||||
* </ul>
|
||||
* <p>
|
||||
* <p>
|
||||
* The root state is special in the sense that it has no failure state; it
|
||||
* cannot fail. If it 'fails' it will still parse the next character and start
|
||||
* from the root node. This ensures that the algorithm always runs. All other
|
||||
* states always have a fail state.
|
||||
* </p>
|
||||
*
|
||||
* @author Daniel Beck
|
||||
*/
|
||||
public class PayloadState<T> {
|
||||
|
||||
/**
|
||||
* effective the size of the keyword
|
||||
*/
|
||||
private final int depth;
|
||||
|
||||
/**
|
||||
* only used for the root state to refer to itself in case no matches have been
|
||||
* found
|
||||
*/
|
||||
private final PayloadState<T> rootState;
|
||||
|
||||
/**
|
||||
* referred to in the white paper as the 'goto' structure. From a state it is
|
||||
* possible to go to other states, depending on the character passed.
|
||||
*/
|
||||
private final Map<Character, PayloadState<T>> success = new HashMap<>();
|
||||
|
||||
/**
|
||||
* if no matching states are found, the failure state will be returned
|
||||
*/
|
||||
private PayloadState<T> failure;
|
||||
|
||||
/**
|
||||
* whenever this state is reached, it will emit the matches keywords for future
|
||||
* reference
|
||||
*/
|
||||
private Set<Payload<T>> emits;
|
||||
|
||||
public PayloadState() {
|
||||
this(0);
|
||||
}
|
||||
|
||||
public PayloadState(final int depth) {
|
||||
this.depth = depth;
|
||||
this.rootState = depth == 0 ? this : null;
|
||||
}
|
||||
|
||||
private PayloadState<T> nextState(final Character character, final boolean ignoreRootState) {
|
||||
PayloadState<T> nextState = this.success.get(character);
|
||||
|
||||
if (!ignoreRootState && nextState == null && this.rootState != null) {
|
||||
nextState = this.rootState;
|
||||
}
|
||||
|
||||
return nextState;
|
||||
}
|
||||
|
||||
public PayloadState<T> nextState(final Character character) {
|
||||
return nextState(character, false);
|
||||
}
|
||||
|
||||
public PayloadState<T> nextStateIgnoreRootState(Character character) {
|
||||
return nextState(character, true);
|
||||
}
|
||||
|
||||
public PayloadState<T> addState(String keyword) {
|
||||
PayloadState<T> state = this;
|
||||
|
||||
for (final Character character : keyword.toCharArray()) {
|
||||
state = state.addState(character);
|
||||
}
|
||||
|
||||
return state;
|
||||
}
|
||||
|
||||
public PayloadState<T> addState(Character character) {
|
||||
PayloadState<T> nextState = nextStateIgnoreRootState(character);
|
||||
if (nextState == null) {
|
||||
nextState = new PayloadState<T>(this.depth + 1);
|
||||
this.success.put(character, nextState);
|
||||
}
|
||||
return nextState;
|
||||
}
|
||||
|
||||
public int getDepth() {
|
||||
return this.depth;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a payload to be emitted for this state.
|
||||
*
|
||||
* @param emit Payload to be emitted.
|
||||
*/
|
||||
public void addEmit(Payload<T> payload) {
|
||||
if (this.emits == null) {
|
||||
this.emits = new TreeSet<>();
|
||||
}
|
||||
this.emits.add(payload);
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a collection of payloads to be emitted for this state.
|
||||
*
|
||||
* @param emits Collection of payloads to be emitted.
|
||||
*/
|
||||
public void addEmit(Collection<Payload<T>> emits) {
|
||||
for (Payload<T> emit : emits) {
|
||||
addEmit(emit);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a collection of emitted payloads for this state.
|
||||
*
|
||||
* @return Collection of emitted payloads.
|
||||
*/
|
||||
public Collection<Payload<T>> emit() {
|
||||
return this.emits == null ? Collections.<Payload<T>>emptyList() : this.emits;
|
||||
}
|
||||
|
||||
public PayloadState<T> failure() {
|
||||
return this.failure;
|
||||
}
|
||||
|
||||
public void setFailure(PayloadState<T> failState) {
|
||||
this.failure = failState;
|
||||
}
|
||||
|
||||
public Collection<PayloadState<T>> getStates() {
|
||||
return this.success.values();
|
||||
}
|
||||
|
||||
public Collection<Character> getTransitions() {
|
||||
return this.success.keySet();
|
||||
}
|
||||
}
|
||||
28
src/main/java/org/ahocorasick/trie/PayloadToken.java
Normal file
28
src/main/java/org/ahocorasick/trie/PayloadToken.java
Normal file
@ -0,0 +1,28 @@
|
||||
package org.ahocorasick.trie;
|
||||
|
||||
/***
|
||||
* PayloadToken holds a text ("the fragment") an emits some output. If
|
||||
* <code>isMatch</code> returns true, the token matched a search term.
|
||||
*
|
||||
* @author Daniel Beck
|
||||
*
|
||||
* @param <T> The Type of the emitted payloads.
|
||||
*/
|
||||
public abstract class PayloadToken<T> {
|
||||
private String fragment;
|
||||
|
||||
public PayloadToken(String fragment) {
|
||||
this.fragment = fragment;
|
||||
}
|
||||
|
||||
public String getFragment() {
|
||||
return this.fragment;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if a search term matched.
|
||||
*/
|
||||
public abstract boolean isMatch();
|
||||
|
||||
public abstract PayloadEmit<T> getEmit();
|
||||
}
|
||||
495
src/main/java/org/ahocorasick/trie/PayloadTrie.java
Normal file
495
src/main/java/org/ahocorasick/trie/PayloadTrie.java
Normal file
@ -0,0 +1,495 @@
|
||||
package org.ahocorasick.trie;
|
||||
|
||||
import static java.lang.Character.isWhitespace;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Queue;
|
||||
import java.util.concurrent.LinkedBlockingDeque;
|
||||
|
||||
import org.ahocorasick.interval.IntervalTree;
|
||||
import org.ahocorasick.interval.Intervalable;
|
||||
import org.ahocorasick.trie.handler.DefaultPayloadEmitHandler;
|
||||
import org.ahocorasick.trie.handler.PayloadEmitHandler;
|
||||
import org.ahocorasick.trie.handler.StatefulPayloadEmitHandler;
|
||||
import org.ahocorasick.util.ListElementRemoval;
|
||||
import org.ahocorasick.util.ListElementRemoval.RemoveElementPredicate;
|
||||
|
||||
/**
|
||||
* A trie implementation, based on the Aho-Corasick white paper, Bell
|
||||
* technologies: http://cr.yp.to/bib/1975/aho.pdf
|
||||
* <p>
|
||||
*
|
||||
* The payload trie adds the possibility to specify emitted payloads for each
|
||||
* added keyword.
|
||||
*
|
||||
* @author Daniel Beck
|
||||
* @param <T> The type of the supplied of the payload
|
||||
*/
|
||||
public class PayloadTrie<T> {
|
||||
|
||||
private final TrieConfig trieConfig;
|
||||
|
||||
private final PayloadState<T> rootState;
|
||||
|
||||
protected PayloadTrie(final TrieConfig trieConfig) {
|
||||
this.trieConfig = trieConfig;
|
||||
this.rootState = new PayloadState<>();
|
||||
}
|
||||
|
||||
/**
|
||||
* Used by the builder to add a text search keyword with a emit payload.
|
||||
*
|
||||
* @param keyword The search term to add to the list of search terms.
|
||||
* @param emit the payload to emit for this search term.
|
||||
* @throws NullPointerException if the keyword is null.
|
||||
*/
|
||||
private void addKeyword(String keyword, T emit) {
|
||||
if (keyword.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (isCaseInsensitive()) {
|
||||
keyword = keyword.toLowerCase();
|
||||
}
|
||||
|
||||
addState(keyword).addEmit(new Payload<T>(keyword, emit));
|
||||
}
|
||||
|
||||
/**
|
||||
* Used by the builder to add a text search keyword.
|
||||
*
|
||||
* @param keyword The search term to add to the list of search terms.
|
||||
* @throws NullPointerException if the keyword is null.
|
||||
*/
|
||||
private void addKeyword(String keyword) {
|
||||
if (keyword.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (isCaseInsensitive()) {
|
||||
keyword = keyword.toLowerCase();
|
||||
}
|
||||
|
||||
addState(keyword).addEmit(new Payload<T>(keyword, null));
|
||||
}
|
||||
|
||||
private PayloadState<T> addState(final String keyword) {
|
||||
return getRootState().addState(keyword);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tokenizes the specified text and returns the emitted outputs.
|
||||
*
|
||||
* @param text The text to tokenize.
|
||||
*/
|
||||
public Collection<PayloadToken<T>> tokenize(final String text) {
|
||||
final Collection<PayloadToken<T>> tokens = new ArrayList<>();
|
||||
final Collection<PayloadEmit<T>> collectedEmits = parseText(text);
|
||||
int lastCollectedPosition = -1;
|
||||
|
||||
for (final PayloadEmit<T> emit : collectedEmits) {
|
||||
if (emit.getStart() - lastCollectedPosition > 1) {
|
||||
tokens.add((PayloadToken<T>) createFragment(emit, text, lastCollectedPosition));
|
||||
}
|
||||
|
||||
tokens.add(createMatch(emit, text));
|
||||
lastCollectedPosition = emit.getEnd();
|
||||
}
|
||||
|
||||
if (text.length() - lastCollectedPosition > 1) {
|
||||
tokens.add((PayloadToken<T>) createFragment(null, text, lastCollectedPosition));
|
||||
}
|
||||
|
||||
return tokens;
|
||||
}
|
||||
|
||||
private PayloadToken<T> createFragment(final PayloadEmit<T> emit, final String text, final int lastCollectedPosition) {
|
||||
return new PayloadFragmentToken<T>(
|
||||
text.substring(lastCollectedPosition + 1, emit == null ? text.length() : emit.getStart()));
|
||||
}
|
||||
|
||||
private PayloadToken<T> createMatch(PayloadEmit<T> emit, String text) {
|
||||
return new PayloadMatchToken<T>(text.substring(emit.getStart(), emit.getEnd() + 1), emit);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tokenizes a specified text and returns the emitted outputs.
|
||||
*
|
||||
* @param text The character sequence to tokenize.
|
||||
* @return A collection of emits.
|
||||
*/
|
||||
public Collection<PayloadEmit<T>> parseText(final CharSequence text) {
|
||||
return parseText(text, new DefaultPayloadEmitHandler<T>());
|
||||
}
|
||||
|
||||
/**
|
||||
* Tokenizes the specified text by using a custom EmitHandler and returns the
|
||||
* emitted outputs.
|
||||
*
|
||||
* @param text The character sequence to tokenize.
|
||||
* @param emitHandler The emit handler that will be used to parse the text.
|
||||
* @return A collection of emits.
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
public Collection<PayloadEmit<T>> parseText(final CharSequence text, final StatefulPayloadEmitHandler<T> emitHandler) {
|
||||
parseText(text, (PayloadEmitHandler<T>) emitHandler);
|
||||
|
||||
final List<PayloadEmit<T>> collectedEmits = emitHandler.getEmits();
|
||||
|
||||
if (trieConfig.isOnlyWholeWords()) {
|
||||
removePartialMatches(text, collectedEmits);
|
||||
}
|
||||
|
||||
if (trieConfig.isOnlyWholeWordsWhiteSpaceSeparated()) {
|
||||
removePartialMatchesWhiteSpaceSeparated(text, collectedEmits);
|
||||
}
|
||||
|
||||
if (!trieConfig.isAllowOverlaps()) {
|
||||
IntervalTree intervalTree = new IntervalTree((List<Intervalable>) (List<?>) collectedEmits);
|
||||
intervalTree.removeOverlaps((List<Intervalable>) (List<?>) collectedEmits);
|
||||
}
|
||||
|
||||
return collectedEmits;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the text contains contains one of the search terms. Else,
|
||||
* returns false.
|
||||
*
|
||||
* @param Text Specified text.
|
||||
* @return true if the text contains one of the search terms. Else, returns
|
||||
* false.
|
||||
*/
|
||||
public boolean containsMatch(final CharSequence text) {
|
||||
return firstMatch(text) != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tokenizes the specified text by using a custom EmitHandler and returns the
|
||||
* emitted outputs.
|
||||
*
|
||||
* @param text The character sequence to tokenize.
|
||||
* @param emitHandler The emit handler that will be used to parse the text.
|
||||
* @return A collection of emits.
|
||||
*/
|
||||
|
||||
public void parseText(final CharSequence text, final PayloadEmitHandler<T> emitHandler) {
|
||||
PayloadState<T> currentState = getRootState();
|
||||
|
||||
for (int position = 0; position < text.length(); position++) {
|
||||
Character character = text.charAt(position);
|
||||
|
||||
// TODO: Maybe lowercase the entire string at once?
|
||||
if (trieConfig.isCaseInsensitive()) {
|
||||
character = Character.toLowerCase(character);
|
||||
}
|
||||
|
||||
currentState = getState(currentState, character);
|
||||
if (storeEmits(position, currentState, emitHandler) && trieConfig.isStopOnHit()) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The first matching text sequence.
|
||||
*
|
||||
* @param text The text to search for keywords.
|
||||
* @return null if no matches found.
|
||||
*/
|
||||
public PayloadEmit<T> firstMatch(final CharSequence text) {
|
||||
if (!trieConfig.isAllowOverlaps()) {
|
||||
// Slow path. Needs to find all the matches to detect overlaps.
|
||||
final Collection<PayloadEmit<T>> parseText = parseText(text);
|
||||
|
||||
if (parseText != null && !parseText.isEmpty()) {
|
||||
return parseText.iterator().next();
|
||||
}
|
||||
} else {
|
||||
// Fast path. Returns first match found.
|
||||
PayloadState<T> currentState = getRootState();
|
||||
|
||||
for (int position = 0; position < text.length(); position++) {
|
||||
Character character = text.charAt(position);
|
||||
|
||||
// TODO: Lowercase the entire string at once?
|
||||
if (trieConfig.isCaseInsensitive()) {
|
||||
character = Character.toLowerCase(character);
|
||||
}
|
||||
|
||||
currentState = getState(currentState, character);
|
||||
Collection<Payload<T>> payloads = currentState.emit();
|
||||
|
||||
if (payloads != null && !payloads.isEmpty()) {
|
||||
for (final Payload<T> payload : payloads) {
|
||||
final PayloadEmit<T> emit = new PayloadEmit<>(position - payload.getKeyword().length() + 1, position,
|
||||
payload.getKeyword(), payload.getData());
|
||||
if (trieConfig.isOnlyWholeWords()) {
|
||||
if (!isPartialMatch(text, emit)) {
|
||||
return emit;
|
||||
}
|
||||
} else {
|
||||
return emit;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private boolean isPartialMatch(final CharSequence searchText, final PayloadEmit<T> emit) {
|
||||
return (emit.getStart() != 0 && Character.isAlphabetic(searchText.charAt(emit.getStart() - 1)))
|
||||
|| (emit.getEnd() + 1 != searchText.length() && Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1)));
|
||||
}
|
||||
|
||||
private void removePartialMatches(final CharSequence searchText, final List<PayloadEmit<T>> collectedEmits) {
|
||||
|
||||
final RemoveElementPredicate<PayloadEmit<T>> predicate = new RemoveElementPredicate<PayloadEmit<T>>() {
|
||||
|
||||
@Override
|
||||
public boolean remove(PayloadEmit<T> emit) {
|
||||
return isPartialMatch(searchText, emit);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
ListElementRemoval.removeIf(collectedEmits, predicate);
|
||||
}
|
||||
|
||||
private void removePartialMatchesWhiteSpaceSeparated(final CharSequence searchText,
|
||||
final List<PayloadEmit<T>> collectedEmits) {
|
||||
final long size = searchText.length();
|
||||
final List<PayloadEmit<T>> removeEmits = new ArrayList<>();
|
||||
|
||||
for (final PayloadEmit<T> emit : collectedEmits) {
|
||||
if ((emit.getStart() == 0 || isWhitespace(searchText.charAt(emit.getStart() - 1)))
|
||||
&& (emit.getEnd() + 1 == size || isWhitespace(searchText.charAt(emit.getEnd() + 1)))) {
|
||||
continue;
|
||||
}
|
||||
removeEmits.add(emit);
|
||||
}
|
||||
|
||||
for (final PayloadEmit<T> removeEmit : removeEmits) {
|
||||
collectedEmits.remove(removeEmit);
|
||||
}
|
||||
}
|
||||
|
||||
private PayloadState<T> getState(PayloadState<T> currentState, final Character character) {
|
||||
PayloadState<T> newCurrentState = currentState.nextState(character);
|
||||
|
||||
while (newCurrentState == null) {
|
||||
currentState = currentState.failure();
|
||||
newCurrentState = currentState.nextState(character);
|
||||
}
|
||||
|
||||
return newCurrentState;
|
||||
}
|
||||
|
||||
private void constructFailureStates() {
|
||||
final Queue<PayloadState<T>> queue = new LinkedBlockingDeque<>();
|
||||
final PayloadState<T> startState = getRootState();
|
||||
|
||||
// First, set the fail state of all depth 1 states to the root state
|
||||
for (PayloadState<T> depthOneState : startState.getStates()) {
|
||||
depthOneState.setFailure(startState);
|
||||
queue.add(depthOneState);
|
||||
}
|
||||
|
||||
// Second, determine the fail state for all depth > 1 state
|
||||
while (!queue.isEmpty()) {
|
||||
final PayloadState<T> currentState = queue.remove();
|
||||
|
||||
for (final Character transition : currentState.getTransitions()) {
|
||||
PayloadState<T> targetState = currentState.nextState(transition);
|
||||
queue.add(targetState);
|
||||
|
||||
PayloadState<T> traceFailureState = currentState.failure();
|
||||
while (traceFailureState.nextState(transition) == null) {
|
||||
traceFailureState = traceFailureState.failure();
|
||||
}
|
||||
|
||||
final PayloadState<T> newFailureState = traceFailureState.nextState(transition);
|
||||
targetState.setFailure(newFailureState);
|
||||
targetState.addEmit(newFailureState.emit());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private boolean storeEmits(final int position, final PayloadState<T> currentState, final PayloadEmitHandler<T> emitHandler) {
|
||||
boolean emitted = false;
|
||||
final Collection<Payload<T>> payloads = currentState.emit();
|
||||
|
||||
// TODO: The check for empty might be superfluous.
|
||||
if (payloads != null && !payloads.isEmpty()) {
|
||||
for (final Payload<T> payload : payloads) {
|
||||
emitted = emitHandler.emit(new PayloadEmit<T>(position - payload.getKeyword().length() + 1, position,
|
||||
payload.getKeyword(), payload.getData())) || emitted;
|
||||
|
||||
if (emitted && trieConfig.isStopOnHit()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return emitted;
|
||||
}
|
||||
|
||||
private boolean isCaseInsensitive() {
|
||||
return trieConfig.isCaseInsensitive();
|
||||
}
|
||||
|
||||
private PayloadState<T> getRootState() {
|
||||
return this.rootState;
|
||||
}
|
||||
|
||||
/**
|
||||
* Provides a fluent interface for constructing Trie instances with payloads.
|
||||
*
|
||||
* @return The builder used to configure its Trie.
|
||||
*/
|
||||
public static <T> PayloadTrieBuilder<T> builder() {
|
||||
return new PayloadTrieBuilder<T>();
|
||||
}
|
||||
|
||||
/**
|
||||
* Builder class to create a PayloadTrie instance.
|
||||
*
|
||||
* @param <T> The type of the emitted payload.
|
||||
*/
|
||||
public static class PayloadTrieBuilder<T> {
|
||||
|
||||
private final TrieConfig trieConfig = new TrieConfig();
|
||||
|
||||
private final PayloadTrie<T> trie = new PayloadTrie<>(trieConfig);
|
||||
|
||||
/**
|
||||
* Default (empty) constructor.
|
||||
*/
|
||||
private PayloadTrieBuilder() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Configure the Trie to ignore case when searching for keywords in the text.
|
||||
* This must be called before calling addKeyword because the algorithm converts
|
||||
* keywords to lowercase as they are added, depending on this case sensitivity
|
||||
* setting.
|
||||
*
|
||||
* @return This builder.
|
||||
*/
|
||||
public PayloadTrieBuilder<T> ignoreCase() {
|
||||
this.trieConfig.setCaseInsensitive(true);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Configure the Trie to ignore overlapping keywords.
|
||||
*
|
||||
* @return This builder.
|
||||
*/
|
||||
public PayloadTrieBuilder<T> ignoreOverlaps() {
|
||||
this.trieConfig.setAllowOverlaps(false);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a keyword to the Trie's list of text search keywords. No Payload is
|
||||
* supplied.
|
||||
*
|
||||
* @param keyword The keyword to add to the list.
|
||||
* @return This builder.
|
||||
* @throws NullPointerException if the keyword is null.
|
||||
*/
|
||||
public PayloadTrieBuilder<T> addKeyword(final String keyword) {
|
||||
this.trie.addKeyword(keyword);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a keyword and a payload to the Trie's list of text search keywords.
|
||||
*
|
||||
* @param keyword The keyword to add to the list.
|
||||
* @return This builder.
|
||||
* @throws NullPointerException if the keyword is null.
|
||||
*/
|
||||
public PayloadTrieBuilder<T> addKeyword(final String keyword, final T payload) {
|
||||
this.trie.addKeyword(keyword, payload);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a list of keywords and payloads to the Trie's list of text search
|
||||
* keywords.
|
||||
*
|
||||
* @param keywords The keywords to add to the list.
|
||||
* @return This builder.
|
||||
*/
|
||||
public PayloadTrieBuilder<T> addKeywords(final Collection<Payload<T>> keywords) {
|
||||
for (Payload<T> payload : keywords) {
|
||||
this.trie.addKeyword(payload.getKeyword(), payload.getData());
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Configure the Trie to match whole keywords in the text.
|
||||
*
|
||||
* @return This builder.
|
||||
*/
|
||||
public PayloadTrieBuilder<T> onlyWholeWords() {
|
||||
this.trieConfig.setOnlyWholeWords(true);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Configure the Trie to match whole keywords that are separated by whitespace
|
||||
* in the text. For example, "this keyword thatkeyword" would only match the
|
||||
* first occurrence of "keyword".
|
||||
*
|
||||
* @return This builder.
|
||||
*/
|
||||
public PayloadTrieBuilder<T> onlyWholeWordsWhiteSpaceSeparated() {
|
||||
this.trieConfig.setOnlyWholeWordsWhiteSpaceSeparated(true);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Configure the Trie to stop after the first keyword is found in the text.
|
||||
*
|
||||
* @return This builder.
|
||||
*/
|
||||
public PayloadTrieBuilder<T> stopOnHit() {
|
||||
trie.trieConfig.setStopOnHit(true);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Configure the PayloadTrie based on the builder settings.
|
||||
*
|
||||
* @return The configured PayloadTrie.
|
||||
*/
|
||||
public PayloadTrie<T> build() {
|
||||
this.trie.constructFailureStates();
|
||||
return this.trie;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return This builder.
|
||||
* @deprecated Use ignoreCase()
|
||||
*/
|
||||
public PayloadTrieBuilder<T> caseInsensitive() {
|
||||
return ignoreCase();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return This builder.
|
||||
* @deprecated Use ignoreOverlaps()
|
||||
*/
|
||||
public PayloadTrieBuilder<T> removeOverlaps() {
|
||||
return ignoreOverlaps();
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -133,4 +133,4 @@ public class State {
|
||||
public Collection<Character> getTransitions() {
|
||||
return this.success.keySet();
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1,7 +1,6 @@
|
||||
package org.ahocorasick.trie;
|
||||
|
||||
public abstract class Token {
|
||||
|
||||
private String fragment;
|
||||
|
||||
public Token(String fragment) {
|
||||
@ -15,5 +14,4 @@ public abstract class Token {
|
||||
public abstract boolean isMatch();
|
||||
|
||||
public abstract Emit getEmit();
|
||||
|
||||
}
|
||||
|
||||
@ -1,20 +1,13 @@
|
||||
package org.ahocorasick.trie;
|
||||
|
||||
import org.ahocorasick.interval.IntervalTree;
|
||||
import org.ahocorasick.interval.Intervalable;
|
||||
import org.ahocorasick.trie.handler.DefaultEmitHandler;
|
||||
import org.ahocorasick.trie.handler.EmitHandler;
|
||||
import org.ahocorasick.trie.handler.StatefulEmitHandler;
|
||||
import org.ahocorasick.util.ListElementRemoval;
|
||||
import org.ahocorasick.util.ListElementRemoval.RemoveElementPredicate;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Queue;
|
||||
import java.util.concurrent.LinkedBlockingDeque;
|
||||
|
||||
import static java.lang.Character.isWhitespace;
|
||||
import org.ahocorasick.trie.PayloadTrie.PayloadTrieBuilder;
|
||||
import org.ahocorasick.trie.handler.EmitHandler;
|
||||
import org.ahocorasick.trie.handler.StatefulPayloadEmitDelegateHandler;
|
||||
import org.ahocorasick.trie.handler.PayloadEmitDelegateHandler;
|
||||
import org.ahocorasick.trie.handler.StatefulEmitHandler;
|
||||
|
||||
/**
|
||||
* Based on the Aho-Corasick white paper, Bell technologies:
|
||||
@ -24,112 +17,47 @@ import static java.lang.Character.isWhitespace;
|
||||
*/
|
||||
public class Trie {
|
||||
|
||||
private final TrieConfig trieConfig;
|
||||
private final PayloadTrie<String> payloadTrie;
|
||||
|
||||
private final State rootState;
|
||||
|
||||
private Trie(final TrieConfig trieConfig) {
|
||||
this.trieConfig = trieConfig;
|
||||
this.rootState = new State();
|
||||
}
|
||||
|
||||
/**
|
||||
* Used by the builder to add a text search keyword.
|
||||
*
|
||||
* @param keyword The search term to add to the list of search terms.
|
||||
* @throws NullPointerException if the keyword is null.
|
||||
*/
|
||||
private void addKeyword(String keyword) {
|
||||
if (keyword.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (isCaseInsensitive()) {
|
||||
keyword = keyword.toLowerCase();
|
||||
}
|
||||
|
||||
addState(keyword).addEmit(keyword);
|
||||
}
|
||||
|
||||
/**
|
||||
* Delegates to addKeyword.
|
||||
*
|
||||
* @param keywords List of search term to add to the list of search terms.
|
||||
*/
|
||||
private void addKeywords(final String[] keywords) {
|
||||
for (final String keyword : keywords) {
|
||||
addKeyword(keyword);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Delegates to addKeyword.
|
||||
*
|
||||
* @param keywords List of search term to add to the list of search terms.
|
||||
*/
|
||||
private void addKeywords(final Collection<String> keywords) {
|
||||
for (final String keyword : keywords) {
|
||||
addKeyword(keyword);
|
||||
}
|
||||
}
|
||||
|
||||
private State addState(final String keyword) {
|
||||
return getRootState().addState(keyword);
|
||||
private Trie(final PayloadTrie<String> payloadTrie) {
|
||||
this.payloadTrie = payloadTrie;
|
||||
}
|
||||
|
||||
public Collection<Token> tokenize(final String text) {
|
||||
final Collection<Token> tokens = new ArrayList<>();
|
||||
final Collection<Emit> collectedEmits = parseText(text);
|
||||
int lastCollectedPosition = -1;
|
||||
|
||||
for (final Emit emit : collectedEmits) {
|
||||
if (emit.getStart() - lastCollectedPosition > 1) {
|
||||
tokens.add(createFragment(emit, text, lastCollectedPosition));
|
||||
}
|
||||
|
||||
tokens.add(createMatch(emit, text));
|
||||
lastCollectedPosition = emit.getEnd();
|
||||
}
|
||||
|
||||
if (text.length() - lastCollectedPosition > 1) {
|
||||
tokens.add(createFragment(null, text, lastCollectedPosition));
|
||||
}
|
||||
|
||||
return tokens;
|
||||
Collection<PayloadToken<String>> tokens = this.payloadTrie.tokenize(text);
|
||||
return asTokens(tokens);
|
||||
}
|
||||
|
||||
private Token createFragment(final Emit emit, final String text, final int lastCollectedPosition) {
|
||||
return new FragmentToken(text.substring(lastCollectedPosition + 1, emit == null ? text.length() : emit.getStart()));
|
||||
private static Collection<Token> asTokens(Collection<PayloadToken<String>> tokens) {
|
||||
Collection<Token> result = new ArrayList<>();
|
||||
for (PayloadToken<String> payloadToken : tokens) {
|
||||
result.add(new DefaultToken(payloadToken));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private Token createMatch(Emit emit, String text) {
|
||||
return new MatchToken(text.substring(emit.getStart(), emit.getEnd() + 1), emit);
|
||||
private static Collection<Emit> asEmits(Collection<PayloadEmit<String>> emits) {
|
||||
Collection<Emit> result = new ArrayList<>();
|
||||
for (PayloadEmit<String> emit : emits) {
|
||||
result.add(asEmit(emit));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private static Emit asEmit(PayloadEmit<String> payloadEmit) {
|
||||
return new Emit(payloadEmit.getStart(), payloadEmit.getEnd(), payloadEmit.getKeyword());
|
||||
}
|
||||
|
||||
public Collection<Emit> parseText(final CharSequence text) {
|
||||
return parseText(text, new DefaultEmitHandler());
|
||||
Collection<PayloadEmit<String>> parsedText = this.payloadTrie.parseText(text);
|
||||
return asEmits(parsedText);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public Collection<Emit> parseText(final CharSequence text, final StatefulEmitHandler emitHandler) {
|
||||
parseText(text, (EmitHandler) emitHandler);
|
||||
|
||||
final List<Emit> collectedEmits = emitHandler.getEmits();
|
||||
|
||||
if (trieConfig.isOnlyWholeWords()) {
|
||||
removePartialMatches(text, collectedEmits);
|
||||
}
|
||||
|
||||
if (trieConfig.isOnlyWholeWordsWhiteSpaceSeparated()) {
|
||||
removePartialMatchesWhiteSpaceSeparated(text, collectedEmits);
|
||||
}
|
||||
|
||||
if (!trieConfig.isAllowOverlaps()) {
|
||||
IntervalTree intervalTree = new IntervalTree((List<Intervalable>) (List<?>) collectedEmits);
|
||||
intervalTree.removeOverlaps((List<Intervalable>) (List<?>) collectedEmits);
|
||||
}
|
||||
|
||||
return collectedEmits;
|
||||
Collection<PayloadEmit<String>> parsedText = this.payloadTrie.parseText(text,
|
||||
new StatefulPayloadEmitDelegateHandler(emitHandler));
|
||||
return asEmits(parsedText);
|
||||
}
|
||||
|
||||
public boolean containsMatch(final CharSequence text) {
|
||||
@ -137,21 +65,7 @@ public class Trie {
|
||||
}
|
||||
|
||||
public void parseText(final CharSequence text, final EmitHandler emitHandler) {
|
||||
State currentState = getRootState();
|
||||
|
||||
for (int position = 0; position < text.length(); position++) {
|
||||
Character character = text.charAt(position);
|
||||
|
||||
// TODO: Maybe lowercase the entire string at once?
|
||||
if (trieConfig.isCaseInsensitive()) {
|
||||
character = Character.toLowerCase(character);
|
||||
}
|
||||
|
||||
currentState = getState(currentState, character);
|
||||
if (storeEmits(position, currentState, emitHandler) && trieConfig.isStopOnHit()) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
this.payloadTrie.parseText(text, new PayloadEmitDelegateHandler(emitHandler));
|
||||
}
|
||||
|
||||
/**
|
||||
@ -161,148 +75,8 @@ public class Trie {
|
||||
* @return null if no matches found.
|
||||
*/
|
||||
public Emit firstMatch(final CharSequence text) {
|
||||
if (!trieConfig.isAllowOverlaps()) {
|
||||
// Slow path. Needs to find all the matches to detect overlaps.
|
||||
final Collection<Emit> parseText = parseText(text);
|
||||
|
||||
if (parseText != null && !parseText.isEmpty()) {
|
||||
return parseText.iterator().next();
|
||||
}
|
||||
} else {
|
||||
// Fast path. Returns first match found.
|
||||
State currentState = getRootState();
|
||||
|
||||
for (int position = 0; position < text.length(); position++) {
|
||||
Character character = text.charAt(position);
|
||||
|
||||
// TODO: Lowercase the entire string at once?
|
||||
if (trieConfig.isCaseInsensitive()) {
|
||||
character = Character.toLowerCase(character);
|
||||
}
|
||||
|
||||
currentState = getState(currentState, character);
|
||||
Collection<String> emitStrs = currentState.emit();
|
||||
|
||||
if (emitStrs != null && !emitStrs.isEmpty()) {
|
||||
for (final String emitStr : emitStrs) {
|
||||
final Emit emit = new Emit(position - emitStr.length() + 1, position, emitStr);
|
||||
if (trieConfig.isOnlyWholeWords()) {
|
||||
if (!isPartialMatch(text, emit)) {
|
||||
return emit;
|
||||
}
|
||||
} else {
|
||||
return emit;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private boolean isPartialMatch(final CharSequence searchText, final Emit emit) {
|
||||
return (emit.getStart() != 0 &&
|
||||
Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) ||
|
||||
(emit.getEnd() + 1 != searchText.length() &&
|
||||
Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1)));
|
||||
}
|
||||
|
||||
private void removePartialMatches(final CharSequence searchText, final List<Emit> collectedEmits) {
|
||||
|
||||
final RemoveElementPredicate<Emit> predicate = new RemoveElementPredicate<Emit>() {
|
||||
|
||||
@Override
|
||||
public boolean remove(Emit emit) {
|
||||
return isPartialMatch(searchText, emit);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
ListElementRemoval.removeIf(collectedEmits, predicate);
|
||||
}
|
||||
|
||||
private void removePartialMatchesWhiteSpaceSeparated(final CharSequence searchText, final List<Emit> collectedEmits) {
|
||||
final long size = searchText.length();
|
||||
final List<Emit> removeEmits = new ArrayList<>();
|
||||
|
||||
for (final Emit emit : collectedEmits) {
|
||||
if ((emit.getStart() == 0 || isWhitespace(searchText.charAt(emit.getStart() - 1))) &&
|
||||
(emit.getEnd() + 1 == size || isWhitespace(searchText.charAt(emit.getEnd() + 1)))) {
|
||||
continue;
|
||||
}
|
||||
removeEmits.add(emit);
|
||||
}
|
||||
|
||||
for (final Emit removeEmit : removeEmits) {
|
||||
collectedEmits.remove(removeEmit);
|
||||
}
|
||||
}
|
||||
|
||||
private State getState(State currentState, final Character character) {
|
||||
State newCurrentState = currentState.nextState(character);
|
||||
|
||||
while (newCurrentState == null) {
|
||||
currentState = currentState.failure();
|
||||
newCurrentState = currentState.nextState(character);
|
||||
}
|
||||
|
||||
return newCurrentState;
|
||||
}
|
||||
|
||||
private void constructFailureStates() {
|
||||
final Queue<State> queue = new LinkedBlockingDeque<>();
|
||||
final State startState = getRootState();
|
||||
|
||||
// First, set the fail state of all depth 1 states to the root state
|
||||
for (State depthOneState : startState.getStates()) {
|
||||
depthOneState.setFailure(startState);
|
||||
queue.add(depthOneState);
|
||||
}
|
||||
|
||||
// Second, determine the fail state for all depth > 1 state
|
||||
while (!queue.isEmpty()) {
|
||||
final State currentState = queue.remove();
|
||||
|
||||
for (final Character transition : currentState.getTransitions()) {
|
||||
State targetState = currentState.nextState(transition);
|
||||
queue.add(targetState);
|
||||
|
||||
State traceFailureState = currentState.failure();
|
||||
while (traceFailureState.nextState(transition) == null) {
|
||||
traceFailureState = traceFailureState.failure();
|
||||
}
|
||||
|
||||
final State newFailureState = traceFailureState.nextState(transition);
|
||||
targetState.setFailure(newFailureState);
|
||||
targetState.addEmit(newFailureState.emit());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private boolean storeEmits(final int position, final State currentState, final EmitHandler emitHandler) {
|
||||
boolean emitted = false;
|
||||
final Collection<String> emits = currentState.emit();
|
||||
|
||||
// TODO: The check for empty might be superfluous.
|
||||
if (emits != null && !emits.isEmpty()) {
|
||||
for (final String emit : emits) {
|
||||
emitted = emitHandler.emit(new Emit(position - emit.length() + 1, position, emit)) || emitted;
|
||||
if (emitted && trieConfig.isStopOnHit()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return emitted;
|
||||
}
|
||||
|
||||
private boolean isCaseInsensitive() {
|
||||
return trieConfig.isCaseInsensitive();
|
||||
}
|
||||
|
||||
private State getRootState() {
|
||||
return this.rootState;
|
||||
PayloadEmit<String> firstMatch = this.payloadTrie.firstMatch(text);
|
||||
return new Emit(firstMatch.getStart(), firstMatch.getEnd(), firstMatch.getKeyword());
|
||||
}
|
||||
|
||||
/**
|
||||
@ -318,7 +92,9 @@ public class Trie {
|
||||
|
||||
private final TrieConfig trieConfig = new TrieConfig();
|
||||
|
||||
private final Trie trie = new Trie(trieConfig);
|
||||
private final PayloadTrie<String> trie = new PayloadTrie<>(trieConfig);
|
||||
|
||||
private final PayloadTrieBuilder<String> delegate = PayloadTrie.builder();
|
||||
|
||||
/**
|
||||
* Default (empty) constructor.
|
||||
@ -327,15 +103,16 @@ public class Trie {
|
||||
}
|
||||
|
||||
/**
|
||||
* Configure the Trie to ignore case when searching for keywords in
|
||||
* the text. This must be called before calling addKeyword because
|
||||
* the algorithm converts keywords to lowercase as they are added,
|
||||
* depending on this case sensitivity setting.
|
||||
* Configure the Trie to ignore case when searching for keywords in the text.
|
||||
* This must be called before calling addKeyword because the algorithm converts
|
||||
* keywords to lowercase as they are added, depending on this case sensitivity
|
||||
* setting.
|
||||
*
|
||||
* @return This builder.
|
||||
*/
|
||||
public TrieBuilder ignoreCase() {
|
||||
this.trieConfig.setCaseInsensitive(true);
|
||||
delegate.ignoreCase();
|
||||
// this.trieConfig.setCaseInsensitive(true);
|
||||
return this;
|
||||
}
|
||||
|
||||
@ -345,7 +122,7 @@ public class Trie {
|
||||
* @return This builder.
|
||||
*/
|
||||
public TrieBuilder ignoreOverlaps() {
|
||||
this.trieConfig.setAllowOverlaps(false);
|
||||
delegate.ignoreOverlaps();
|
||||
return this;
|
||||
}
|
||||
|
||||
@ -357,7 +134,7 @@ public class Trie {
|
||||
* @throws NullPointerException if the keyword is null.
|
||||
*/
|
||||
public TrieBuilder addKeyword(final String keyword) {
|
||||
this.trie.addKeyword(keyword);
|
||||
delegate.addKeyword(keyword, null);
|
||||
return this;
|
||||
}
|
||||
|
||||
@ -368,7 +145,9 @@ public class Trie {
|
||||
* @return This builder.
|
||||
*/
|
||||
public TrieBuilder addKeywords(final String... keywords) {
|
||||
this.trie.addKeywords(keywords);
|
||||
for (String keyword : keywords) {
|
||||
delegate.addKeyword(keyword, null);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
@ -379,7 +158,9 @@ public class Trie {
|
||||
* @return This builder.
|
||||
*/
|
||||
public TrieBuilder addKeywords(final Collection<String> keywords) {
|
||||
this.trie.addKeywords(keywords);
|
||||
for (String keyword : keywords) {
|
||||
this.delegate.addKeyword(keyword, null);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
@ -389,30 +170,29 @@ public class Trie {
|
||||
* @return This builder.
|
||||
*/
|
||||
public TrieBuilder onlyWholeWords() {
|
||||
this.trieConfig.setOnlyWholeWords(true);
|
||||
this.delegate.onlyWholeWords();
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Configure the Trie to match whole keywords that are separated by
|
||||
* whitespace in the text. For example, "this keyword thatkeyword"
|
||||
* would only match the first occurrence of "keyword".
|
||||
* Configure the Trie to match whole keywords that are separated by whitespace
|
||||
* in the text. For example, "this keyword thatkeyword" would only match the
|
||||
* first occurrence of "keyword".
|
||||
*
|
||||
* @return This builder.
|
||||
*/
|
||||
public TrieBuilder onlyWholeWordsWhiteSpaceSeparated() {
|
||||
this.trieConfig.setOnlyWholeWordsWhiteSpaceSeparated(true);
|
||||
this.delegate.onlyWholeWordsWhiteSpaceSeparated();
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Configure the Trie to stop after the first keyword is found in the
|
||||
* text.
|
||||
* Configure the Trie to stop after the first keyword is found in the text.
|
||||
*
|
||||
* @return This builder.
|
||||
*/
|
||||
public TrieBuilder stopOnHit() {
|
||||
trie.trieConfig.setStopOnHit(true);
|
||||
this.delegate.stopOnHit();
|
||||
return this;
|
||||
}
|
||||
|
||||
@ -422,8 +202,8 @@ public class Trie {
|
||||
* @return The configured Trie.
|
||||
*/
|
||||
public Trie build() {
|
||||
this.trie.constructFailureStates();
|
||||
return this.trie;
|
||||
PayloadTrie<String> payloadTrie = this.delegate.build();
|
||||
return new Trie(payloadTrie);
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@ -0,0 +1,21 @@
|
||||
package org.ahocorasick.trie.handler;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.ahocorasick.trie.PayloadEmit;
|
||||
|
||||
public abstract class AbstractStatefulPayloadEmitHandler<T> implements StatefulPayloadEmitHandler<T> {
|
||||
|
||||
private final List<PayloadEmit<T>> emits = new ArrayList<>();
|
||||
|
||||
public void addEmit(final PayloadEmit<T> emit) {
|
||||
this.emits.add(emit);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<PayloadEmit<T>> getEmits() {
|
||||
return this.emits;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,10 +1,10 @@
|
||||
package org.ahocorasick.trie.handler;
|
||||
|
||||
import org.ahocorasick.trie.Emit;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.ahocorasick.trie.Emit;
|
||||
|
||||
public class DefaultEmitHandler implements StatefulEmitHandler {
|
||||
|
||||
private final List<Emit> emits = new ArrayList<>();
|
||||
|
||||
@ -0,0 +1,22 @@
|
||||
package org.ahocorasick.trie.handler;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.ahocorasick.trie.PayloadEmit;
|
||||
|
||||
public class DefaultPayloadEmitHandler<T> implements StatefulPayloadEmitHandler<T> {
|
||||
|
||||
private final List<PayloadEmit<T>> emits = new ArrayList<>();
|
||||
|
||||
@Override
|
||||
public boolean emit(final PayloadEmit<T> emit) {
|
||||
this.emits.add(emit);
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<PayloadEmit<T>> getEmits() {
|
||||
return this.emits;
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,24 @@
|
||||
package org.ahocorasick.trie.handler;
|
||||
|
||||
import org.ahocorasick.trie.Emit;
|
||||
import org.ahocorasick.trie.PayloadEmit;
|
||||
|
||||
/**
|
||||
* Convenience wrapper class that delegates every method to a EmitHandler.
|
||||
*/
|
||||
public class PayloadEmitDelegateHandler implements PayloadEmitHandler<String> {
|
||||
|
||||
private EmitHandler handler;
|
||||
|
||||
public PayloadEmitDelegateHandler(EmitHandler handler) {
|
||||
this.handler = handler;
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean emit(PayloadEmit<String> emit) {
|
||||
Emit newEmit = new Emit(emit.getStart(), emit.getEnd(), emit.getKeyword());
|
||||
return handler.emit(newEmit);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,7 @@
|
||||
package org.ahocorasick.trie.handler;
|
||||
|
||||
import org.ahocorasick.trie.PayloadEmit;
|
||||
|
||||
public interface PayloadEmitHandler<T> {
|
||||
boolean emit(PayloadEmit<T> emit);
|
||||
}
|
||||
@ -0,0 +1,42 @@
|
||||
package org.ahocorasick.trie.handler;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
import org.ahocorasick.trie.Emit;
|
||||
import org.ahocorasick.trie.PayloadEmit;
|
||||
|
||||
/**
|
||||
* Convenience wrapper class that delegates every method to a
|
||||
* StatefullPayloadEmitHandler.
|
||||
*/
|
||||
public class StatefulPayloadEmitDelegateHandler implements StatefulPayloadEmitHandler<String> {
|
||||
|
||||
private StatefulEmitHandler handler;
|
||||
|
||||
public StatefulPayloadEmitDelegateHandler(StatefulEmitHandler handler) {
|
||||
this.handler = handler;
|
||||
|
||||
}
|
||||
|
||||
private static List<PayloadEmit<String>> asEmits(Collection<Emit> emits) {
|
||||
List<PayloadEmit<String>> result = new ArrayList<>();
|
||||
for (Emit emit : emits) {
|
||||
result.add(new PayloadEmit<String>(emit.getStart(), emit.getEnd(), emit.getKeyword(), null));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean emit(PayloadEmit<String> emit) {
|
||||
Emit newEmit = new Emit(emit.getStart(), emit.getEnd(), emit.getKeyword());
|
||||
return handler.emit(newEmit);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<PayloadEmit<String>> getEmits() {
|
||||
List<Emit> emits = this.handler.getEmits();
|
||||
return asEmits(emits);
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,9 @@
|
||||
package org.ahocorasick.trie.handler;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.ahocorasick.trie.PayloadEmit;
|
||||
|
||||
public interface StatefulPayloadEmitHandler<T> extends PayloadEmitHandler<T>{
|
||||
List<PayloadEmit<T>> getEmits();
|
||||
}
|
||||
510
src/test/java/org/ahocorasick/trie/PayloadTrieTest.java
Normal file
510
src/test/java/org/ahocorasick/trie/PayloadTrieTest.java
Normal file
@ -0,0 +1,510 @@
|
||||
package org.ahocorasick.trie;
|
||||
|
||||
import static junit.framework.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.ThreadLocalRandom;
|
||||
|
||||
import org.ahocorasick.trie.handler.AbstractStatefulPayloadEmitHandler;
|
||||
import org.ahocorasick.trie.handler.PayloadEmitHandler;
|
||||
import org.ahocorasick.trie.handler.StatefulPayloadEmitHandler;
|
||||
import org.junit.Test;
|
||||
|
||||
public class PayloadTrieTest {
|
||||
|
||||
private final static String[] ALPHABET = new String[] { "abc", "bcd", "cde" };
|
||||
private final static String[] ALPHABET_PAYLOAD = new String[] { "alpha:abc", "alpha:bcd", "alpha:cde" };
|
||||
|
||||
private final static List<Payload<String>> ALPHABET_WITH_PAYLOADS = Arrays.asList(//
|
||||
new Payload<String>(ALPHABET[0], ALPHABET_PAYLOAD[0]), //
|
||||
new Payload<String>(ALPHABET[1], ALPHABET_PAYLOAD[1]), //
|
||||
new Payload<String>(ALPHABET[2], ALPHABET_PAYLOAD[2]));
|
||||
|
||||
private final static String[] PRONOUNS = new String[] { "hers", "his", "she", "he" };
|
||||
private final static int[] PRONOUNS_PAYLOAD_ID = new int[] { 9, 12, 4, 20 };
|
||||
|
||||
private final static List<Payload<Integer>> PRONOUNS_WITH_PAYLOADS = Arrays.asList(//
|
||||
new Payload<Integer>(PRONOUNS[0], PRONOUNS_PAYLOAD_ID[0]), //
|
||||
new Payload<Integer>(PRONOUNS[1], PRONOUNS_PAYLOAD_ID[1]), //
|
||||
new Payload<Integer>(PRONOUNS[2], PRONOUNS_PAYLOAD_ID[2]), //
|
||||
new Payload<Integer>(PRONOUNS[3], PRONOUNS_PAYLOAD_ID[3]) //
|
||||
);
|
||||
|
||||
private final static String[] FOOD = new String[] { "veal", "cauliflower", "broccoli", "tomatoes" };
|
||||
private final static Food[] FOOD_PAYLOAD = new Food[] { new Food("veal"), new Food("cauliflower"), new Food("broccoli"),
|
||||
new Food("tomatoes") };
|
||||
|
||||
private final static List<Payload<Food>> FOOD_WITH_PAYLOADS = Arrays.asList(//
|
||||
new Payload<Food>(FOOD[0], FOOD_PAYLOAD[0]), //
|
||||
new Payload<Food>(FOOD[1], FOOD_PAYLOAD[1]), //
|
||||
new Payload<Food>(FOOD[2], FOOD_PAYLOAD[2]), //
|
||||
new Payload<Food>(FOOD[3], FOOD_PAYLOAD[3]) //
|
||||
);
|
||||
|
||||
private final static String[] GREEK_LETTERS = new String[] { "Alpha", "Beta", "Gamma" };
|
||||
private final static String[] GREEK_LETTERS_PAYLOAD = new String[] { "greek:Alpha", "greek:Beta", "greek:Gamma" };
|
||||
|
||||
private final static List<Payload<String>> GREEK_LETTERS_WITH_PAYLOADS = Arrays.asList(//
|
||||
new Payload<String>(GREEK_LETTERS[0], GREEK_LETTERS_PAYLOAD[0]), //
|
||||
new Payload<String>(GREEK_LETTERS[1], GREEK_LETTERS_PAYLOAD[1]), //
|
||||
new Payload<String>(GREEK_LETTERS[2], GREEK_LETTERS_PAYLOAD[2]));
|
||||
|
||||
private final static String[] UNICODE = new String[] { "turning", "once", "again", "börkü" };
|
||||
private final static String[] UNICODE_PAYLOAD = new String[] { "uni:turning", "uni:once", "uni:again", "uni:börkü" };
|
||||
|
||||
private final static List<Payload<String>> UNICODE_WITH_PAYLOADS = Arrays.asList(//
|
||||
new Payload<String>(UNICODE[0], UNICODE_PAYLOAD[0]), //
|
||||
new Payload<String>(UNICODE[1], UNICODE_PAYLOAD[1]), //
|
||||
new Payload<String>(UNICODE[2], UNICODE_PAYLOAD[2]), //
|
||||
new Payload<String>(UNICODE[3], UNICODE_PAYLOAD[3]));
|
||||
|
||||
public static class Food {
|
||||
private final String name;
|
||||
|
||||
public Food(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
final int prime = 31;
|
||||
int result = 1;
|
||||
result = prime * result + ((name == null) ? 0 : name.hashCode());
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj)
|
||||
return true;
|
||||
if (obj == null)
|
||||
return false;
|
||||
if (getClass() != obj.getClass())
|
||||
return false;
|
||||
Food other = (Food) obj;
|
||||
if (name == null) {
|
||||
if (other.name != null)
|
||||
return false;
|
||||
} else if (!name.equals(other.name))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void keywordAndTextAreTheSame() {
|
||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword(ALPHABET[0], ALPHABET_PAYLOAD[0]).build();
|
||||
Collection<PayloadEmit<String>> emits = trie.parseText(ALPHABET[0]);
|
||||
Iterator<PayloadEmit<String>> iterator = emits.iterator();
|
||||
checkEmit(iterator.next(), 0, 2, ALPHABET[0], ALPHABET_PAYLOAD[0]);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void keywordAndTextAreTheSameFirstMatch() {
|
||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword(ALPHABET[0], ALPHABET_PAYLOAD[0]).build();
|
||||
PayloadEmit<String> firstMatch = trie.firstMatch(ALPHABET[0]);
|
||||
checkEmit(firstMatch, 0, 2, ALPHABET[0], ALPHABET_PAYLOAD[0]);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void textIsLongerThanKeyword() {
|
||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword(ALPHABET[0], ALPHABET_PAYLOAD[0]).build();
|
||||
Collection<PayloadEmit<String>> emits = trie.parseText(" " + ALPHABET[0]);
|
||||
Iterator<PayloadEmit<String>> iterator = emits.iterator();
|
||||
checkEmit(iterator.next(), 1, 3, ALPHABET[0], ALPHABET_PAYLOAD[0]);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void textIsLongerThanKeywordFirstMatch() {
|
||||
|
||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword(ALPHABET[0], ALPHABET_PAYLOAD[0]).build();
|
||||
PayloadEmit<String> firstMatch = trie.firstMatch(" " + ALPHABET[0]);
|
||||
checkEmit(firstMatch, 1, 3, ALPHABET[0], ALPHABET_PAYLOAD[0]);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void variousKeywordsOneMatch() {
|
||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeywords(ALPHABET_WITH_PAYLOADS).build();
|
||||
Collection<PayloadEmit<String>> emits = trie.parseText("bcd");
|
||||
Iterator<PayloadEmit<String>> iterator = emits.iterator();
|
||||
checkEmit(iterator.next(), 0, 2, "bcd", "alpha:bcd");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void variousKeywordsFirstMatch() {
|
||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeywords(ALPHABET_WITH_PAYLOADS).build();
|
||||
PayloadEmit<String> firstMatch = trie.firstMatch("bcd");
|
||||
checkEmit(firstMatch, 0, 2, "bcd", "alpha:bcd");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void ushersTestAndStopOnHit() {
|
||||
PayloadTrie<Integer> trie = PayloadTrie.<Integer>builder().addKeywords(PRONOUNS_WITH_PAYLOADS).stopOnHit().build();
|
||||
Collection<PayloadEmit<Integer>> emits = trie.parseText("ushers");
|
||||
assertEquals(1, emits.size()); // she @ 3, he @ 3, hers @ 5
|
||||
Iterator<PayloadEmit<Integer>> iterator = emits.iterator();
|
||||
checkEmit(iterator.next(), 2, 3, "he", 20);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void ushersTestStopOnHitSkipOne() {
|
||||
PayloadTrie<Integer> trie = PayloadTrie.<Integer>builder().addKeywords(PRONOUNS_WITH_PAYLOADS).stopOnHit().build();
|
||||
|
||||
StatefulPayloadEmitHandler<Integer> testEmitHandler = new AbstractStatefulPayloadEmitHandler<Integer>() {
|
||||
boolean first = true;
|
||||
|
||||
@Override
|
||||
public boolean emit(final PayloadEmit<Integer> emit) {
|
||||
if (first) {
|
||||
// return false for the first element
|
||||
first = false;
|
||||
return false;
|
||||
}
|
||||
addEmit(emit);
|
||||
return true;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
trie.parseText("ushers", testEmitHandler);
|
||||
Collection<PayloadEmit<Integer>> emits = testEmitHandler.getEmits();
|
||||
assertEquals(1, emits.size()); // she @ 3, he @ 3, hers @ 5
|
||||
Iterator<PayloadEmit<Integer>> iterator = emits.iterator();
|
||||
checkEmit(iterator.next(), 1, 3, "she", 4);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void ushersTest() {
|
||||
PayloadTrie<Integer> trie = PayloadTrie.<Integer>builder().addKeywords(PRONOUNS_WITH_PAYLOADS).build();
|
||||
Collection<PayloadEmit<Integer>> emits = trie.parseText("ushers");
|
||||
assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5
|
||||
Iterator<PayloadEmit<Integer>> iterator = emits.iterator();
|
||||
|
||||
checkEmit(iterator.next(), 2, 3, "he", 20);
|
||||
checkEmit(iterator.next(), 1, 3, "she", 4);
|
||||
checkEmit(iterator.next(), 2, 5, "hers", 9);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void ushersTestWithCapitalKeywords() {
|
||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().addKeyword("HERS", "hers").addKeyword("HIS", "his")
|
||||
.addKeyword("SHE", "she").addKeyword("HE", "he").build();
|
||||
Collection<PayloadEmit<String>> emits = trie.parseText("ushers");
|
||||
assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5
|
||||
Iterator<PayloadEmit<String>> iterator = emits.iterator();
|
||||
checkEmit(iterator.next(), 2, 3, "he", "he");
|
||||
checkEmit(iterator.next(), 1, 3, "she", "she");
|
||||
checkEmit(iterator.next(), 2, 5, "hers", "hers");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void ushersTestFirstMatch() {
|
||||
PayloadTrie<Integer> trie = PayloadTrie.<Integer>builder().addKeywords(PRONOUNS_WITH_PAYLOADS).build();
|
||||
PayloadEmit<Integer> firstMatch = trie.firstMatch("ushers");
|
||||
checkEmit(firstMatch, 2, 3, "he", 20);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void ushersTestByCallback() {
|
||||
PayloadTrie<Integer> trie = PayloadTrie.<Integer>builder().addKeywords(PRONOUNS_WITH_PAYLOADS).build();
|
||||
|
||||
final List<PayloadEmit<Integer>> emits = new ArrayList<>();
|
||||
PayloadEmitHandler<Integer> emitHandler = new PayloadEmitHandler<Integer>() {
|
||||
|
||||
@Override
|
||||
public boolean emit(PayloadEmit<Integer> emit) {
|
||||
emits.add(emit);
|
||||
return true;
|
||||
}
|
||||
};
|
||||
trie.parseText("ushers", emitHandler);
|
||||
assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5
|
||||
Iterator<PayloadEmit<Integer>> iterator = emits.iterator();
|
||||
|
||||
checkEmit(iterator.next(), 2, 3, "he", 20);
|
||||
checkEmit(iterator.next(), 1, 3, "she", 4);
|
||||
checkEmit(iterator.next(), 2, 5, "hers", 9);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void misleadingTest() {
|
||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword("hers", "pronon:hers").build();
|
||||
Collection<PayloadEmit<String>> emits = trie.parseText("h he her hers");
|
||||
Iterator<PayloadEmit<String>> iterator = emits.iterator();
|
||||
checkEmit(iterator.next(), 9, 12, "hers", "pronon:hers");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void misleadingTestFirstMatch() {
|
||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword("hers", "pronon:hers").build();
|
||||
PayloadEmit<String> firstMatch = trie.firstMatch("h he her hers");
|
||||
checkEmit(firstMatch, 9, 12, "hers", "pronon:hers");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void recipes() {
|
||||
PayloadTrie<Food> trie = PayloadTrie.<Food>builder().addKeywords(FOOD_WITH_PAYLOADS).build();
|
||||
Collection<PayloadEmit<Food>> emits = trie.parseText("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli");
|
||||
Iterator<PayloadEmit<Food>> iterator = emits.iterator();
|
||||
checkEmit(iterator.next(), 2, 12, "cauliflower", new Food("cauliflower"));
|
||||
checkEmit(iterator.next(), 18, 25, "tomatoes", new Food("tomatoes"));
|
||||
checkEmit(iterator.next(), 40, 43, "veal", new Food("veal"));
|
||||
checkEmit(iterator.next(), 51, 58, "broccoli", new Food("broccoli"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void recipesFirstMatch() {
|
||||
PayloadTrie<Food> trie = PayloadTrie.<Food>builder().addKeywords(FOOD_WITH_PAYLOADS).build();
|
||||
PayloadEmit<Food> firstMatch = trie.firstMatch("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli");
|
||||
checkEmit(firstMatch, 2, 12, "cauliflower", new Food("cauliflower"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void longAndShortOverlappingMatch() {
|
||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword("he", "pronon:he").addKeyword("hehehehe", "garbage")
|
||||
.build();
|
||||
Collection<PayloadEmit<String>> emits = trie.parseText("hehehehehe");
|
||||
Iterator<PayloadEmit<String>> iterator = emits.iterator();
|
||||
checkEmit(iterator.next(), 0, 1, "he", "pronon:he");
|
||||
checkEmit(iterator.next(), 2, 3, "he", "pronon:he");
|
||||
checkEmit(iterator.next(), 4, 5, "he", "pronon:he");
|
||||
checkEmit(iterator.next(), 6, 7, "he", "pronon:he");
|
||||
checkEmit(iterator.next(), 0, 7, "hehehehe", "garbage");
|
||||
checkEmit(iterator.next(), 8, 9, "he", "pronon:he");
|
||||
checkEmit(iterator.next(), 2, 9, "hehehehe", "garbage");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void nonOverlapping() {
|
||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().removeOverlaps().addKeyword("ab", "alpha:ab")
|
||||
.addKeyword("cba", "alpha:cba").addKeyword("ababc", "alpha:ababc").build();
|
||||
Collection<PayloadEmit<String>> emits = trie.parseText("ababcbab");
|
||||
assertEquals(2, emits.size());
|
||||
Iterator<PayloadEmit<String>> iterator = emits.iterator();
|
||||
// With overlaps: ab@1, ab@3, ababc@4, cba@6, ab@7
|
||||
checkEmit(iterator.next(), 0, 4, "ababc", "alpha:ababc");
|
||||
checkEmit(iterator.next(), 6, 7, "ab", "alpha:ab");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void nonOverlappingFirstMatch() {
|
||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().removeOverlaps().addKeyword("ab", "alpha:ab")
|
||||
.addKeyword("cba", "alpha:cba").addKeyword("ababc", "alpha:ababc").build();
|
||||
PayloadEmit<String> firstMatch = trie.firstMatch("ababcbab");
|
||||
|
||||
checkEmit(firstMatch, 0, 4, "ababc", "alpha:ababc");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void containsMatch() {
|
||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().removeOverlaps().addKeyword("ab", "alpha:ab")
|
||||
.addKeyword("cba", "alpha:cba").addKeyword("ababc", "alpha:ababc").build();
|
||||
assertTrue(trie.containsMatch("ababcbab"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void startOfChurchillSpeech() {
|
||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().removeOverlaps().addKeyword("T").addKeyword("u").addKeyword("ur")
|
||||
.addKeyword("r").addKeyword("urn").addKeyword("ni").addKeyword("i").addKeyword("in").addKeyword("n")
|
||||
.addKeyword("urning").build();
|
||||
Collection<PayloadEmit<String>> emits = trie.parseText("Turning");
|
||||
assertEquals(2, emits.size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void partialMatch() {
|
||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().onlyWholeWords().addKeyword("sugar", "food:sugar").build();
|
||||
Collection<PayloadEmit<String>> emits = trie.parseText("sugarcane sugarcane sugar canesugar"); // left, middle, right test
|
||||
assertEquals(1, emits.size()); // Match must not be made
|
||||
checkEmit(emits.iterator().next(), 20, 24, "sugar", "food:sugar");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void partialMatchFirstMatch() {
|
||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().onlyWholeWords().addKeyword("sugar", "food:sugar").build();
|
||||
PayloadEmit<String> firstMatch = trie.firstMatch("sugarcane sugarcane sugar canesugar"); // left, middle, right test
|
||||
|
||||
checkEmit(firstMatch, 20, 24, "sugar", "food:sugar");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void tokenizeFullSentence() {
|
||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeywords(GREEK_LETTERS_WITH_PAYLOADS).build();
|
||||
Collection<PayloadToken<String>> tokens = trie.tokenize("Hear: Alpha team first, Beta from the rear, Gamma in reserve");
|
||||
assertEquals(7, tokens.size());
|
||||
Iterator<PayloadToken<String>> tokensIt = tokens.iterator();
|
||||
assertEquals("Hear: ", tokensIt.next().getFragment());
|
||||
assertEquals("Alpha", tokensIt.next().getFragment());
|
||||
assertEquals(" team first, ", tokensIt.next().getFragment());
|
||||
assertEquals("Beta", tokensIt.next().getFragment());
|
||||
assertEquals(" from the rear, ", tokensIt.next().getFragment());
|
||||
assertEquals("Gamma", tokensIt.next().getFragment());
|
||||
assertEquals(" in reserve", tokensIt.next().getFragment());
|
||||
}
|
||||
|
||||
// @see https://github.com/robert-bor/aho-corasick/issues/5
|
||||
@Test
|
||||
public void testStringIndexOutOfBoundsException() {
|
||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().onlyWholeWords().addKeywords(UNICODE_WITH_PAYLOADS)
|
||||
.build();
|
||||
Collection<PayloadEmit<String>> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ");
|
||||
assertEquals(4, emits.size()); // Match must not be made
|
||||
Iterator<PayloadEmit<String>> it = emits.iterator();
|
||||
|
||||
checkEmit(it.next(), 0, 6, "turning", "uni:turning");
|
||||
checkEmit(it.next(), 8, 11, "once", "uni:once");
|
||||
checkEmit(it.next(), 13, 17, "again", "uni:again");
|
||||
checkEmit(it.next(), 19, 23, "börkü", "uni:börkü");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIgnoreCase() {
|
||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().addKeywords(UNICODE_WITH_PAYLOADS).build();
|
||||
Collection<PayloadEmit<String>> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ");
|
||||
assertEquals(4, emits.size()); // Match must not be made
|
||||
Iterator<PayloadEmit<String>> it = emits.iterator();
|
||||
|
||||
checkEmit(it.next(), 0, 6, "turning", "uni:turning");
|
||||
checkEmit(it.next(), 8, 11, "once", "uni:once");
|
||||
checkEmit(it.next(), 13, 17, "again", "uni:again");
|
||||
checkEmit(it.next(), 19, 23, "börkü", "uni:börkü");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIgnoreCaseFirstMatch() {
|
||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().addKeywords(UNICODE_WITH_PAYLOADS).build();
|
||||
PayloadEmit<String> firstMatch = trie.firstMatch("TurninG OnCe AgAiN BÖRKÜ");
|
||||
|
||||
checkEmit(firstMatch, 0, 6, "turning", "uni:turning");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void tokenizeTokensInSequence() {
|
||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeywords(GREEK_LETTERS_WITH_PAYLOADS).build();
|
||||
Collection<PayloadToken<String>> tokens = trie.tokenize("Alpha Beta Gamma");
|
||||
assertEquals(5, tokens.size());
|
||||
}
|
||||
|
||||
// @see https://github.com/robert-bor/aho-corasick/issues/7
|
||||
@Test
|
||||
public void testZeroLength() {
|
||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreOverlaps().onlyWholeWords().ignoreCase().addKeyword("")
|
||||
.build();
|
||||
trie.tokenize(
|
||||
"Try a natural lip and subtle bronzer to keep all the focus on those big bright eyes with NARS Eyeshadow Duo in Rated R And the winner is... Boots No7 Advanced Renewal Anti-ageing Glycolic Peel Kit ($25 amazon.com) won most-appealing peel.");
|
||||
}
|
||||
|
||||
// @see https://github.com/robert-bor/aho-corasick/issues/8
|
||||
@Test
|
||||
public void testUnicode1() {
|
||||
String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char
|
||||
assertEquals("THIS", target.substring(5, 9)); // Java does it the right way
|
||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().onlyWholeWords().addKeyword("this", "pronon:this")
|
||||
.build();
|
||||
Collection<PayloadEmit<String>> emits = trie.parseText(target);
|
||||
assertEquals(1, emits.size());
|
||||
Iterator<PayloadEmit<String>> it = emits.iterator();
|
||||
checkEmit(it.next(), 5, 8, "this", "pronon:this");
|
||||
}
|
||||
|
||||
// @see https://github.com/robert-bor/aho-corasick/issues/8
|
||||
@Test
|
||||
public void testUnicode2() {
|
||||
String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char
|
||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().onlyWholeWords().addKeyword("this", "pronon:this")
|
||||
.build();
|
||||
assertEquals("THIS", target.substring(5, 9)); // Java does it the right way
|
||||
PayloadEmit<String> firstMatch = trie.firstMatch(target);
|
||||
checkEmit(firstMatch, 5, 8, "this", "pronon:this");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPartialMatchWhiteSpaces() {
|
||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().onlyWholeWordsWhiteSpaceSeparated()
|
||||
.addKeyword("#sugar-123", "sugar").build();
|
||||
Collection<PayloadEmit<String>> emits = trie.parseText("#sugar-123 #sugar-1234"); // left, middle, right test
|
||||
assertEquals(1, emits.size()); // Match must not be made
|
||||
checkEmit(emits.iterator().next(), 0, 9, "#sugar-123", "sugar");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLargeString() {
|
||||
final int interval = 100;
|
||||
final int textSize = 1000000;
|
||||
final String keyword = FOOD[1];
|
||||
final Food payload = FOOD_PAYLOAD[1];
|
||||
final StringBuilder text = randomNumbers(textSize);
|
||||
|
||||
injectKeyword(text, keyword, interval);
|
||||
|
||||
PayloadTrie<Food> trie = PayloadTrie.<Food>builder().onlyWholeWords().addKeyword(keyword, payload).build();
|
||||
|
||||
final Collection<PayloadEmit<Food>> emits = trie.parseText(text);
|
||||
|
||||
assertEquals(textSize / interval, emits.size());
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates a random sequence of ASCII numbers.
|
||||
*
|
||||
* @param count The number of numbers to generate.
|
||||
* @return A character sequence filled with random digits.
|
||||
*/
|
||||
private StringBuilder randomNumbers(int count) {
|
||||
final StringBuilder sb = new StringBuilder(count);
|
||||
|
||||
while (--count > 0) {
|
||||
sb.append(randomInt(0, 10));
|
||||
}
|
||||
|
||||
return sb;
|
||||
}
|
||||
|
||||
/**
|
||||
* Injects keywords into a string builder.
|
||||
*
|
||||
* @param source Should contain a bunch of random data that cannot match any
|
||||
* keyword.
|
||||
* @param keyword A keyword to inject repeatedly in the text.
|
||||
* @param interval How often to inject the keyword.
|
||||
*/
|
||||
private void injectKeyword(final StringBuilder source, final String keyword, final int interval) {
|
||||
final int length = source.length();
|
||||
for (int i = 0; i < length; i += interval) {
|
||||
source.replace(i, i + keyword.length(), keyword);
|
||||
}
|
||||
}
|
||||
|
||||
private int randomInt(final int min, final int max) {
|
||||
return ThreadLocalRandom.current().nextInt(min, max);
|
||||
}
|
||||
|
||||
private void checkEmit(PayloadEmit<Food> next, int expectedStart, int expectedEnd, String expectedKeyword,
|
||||
Food expectedPayload) {
|
||||
assertEquals("Start of emit should have been " + expectedStart, expectedStart, next.getStart());
|
||||
assertEquals("End of emit should have been " + expectedEnd, expectedEnd, next.getEnd());
|
||||
assertEquals("Keyword of emit shoud be " + expectedKeyword, expectedKeyword, next.getKeyword());
|
||||
assertEquals("Payload of emit shoud be " + expectedPayload, expectedPayload, next.getPayload());
|
||||
}
|
||||
|
||||
private void checkEmit(PayloadEmit<Integer> next, int expectedStart, int expectedEnd, String expectedKeyword,
|
||||
Integer expectedPayload) {
|
||||
assertEquals("Start of emit should have been " + expectedStart, expectedStart, next.getStart());
|
||||
assertEquals("End of emit should have been " + expectedEnd, expectedEnd, next.getEnd());
|
||||
assertEquals("Keyword of emit shoud be " + expectedKeyword, expectedKeyword, next.getKeyword());
|
||||
assertEquals("Payload of emit shoud be " + expectedPayload, expectedPayload, next.getPayload());
|
||||
}
|
||||
|
||||
private void checkEmit(PayloadEmit<String> next, int expectedStart, int expectedEnd, String expectedKeyword,
|
||||
String expectedPayload) {
|
||||
assertEquals("Start of emit should have been " + expectedStart, expectedStart, next.getStart());
|
||||
assertEquals("End of emit should have been " + expectedEnd, expectedEnd, next.getEnd());
|
||||
assertEquals("Keyword of emit shoud be " + expectedKeyword, expectedKeyword, next.getKeyword());
|
||||
assertEquals("Payload of emit shoud be " + expectedPayload, expectedPayload, next.getPayload());
|
||||
}
|
||||
}
|
||||
@ -1,9 +1,7 @@
|
||||
package org.ahocorasick.trie;
|
||||
|
||||
import org.ahocorasick.trie.handler.AbstractStatefulEmitHandler;
|
||||
import org.ahocorasick.trie.handler.EmitHandler;
|
||||
import org.ahocorasick.trie.handler.StatefulEmitHandler;
|
||||
import org.junit.Test;
|
||||
import static junit.framework.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
@ -11,8 +9,10 @@ import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.ThreadLocalRandom;
|
||||
|
||||
import static junit.framework.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import org.ahocorasick.trie.handler.AbstractStatefulEmitHandler;
|
||||
import org.ahocorasick.trie.handler.EmitHandler;
|
||||
import org.ahocorasick.trie.handler.StatefulEmitHandler;
|
||||
import org.junit.Test;
|
||||
|
||||
public class TrieTest {
|
||||
private final static String[] ALPHABET = new String[]{
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user