diff --git a/.travis.yml b/.travis.yml index 510d771..751303d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,6 @@ language: java install: mvn install -DskipTests=true -Dgpg.skip=true jdk: - - oraclejdk8 + - openjdk8 after_success: - - bash <(curl -s https://codecov.io/bash) \ No newline at end of file + - bash <(curl -s https://codecov.io/bash) diff --git a/README.md b/README.md index 3ebbb50..51985c5 100644 --- a/README.md +++ b/README.md @@ -182,6 +182,26 @@ matches as soon as you encounter them. Let's look at an example where we want to System.out.println(html); ``` +You can also emit custom outputs. This might for example be useful to implement a trivial named entity +recognizer. In this case use a PayloadTrie instead of a Trie: + +```java + class Word { + private final String gender; + public Word(String gender) { + this.gender = gender; + } + } + + PayloadTrie trie = PayloadTrie.builder() + .addKeyword("hers", new Word("f") + .addKeyword("his", new Word("m")) + .addKeyword("she", new Word("f")) + .addKeyword("he", new Word("m")) + .build(); + Collection> emits = trie.parseText("ushers"); +``` + Releases -------- Information on the aho-corasick [releases](https://github.com/robert-bor/aho-corasick/releases). diff --git a/src/main/java/org/ahocorasick/trie/DefaultToken.java b/src/main/java/org/ahocorasick/trie/DefaultToken.java new file mode 100644 index 0000000..b83942d --- /dev/null +++ b/src/main/java/org/ahocorasick/trie/DefaultToken.java @@ -0,0 +1,21 @@ +package org.ahocorasick.trie; + +public class DefaultToken extends Token { + + private PayloadToken payloadToken; + + public DefaultToken(PayloadToken payloadToken) { + super(payloadToken.getFragment()); + this.payloadToken = payloadToken; + } + + public boolean isMatch() { + return payloadToken.isMatch(); + } + + public Emit getEmit() { + PayloadEmit emit = payloadToken.getEmit(); + return new Emit(emit.getStart(), emit.getEnd(), emit.getKeyword()); + } + +} diff --git a/src/main/java/org/ahocorasick/trie/Emit.java b/src/main/java/org/ahocorasick/trie/Emit.java index 8c17253..ed2594a 100644 --- a/src/main/java/org/ahocorasick/trie/Emit.java +++ b/src/main/java/org/ahocorasick/trie/Emit.java @@ -4,10 +4,9 @@ import org.ahocorasick.interval.Interval; import org.ahocorasick.interval.Intervalable; public class Emit extends Interval implements Intervalable { - private final String keyword; - public Emit(final int start, final int end, final String keyword) { + public Emit(final int start, final int end, String keyword) { super(start, end); this.keyword = keyword; } @@ -20,4 +19,5 @@ public class Emit extends Interval implements Intervalable { public String toString() { return super.toString() + "=" + this.keyword; } + } diff --git a/src/main/java/org/ahocorasick/trie/FragmentToken.java b/src/main/java/org/ahocorasick/trie/FragmentToken.java index 85c498a..37e83d1 100644 --- a/src/main/java/org/ahocorasick/trie/FragmentToken.java +++ b/src/main/java/org/ahocorasick/trie/FragmentToken.java @@ -15,4 +15,5 @@ public class FragmentToken extends Token { public Emit getEmit() { return null; } + } diff --git a/src/main/java/org/ahocorasick/trie/MatchToken.java b/src/main/java/org/ahocorasick/trie/MatchToken.java index 851472c..ff499b3 100644 --- a/src/main/java/org/ahocorasick/trie/MatchToken.java +++ b/src/main/java/org/ahocorasick/trie/MatchToken.java @@ -7,6 +7,7 @@ public class MatchToken extends Token { public MatchToken(final String fragment, final Emit emit) { super(fragment); this.emit = emit; + } @Override diff --git a/src/main/java/org/ahocorasick/trie/Payload.java b/src/main/java/org/ahocorasick/trie/Payload.java new file mode 100644 index 0000000..111a043 --- /dev/null +++ b/src/main/java/org/ahocorasick/trie/Payload.java @@ -0,0 +1,33 @@ +package org.ahocorasick.trie; + +/** + * Payload holds the matched keyword and some payload-data. + * + * @author Daniel Beck + * + * @param The type of the wrapped payload data. + */ +public class Payload implements Comparable> { + + private final String keyword; + private final T data; + + public Payload(final String keyword, final T data) { + super(); + this.keyword = keyword; + this.data = data; + } + + public String getKeyword() { + return keyword; + } + + public T getData() { + return data; + } + + @Override + public int compareTo(Payload other) { + return keyword.compareTo(other.getKeyword()); + } +} diff --git a/src/main/java/org/ahocorasick/trie/PayloadEmit.java b/src/main/java/org/ahocorasick/trie/PayloadEmit.java new file mode 100644 index 0000000..a456e61 --- /dev/null +++ b/src/main/java/org/ahocorasick/trie/PayloadEmit.java @@ -0,0 +1,50 @@ +package org.ahocorasick.trie; + +import org.ahocorasick.interval.Interval; +import org.ahocorasick.interval.Intervalable; + +/** + * PayloadEmit contains a matched term and its associated payload data. + * + * @param Type of the wrapped payload-data. + * @author Daniel Beck + * + */ +public class PayloadEmit extends Interval implements Intervalable { + + private final String keyword; + + private final T payload; + + /** + * Created a PayloadEmit + * + * @param start Start of the matched search term. + * @param end End of the matched search term. + * @param keyword Keyword that matched. + * @param payload Emitted payload data. + */ + public PayloadEmit(final int start, final int end, String keyword, T payload) { + super(start, end); + this.keyword = keyword; + this.payload = payload; + } + + public String getKeyword() { + return this.keyword; + } + + /** + * Returns the payload associated to this emit. + * + * @return the associated payload + */ + public T getPayload() { + return this.payload; + } + + @Override + public String toString() { + return super.toString() + "=" + this.keyword + (this.payload != null ? "->" + this.payload : ""); + } +} diff --git a/src/main/java/org/ahocorasick/trie/PayloadFragmentToken.java b/src/main/java/org/ahocorasick/trie/PayloadFragmentToken.java new file mode 100644 index 0000000..b1809dd --- /dev/null +++ b/src/main/java/org/ahocorasick/trie/PayloadFragmentToken.java @@ -0,0 +1,31 @@ +package org.ahocorasick.trie; + +/*** + * PayloadFragmentToken holds a text ("the fragment"). + *

+ * It does not matches a search term - so its isMatch-method + * returns always false. getEmits returns not Emits. + * + * @author Daniel Beck + * + * @param The Type of the emitted payloads. + */ +public class PayloadFragmentToken extends PayloadToken { + + public PayloadFragmentToken(String fragment) { + super(fragment); + } + + @Override + public boolean isMatch() { + return false; + } + + /** + * Returns null. + */ + @Override + public PayloadEmit getEmit() { + return null; + } +} diff --git a/src/main/java/org/ahocorasick/trie/PayloadMatchToken.java b/src/main/java/org/ahocorasick/trie/PayloadMatchToken.java new file mode 100644 index 0000000..ab7d8ac --- /dev/null +++ b/src/main/java/org/ahocorasick/trie/PayloadMatchToken.java @@ -0,0 +1,31 @@ +package org.ahocorasick.trie; + +/** + * PayloadMatchToken holds a text ("the fragment") an emits some output. + *

+ * It matches a search term - so its isMatch-method returns always + * true.. + * + * @author Daniel Beck + * + * @param The Type of the emitted payloads. + */ +public class PayloadMatchToken extends PayloadToken { + + private final PayloadEmit emit; + + public PayloadMatchToken(final String fragment, final PayloadEmit emit) { + super(fragment); + this.emit = emit; + } + + @Override + public boolean isMatch() { + return true; + } + + @Override + public PayloadEmit getEmit() { + return this.emit; + } +} diff --git a/src/main/java/org/ahocorasick/trie/PayloadState.java b/src/main/java/org/ahocorasick/trie/PayloadState.java new file mode 100644 index 0000000..effc49d --- /dev/null +++ b/src/main/java/org/ahocorasick/trie/PayloadState.java @@ -0,0 +1,156 @@ +package org.ahocorasick.trie; + +import java.util.*; + +/** + *

+ * A state has various important tasks it must attend to: + *

+ *

+ *

    + *
  • success; when a character points to another state, it must return that + * state
  • + *
  • failure; when a character has no matching state, the algorithm must be + * able to fall back on a state with less depth
  • + *
  • emits; when this state is passed and keywords have been matched, the + * matches and their payloads must be 'emitted' so that they can be used later + * on.
  • + *
+ *

+ *

+ * The root state is special in the sense that it has no failure state; it + * cannot fail. If it 'fails' it will still parse the next character and start + * from the root node. This ensures that the algorithm always runs. All other + * states always have a fail state. + *

+ * + * @author Daniel Beck + */ +public class PayloadState { + + /** + * effective the size of the keyword + */ + private final int depth; + + /** + * only used for the root state to refer to itself in case no matches have been + * found + */ + private final PayloadState rootState; + + /** + * referred to in the white paper as the 'goto' structure. From a state it is + * possible to go to other states, depending on the character passed. + */ + private final Map> success = new HashMap<>(); + + /** + * if no matching states are found, the failure state will be returned + */ + private PayloadState failure; + + /** + * whenever this state is reached, it will emit the matches keywords for future + * reference + */ + private Set> emits; + + public PayloadState() { + this(0); + } + + public PayloadState(final int depth) { + this.depth = depth; + this.rootState = depth == 0 ? this : null; + } + + private PayloadState nextState(final Character character, final boolean ignoreRootState) { + PayloadState nextState = this.success.get(character); + + if (!ignoreRootState && nextState == null && this.rootState != null) { + nextState = this.rootState; + } + + return nextState; + } + + public PayloadState nextState(final Character character) { + return nextState(character, false); + } + + public PayloadState nextStateIgnoreRootState(Character character) { + return nextState(character, true); + } + + public PayloadState addState(String keyword) { + PayloadState state = this; + + for (final Character character : keyword.toCharArray()) { + state = state.addState(character); + } + + return state; + } + + public PayloadState addState(Character character) { + PayloadState nextState = nextStateIgnoreRootState(character); + if (nextState == null) { + nextState = new PayloadState(this.depth + 1); + this.success.put(character, nextState); + } + return nextState; + } + + public int getDepth() { + return this.depth; + } + + /** + * Adds a payload to be emitted for this state. + * + * @param emit Payload to be emitted. + */ + public void addEmit(Payload payload) { + if (this.emits == null) { + this.emits = new TreeSet<>(); + } + this.emits.add(payload); + } + + /** + * Adds a collection of payloads to be emitted for this state. + * + * @param emits Collection of payloads to be emitted. + */ + public void addEmit(Collection> emits) { + for (Payload emit : emits) { + addEmit(emit); + } + } + + /** + * Returns a collection of emitted payloads for this state. + * + * @return Collection of emitted payloads. + */ + public Collection> emit() { + return this.emits == null ? Collections.>emptyList() : this.emits; + } + + public PayloadState failure() { + return this.failure; + } + + public void setFailure(PayloadState failState) { + this.failure = failState; + } + + public Collection> getStates() { + return this.success.values(); + } + + public Collection getTransitions() { + return this.success.keySet(); + } +} \ No newline at end of file diff --git a/src/main/java/org/ahocorasick/trie/PayloadToken.java b/src/main/java/org/ahocorasick/trie/PayloadToken.java new file mode 100644 index 0000000..4350e75 --- /dev/null +++ b/src/main/java/org/ahocorasick/trie/PayloadToken.java @@ -0,0 +1,28 @@ +package org.ahocorasick.trie; + +/*** + * PayloadToken holds a text ("the fragment") an emits some output. If + * isMatch returns true, the token matched a search term. + * + * @author Daniel Beck + * + * @param The Type of the emitted payloads. + */ +public abstract class PayloadToken { + private String fragment; + + public PayloadToken(String fragment) { + this.fragment = fragment; + } + + public String getFragment() { + return this.fragment; + } + + /** + * Return true if a search term matched. + */ + public abstract boolean isMatch(); + + public abstract PayloadEmit getEmit(); +} diff --git a/src/main/java/org/ahocorasick/trie/PayloadTrie.java b/src/main/java/org/ahocorasick/trie/PayloadTrie.java new file mode 100644 index 0000000..19db935 --- /dev/null +++ b/src/main/java/org/ahocorasick/trie/PayloadTrie.java @@ -0,0 +1,495 @@ +package org.ahocorasick.trie; + +import static java.lang.Character.isWhitespace; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.Queue; +import java.util.concurrent.LinkedBlockingDeque; + +import org.ahocorasick.interval.IntervalTree; +import org.ahocorasick.interval.Intervalable; +import org.ahocorasick.trie.handler.DefaultPayloadEmitHandler; +import org.ahocorasick.trie.handler.PayloadEmitHandler; +import org.ahocorasick.trie.handler.StatefulPayloadEmitHandler; +import org.ahocorasick.util.ListElementRemoval; +import org.ahocorasick.util.ListElementRemoval.RemoveElementPredicate; + +/** + * A trie implementation, based on the Aho-Corasick white paper, Bell + * technologies: http://cr.yp.to/bib/1975/aho.pdf + *

+ * + * The payload trie adds the possibility to specify emitted payloads for each + * added keyword. + * + * @author Daniel Beck + * @param The type of the supplied of the payload + */ +public class PayloadTrie { + + private final TrieConfig trieConfig; + + private final PayloadState rootState; + + protected PayloadTrie(final TrieConfig trieConfig) { + this.trieConfig = trieConfig; + this.rootState = new PayloadState<>(); + } + + /** + * Used by the builder to add a text search keyword with a emit payload. + * + * @param keyword The search term to add to the list of search terms. + * @param emit the payload to emit for this search term. + * @throws NullPointerException if the keyword is null. + */ + private void addKeyword(String keyword, T emit) { + if (keyword.isEmpty()) { + return; + } + + if (isCaseInsensitive()) { + keyword = keyword.toLowerCase(); + } + + addState(keyword).addEmit(new Payload(keyword, emit)); + } + + /** + * Used by the builder to add a text search keyword. + * + * @param keyword The search term to add to the list of search terms. + * @throws NullPointerException if the keyword is null. + */ + private void addKeyword(String keyword) { + if (keyword.isEmpty()) { + return; + } + + if (isCaseInsensitive()) { + keyword = keyword.toLowerCase(); + } + + addState(keyword).addEmit(new Payload(keyword, null)); + } + + private PayloadState addState(final String keyword) { + return getRootState().addState(keyword); + } + + /** + * Tokenizes the specified text and returns the emitted outputs. + * + * @param text The text to tokenize. + */ + public Collection> tokenize(final String text) { + final Collection> tokens = new ArrayList<>(); + final Collection> collectedEmits = parseText(text); + int lastCollectedPosition = -1; + + for (final PayloadEmit emit : collectedEmits) { + if (emit.getStart() - lastCollectedPosition > 1) { + tokens.add((PayloadToken) createFragment(emit, text, lastCollectedPosition)); + } + + tokens.add(createMatch(emit, text)); + lastCollectedPosition = emit.getEnd(); + } + + if (text.length() - lastCollectedPosition > 1) { + tokens.add((PayloadToken) createFragment(null, text, lastCollectedPosition)); + } + + return tokens; + } + + private PayloadToken createFragment(final PayloadEmit emit, final String text, final int lastCollectedPosition) { + return new PayloadFragmentToken( + text.substring(lastCollectedPosition + 1, emit == null ? text.length() : emit.getStart())); + } + + private PayloadToken createMatch(PayloadEmit emit, String text) { + return new PayloadMatchToken(text.substring(emit.getStart(), emit.getEnd() + 1), emit); + } + + /** + * Tokenizes a specified text and returns the emitted outputs. + * + * @param text The character sequence to tokenize. + * @return A collection of emits. + */ + public Collection> parseText(final CharSequence text) { + return parseText(text, new DefaultPayloadEmitHandler()); + } + + /** + * Tokenizes the specified text by using a custom EmitHandler and returns the + * emitted outputs. + * + * @param text The character sequence to tokenize. + * @param emitHandler The emit handler that will be used to parse the text. + * @return A collection of emits. + */ + @SuppressWarnings("unchecked") + public Collection> parseText(final CharSequence text, final StatefulPayloadEmitHandler emitHandler) { + parseText(text, (PayloadEmitHandler) emitHandler); + + final List> collectedEmits = emitHandler.getEmits(); + + if (trieConfig.isOnlyWholeWords()) { + removePartialMatches(text, collectedEmits); + } + + if (trieConfig.isOnlyWholeWordsWhiteSpaceSeparated()) { + removePartialMatchesWhiteSpaceSeparated(text, collectedEmits); + } + + if (!trieConfig.isAllowOverlaps()) { + IntervalTree intervalTree = new IntervalTree((List) (List) collectedEmits); + intervalTree.removeOverlaps((List) (List) collectedEmits); + } + + return collectedEmits; + } + + /** + * Returns true if the text contains contains one of the search terms. Else, + * returns false. + * + * @param Text Specified text. + * @return true if the text contains one of the search terms. Else, returns + * false. + */ + public boolean containsMatch(final CharSequence text) { + return firstMatch(text) != null; + } + + /** + * Tokenizes the specified text by using a custom EmitHandler and returns the + * emitted outputs. + * + * @param text The character sequence to tokenize. + * @param emitHandler The emit handler that will be used to parse the text. + * @return A collection of emits. + */ + + public void parseText(final CharSequence text, final PayloadEmitHandler emitHandler) { + PayloadState currentState = getRootState(); + + for (int position = 0; position < text.length(); position++) { + Character character = text.charAt(position); + + // TODO: Maybe lowercase the entire string at once? + if (trieConfig.isCaseInsensitive()) { + character = Character.toLowerCase(character); + } + + currentState = getState(currentState, character); + if (storeEmits(position, currentState, emitHandler) && trieConfig.isStopOnHit()) { + return; + } + } + } + + /** + * The first matching text sequence. + * + * @param text The text to search for keywords. + * @return null if no matches found. + */ + public PayloadEmit firstMatch(final CharSequence text) { + if (!trieConfig.isAllowOverlaps()) { + // Slow path. Needs to find all the matches to detect overlaps. + final Collection> parseText = parseText(text); + + if (parseText != null && !parseText.isEmpty()) { + return parseText.iterator().next(); + } + } else { + // Fast path. Returns first match found. + PayloadState currentState = getRootState(); + + for (int position = 0; position < text.length(); position++) { + Character character = text.charAt(position); + + // TODO: Lowercase the entire string at once? + if (trieConfig.isCaseInsensitive()) { + character = Character.toLowerCase(character); + } + + currentState = getState(currentState, character); + Collection> payloads = currentState.emit(); + + if (payloads != null && !payloads.isEmpty()) { + for (final Payload payload : payloads) { + final PayloadEmit emit = new PayloadEmit<>(position - payload.getKeyword().length() + 1, position, + payload.getKeyword(), payload.getData()); + if (trieConfig.isOnlyWholeWords()) { + if (!isPartialMatch(text, emit)) { + return emit; + } + } else { + return emit; + } + } + } + } + } + + return null; + } + + private boolean isPartialMatch(final CharSequence searchText, final PayloadEmit emit) { + return (emit.getStart() != 0 && Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) + || (emit.getEnd() + 1 != searchText.length() && Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1))); + } + + private void removePartialMatches(final CharSequence searchText, final List> collectedEmits) { + + final RemoveElementPredicate> predicate = new RemoveElementPredicate>() { + + @Override + public boolean remove(PayloadEmit emit) { + return isPartialMatch(searchText, emit); + } + + }; + + ListElementRemoval.removeIf(collectedEmits, predicate); + } + + private void removePartialMatchesWhiteSpaceSeparated(final CharSequence searchText, + final List> collectedEmits) { + final long size = searchText.length(); + final List> removeEmits = new ArrayList<>(); + + for (final PayloadEmit emit : collectedEmits) { + if ((emit.getStart() == 0 || isWhitespace(searchText.charAt(emit.getStart() - 1))) + && (emit.getEnd() + 1 == size || isWhitespace(searchText.charAt(emit.getEnd() + 1)))) { + continue; + } + removeEmits.add(emit); + } + + for (final PayloadEmit removeEmit : removeEmits) { + collectedEmits.remove(removeEmit); + } + } + + private PayloadState getState(PayloadState currentState, final Character character) { + PayloadState newCurrentState = currentState.nextState(character); + + while (newCurrentState == null) { + currentState = currentState.failure(); + newCurrentState = currentState.nextState(character); + } + + return newCurrentState; + } + + private void constructFailureStates() { + final Queue> queue = new LinkedBlockingDeque<>(); + final PayloadState startState = getRootState(); + + // First, set the fail state of all depth 1 states to the root state + for (PayloadState depthOneState : startState.getStates()) { + depthOneState.setFailure(startState); + queue.add(depthOneState); + } + + // Second, determine the fail state for all depth > 1 state + while (!queue.isEmpty()) { + final PayloadState currentState = queue.remove(); + + for (final Character transition : currentState.getTransitions()) { + PayloadState targetState = currentState.nextState(transition); + queue.add(targetState); + + PayloadState traceFailureState = currentState.failure(); + while (traceFailureState.nextState(transition) == null) { + traceFailureState = traceFailureState.failure(); + } + + final PayloadState newFailureState = traceFailureState.nextState(transition); + targetState.setFailure(newFailureState); + targetState.addEmit(newFailureState.emit()); + } + } + } + + private boolean storeEmits(final int position, final PayloadState currentState, final PayloadEmitHandler emitHandler) { + boolean emitted = false; + final Collection> payloads = currentState.emit(); + + // TODO: The check for empty might be superfluous. + if (payloads != null && !payloads.isEmpty()) { + for (final Payload payload : payloads) { + emitted = emitHandler.emit(new PayloadEmit(position - payload.getKeyword().length() + 1, position, + payload.getKeyword(), payload.getData())) || emitted; + + if (emitted && trieConfig.isStopOnHit()) { + break; + } + } + } + + return emitted; + } + + private boolean isCaseInsensitive() { + return trieConfig.isCaseInsensitive(); + } + + private PayloadState getRootState() { + return this.rootState; + } + + /** + * Provides a fluent interface for constructing Trie instances with payloads. + * + * @return The builder used to configure its Trie. + */ + public static PayloadTrieBuilder builder() { + return new PayloadTrieBuilder(); + } + + /** + * Builder class to create a PayloadTrie instance. + * + * @param The type of the emitted payload. + */ + public static class PayloadTrieBuilder { + + private final TrieConfig trieConfig = new TrieConfig(); + + private final PayloadTrie trie = new PayloadTrie<>(trieConfig); + + /** + * Default (empty) constructor. + */ + private PayloadTrieBuilder() { + } + + /** + * Configure the Trie to ignore case when searching for keywords in the text. + * This must be called before calling addKeyword because the algorithm converts + * keywords to lowercase as they are added, depending on this case sensitivity + * setting. + * + * @return This builder. + */ + public PayloadTrieBuilder ignoreCase() { + this.trieConfig.setCaseInsensitive(true); + return this; + } + + /** + * Configure the Trie to ignore overlapping keywords. + * + * @return This builder. + */ + public PayloadTrieBuilder ignoreOverlaps() { + this.trieConfig.setAllowOverlaps(false); + return this; + } + + /** + * Adds a keyword to the Trie's list of text search keywords. No Payload is + * supplied. + * + * @param keyword The keyword to add to the list. + * @return This builder. + * @throws NullPointerException if the keyword is null. + */ + public PayloadTrieBuilder addKeyword(final String keyword) { + this.trie.addKeyword(keyword); + return this; + } + + /** + * Adds a keyword and a payload to the Trie's list of text search keywords. + * + * @param keyword The keyword to add to the list. + * @return This builder. + * @throws NullPointerException if the keyword is null. + */ + public PayloadTrieBuilder addKeyword(final String keyword, final T payload) { + this.trie.addKeyword(keyword, payload); + return this; + } + + /** + * Adds a list of keywords and payloads to the Trie's list of text search + * keywords. + * + * @param keywords The keywords to add to the list. + * @return This builder. + */ + public PayloadTrieBuilder addKeywords(final Collection> keywords) { + for (Payload payload : keywords) { + this.trie.addKeyword(payload.getKeyword(), payload.getData()); + } + return this; + } + + /** + * Configure the Trie to match whole keywords in the text. + * + * @return This builder. + */ + public PayloadTrieBuilder onlyWholeWords() { + this.trieConfig.setOnlyWholeWords(true); + return this; + } + + /** + * Configure the Trie to match whole keywords that are separated by whitespace + * in the text. For example, "this keyword thatkeyword" would only match the + * first occurrence of "keyword". + * + * @return This builder. + */ + public PayloadTrieBuilder onlyWholeWordsWhiteSpaceSeparated() { + this.trieConfig.setOnlyWholeWordsWhiteSpaceSeparated(true); + return this; + } + + /** + * Configure the Trie to stop after the first keyword is found in the text. + * + * @return This builder. + */ + public PayloadTrieBuilder stopOnHit() { + trie.trieConfig.setStopOnHit(true); + return this; + } + + /** + * Configure the PayloadTrie based on the builder settings. + * + * @return The configured PayloadTrie. + */ + public PayloadTrie build() { + this.trie.constructFailureStates(); + return this.trie; + } + + /** + * @return This builder. + * @deprecated Use ignoreCase() + */ + public PayloadTrieBuilder caseInsensitive() { + return ignoreCase(); + } + + /** + * @return This builder. + * @deprecated Use ignoreOverlaps() + */ + public PayloadTrieBuilder removeOverlaps() { + return ignoreOverlaps(); + } + } +} diff --git a/src/main/java/org/ahocorasick/trie/State.java b/src/main/java/org/ahocorasick/trie/State.java index e192207..e5d763a 100644 --- a/src/main/java/org/ahocorasick/trie/State.java +++ b/src/main/java/org/ahocorasick/trie/State.java @@ -133,4 +133,4 @@ public class State { public Collection getTransitions() { return this.success.keySet(); } -} +} \ No newline at end of file diff --git a/src/main/java/org/ahocorasick/trie/Token.java b/src/main/java/org/ahocorasick/trie/Token.java index 65c1fac..4e79a35 100644 --- a/src/main/java/org/ahocorasick/trie/Token.java +++ b/src/main/java/org/ahocorasick/trie/Token.java @@ -1,7 +1,6 @@ package org.ahocorasick.trie; public abstract class Token { - private String fragment; public Token(String fragment) { @@ -15,5 +14,4 @@ public abstract class Token { public abstract boolean isMatch(); public abstract Emit getEmit(); - } diff --git a/src/main/java/org/ahocorasick/trie/Trie.java b/src/main/java/org/ahocorasick/trie/Trie.java index 25922db..315d749 100644 --- a/src/main/java/org/ahocorasick/trie/Trie.java +++ b/src/main/java/org/ahocorasick/trie/Trie.java @@ -1,20 +1,13 @@ package org.ahocorasick.trie; -import org.ahocorasick.interval.IntervalTree; -import org.ahocorasick.interval.Intervalable; -import org.ahocorasick.trie.handler.DefaultEmitHandler; -import org.ahocorasick.trie.handler.EmitHandler; -import org.ahocorasick.trie.handler.StatefulEmitHandler; -import org.ahocorasick.util.ListElementRemoval; -import org.ahocorasick.util.ListElementRemoval.RemoveElementPredicate; - import java.util.ArrayList; import java.util.Collection; -import java.util.List; -import java.util.Queue; -import java.util.concurrent.LinkedBlockingDeque; -import static java.lang.Character.isWhitespace; +import org.ahocorasick.trie.PayloadTrie.PayloadTrieBuilder; +import org.ahocorasick.trie.handler.EmitHandler; +import org.ahocorasick.trie.handler.StatefulPayloadEmitDelegateHandler; +import org.ahocorasick.trie.handler.PayloadEmitDelegateHandler; +import org.ahocorasick.trie.handler.StatefulEmitHandler; /** * Based on the Aho-Corasick white paper, Bell technologies: @@ -24,112 +17,47 @@ import static java.lang.Character.isWhitespace; */ public class Trie { - private final TrieConfig trieConfig; + private final PayloadTrie payloadTrie; - private final State rootState; - - private Trie(final TrieConfig trieConfig) { - this.trieConfig = trieConfig; - this.rootState = new State(); - } - - /** - * Used by the builder to add a text search keyword. - * - * @param keyword The search term to add to the list of search terms. - * @throws NullPointerException if the keyword is null. - */ - private void addKeyword(String keyword) { - if (keyword.isEmpty()) { - return; - } - - if (isCaseInsensitive()) { - keyword = keyword.toLowerCase(); - } - - addState(keyword).addEmit(keyword); - } - - /** - * Delegates to addKeyword. - * - * @param keywords List of search term to add to the list of search terms. - */ - private void addKeywords(final String[] keywords) { - for (final String keyword : keywords) { - addKeyword(keyword); - } - } - - /** - * Delegates to addKeyword. - * - * @param keywords List of search term to add to the list of search terms. - */ - private void addKeywords(final Collection keywords) { - for (final String keyword : keywords) { - addKeyword(keyword); - } - } - - private State addState(final String keyword) { - return getRootState().addState(keyword); + private Trie(final PayloadTrie payloadTrie) { + this.payloadTrie = payloadTrie; } public Collection tokenize(final String text) { - final Collection tokens = new ArrayList<>(); - final Collection collectedEmits = parseText(text); - int lastCollectedPosition = -1; - - for (final Emit emit : collectedEmits) { - if (emit.getStart() - lastCollectedPosition > 1) { - tokens.add(createFragment(emit, text, lastCollectedPosition)); - } - - tokens.add(createMatch(emit, text)); - lastCollectedPosition = emit.getEnd(); - } - - if (text.length() - lastCollectedPosition > 1) { - tokens.add(createFragment(null, text, lastCollectedPosition)); - } - - return tokens; + Collection> tokens = this.payloadTrie.tokenize(text); + return asTokens(tokens); } - private Token createFragment(final Emit emit, final String text, final int lastCollectedPosition) { - return new FragmentToken(text.substring(lastCollectedPosition + 1, emit == null ? text.length() : emit.getStart())); + private static Collection asTokens(Collection> tokens) { + Collection result = new ArrayList<>(); + for (PayloadToken payloadToken : tokens) { + result.add(new DefaultToken(payloadToken)); + } + return result; } - private Token createMatch(Emit emit, String text) { - return new MatchToken(text.substring(emit.getStart(), emit.getEnd() + 1), emit); + private static Collection asEmits(Collection> emits) { + Collection result = new ArrayList<>(); + for (PayloadEmit emit : emits) { + result.add(asEmit(emit)); + } + return result; + } + + private static Emit asEmit(PayloadEmit payloadEmit) { + return new Emit(payloadEmit.getStart(), payloadEmit.getEnd(), payloadEmit.getKeyword()); } public Collection parseText(final CharSequence text) { - return parseText(text, new DefaultEmitHandler()); + Collection> parsedText = this.payloadTrie.parseText(text); + return asEmits(parsedText); } @SuppressWarnings("unchecked") public Collection parseText(final CharSequence text, final StatefulEmitHandler emitHandler) { - parseText(text, (EmitHandler) emitHandler); - - final List collectedEmits = emitHandler.getEmits(); - - if (trieConfig.isOnlyWholeWords()) { - removePartialMatches(text, collectedEmits); - } - - if (trieConfig.isOnlyWholeWordsWhiteSpaceSeparated()) { - removePartialMatchesWhiteSpaceSeparated(text, collectedEmits); - } - - if (!trieConfig.isAllowOverlaps()) { - IntervalTree intervalTree = new IntervalTree((List) (List) collectedEmits); - intervalTree.removeOverlaps((List) (List) collectedEmits); - } - - return collectedEmits; + Collection> parsedText = this.payloadTrie.parseText(text, + new StatefulPayloadEmitDelegateHandler(emitHandler)); + return asEmits(parsedText); } public boolean containsMatch(final CharSequence text) { @@ -137,21 +65,7 @@ public class Trie { } public void parseText(final CharSequence text, final EmitHandler emitHandler) { - State currentState = getRootState(); - - for (int position = 0; position < text.length(); position++) { - Character character = text.charAt(position); - - // TODO: Maybe lowercase the entire string at once? - if (trieConfig.isCaseInsensitive()) { - character = Character.toLowerCase(character); - } - - currentState = getState(currentState, character); - if (storeEmits(position, currentState, emitHandler) && trieConfig.isStopOnHit()) { - return; - } - } + this.payloadTrie.parseText(text, new PayloadEmitDelegateHandler(emitHandler)); } /** @@ -161,148 +75,8 @@ public class Trie { * @return null if no matches found. */ public Emit firstMatch(final CharSequence text) { - if (!trieConfig.isAllowOverlaps()) { - // Slow path. Needs to find all the matches to detect overlaps. - final Collection parseText = parseText(text); - - if (parseText != null && !parseText.isEmpty()) { - return parseText.iterator().next(); - } - } else { - // Fast path. Returns first match found. - State currentState = getRootState(); - - for (int position = 0; position < text.length(); position++) { - Character character = text.charAt(position); - - // TODO: Lowercase the entire string at once? - if (trieConfig.isCaseInsensitive()) { - character = Character.toLowerCase(character); - } - - currentState = getState(currentState, character); - Collection emitStrs = currentState.emit(); - - if (emitStrs != null && !emitStrs.isEmpty()) { - for (final String emitStr : emitStrs) { - final Emit emit = new Emit(position - emitStr.length() + 1, position, emitStr); - if (trieConfig.isOnlyWholeWords()) { - if (!isPartialMatch(text, emit)) { - return emit; - } - } else { - return emit; - } - } - } - } - } - - return null; - } - - private boolean isPartialMatch(final CharSequence searchText, final Emit emit) { - return (emit.getStart() != 0 && - Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) || - (emit.getEnd() + 1 != searchText.length() && - Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1))); - } - - private void removePartialMatches(final CharSequence searchText, final List collectedEmits) { - - final RemoveElementPredicate predicate = new RemoveElementPredicate() { - - @Override - public boolean remove(Emit emit) { - return isPartialMatch(searchText, emit); - } - - }; - - ListElementRemoval.removeIf(collectedEmits, predicate); - } - - private void removePartialMatchesWhiteSpaceSeparated(final CharSequence searchText, final List collectedEmits) { - final long size = searchText.length(); - final List removeEmits = new ArrayList<>(); - - for (final Emit emit : collectedEmits) { - if ((emit.getStart() == 0 || isWhitespace(searchText.charAt(emit.getStart() - 1))) && - (emit.getEnd() + 1 == size || isWhitespace(searchText.charAt(emit.getEnd() + 1)))) { - continue; - } - removeEmits.add(emit); - } - - for (final Emit removeEmit : removeEmits) { - collectedEmits.remove(removeEmit); - } - } - - private State getState(State currentState, final Character character) { - State newCurrentState = currentState.nextState(character); - - while (newCurrentState == null) { - currentState = currentState.failure(); - newCurrentState = currentState.nextState(character); - } - - return newCurrentState; - } - - private void constructFailureStates() { - final Queue queue = new LinkedBlockingDeque<>(); - final State startState = getRootState(); - - // First, set the fail state of all depth 1 states to the root state - for (State depthOneState : startState.getStates()) { - depthOneState.setFailure(startState); - queue.add(depthOneState); - } - - // Second, determine the fail state for all depth > 1 state - while (!queue.isEmpty()) { - final State currentState = queue.remove(); - - for (final Character transition : currentState.getTransitions()) { - State targetState = currentState.nextState(transition); - queue.add(targetState); - - State traceFailureState = currentState.failure(); - while (traceFailureState.nextState(transition) == null) { - traceFailureState = traceFailureState.failure(); - } - - final State newFailureState = traceFailureState.nextState(transition); - targetState.setFailure(newFailureState); - targetState.addEmit(newFailureState.emit()); - } - } - } - - private boolean storeEmits(final int position, final State currentState, final EmitHandler emitHandler) { - boolean emitted = false; - final Collection emits = currentState.emit(); - - // TODO: The check for empty might be superfluous. - if (emits != null && !emits.isEmpty()) { - for (final String emit : emits) { - emitted = emitHandler.emit(new Emit(position - emit.length() + 1, position, emit)) || emitted; - if (emitted && trieConfig.isStopOnHit()) { - break; - } - } - } - - return emitted; - } - - private boolean isCaseInsensitive() { - return trieConfig.isCaseInsensitive(); - } - - private State getRootState() { - return this.rootState; + PayloadEmit firstMatch = this.payloadTrie.firstMatch(text); + return new Emit(firstMatch.getStart(), firstMatch.getEnd(), firstMatch.getKeyword()); } /** @@ -318,7 +92,9 @@ public class Trie { private final TrieConfig trieConfig = new TrieConfig(); - private final Trie trie = new Trie(trieConfig); + private final PayloadTrie trie = new PayloadTrie<>(trieConfig); + + private final PayloadTrieBuilder delegate = PayloadTrie.builder(); /** * Default (empty) constructor. @@ -327,15 +103,16 @@ public class Trie { } /** - * Configure the Trie to ignore case when searching for keywords in - * the text. This must be called before calling addKeyword because - * the algorithm converts keywords to lowercase as they are added, - * depending on this case sensitivity setting. + * Configure the Trie to ignore case when searching for keywords in the text. + * This must be called before calling addKeyword because the algorithm converts + * keywords to lowercase as they are added, depending on this case sensitivity + * setting. * * @return This builder. */ public TrieBuilder ignoreCase() { - this.trieConfig.setCaseInsensitive(true); + delegate.ignoreCase(); +// this.trieConfig.setCaseInsensitive(true); return this; } @@ -345,7 +122,7 @@ public class Trie { * @return This builder. */ public TrieBuilder ignoreOverlaps() { - this.trieConfig.setAllowOverlaps(false); + delegate.ignoreOverlaps(); return this; } @@ -357,7 +134,7 @@ public class Trie { * @throws NullPointerException if the keyword is null. */ public TrieBuilder addKeyword(final String keyword) { - this.trie.addKeyword(keyword); + delegate.addKeyword(keyword, null); return this; } @@ -368,7 +145,9 @@ public class Trie { * @return This builder. */ public TrieBuilder addKeywords(final String... keywords) { - this.trie.addKeywords(keywords); + for (String keyword : keywords) { + delegate.addKeyword(keyword, null); + } return this; } @@ -379,7 +158,9 @@ public class Trie { * @return This builder. */ public TrieBuilder addKeywords(final Collection keywords) { - this.trie.addKeywords(keywords); + for (String keyword : keywords) { + this.delegate.addKeyword(keyword, null); + } return this; } @@ -389,30 +170,29 @@ public class Trie { * @return This builder. */ public TrieBuilder onlyWholeWords() { - this.trieConfig.setOnlyWholeWords(true); + this.delegate.onlyWholeWords(); return this; } /** - * Configure the Trie to match whole keywords that are separated by - * whitespace in the text. For example, "this keyword thatkeyword" - * would only match the first occurrence of "keyword". + * Configure the Trie to match whole keywords that are separated by whitespace + * in the text. For example, "this keyword thatkeyword" would only match the + * first occurrence of "keyword". * * @return This builder. */ public TrieBuilder onlyWholeWordsWhiteSpaceSeparated() { - this.trieConfig.setOnlyWholeWordsWhiteSpaceSeparated(true); + this.delegate.onlyWholeWordsWhiteSpaceSeparated(); return this; } /** - * Configure the Trie to stop after the first keyword is found in the - * text. + * Configure the Trie to stop after the first keyword is found in the text. * * @return This builder. */ public TrieBuilder stopOnHit() { - trie.trieConfig.setStopOnHit(true); + this.delegate.stopOnHit(); return this; } @@ -422,8 +202,8 @@ public class Trie { * @return The configured Trie. */ public Trie build() { - this.trie.constructFailureStates(); - return this.trie; + PayloadTrie payloadTrie = this.delegate.build(); + return new Trie(payloadTrie); } /** diff --git a/src/main/java/org/ahocorasick/trie/handler/AbstractStatefulPayloadEmitHandler.java b/src/main/java/org/ahocorasick/trie/handler/AbstractStatefulPayloadEmitHandler.java new file mode 100644 index 0000000..6d5d088 --- /dev/null +++ b/src/main/java/org/ahocorasick/trie/handler/AbstractStatefulPayloadEmitHandler.java @@ -0,0 +1,21 @@ +package org.ahocorasick.trie.handler; + +import java.util.ArrayList; +import java.util.List; + +import org.ahocorasick.trie.PayloadEmit; + +public abstract class AbstractStatefulPayloadEmitHandler implements StatefulPayloadEmitHandler { + + private final List> emits = new ArrayList<>(); + + public void addEmit(final PayloadEmit emit) { + this.emits.add(emit); + } + + @Override + public List> getEmits() { + return this.emits; + } + +} diff --git a/src/main/java/org/ahocorasick/trie/handler/DefaultEmitHandler.java b/src/main/java/org/ahocorasick/trie/handler/DefaultEmitHandler.java index 0e9236f..80a18c1 100644 --- a/src/main/java/org/ahocorasick/trie/handler/DefaultEmitHandler.java +++ b/src/main/java/org/ahocorasick/trie/handler/DefaultEmitHandler.java @@ -1,10 +1,10 @@ package org.ahocorasick.trie.handler; -import org.ahocorasick.trie.Emit; - import java.util.ArrayList; import java.util.List; +import org.ahocorasick.trie.Emit; + public class DefaultEmitHandler implements StatefulEmitHandler { private final List emits = new ArrayList<>(); diff --git a/src/main/java/org/ahocorasick/trie/handler/DefaultPayloadEmitHandler.java b/src/main/java/org/ahocorasick/trie/handler/DefaultPayloadEmitHandler.java new file mode 100644 index 0000000..8e7b1c3 --- /dev/null +++ b/src/main/java/org/ahocorasick/trie/handler/DefaultPayloadEmitHandler.java @@ -0,0 +1,22 @@ +package org.ahocorasick.trie.handler; + +import java.util.ArrayList; +import java.util.List; + +import org.ahocorasick.trie.PayloadEmit; + +public class DefaultPayloadEmitHandler implements StatefulPayloadEmitHandler { + + private final List> emits = new ArrayList<>(); + + @Override + public boolean emit(final PayloadEmit emit) { + this.emits.add(emit); + return true; + } + + @Override + public List> getEmits() { + return this.emits; + } +} diff --git a/src/main/java/org/ahocorasick/trie/handler/PayloadEmitDelegateHandler.java b/src/main/java/org/ahocorasick/trie/handler/PayloadEmitDelegateHandler.java new file mode 100644 index 0000000..3ec3a34 --- /dev/null +++ b/src/main/java/org/ahocorasick/trie/handler/PayloadEmitDelegateHandler.java @@ -0,0 +1,24 @@ +package org.ahocorasick.trie.handler; + +import org.ahocorasick.trie.Emit; +import org.ahocorasick.trie.PayloadEmit; + +/** + * Convenience wrapper class that delegates every method to a EmitHandler. + */ +public class PayloadEmitDelegateHandler implements PayloadEmitHandler { + + private EmitHandler handler; + + public PayloadEmitDelegateHandler(EmitHandler handler) { + this.handler = handler; + + } + + @Override + public boolean emit(PayloadEmit emit) { + Emit newEmit = new Emit(emit.getStart(), emit.getEnd(), emit.getKeyword()); + return handler.emit(newEmit); + } + +} diff --git a/src/main/java/org/ahocorasick/trie/handler/PayloadEmitHandler.java b/src/main/java/org/ahocorasick/trie/handler/PayloadEmitHandler.java new file mode 100644 index 0000000..173c712 --- /dev/null +++ b/src/main/java/org/ahocorasick/trie/handler/PayloadEmitHandler.java @@ -0,0 +1,7 @@ +package org.ahocorasick.trie.handler; + +import org.ahocorasick.trie.PayloadEmit; + +public interface PayloadEmitHandler { + boolean emit(PayloadEmit emit); +} diff --git a/src/main/java/org/ahocorasick/trie/handler/StatefulPayloadEmitDelegateHandler.java b/src/main/java/org/ahocorasick/trie/handler/StatefulPayloadEmitDelegateHandler.java new file mode 100644 index 0000000..395517b --- /dev/null +++ b/src/main/java/org/ahocorasick/trie/handler/StatefulPayloadEmitDelegateHandler.java @@ -0,0 +1,42 @@ +package org.ahocorasick.trie.handler; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +import org.ahocorasick.trie.Emit; +import org.ahocorasick.trie.PayloadEmit; + +/** + * Convenience wrapper class that delegates every method to a + * StatefullPayloadEmitHandler. + */ +public class StatefulPayloadEmitDelegateHandler implements StatefulPayloadEmitHandler { + + private StatefulEmitHandler handler; + + public StatefulPayloadEmitDelegateHandler(StatefulEmitHandler handler) { + this.handler = handler; + + } + + private static List> asEmits(Collection emits) { + List> result = new ArrayList<>(); + for (Emit emit : emits) { + result.add(new PayloadEmit(emit.getStart(), emit.getEnd(), emit.getKeyword(), null)); + } + return result; + } + + @Override + public boolean emit(PayloadEmit emit) { + Emit newEmit = new Emit(emit.getStart(), emit.getEnd(), emit.getKeyword()); + return handler.emit(newEmit); + } + + @Override + public List> getEmits() { + List emits = this.handler.getEmits(); + return asEmits(emits); + } +} diff --git a/src/main/java/org/ahocorasick/trie/handler/StatefulPayloadEmitHandler.java b/src/main/java/org/ahocorasick/trie/handler/StatefulPayloadEmitHandler.java new file mode 100644 index 0000000..bb42049 --- /dev/null +++ b/src/main/java/org/ahocorasick/trie/handler/StatefulPayloadEmitHandler.java @@ -0,0 +1,9 @@ +package org.ahocorasick.trie.handler; + +import java.util.List; + +import org.ahocorasick.trie.PayloadEmit; + +public interface StatefulPayloadEmitHandler extends PayloadEmitHandler{ + List> getEmits(); +} diff --git a/src/test/java/org/ahocorasick/trie/PayloadTrieTest.java b/src/test/java/org/ahocorasick/trie/PayloadTrieTest.java new file mode 100644 index 0000000..ecb8cdc --- /dev/null +++ b/src/test/java/org/ahocorasick/trie/PayloadTrieTest.java @@ -0,0 +1,510 @@ +package org.ahocorasick.trie; + +import static junit.framework.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Iterator; +import java.util.List; +import java.util.concurrent.ThreadLocalRandom; + +import org.ahocorasick.trie.handler.AbstractStatefulPayloadEmitHandler; +import org.ahocorasick.trie.handler.PayloadEmitHandler; +import org.ahocorasick.trie.handler.StatefulPayloadEmitHandler; +import org.junit.Test; + +public class PayloadTrieTest { + + private final static String[] ALPHABET = new String[] { "abc", "bcd", "cde" }; + private final static String[] ALPHABET_PAYLOAD = new String[] { "alpha:abc", "alpha:bcd", "alpha:cde" }; + + private final static List> ALPHABET_WITH_PAYLOADS = Arrays.asList(// + new Payload(ALPHABET[0], ALPHABET_PAYLOAD[0]), // + new Payload(ALPHABET[1], ALPHABET_PAYLOAD[1]), // + new Payload(ALPHABET[2], ALPHABET_PAYLOAD[2])); + + private final static String[] PRONOUNS = new String[] { "hers", "his", "she", "he" }; + private final static int[] PRONOUNS_PAYLOAD_ID = new int[] { 9, 12, 4, 20 }; + + private final static List> PRONOUNS_WITH_PAYLOADS = Arrays.asList(// + new Payload(PRONOUNS[0], PRONOUNS_PAYLOAD_ID[0]), // + new Payload(PRONOUNS[1], PRONOUNS_PAYLOAD_ID[1]), // + new Payload(PRONOUNS[2], PRONOUNS_PAYLOAD_ID[2]), // + new Payload(PRONOUNS[3], PRONOUNS_PAYLOAD_ID[3]) // + ); + + private final static String[] FOOD = new String[] { "veal", "cauliflower", "broccoli", "tomatoes" }; + private final static Food[] FOOD_PAYLOAD = new Food[] { new Food("veal"), new Food("cauliflower"), new Food("broccoli"), + new Food("tomatoes") }; + + private final static List> FOOD_WITH_PAYLOADS = Arrays.asList(// + new Payload(FOOD[0], FOOD_PAYLOAD[0]), // + new Payload(FOOD[1], FOOD_PAYLOAD[1]), // + new Payload(FOOD[2], FOOD_PAYLOAD[2]), // + new Payload(FOOD[3], FOOD_PAYLOAD[3]) // + ); + + private final static String[] GREEK_LETTERS = new String[] { "Alpha", "Beta", "Gamma" }; + private final static String[] GREEK_LETTERS_PAYLOAD = new String[] { "greek:Alpha", "greek:Beta", "greek:Gamma" }; + + private final static List> GREEK_LETTERS_WITH_PAYLOADS = Arrays.asList(// + new Payload(GREEK_LETTERS[0], GREEK_LETTERS_PAYLOAD[0]), // + new Payload(GREEK_LETTERS[1], GREEK_LETTERS_PAYLOAD[1]), // + new Payload(GREEK_LETTERS[2], GREEK_LETTERS_PAYLOAD[2])); + + private final static String[] UNICODE = new String[] { "turning", "once", "again", "börkü" }; + private final static String[] UNICODE_PAYLOAD = new String[] { "uni:turning", "uni:once", "uni:again", "uni:börkü" }; + + private final static List> UNICODE_WITH_PAYLOADS = Arrays.asList(// + new Payload(UNICODE[0], UNICODE_PAYLOAD[0]), // + new Payload(UNICODE[1], UNICODE_PAYLOAD[1]), // + new Payload(UNICODE[2], UNICODE_PAYLOAD[2]), // + new Payload(UNICODE[3], UNICODE_PAYLOAD[3])); + + public static class Food { + private final String name; + + public Food(String name) { + this.name = name; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((name == null) ? 0 : name.hashCode()); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + Food other = (Food) obj; + if (name == null) { + if (other.name != null) + return false; + } else if (!name.equals(other.name)) + return false; + return true; + } + } + + @Test + public void keywordAndTextAreTheSame() { + PayloadTrie trie = PayloadTrie.builder().addKeyword(ALPHABET[0], ALPHABET_PAYLOAD[0]).build(); + Collection> emits = trie.parseText(ALPHABET[0]); + Iterator> iterator = emits.iterator(); + checkEmit(iterator.next(), 0, 2, ALPHABET[0], ALPHABET_PAYLOAD[0]); + } + + @Test + public void keywordAndTextAreTheSameFirstMatch() { + PayloadTrie trie = PayloadTrie.builder().addKeyword(ALPHABET[0], ALPHABET_PAYLOAD[0]).build(); + PayloadEmit firstMatch = trie.firstMatch(ALPHABET[0]); + checkEmit(firstMatch, 0, 2, ALPHABET[0], ALPHABET_PAYLOAD[0]); + } + + @Test + public void textIsLongerThanKeyword() { + PayloadTrie trie = PayloadTrie.builder().addKeyword(ALPHABET[0], ALPHABET_PAYLOAD[0]).build(); + Collection> emits = trie.parseText(" " + ALPHABET[0]); + Iterator> iterator = emits.iterator(); + checkEmit(iterator.next(), 1, 3, ALPHABET[0], ALPHABET_PAYLOAD[0]); + } + + @Test + public void textIsLongerThanKeywordFirstMatch() { + + PayloadTrie trie = PayloadTrie.builder().addKeyword(ALPHABET[0], ALPHABET_PAYLOAD[0]).build(); + PayloadEmit firstMatch = trie.firstMatch(" " + ALPHABET[0]); + checkEmit(firstMatch, 1, 3, ALPHABET[0], ALPHABET_PAYLOAD[0]); + } + + @Test + public void variousKeywordsOneMatch() { + PayloadTrie trie = PayloadTrie.builder().addKeywords(ALPHABET_WITH_PAYLOADS).build(); + Collection> emits = trie.parseText("bcd"); + Iterator> iterator = emits.iterator(); + checkEmit(iterator.next(), 0, 2, "bcd", "alpha:bcd"); + } + + @Test + public void variousKeywordsFirstMatch() { + PayloadTrie trie = PayloadTrie.builder().addKeywords(ALPHABET_WITH_PAYLOADS).build(); + PayloadEmit firstMatch = trie.firstMatch("bcd"); + checkEmit(firstMatch, 0, 2, "bcd", "alpha:bcd"); + } + + @Test + public void ushersTestAndStopOnHit() { + PayloadTrie trie = PayloadTrie.builder().addKeywords(PRONOUNS_WITH_PAYLOADS).stopOnHit().build(); + Collection> emits = trie.parseText("ushers"); + assertEquals(1, emits.size()); // she @ 3, he @ 3, hers @ 5 + Iterator> iterator = emits.iterator(); + checkEmit(iterator.next(), 2, 3, "he", 20); + } + + @Test + public void ushersTestStopOnHitSkipOne() { + PayloadTrie trie = PayloadTrie.builder().addKeywords(PRONOUNS_WITH_PAYLOADS).stopOnHit().build(); + + StatefulPayloadEmitHandler testEmitHandler = new AbstractStatefulPayloadEmitHandler() { + boolean first = true; + + @Override + public boolean emit(final PayloadEmit emit) { + if (first) { + // return false for the first element + first = false; + return false; + } + addEmit(emit); + return true; + } + + }; + + trie.parseText("ushers", testEmitHandler); + Collection> emits = testEmitHandler.getEmits(); + assertEquals(1, emits.size()); // she @ 3, he @ 3, hers @ 5 + Iterator> iterator = emits.iterator(); + checkEmit(iterator.next(), 1, 3, "she", 4); + } + + @Test + public void ushersTest() { + PayloadTrie trie = PayloadTrie.builder().addKeywords(PRONOUNS_WITH_PAYLOADS).build(); + Collection> emits = trie.parseText("ushers"); + assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5 + Iterator> iterator = emits.iterator(); + + checkEmit(iterator.next(), 2, 3, "he", 20); + checkEmit(iterator.next(), 1, 3, "she", 4); + checkEmit(iterator.next(), 2, 5, "hers", 9); + } + + @Test + public void ushersTestWithCapitalKeywords() { + PayloadTrie trie = PayloadTrie.builder().ignoreCase().addKeyword("HERS", "hers").addKeyword("HIS", "his") + .addKeyword("SHE", "she").addKeyword("HE", "he").build(); + Collection> emits = trie.parseText("ushers"); + assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5 + Iterator> iterator = emits.iterator(); + checkEmit(iterator.next(), 2, 3, "he", "he"); + checkEmit(iterator.next(), 1, 3, "she", "she"); + checkEmit(iterator.next(), 2, 5, "hers", "hers"); + } + + @Test + public void ushersTestFirstMatch() { + PayloadTrie trie = PayloadTrie.builder().addKeywords(PRONOUNS_WITH_PAYLOADS).build(); + PayloadEmit firstMatch = trie.firstMatch("ushers"); + checkEmit(firstMatch, 2, 3, "he", 20); + } + + @Test + public void ushersTestByCallback() { + PayloadTrie trie = PayloadTrie.builder().addKeywords(PRONOUNS_WITH_PAYLOADS).build(); + + final List> emits = new ArrayList<>(); + PayloadEmitHandler emitHandler = new PayloadEmitHandler() { + + @Override + public boolean emit(PayloadEmit emit) { + emits.add(emit); + return true; + } + }; + trie.parseText("ushers", emitHandler); + assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5 + Iterator> iterator = emits.iterator(); + + checkEmit(iterator.next(), 2, 3, "he", 20); + checkEmit(iterator.next(), 1, 3, "she", 4); + checkEmit(iterator.next(), 2, 5, "hers", 9); + } + + @Test + public void misleadingTest() { + PayloadTrie trie = PayloadTrie.builder().addKeyword("hers", "pronon:hers").build(); + Collection> emits = trie.parseText("h he her hers"); + Iterator> iterator = emits.iterator(); + checkEmit(iterator.next(), 9, 12, "hers", "pronon:hers"); + } + + @Test + public void misleadingTestFirstMatch() { + PayloadTrie trie = PayloadTrie.builder().addKeyword("hers", "pronon:hers").build(); + PayloadEmit firstMatch = trie.firstMatch("h he her hers"); + checkEmit(firstMatch, 9, 12, "hers", "pronon:hers"); + } + + @Test + public void recipes() { + PayloadTrie trie = PayloadTrie.builder().addKeywords(FOOD_WITH_PAYLOADS).build(); + Collection> emits = trie.parseText("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli"); + Iterator> iterator = emits.iterator(); + checkEmit(iterator.next(), 2, 12, "cauliflower", new Food("cauliflower")); + checkEmit(iterator.next(), 18, 25, "tomatoes", new Food("tomatoes")); + checkEmit(iterator.next(), 40, 43, "veal", new Food("veal")); + checkEmit(iterator.next(), 51, 58, "broccoli", new Food("broccoli")); + } + + @Test + public void recipesFirstMatch() { + PayloadTrie trie = PayloadTrie.builder().addKeywords(FOOD_WITH_PAYLOADS).build(); + PayloadEmit firstMatch = trie.firstMatch("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli"); + checkEmit(firstMatch, 2, 12, "cauliflower", new Food("cauliflower")); + } + + @Test + public void longAndShortOverlappingMatch() { + PayloadTrie trie = PayloadTrie.builder().addKeyword("he", "pronon:he").addKeyword("hehehehe", "garbage") + .build(); + Collection> emits = trie.parseText("hehehehehe"); + Iterator> iterator = emits.iterator(); + checkEmit(iterator.next(), 0, 1, "he", "pronon:he"); + checkEmit(iterator.next(), 2, 3, "he", "pronon:he"); + checkEmit(iterator.next(), 4, 5, "he", "pronon:he"); + checkEmit(iterator.next(), 6, 7, "he", "pronon:he"); + checkEmit(iterator.next(), 0, 7, "hehehehe", "garbage"); + checkEmit(iterator.next(), 8, 9, "he", "pronon:he"); + checkEmit(iterator.next(), 2, 9, "hehehehe", "garbage"); + } + + @Test + public void nonOverlapping() { + PayloadTrie trie = PayloadTrie.builder().removeOverlaps().addKeyword("ab", "alpha:ab") + .addKeyword("cba", "alpha:cba").addKeyword("ababc", "alpha:ababc").build(); + Collection> emits = trie.parseText("ababcbab"); + assertEquals(2, emits.size()); + Iterator> iterator = emits.iterator(); + // With overlaps: ab@1, ab@3, ababc@4, cba@6, ab@7 + checkEmit(iterator.next(), 0, 4, "ababc", "alpha:ababc"); + checkEmit(iterator.next(), 6, 7, "ab", "alpha:ab"); + } + + @Test + public void nonOverlappingFirstMatch() { + PayloadTrie trie = PayloadTrie.builder().removeOverlaps().addKeyword("ab", "alpha:ab") + .addKeyword("cba", "alpha:cba").addKeyword("ababc", "alpha:ababc").build(); + PayloadEmit firstMatch = trie.firstMatch("ababcbab"); + + checkEmit(firstMatch, 0, 4, "ababc", "alpha:ababc"); + } + + @Test + public void containsMatch() { + PayloadTrie trie = PayloadTrie.builder().removeOverlaps().addKeyword("ab", "alpha:ab") + .addKeyword("cba", "alpha:cba").addKeyword("ababc", "alpha:ababc").build(); + assertTrue(trie.containsMatch("ababcbab")); + } + + @Test + public void startOfChurchillSpeech() { + PayloadTrie trie = PayloadTrie.builder().removeOverlaps().addKeyword("T").addKeyword("u").addKeyword("ur") + .addKeyword("r").addKeyword("urn").addKeyword("ni").addKeyword("i").addKeyword("in").addKeyword("n") + .addKeyword("urning").build(); + Collection> emits = trie.parseText("Turning"); + assertEquals(2, emits.size()); + } + + @Test + public void partialMatch() { + PayloadTrie trie = PayloadTrie.builder().onlyWholeWords().addKeyword("sugar", "food:sugar").build(); + Collection> emits = trie.parseText("sugarcane sugarcane sugar canesugar"); // left, middle, right test + assertEquals(1, emits.size()); // Match must not be made + checkEmit(emits.iterator().next(), 20, 24, "sugar", "food:sugar"); + } + + @Test + public void partialMatchFirstMatch() { + PayloadTrie trie = PayloadTrie.builder().onlyWholeWords().addKeyword("sugar", "food:sugar").build(); + PayloadEmit firstMatch = trie.firstMatch("sugarcane sugarcane sugar canesugar"); // left, middle, right test + + checkEmit(firstMatch, 20, 24, "sugar", "food:sugar"); + } + + @Test + public void tokenizeFullSentence() { + PayloadTrie trie = PayloadTrie.builder().addKeywords(GREEK_LETTERS_WITH_PAYLOADS).build(); + Collection> tokens = trie.tokenize("Hear: Alpha team first, Beta from the rear, Gamma in reserve"); + assertEquals(7, tokens.size()); + Iterator> tokensIt = tokens.iterator(); + assertEquals("Hear: ", tokensIt.next().getFragment()); + assertEquals("Alpha", tokensIt.next().getFragment()); + assertEquals(" team first, ", tokensIt.next().getFragment()); + assertEquals("Beta", tokensIt.next().getFragment()); + assertEquals(" from the rear, ", tokensIt.next().getFragment()); + assertEquals("Gamma", tokensIt.next().getFragment()); + assertEquals(" in reserve", tokensIt.next().getFragment()); + } + + // @see https://github.com/robert-bor/aho-corasick/issues/5 + @Test + public void testStringIndexOutOfBoundsException() { + PayloadTrie trie = PayloadTrie.builder().ignoreCase().onlyWholeWords().addKeywords(UNICODE_WITH_PAYLOADS) + .build(); + Collection> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ"); + assertEquals(4, emits.size()); // Match must not be made + Iterator> it = emits.iterator(); + + checkEmit(it.next(), 0, 6, "turning", "uni:turning"); + checkEmit(it.next(), 8, 11, "once", "uni:once"); + checkEmit(it.next(), 13, 17, "again", "uni:again"); + checkEmit(it.next(), 19, 23, "börkü", "uni:börkü"); + } + + @Test + public void testIgnoreCase() { + PayloadTrie trie = PayloadTrie.builder().ignoreCase().addKeywords(UNICODE_WITH_PAYLOADS).build(); + Collection> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ"); + assertEquals(4, emits.size()); // Match must not be made + Iterator> it = emits.iterator(); + + checkEmit(it.next(), 0, 6, "turning", "uni:turning"); + checkEmit(it.next(), 8, 11, "once", "uni:once"); + checkEmit(it.next(), 13, 17, "again", "uni:again"); + checkEmit(it.next(), 19, 23, "börkü", "uni:börkü"); + } + + @Test + public void testIgnoreCaseFirstMatch() { + PayloadTrie trie = PayloadTrie.builder().ignoreCase().addKeywords(UNICODE_WITH_PAYLOADS).build(); + PayloadEmit firstMatch = trie.firstMatch("TurninG OnCe AgAiN BÖRKÜ"); + + checkEmit(firstMatch, 0, 6, "turning", "uni:turning"); + } + + @Test + public void tokenizeTokensInSequence() { + PayloadTrie trie = PayloadTrie.builder().addKeywords(GREEK_LETTERS_WITH_PAYLOADS).build(); + Collection> tokens = trie.tokenize("Alpha Beta Gamma"); + assertEquals(5, tokens.size()); + } + + // @see https://github.com/robert-bor/aho-corasick/issues/7 + @Test + public void testZeroLength() { + PayloadTrie trie = PayloadTrie.builder().ignoreOverlaps().onlyWholeWords().ignoreCase().addKeyword("") + .build(); + trie.tokenize( + "Try a natural lip and subtle bronzer to keep all the focus on those big bright eyes with NARS Eyeshadow Duo in Rated R And the winner is... Boots No7 Advanced Renewal Anti-ageing Glycolic Peel Kit ($25 amazon.com) won most-appealing peel."); + } + + // @see https://github.com/robert-bor/aho-corasick/issues/8 + @Test + public void testUnicode1() { + String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char + assertEquals("THIS", target.substring(5, 9)); // Java does it the right way + PayloadTrie trie = PayloadTrie.builder().ignoreCase().onlyWholeWords().addKeyword("this", "pronon:this") + .build(); + Collection> emits = trie.parseText(target); + assertEquals(1, emits.size()); + Iterator> it = emits.iterator(); + checkEmit(it.next(), 5, 8, "this", "pronon:this"); + } + + // @see https://github.com/robert-bor/aho-corasick/issues/8 + @Test + public void testUnicode2() { + String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char + PayloadTrie trie = PayloadTrie.builder().ignoreCase().onlyWholeWords().addKeyword("this", "pronon:this") + .build(); + assertEquals("THIS", target.substring(5, 9)); // Java does it the right way + PayloadEmit firstMatch = trie.firstMatch(target); + checkEmit(firstMatch, 5, 8, "this", "pronon:this"); + } + + @Test + public void testPartialMatchWhiteSpaces() { + PayloadTrie trie = PayloadTrie.builder().onlyWholeWordsWhiteSpaceSeparated() + .addKeyword("#sugar-123", "sugar").build(); + Collection> emits = trie.parseText("#sugar-123 #sugar-1234"); // left, middle, right test + assertEquals(1, emits.size()); // Match must not be made + checkEmit(emits.iterator().next(), 0, 9, "#sugar-123", "sugar"); + } + + @Test + public void testLargeString() { + final int interval = 100; + final int textSize = 1000000; + final String keyword = FOOD[1]; + final Food payload = FOOD_PAYLOAD[1]; + final StringBuilder text = randomNumbers(textSize); + + injectKeyword(text, keyword, interval); + + PayloadTrie trie = PayloadTrie.builder().onlyWholeWords().addKeyword(keyword, payload).build(); + + final Collection> emits = trie.parseText(text); + + assertEquals(textSize / interval, emits.size()); + } + + /** + * Generates a random sequence of ASCII numbers. + * + * @param count The number of numbers to generate. + * @return A character sequence filled with random digits. + */ + private StringBuilder randomNumbers(int count) { + final StringBuilder sb = new StringBuilder(count); + + while (--count > 0) { + sb.append(randomInt(0, 10)); + } + + return sb; + } + + /** + * Injects keywords into a string builder. + * + * @param source Should contain a bunch of random data that cannot match any + * keyword. + * @param keyword A keyword to inject repeatedly in the text. + * @param interval How often to inject the keyword. + */ + private void injectKeyword(final StringBuilder source, final String keyword, final int interval) { + final int length = source.length(); + for (int i = 0; i < length; i += interval) { + source.replace(i, i + keyword.length(), keyword); + } + } + + private int randomInt(final int min, final int max) { + return ThreadLocalRandom.current().nextInt(min, max); + } + + private void checkEmit(PayloadEmit next, int expectedStart, int expectedEnd, String expectedKeyword, + Food expectedPayload) { + assertEquals("Start of emit should have been " + expectedStart, expectedStart, next.getStart()); + assertEquals("End of emit should have been " + expectedEnd, expectedEnd, next.getEnd()); + assertEquals("Keyword of emit shoud be " + expectedKeyword, expectedKeyword, next.getKeyword()); + assertEquals("Payload of emit shoud be " + expectedPayload, expectedPayload, next.getPayload()); + } + + private void checkEmit(PayloadEmit next, int expectedStart, int expectedEnd, String expectedKeyword, + Integer expectedPayload) { + assertEquals("Start of emit should have been " + expectedStart, expectedStart, next.getStart()); + assertEquals("End of emit should have been " + expectedEnd, expectedEnd, next.getEnd()); + assertEquals("Keyword of emit shoud be " + expectedKeyword, expectedKeyword, next.getKeyword()); + assertEquals("Payload of emit shoud be " + expectedPayload, expectedPayload, next.getPayload()); + } + + private void checkEmit(PayloadEmit next, int expectedStart, int expectedEnd, String expectedKeyword, + String expectedPayload) { + assertEquals("Start of emit should have been " + expectedStart, expectedStart, next.getStart()); + assertEquals("End of emit should have been " + expectedEnd, expectedEnd, next.getEnd()); + assertEquals("Keyword of emit shoud be " + expectedKeyword, expectedKeyword, next.getKeyword()); + assertEquals("Payload of emit shoud be " + expectedPayload, expectedPayload, next.getPayload()); + } +} diff --git a/src/test/java/org/ahocorasick/trie/TrieTest.java b/src/test/java/org/ahocorasick/trie/TrieTest.java index bf01589..bb4227a 100644 --- a/src/test/java/org/ahocorasick/trie/TrieTest.java +++ b/src/test/java/org/ahocorasick/trie/TrieTest.java @@ -1,9 +1,7 @@ package org.ahocorasick.trie; -import org.ahocorasick.trie.handler.AbstractStatefulEmitHandler; -import org.ahocorasick.trie.handler.EmitHandler; -import org.ahocorasick.trie.handler.StatefulEmitHandler; -import org.junit.Test; +import static junit.framework.Assert.assertEquals; +import static org.junit.Assert.assertTrue; import java.util.ArrayList; import java.util.Collection; @@ -11,8 +9,10 @@ import java.util.Iterator; import java.util.List; import java.util.concurrent.ThreadLocalRandom; -import static junit.framework.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import org.ahocorasick.trie.handler.AbstractStatefulEmitHandler; +import org.ahocorasick.trie.handler.EmitHandler; +import org.ahocorasick.trie.handler.StatefulEmitHandler; +import org.junit.Test; public class TrieTest { private final static String[] ALPHABET = new String[]{