Compare commits

...

28 Commits

Author SHA1 Message Date
Dave Jarvis
83dfcbb82e
Merge pull request #60 from wbreeze/heartbeat
Implement word transitions
2017-11-24 08:50:46 -08:00
Douglas Lovell
360d76193a Express paramaters as final 2017-10-13 21:17:04 -03:00
Douglas Lovell
82d36386fd Add comments, rename variables, remove unused lookahead 2017-10-13 20:47:21 -03:00
Douglas Lovell
b11c655ccf Merge pull request #1 from stillleben/heartbeat
Heartbeat improvements from stillleben
2016-08-03 14:38:57 -06:00
Benni
165f18e581 Remove unused varables, remove duplicate tests (both "nonOverlappingWordTransitions" and "nonOverlappingWholeWords" contain exactly the same code) 2016-02-17 18:05:42 +01:00
Benni
06675a6074 Add support for punctuation in text: Those characters will form a separate token now. 2016-02-17 17:56:11 +01:00
Douglas Lovell
3839e406ce we want the keywords back in all cases after all, but the start and end indexes refer to the text 2015-11-02 17:50:27 -07:00
Douglas Lovell
d764017abe use word transitions for whole word only mode 2015-11-02 17:14:28 -07:00
Douglas Lovell
f89b000894 progress on leading and trailing white space problem 2015-11-02 11:49:44 -07:00
Douglas Lovell
9aa9695d38 version change for heartbeat changes 2015-11-02 11:49:14 -07:00
Douglas Lovell
c0d89cec2d add a few word transition tests 2015-10-30 16:25:04 -06:00
Douglas Lovell
f9c2d9d4aa all the tests pass 2015-10-30 15:03:06 -06:00
Douglas Lovell
b7bb0cbf5b put text positions on the transitions and track in the token stream 2015-10-30 10:58:15 -06:00
Douglas Lovell
dd5f9b25fa fix the off by ones 2015-10-29 15:29:44 -06:00
Douglas Lovell
7514478a65 Make the transition token the hash key for a transition 2015-10-29 15:28:23 -06:00
Douglas Lovell
8560af8cce test trie in word transition mode 2015-10-28 17:03:29 -06:00
Douglas Lovell
51940af6e7 test state with word transitions 2015-10-28 16:57:37 -06:00
Douglas Lovell
1f63ae71d4 StringBuffer, StringIterator are old school 2015-10-28 16:56:27 -06:00
Douglas Lovell
4be3e115b6 add builder method for setting word transitions 2015-10-28 14:10:53 -06:00
Douglas Lovell
a646f233a5 improve the option name 2015-10-28 11:17:46 -06:00
Douglas Lovell
f05026cf90 added word transitions. all tests pass 2015-10-28 10:59:45 -06:00
Douglas Lovell
6283bf039d first pass refactoring for word transitions 2015-10-27 16:55:35 -06:00
robert-bor
3393e4f51f Issue #26 tokens report if they are 100% whitespace 2015-09-27 20:56:04 +02:00
robert-bor
438e546245 Issue #25 match tokens report back whether they are whole words or not 2015-09-27 18:22:38 +02:00
robert-bor
877a56c956 Issue #24 removed if condition that checked for empty emit strings, whereas emit() always returns a collection 2015-09-27 17:58:55 +02:00
robert-bor
b42c664796 Issue #24 big cleanup, removed all post-processing methods for whole words and non-overlapping sequences and integrated the same functionality closer to the AC algorithm. 2015-09-27 17:56:42 +02:00
robert-bor
b274844b75 Issue #24 stopOnHit removed; the functionality has been replaced by the superior firstMatch 2015-09-27 14:40:04 +02:00
robert-bor
bfaa32b20e Issue #24 tokenize() method implementation extracted to separate class 2015-09-27 14:37:36 +02:00
26 changed files with 939 additions and 250 deletions

View File

@ -3,7 +3,7 @@
<groupId>org.ahocorasick</groupId>
<artifactId>ahocorasick</artifactId>
<version>0.3.1-SNAPSHOT</version>
<version>0.3.1-heartbeat</version>
<packaging>jar</packaging>
<name>Aho-CoraSick algorithm for efficient string matching</name>
<description>Java library for efficient string matching against a large set of keywords</description>
@ -104,4 +104,4 @@
</plugins>
</reporting>
</project>
</project>

View File

@ -0,0 +1,41 @@
/*
* Copyright 2015 Rogue Wave Software.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.ahocorasick.trie;
/**
* Model transitions on characters
* @author doug.lovell
*/
class CharacterTransition extends Transition<Character> {
/**
* Create a character transition from a position in the source string
* @param c character to match
* @param start positon of character in source string
*/
public CharacterTransition(Character c, int start) {
super(c, start, 1);
}
/**
* Create a character transition without regard for position
* @param c character to match
*/
public CharacterTransition(Character c) {
this(c, 0);
}
}

View File

@ -11,11 +11,11 @@ public class Emit extends Interval implements Intervalable {
super(start, end);
this.keyword = keyword;
}
public String getKeyword() {
return this.keyword;
}
@Override
public String toString() {
return super.toString() + "=" + this.keyword;

View File

@ -2,7 +2,7 @@ package org.ahocorasick.trie;
public class FragmentToken extends Token {
public FragmentToken(String fragment) {
public FragmentToken(final String fragment) {
super(fragment);
}
@ -15,4 +15,5 @@ public class FragmentToken extends Token {
public Emit getEmit() {
return null;
}
}

View File

@ -0,0 +1,60 @@
/*
* Copyright 2015 Rogue Wave Software.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.ahocorasick.trie;
/**
* Keyword encapsulates part of a potential match along with the count
* of prior source tokens consumed to create the potential match.
*
* @author doug.lovell
*/
public class Keyword implements Comparable {
private final String text;
private final int depth;
/**
* Create portion of potential match
* @param text content that matches
* @param depth count of prior source tokens that comprise the match
*/
public Keyword(final String text, final int depth) {
this.text = text;
this.depth = depth;
}
public int getDepth() {
return depth;
}
public String getText() {
return text;
}
public String toString() {
final String t = getText();
final int d = getDepth();
return "Keyword '" + t + "' at depth " + d;
}
@Override
public int compareTo(final Object o) {
if (o instanceof Keyword) {
return text.compareTo(((Keyword) o).text);
}
throw new IllegalArgumentException("Only supports comparison with other keywords");
}
}

View File

@ -2,9 +2,9 @@ package org.ahocorasick.trie;
public class MatchToken extends Token {
private Emit emit;
private final Emit emit;
public MatchToken(String fragment, Emit emit) {
public MatchToken(final String fragment, final Emit emit) {
super(fragment);
this.emit = emit;
}

View File

@ -1,5 +1,7 @@
package org.ahocorasick.trie;
import org.ahocorasick.trie.candidate.EmitCandidateFlushHandler;
import java.util.*;
/**
@ -8,8 +10,8 @@ import java.util.*;
* </p>
*
* <ul>
* <li>success; when a character points to another state, it must return that state</li>
* <li>failure; when a character has no matching state, the algorithm must be able to fall back on a
* <li>success; when a transition points to another state, it must return that state</li>
* <li>failure; when a transition has no matching state, the algorithm must be able to fall back on a
* state with less depth</li>
* <li>emits; when this state is passed and keywords have been matched, the matches must be
* 'emitted' so that they can be used later on.</li>
@ -17,7 +19,7 @@ import java.util.*;
*
* <p>
* The root state is special in the sense that it has no failure state; it cannot fail. If it 'fails'
* it will still parse the next character and start from the root node. This ensures that the algorithm
* it will still parse the next transition and start from the root node. This ensures that the algorithm
* always runs. All other states always have a fail state.
* </p>
*
@ -25,7 +27,7 @@ import java.util.*;
*/
public class State {
/** effective the size of the keyword */
/** effectively the size of the keyword */
private final int depth;
/** only used for the root state to refer to itself in case no matches have been found */
@ -33,15 +35,15 @@ public class State {
/**
* referred to in the white paper as the 'goto' structure. From a state it is possible to go
* to other states, depending on the character passed.
* to other states, depending on the transition passed.
*/
private Map<Character,State> success = new HashMap<Character, State>();
private final Map<Transition,State> success = new HashMap<>();
/** if no matching states are found, the failure state will be returned */
private State failure = null;
/** whenever this state is reached, it will emit the matches keywords for future reference */
private Set<String> emits = null;
private Set<Keyword> emits = null;
public State() {
this(0);
@ -52,27 +54,27 @@ public class State {
this.rootState = depth == 0 ? this : null;
}
private State nextState(Character character, boolean ignoreRootState) {
State nextState = this.success.get(character);
private State nextState(final Transition transition, boolean ignoreRootState) {
State nextState = this.success.get(transition);
if (!ignoreRootState && nextState == null && this.rootState != null) {
nextState = this.rootState;
}
return nextState;
}
public State nextState(Character character) {
return nextState(character, false);
public State nextState(final Transition transition) {
return nextState(transition, false);
}
public State nextStateIgnoreRootState(Character character) {
return nextState(character, true);
public State nextStateIgnoreRootState(final Transition transition) {
return nextState(transition, true);
}
public State addState(Character character) {
State nextState = nextStateIgnoreRootState(character);
public State addState(final Transition transition) {
State nextState = nextStateIgnoreRootState(transition);
if (nextState == null) {
nextState = new State(this.depth+1);
this.success.put(character, nextState);
this.success.put(transition, nextState);
}
return nextState;
}
@ -81,28 +83,39 @@ public class State {
return this.depth;
}
public void addEmit(String keyword) {
public void addEmit(final Keyword keyword) {
if (this.emits == null) {
this.emits = new TreeSet<>();
}
this.emits.add(keyword);
}
public void addEmit(Collection<String> emits) {
for (String emit : emits) {
public void addEmit(final Collection<Keyword> emits) {
for (Keyword emit : emits) {
addEmit(emit);
}
}
public Collection<String> emit() {
return this.emits == null ? Collections.<String> emptyList() : this.emits;
public void addEmitString(final String key) {
addEmit(new Keyword(key, getDepth()));
}
public State failure() {
public Collection<Keyword> emit() {
return this.emits == null ? Collections.<Keyword> emptyList() : this.emits;
}
public State failure(final EmitCandidateFlushHandler emitCandidateFlushHandler) {
if (emitCandidateFlushHandler != null && this.failure.isRootState()) {
emitCandidateFlushHandler.flush();
}
return this.failure;
}
public void setFailure(State failState) {
public State failure() {
return failure(null);
}
public void setFailure(final State failState) {
this.failure = failState;
}
@ -110,8 +123,12 @@ public class State {
return this.success.values();
}
public Collection<Character> getTransitions() {
public Collection<Transition> getTransitions() {
return this.success.keySet();
}
public boolean isRootState() {
return this.depth == 0;
}
}

View File

@ -2,9 +2,9 @@ package org.ahocorasick.trie;
public abstract class Token {
private String fragment;
private final String fragment;
public Token(String fragment) {
public Token(final String fragment) {
this.fragment = fragment;
}

View File

@ -0,0 +1,48 @@
package org.ahocorasick.trie;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
public class Tokenizer {
private final Collection<Emit> emits;
private final String text;
public Tokenizer(final Collection<Emit> emits, final String text) {
this.emits = emits;
this.text = text;
}
public Collection<Token> tokenize() {
List<Token> tokens = new ArrayList<>();
int lastCollectedPosition = -1;
for (Emit emit : emits) {
if (emit.getStart() - lastCollectedPosition > 1) {
tokens.add(createFragment(emit, text, lastCollectedPosition));
}
tokens.add(createMatch(emit, text));
lastCollectedPosition = emit.getEnd();
}
if (text.length() - lastCollectedPosition > 1) {
tokens.add(createFragment(null, text, lastCollectedPosition));
}
return tokens;
}
private Token createFragment(Emit emit, String text, int lastCollectedPosition) {
return new FragmentToken(text.substring(
lastCollectedPosition+1,
emit == null ? text.length() : emit.getStart()));
}
private Token createMatch(Emit emit, String text) {
return new MatchToken(
text.substring(emit.getStart(), emit.getEnd()+1),
emit);
}
}

View File

@ -0,0 +1,69 @@
/*
* Copyright 2015 Rogue Wave Software.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.ahocorasick.trie;
import java.util.Objects;
/**
* Enables the trie to model transitions on whole words or characters
* ... or whatever!
* @author doug.lovell
* @param <T>
*/
public class Transition<T> {
protected final T token;
protected final int start;
protected final int length;
public Transition(final T token, int start, int length) {
this.token = token;
this.start = start;
this.length = length;
}
public int getStart() {
return start;
}
public int getLength() {
return length;
}
@Override
public String toString() {
final int s = getStart();
final int len = getLength();
return "Transition on '" + token + "' start: " + s + ", length: " + len;
}
@Override
public int hashCode() {
return token.hashCode();
}
@Override
public boolean equals(final Object obj) {
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
final Transition<?> other = (Transition<?>) obj;
return Objects.equals(this.token, other.token);
}
}

View File

@ -1,14 +1,16 @@
package org.ahocorasick.trie;
import org.ahocorasick.interval.IntervalTree;
import org.ahocorasick.interval.Intervalable;
import org.ahocorasick.trie.candidate.EmitCandidateFlushHandler;
import org.ahocorasick.trie.candidate.EmitCandidateHolder;
import org.ahocorasick.trie.candidate.NonOverlappingEmitCandidateHolder;
import org.ahocorasick.trie.candidate.OverlappingEmitCandidateHolder;
import org.ahocorasick.trie.handler.DefaultEmitHandler;
import org.ahocorasick.trie.handler.EmitHandler;
import org.ahocorasick.trie.handler.FirstMatchHandler;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Queue;
import java.util.LinkedList;
import java.util.concurrent.LinkedBlockingDeque;
/**
@ -18,173 +20,175 @@ import java.util.concurrent.LinkedBlockingDeque;
*/
public class Trie {
private TrieConfig trieConfig;
private final TrieConfig trieConfig;
private State rootState;
private final State rootState;
private Trie(TrieConfig trieConfig) {
private Trie(final TrieConfig trieConfig) {
this.trieConfig = trieConfig;
this.rootState = new State();
}
private abstract class KeywordTokenizer {
protected int position = 0;
protected final CharSequence input;
protected int length;
protected KeywordTokenizer(final CharSequence input) {
this.input = input;
this.length = input.length();
}
protected char currentChar() {
return (position < length) ? input.charAt(position) : '\0';
}
public abstract Transition nextTransition();
}
private class WordTokenizer extends KeywordTokenizer {
public WordTokenizer(final CharSequence input) {
super(input);
}
@Override
public Transition<String> nextTransition() {
WordTransition t = null;
while (position < length && Character.isWhitespace(currentChar())) {
++position;
}
int start = position;
if (start < length) {
do {
++position;
} while (position < length && Character.isLetterOrDigit(currentChar()));
String word = input.subSequence(start, position).toString();
t = new WordTransition(word, start);
}
return t;
}
}
private class CharacterTokenizer extends KeywordTokenizer {
public CharacterTokenizer(final CharSequence input) {
super(input);
}
@Override
public Transition<Character> nextTransition() {
CharacterTransition t = null;
if (position < length) {
t = new CharacterTransition(currentChar(), position);
position += 1;
}
return t;
}
}
private class TokenStream {
private final KeywordTokenizer tokenizer;
private final StringBuilder input;
public TokenStream(final CharSequence text) {
input = new StringBuilder(text.length());
for (int p = 0; p < text.length(); ++p) {
char ch = text.charAt(p);
input.append(trieConfig.isCaseInsensitive() ?
Character.toLowerCase(ch) : ch);
}
if (trieConfig.isOnlyWholeWords()) {
tokenizer = new WordTokenizer(input);
}
else {
tokenizer = new CharacterTokenizer(input);
}
}
public Transition nextTransition() {
return tokenizer.nextTransition();
}
public String input() {
return input.toString();
}
private void addKeyword(String keyword) {
}
private void addKeyword(final CharSequence keyword) {
if (keyword == null || keyword.length() == 0) {
return;
}
State currentState = this.rootState;
for (Character character : keyword.toCharArray()) {
if (trieConfig.isCaseInsensitive()) {
character = Character.toLowerCase(character);
}
currentState = currentState.addState(character);
TokenStream tokenStream = new TokenStream(keyword);
Transition transition = tokenStream.nextTransition();
while (transition != null) {
currentState = currentState.addState(transition);
transition = tokenStream.nextTransition();
}
currentState.addEmit(trieConfig.isCaseInsensitive() ? keyword.toLowerCase() : keyword);
currentState.addEmitString(tokenStream.input());
}
public Collection<Token> tokenize(String text) {
Collection<Token> tokens = new ArrayList<>();
Collection<Emit> collectedEmits = parseText(text);
int lastCollectedPosition = -1;
for (Emit emit : collectedEmits) {
if (emit.getStart() - lastCollectedPosition > 1) {
tokens.add(createFragment(emit, text, lastCollectedPosition));
}
tokens.add(createMatch(emit, text));
lastCollectedPosition = emit.getEnd();
}
if (text.length() - lastCollectedPosition > 1) {
tokens.add(createFragment(null, text, lastCollectedPosition));
}
return tokens;
}
private Token createFragment(Emit emit, String text, int lastCollectedPosition) {
return new FragmentToken(text.substring(lastCollectedPosition+1, emit == null ? text.length() : emit.getStart()));
}
private Token createMatch(Emit emit, String text) {
return new MatchToken(text.substring(emit.getStart(), emit.getEnd()+1), emit);
return new Tokenizer(parseText(text), text).tokenize();
}
@SuppressWarnings("unchecked")
public Collection<Emit> parseText(CharSequence text) {
public Collection<Emit> parseText(final CharSequence text) {
DefaultEmitHandler emitHandler = new DefaultEmitHandler();
parseText(text, emitHandler);
List<Emit> collectedEmits = emitHandler.getEmits();
if (trieConfig.isOnlyWholeWords()) {
removePartialMatches(text, collectedEmits);
}
if (trieConfig.isOnlyWholeWordsWhiteSpaceSeparated()) {
removePartialMatchesWhiteSpaceSeparated(text, collectedEmits);
}
if (!trieConfig.isAllowOverlaps()) {
IntervalTree intervalTree = new IntervalTree((List<Intervalable>)(List<?>)collectedEmits);
intervalTree.removeOverlaps((List<Intervalable>) (List<?>) collectedEmits);
}
return collectedEmits;
return emitHandler.getEmits();
}
public boolean containsMatch(CharSequence text) {
Emit firstMatch = firstMatch(text);
return firstMatch != null;
}
public boolean containsMatch(final CharSequence text) {
Emit firstMatch = firstMatch(text);
return firstMatch != null;
}
public void parseText(CharSequence text, EmitHandler emitHandler) {
public Emit firstMatch(final CharSequence text) {
FirstMatchHandler emitHandler = new FirstMatchHandler();
parseText(text, emitHandler);
return emitHandler.getFirstMatch();
}
public void parseText(final CharSequence text, final EmitHandler emitHandler) {
final EmitCandidateHolder emitCandidateHolder = this.trieConfig.isAllowOverlaps() ?
new OverlappingEmitCandidateHolder() :
new NonOverlappingEmitCandidateHolder();
final EmitCandidateFlushHandler flushHandler =
new EmitCandidateFlushHandler(emitHandler, emitCandidateHolder);
TokenStream tknz = new TokenStream(text);
LinkedList<Transition> tknHistory = new LinkedList<>();
State currentState = this.rootState;
for (int position = 0; position < text.length(); position++) {
Character character = text.charAt(position);
if (trieConfig.isCaseInsensitive()) {
character = Character.toLowerCase(character);
}
currentState = getState(currentState, character);
if (storeEmits(position, currentState, emitHandler) && trieConfig.isStopOnHit()) {
Transition nextTransition = tknz.nextTransition();
while (nextTransition != null) {
if (flushHandler.stop()) {
return;
}
}
}
public Emit firstMatch(CharSequence text) {
if (!trieConfig.isAllowOverlaps()) {
// Slow path. Needs to find all the matches to detect overlaps.
Collection<Emit> parseText = parseText(text);
if (parseText != null && !parseText.isEmpty()) {
return parseText.iterator().next();
}
} else {
// Fast path. Returns first match found.
State currentState = this.rootState;
for (int position = 0; position < text.length(); position++) {
Character character = text.charAt(position);
if (trieConfig.isCaseInsensitive()) {
character = Character.toLowerCase(character);
}
currentState = getState(currentState, character);
Collection<String> emitStrs = currentState.emit();
if (emitStrs != null && !emitStrs.isEmpty()) {
for (String emitStr : emitStrs) {
final Emit emit = new Emit(position - emitStr.length() + 1, position, emitStr);
if (trieConfig.isOnlyWholeWords()) {
if (!isPartialMatch(text, emit)) {
return emit;
}
} else {
return emit;
}
}
}
}
}
return null;
}
private boolean isPartialMatch(CharSequence searchText, Emit emit) {
return (emit.getStart() != 0 &&
Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) ||
(emit.getEnd() + 1 != searchText.length() &&
Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1)));
}
private void removePartialMatches(CharSequence searchText, List<Emit> collectedEmits) {
List<Emit> removeEmits = new ArrayList<>();
for (Emit emit : collectedEmits) {
if (isPartialMatch(searchText, emit)) {
removeEmits.add(emit);
}
}
for (Emit removeEmit : removeEmits) {
collectedEmits.remove(removeEmit);
}
}
private void removePartialMatchesWhiteSpaceSeparated(CharSequence searchText, List<Emit> collectedEmits) {
long size = searchText.length();
List<Emit> removeEmits = new ArrayList<>();
for (Emit emit : collectedEmits) {
if ((emit.getStart() == 0 || Character.isWhitespace(searchText.charAt(emit.getStart() - 1))) &&
(emit.getEnd() + 1 == size || Character.isWhitespace(searchText.charAt(emit.getEnd() + 1)))) {
continue;
tknHistory.add(nextTransition);
currentState = getState(currentState, nextTransition, flushHandler);
Collection<Keyword> emits = currentState.emit();
int depth = currentState.getDepth();
while (depth < tknHistory.size()) {
tknHistory.remove();
}
removeEmits.add(emit);
}
for (Emit removeEmit : removeEmits) {
collectedEmits.remove(removeEmit);
int position = nextTransition.getStart() + nextTransition.getLength();
for (Keyword emit : emits) {
int start = tknHistory.get(depth - emit.getDepth()).getStart();
emitCandidateHolder.addCandidate(
new Emit(start, position - 1, emit.getText()));
}
nextTransition = tknz.nextTransition();
}
flushHandler.flush();
}
private State getState(State currentState, Character character) {
State newCurrentState = currentState.nextState(character);
private State getState(final State currentState,
final Transition transition,
final EmitCandidateFlushHandler flushHandler) {
State failState = currentState;
State newCurrentState = currentState.nextState(transition);
while (newCurrentState == null) {
currentState = currentState.failure();
newCurrentState = currentState.nextState(character);
failState = failState.failure(flushHandler);
newCurrentState = failState.nextState(transition);
}
return newCurrentState;
}
@ -202,7 +206,7 @@ public class Trie {
while (!queue.isEmpty()) {
State currentState = queue.remove();
for (Character transition : currentState.getTransitions()) {
for (Transition transition : currentState.getTransitions()) {
State targetState = currentState.nextState(transition);
queue.add(targetState);
@ -217,27 +221,17 @@ public class Trie {
}
}
private boolean storeEmits(int position, State currentState, EmitHandler emitHandler) {
boolean emitted = false;
Collection<String> emits = currentState.emit();
if (emits != null && !emits.isEmpty()) {
for (String emit : emits) {
emitHandler.emit(new Emit(position - emit.length() + 1, position, emit));
emitted = true;
}
}
return emitted;
}
public static TrieBuilder builder() {
return new TrieBuilder();
}
public static class TrieBuilder {
private TrieConfig trieConfig = new TrieConfig();
private final TrieConfig trieConfig = new TrieConfig();
private Trie trie = new Trie(trieConfig);
private final Trie trie = new Trie(trieConfig);
private boolean hasAddedKeyword = false;
private TrieBuilder() {}
@ -252,25 +246,20 @@ public class Trie {
}
public TrieBuilder onlyWholeWords() {
if (hasAddedKeyword) {
throw new IllegalStateException(
"Unable to switch to only whole words after keywords added");
}
this.trieConfig.setOnlyWholeWords(true);
return this;
}
public TrieBuilder onlyWholeWordsWhiteSpaceSeparated() {
this.trieConfig.setOnlyWholeWordsWhiteSpaceSeparated(true);
return this;
}
public TrieBuilder addKeyword(String keyword) {
trie.addKeyword(keyword);
hasAddedKeyword = true;
return this;
}
public TrieBuilder stopOnHit() {
trie.trieConfig.setStopOnHit(true);
return this;
}
public Trie build() {
trie.constructFailureStates();
return trie;

View File

@ -6,16 +6,8 @@ public class TrieConfig {
private boolean onlyWholeWords = false;
private boolean onlyWholeWordsWhiteSpaceSeparated = false;
private boolean caseInsensitive = false;
private boolean stopOnHit = false;
public boolean isStopOnHit() { return stopOnHit; }
public void setStopOnHit(boolean stopOnHit) { this.stopOnHit = stopOnHit; }
public boolean isAllowOverlaps() {
return allowOverlaps;
}
@ -32,12 +24,6 @@ public class TrieConfig {
this.onlyWholeWords = onlyWholeWords;
}
public boolean isOnlyWholeWordsWhiteSpaceSeparated() { return onlyWholeWordsWhiteSpaceSeparated; }
public void setOnlyWholeWordsWhiteSpaceSeparated(boolean onlyWholeWordsWhiteSpaceSeparated) {
this.onlyWholeWordsWhiteSpaceSeparated = onlyWholeWordsWhiteSpaceSeparated;
}
public boolean isCaseInsensitive() {
return caseInsensitive;
}

View File

@ -0,0 +1,40 @@
/*
* Copyright 2015 Rogue Wave Software.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.ahocorasick.trie;
/**
* Model transitions on words
* @author doug.lovell
*/
public class WordTransition extends Transition<String> {
/**
* Create a transition from a position in the source string
* @param word to match
* @param start position of first character within the source string
*/
public WordTransition(final String word, int start) {
super(word, start, word.length());
}
/**
* Create a transition without regard for position
* @param word to match
*/
public WordTransition(final String word) {
this(word, 0);
}
}

View File

@ -0,0 +1,27 @@
package org.ahocorasick.trie.candidate;
import org.ahocorasick.trie.Emit;
import org.ahocorasick.trie.handler.EmitHandler;
public class EmitCandidateFlushHandler {
private final EmitHandler emitHandler;
private final EmitCandidateHolder emitCandidateHolder;
public EmitCandidateFlushHandler(EmitHandler emitHandler, EmitCandidateHolder emitCandidateHolder) {
this.emitHandler = emitHandler;
this.emitCandidateHolder = emitCandidateHolder;
}
public void flush() {
for (Emit emit : emitCandidateHolder.flush()) {
emitHandler.emit(emit);
}
}
public boolean stop() {
return emitHandler.stop();
}
}

View File

@ -0,0 +1,12 @@
package org.ahocorasick.trie.candidate;
import org.ahocorasick.trie.Emit;
import java.util.List;
public interface EmitCandidateHolder {
void addCandidate(Emit emitCandidate);
List<Emit> flush();
}

View File

@ -0,0 +1,18 @@
package org.ahocorasick.trie.candidate;
import org.ahocorasick.interval.IntervalTree;
import org.ahocorasick.interval.Intervalable;
import org.ahocorasick.trie.Emit;
import java.util.*;
public class NonOverlappingEmitCandidateHolder extends OverlappingEmitCandidateHolder {
@Override
public List<Emit> flush() {
IntervalTree intervalTree = new IntervalTree((List<Intervalable>)(List<?>)emitCandidates);
intervalTree.removeOverlaps((List<Intervalable>) (List<?>) emitCandidates);
return super.flush();
}
}

View File

@ -0,0 +1,28 @@
package org.ahocorasick.trie.candidate;
import org.ahocorasick.trie.Emit;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
public class OverlappingEmitCandidateHolder implements EmitCandidateHolder {
protected List<Emit> emitCandidates = new ArrayList<>();
@Override
public void addCandidate(Emit emitCandidate) {
this.emitCandidates.add(emitCandidate);
}
@Override
public List<Emit> flush() {
return reset(emitCandidates);
}
private List<Emit> reset(List<Emit> emitCandidates) {
this.emitCandidates = new ArrayList<>();
return emitCandidates;
}
}

View File

@ -5,7 +5,7 @@ import org.ahocorasick.trie.Emit;
import java.util.ArrayList;
import java.util.List;
public class DefaultEmitHandler implements EmitHandler {
public class DefaultEmitHandler extends SimpleEmitHandler {
private List<Emit> emits = new ArrayList<>();

View File

@ -3,5 +3,16 @@ package org.ahocorasick.trie.handler;
import org.ahocorasick.trie.Emit;
public interface EmitHandler {
/**
* Callback handler that deals with an emit it gets from the parser
* @param emit the current emit that must be dealt with
*/
void emit(Emit emit);
/**
* Force the parse process to stop
* @return true if the process must stop
*/
boolean stop();
}

View File

@ -0,0 +1,28 @@
package org.ahocorasick.trie.handler;
import org.ahocorasick.trie.Emit;
public class FirstMatchHandler extends SimpleEmitHandler {
private Emit firstMatch;
private boolean stop = false;
@Override
public void emit(Emit emit) {
if (!stop) {
firstMatch = emit;
stop = true;
}
}
public Emit getFirstMatch() {
return firstMatch;
}
@Override
public boolean stop() {
return this.stop;
}
}

View File

@ -0,0 +1,14 @@
package org.ahocorasick.trie.handler;
import org.ahocorasick.trie.Emit;
public abstract class SimpleEmitHandler implements EmitHandler {
@Override
public abstract void emit(Emit emit);
@Override
public boolean stop() {
return false;
}
}

View File

@ -1,6 +1,5 @@
package org.ahocorasick.trie;
import org.ahocorasick.trie.State;
import org.junit.Test;
import static junit.framework.Assert.assertEquals;
@ -10,15 +9,36 @@ public class StateTest {
@Test
public void constructSequenceOfCharacters() {
State rootState = new State();
Transition a = new CharacterTransition('a');
Transition b = new CharacterTransition('b');
Transition c = new CharacterTransition('c');
rootState
.addState('a')
.addState('b')
.addState('c');
State currentState = rootState.nextState('a');
.addState(a)
.addState(b)
.addState(c);
State currentState = rootState.nextState(a);
assertEquals(1, currentState.getDepth());
currentState = currentState.nextState('b');
currentState = currentState.nextState(b);
assertEquals(2, currentState.getDepth());
currentState = currentState.nextState('c');
currentState = currentState.nextState(c);
assertEquals(3, currentState.getDepth());
}
@Test
public void constructSequenceOfWords() {
State rootState = new State();
Transition a = new WordTransition("Alpha");
Transition b = new WordTransition("Bravo");
Transition c = new WordTransition("Charlie");
rootState
.addState(a)
.addState(b)
.addState(c);
State currentState = rootState.nextState(a);
assertEquals(1, currentState.getDepth());
currentState = currentState.nextState(b);
assertEquals(2, currentState.getDepth());
currentState = currentState.nextState(c);
assertEquals(3, currentState.getDepth());
}

View File

@ -1,6 +1,8 @@
package org.ahocorasick.trie;
import org.ahocorasick.trie.handler.EmitHandler;
import org.ahocorasick.trie.handler.SimpleEmitHandler;
import org.junit.Assert;
import org.junit.Test;
import java.util.ArrayList;
@ -10,6 +12,8 @@ import java.util.List;
import static junit.framework.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import org.junit.Rule;
import org.junit.rules.ExpectedException;
public class TrieTest {
@ -51,6 +55,18 @@ public class TrieTest {
checkEmit(firstMatch, 1, 3, "abc");
}
@Test
public void sameKeywordTwice() {
Trie trie = Trie.builder()
.addKeyword("abc")
.addKeyword("abc")
.build();
Collection<Emit> emits = trie.parseText("abc");
assertEquals(1, emits.size());
Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 0, 2, "abc");
}
@Test
public void variousKeywordsOneMatch() {
Trie trie = Trie.builder()
@ -64,8 +80,9 @@ public class TrieTest {
}
@Test
public void variousKeywordsFirstMatch() {
public void variousKeywordsFirstMatchWordTransitions() {
Trie trie = Trie.builder()
.onlyWholeWords()
.addKeyword("abc")
.addKeyword("bcd")
.addKeyword("cde")
@ -74,22 +91,6 @@ public class TrieTest {
checkEmit(firstMatch, 0, 2, "bcd");
}
@Test
public void ushersTestAndStopOnHit() {
Trie trie = Trie.builder()
.addKeyword("hers")
.addKeyword("his")
.addKeyword("she")
.addKeyword("he")
.stopOnHit()
.build();
Collection<Emit> emits = trie.parseText("ushers");
assertEquals(2, emits.size()); // she @ 3, he @ 3, hers @ 5
Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 2, 3, "he");
checkEmit(iterator.next(), 1, 3, "she");
}
@Test
public void ushersTest() {
Trie trie = Trie.builder()
@ -145,7 +146,7 @@ public class TrieTest {
.build();
final List<Emit> emits = new ArrayList<>();
EmitHandler emitHandler = new EmitHandler() {
EmitHandler emitHandler = new SimpleEmitHandler() {
@Override
public void emit(Emit emit) {
@ -195,6 +196,23 @@ public class TrieTest {
checkEmit(iterator.next(), 51, 58, "broccoli");
}
@Test
public void recipesWordTransitions() {
Trie trie = Trie.builder()
.onlyWholeWords()
.addKeyword("veal")
.addKeyword("cauliflower")
.addKeyword("broccoli")
.addKeyword("tomatoes")
.build();
Collection<Emit> emits = trie.parseText("2 cauliflower 3 tomatoes 4 slices of veal 100g broccoli");
Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 2, 12, "cauliflower");
checkEmit(iterator.next(), 16, 23, "tomatoes");
checkEmit(iterator.next(), 37, 40, "veal");
checkEmit(iterator.next(), 47, 54, "broccoli");
}
@Test
public void recipesFirstMatch() {
Trie trie = Trie.builder()
@ -225,6 +243,51 @@ public class TrieTest {
checkEmit(iterator.next(), 2, 9, "hehehehe");
}
@Test
public void nonOverlappingWholeWords() {
Trie trie = Trie.builder()
.removeOverlaps()
.onlyWholeWords()
.addKeyword("peper molen")
.addKeyword("molen wiel")
.addKeyword("wiel dop")
.addKeyword("dop")
.build();
Collection<Emit> emits = trie.parseText("peper molen wiel dop xwiel dop wiel dopx wiel dop");
assertEquals(4, emits.size());
Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 0, 10, "peper molen");
checkEmit(iterator.next(), 12, 19, "wiel dop");
checkEmit(iterator.next(), 27, 29, "dop");
checkEmit(iterator.next(), 41, 48, "wiel dop");
}
@Test
public void nonOverlappingWholeWordsWithCustomEmitHandler() {
Trie trie = Trie.builder()
.removeOverlaps()
.onlyWholeWords()
.addKeyword("peper molen")
.addKeyword("molen wiel")
.addKeyword("wiel dop")
.addKeyword("dop")
.build();
final List<Emit> emits = new ArrayList<>();
EmitHandler emitHandler = new SimpleEmitHandler() {
@Override
public void emit(Emit emit) {
emits.add(emit);
}
};
trie.parseText("peper molen wiel dop xwiel dop wiel dopx wiel dop", emitHandler);
assertEquals(4, emits.size());
Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 0, 10, "peper molen");
checkEmit(iterator.next(), 12, 19, "wiel dop");
checkEmit(iterator.next(), 27, 29, "dop");
checkEmit(iterator.next(), 41, 48, "wiel dop");
}
@Test
public void nonOverlapping() {
Trie trie = Trie.builder().removeOverlaps()
@ -302,6 +365,37 @@ public class TrieTest {
checkEmit(firstMatch, 20, 24, "sugar");
}
@Test
public void tokenizeAndReportOnWholeWords() {
Trie trie = Trie.builder()
.addKeyword("Alpha")
.build();
Collection<Token> tokens = trie.tokenize("Alpha AlphaAlpha Alpha");
assertEquals(6, tokens.size());
Iterator<Token> tokensIt = tokens.iterator();
assertToken(tokensIt.next(), "Alpha", true, true, false);
assertToken(tokensIt.next(), " ", false, false, true);
assertToken(tokensIt.next(), "Alpha", true, false, false);
assertToken(tokensIt.next(), "Alpha", true, false, false);
assertToken(tokensIt.next(), " ", false, false, true);
assertToken(tokensIt.next(), "Alpha", true, true, false);
}
@Test
public void whiteSpaceTokens() {
Trie trie = Trie.builder()
.addKeyword("Alpha")
.build();
Collection<Token> tokens = trie.tokenize("Alpha \tthe\t Alpha\n Alpha");
assertEquals(5, tokens.size());
Iterator<Token> tokensIt = tokens.iterator();
assertToken(tokensIt.next(), "Alpha", true, true, false);
assertToken(tokensIt.next(), " \tthe\t ", false, false, false);
assertToken(tokensIt.next(), "Alpha", true, true, false);
assertToken(tokensIt.next(), "\n ", false, false, true);
assertToken(tokensIt.next(), "Alpha", true, true, false);
}
@Test
public void tokenizeFullSentence() {
Trie trie = Trie.builder()
@ -312,13 +406,48 @@ public class TrieTest {
Collection<Token> tokens = trie.tokenize("Hear: Alpha team first, Beta from the rear, Gamma in reserve");
assertEquals(7, tokens.size());
Iterator<Token> tokensIt = tokens.iterator();
assertEquals("Hear: ", tokensIt.next().getFragment());
assertEquals("Alpha", tokensIt.next().getFragment());
assertEquals(" team first, ", tokensIt.next().getFragment());
assertEquals("Beta", tokensIt.next().getFragment());
assertEquals(" from the rear, ", tokensIt.next().getFragment());
assertEquals("Gamma", tokensIt.next().getFragment());
assertEquals(" in reserve", tokensIt.next().getFragment());
assertToken(tokensIt.next(), "Hear: ", false, false, false);
assertToken(tokensIt.next(), "Alpha", true, true, false);
assertToken(tokensIt.next(), " team first, ", false, false, false);
assertToken(tokensIt.next(), "Beta", true, true, false);
assertToken(tokensIt.next(), " from the rear, ", false, false, false);
assertToken(tokensIt.next(), "Gamma", true, true, false);
assertToken(tokensIt.next(), " in reserve", false, false, false);
}
@Test
public void tokenizeFullSentenceByWords() {
Trie trie = Trie.builder()
.onlyWholeWords()
.addKeyword("Alpha")
.addKeyword("Beta")
.addKeyword("Gamma")
.build();
Collection<Token> tokens = trie.tokenize("Hear: Alpha team first, Beta from the rear, Gamma in reserve");
assertEquals(7, tokens.size());
Iterator<Token> tokensIt = tokens.iterator();
assertToken(tokensIt.next(), "Hear: ", false, false, false);
assertToken(tokensIt.next(), "Alpha", true, true, false);
assertToken(tokensIt.next(), " team first, ", false, false, false);
assertToken(tokensIt.next(), "Beta", true, true, false);
assertToken(tokensIt.next(), " from the rear, ", false, false, false);
assertToken(tokensIt.next(), "Gamma", true, true, false);
assertToken(tokensIt.next(), " in reserve", false, false, false);
}
@Rule
public ExpectedException thrown = ExpectedException.none();
@Test
public void onlyWholeWordsThrowsExceptionAfterKeywordsAdded()
throws IllegalStateException {
thrown.expect(IllegalStateException.class);
thrown.expectMessage("Unable to switch to only whole words after keywords added");
Trie trie = Trie.builder()
.addKeyword("Happy for now")
.onlyWholeWords()
.addKeyword("Not so happy")
.build();
}
@Test
@ -415,10 +544,26 @@ public class TrieTest {
checkEmit(firstMatch, 5, 8, "this");
}
@Test
public void unicodeInKeyword() {
// The upper case character ('İ') is Unicode,
// which was read by AC as a 2-byte char
String target = "it is so much LİKE Unicode to mess with Java";
Trie trie = Trie.builder()
.onlyWholeWords()
.addKeyword("so much LİKE Unicode")
.addKeyword("it is")
.build();
Collection<Emit> emits = trie.parseText(target);
Iterator<Emit> it = emits.iterator();
checkEmit(it.next(), 0, 4, "it is");
checkEmit(it.next(), 6, 25, "so much LİKE Unicode");
}
@Test
public void partialMatchWhiteSpaces() {
Trie trie = Trie.builder()
.onlyWholeWordsWhiteSpaceSeparated()
.onlyWholeWords()
.addKeyword("#sugar-123")
.build();
Collection < Emit > emits = trie.parseText("#sugar-123 #sugar-1234"); // left, middle, right test
@ -426,6 +571,63 @@ public class TrieTest {
checkEmit(emits.iterator().next(), 0, 9, "#sugar-123");
}
/*
For onlyWholeWords, we'll ignore leading and trailing white space
included on keywords
*/
@Test
public void spacesAroundKeywordByWords() {
String text = "lorem ipso facto genera linden pharma six 1";
String keyword = " " + text + " ";
Trie trie = Trie.builder()
.onlyWholeWords()
.caseInsensitive()
.addKeyword(keyword)
.build();
Collection < Emit > emits = trie.parseText(
text + " under addressed object ");
assertEquals(1, emits.size());
checkEmit(emits.iterator().next(), 0, text.length() - 1, keyword);
}
@Test
public void punctuationInText() {
Trie trie = Trie.builder()
.onlyWholeWords()
.addKeyword("pie")
.build();
Collection<Emit> emits = trie.parseText("Want some pie? Gimme pie! pie, pie. The pie's revenge.");
Assert.assertEquals(5, emits.size());
Iterator<Emit> it = emits.iterator();
checkEmit(it.next(), 10, 12, "pie");
checkEmit(it.next(), 21, 23, "pie");
checkEmit(it.next(), 26, 28, "pie");
checkEmit(it.next(), 31, 33, "pie");
checkEmit(it.next(), 40, 42, "pie");
}
@Test
public void punctuationInSearchTerm() {
Trie trie = Trie.builder()
.onlyWholeWords()
.addKeyword("Dr. Feelgood")
.addKeyword("Oi!")
.build();
Collection<Emit> emits = trie
.parseText("The Oi! music genre is inspired by Dr. Feelgood and other bands. Oi or Dr Feelgood should not match.");
Assert.assertEquals(2, emits.size());
}
private void assertToken(Token token, String fragment, boolean match, boolean wholeWord, boolean whiteSpace) {
assertEquals(fragment, token.getFragment());
assertEquals(match, token.isMatch());
}
private void checkEmit(Emit next, int expectedStart, int expectedEnd, String expectedKeyword) {
assertEquals("Start of emit should have been " + expectedStart, expectedStart, next.getStart());
assertEquals("End of emit should have been " + expectedEnd, expectedEnd, next.getEnd());

View File

@ -0,0 +1,21 @@
package org.ahocorasick.trie.candidate;
import org.ahocorasick.trie.Emit;
import org.ahocorasick.trie.handler.EmitHandler;
import org.ahocorasick.trie.handler.FirstMatchHandler;
import org.junit.Test;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
public class EmitCandidateFlushHandlerTest {
@Test
public void stop() {
EmitHandler emitHandler = new FirstMatchHandler();
EmitCandidateFlushHandler flushHandler = new EmitCandidateFlushHandler(emitHandler, null);
assertFalse(flushHandler.stop());
emitHandler.emit(new Emit(0, 2, "bla"));
assertTrue(flushHandler.stop());
}
}

View File

@ -0,0 +1,35 @@
package org.ahocorasick.trie.candidate;
import org.ahocorasick.trie.Emit;
import org.junit.Test;
import java.util.Collection;
import java.util.List;
import static org.junit.Assert.assertEquals;
public class NonOverlappingEmitCandidateHolderTest {
@Test
public void retainLongestEmit() {
EmitCandidateHolder holder = new NonOverlappingEmitCandidateHolder();
holder.addCandidate(new Emit(0, 2, "she"));
holder.addCandidate(new Emit(1, 2, "he"));
List<Emit> emits = holder.flush();
assertEquals(1, emits.size());
assertEquals("she", emits.get(0).getKeyword());
}
@Test
public void multipleOverlaps() {
EmitCandidateHolder holder = new NonOverlappingEmitCandidateHolder();
holder.addCandidate(new Emit(0, 4, "ababc"));
holder.addCandidate(new Emit(4, 6, "cba"));
holder.addCandidate(new Emit(6, 7, "ab"));
List<Emit> emits = holder.flush();
assertEquals(2, emits.size());
assertEquals("ababc", emits.get(0).getKeyword());
assertEquals("ab", emits.get(1).getKeyword());
}
}

View File

@ -0,0 +1,22 @@
package org.ahocorasick.trie.candidate;
import org.ahocorasick.trie.Emit;
import org.junit.Test;
import java.util.List;
import static org.junit.Assert.assertEquals;
public class OverlappingEmitCandidateHolderTest {
@Test
public void addAndFlush() {
EmitCandidateHolder holder = new OverlappingEmitCandidateHolder();
holder.addCandidate(new Emit(0, 2, "ABC"));
holder.addCandidate(new Emit(2, 4, "CDE"));
List<Emit> emits = holder.flush();
assertEquals(2, emits.size());
assertEquals("ABC", emits.get(0).getKeyword());
assertEquals("CDE", emits.get(1).getKeyword());
}
}