Merge pull request #60 from wbreeze/heartbeat
Implement word transitions
This commit is contained in:
commit
83dfcbb82e
4
pom.xml
4
pom.xml
@ -3,7 +3,7 @@
|
||||
|
||||
<groupId>org.ahocorasick</groupId>
|
||||
<artifactId>ahocorasick</artifactId>
|
||||
<version>0.3.1-SNAPSHOT</version>
|
||||
<version>0.3.1-heartbeat</version>
|
||||
<packaging>jar</packaging>
|
||||
<name>Aho-CoraSick algorithm for efficient string matching</name>
|
||||
<description>Java library for efficient string matching against a large set of keywords</description>
|
||||
@ -104,4 +104,4 @@
|
||||
</plugins>
|
||||
</reporting>
|
||||
|
||||
</project>
|
||||
</project>
|
||||
|
||||
41
src/main/java/org/ahocorasick/trie/CharacterTransition.java
Normal file
41
src/main/java/org/ahocorasick/trie/CharacterTransition.java
Normal file
@ -0,0 +1,41 @@
|
||||
/*
|
||||
* Copyright 2015 Rogue Wave Software.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.ahocorasick.trie;
|
||||
|
||||
/**
|
||||
* Model transitions on characters
|
||||
* @author doug.lovell
|
||||
*/
|
||||
class CharacterTransition extends Transition<Character> {
|
||||
|
||||
/**
|
||||
* Create a character transition from a position in the source string
|
||||
* @param c character to match
|
||||
* @param start positon of character in source string
|
||||
*/
|
||||
public CharacterTransition(Character c, int start) {
|
||||
super(c, start, 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a character transition without regard for position
|
||||
* @param c character to match
|
||||
*/
|
||||
public CharacterTransition(Character c) {
|
||||
this(c, 0);
|
||||
}
|
||||
|
||||
}
|
||||
@ -11,11 +11,11 @@ public class Emit extends Interval implements Intervalable {
|
||||
super(start, end);
|
||||
this.keyword = keyword;
|
||||
}
|
||||
|
||||
|
||||
public String getKeyword() {
|
||||
return this.keyword;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return super.toString() + "=" + this.keyword;
|
||||
|
||||
@ -2,16 +2,8 @@ package org.ahocorasick.trie;
|
||||
|
||||
public class FragmentToken extends Token {
|
||||
|
||||
private boolean whiteSpace;
|
||||
|
||||
public FragmentToken(String fragment) {
|
||||
public FragmentToken(final String fragment) {
|
||||
super(fragment);
|
||||
this.whiteSpace = true;
|
||||
for (int position = 0; position < fragment.length(); position++) {
|
||||
if (!Character.isWhitespace(fragment.charAt(position))) {
|
||||
whiteSpace = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -24,9 +16,4 @@ public class FragmentToken extends Token {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isWhiteSpace() {
|
||||
return whiteSpace;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
60
src/main/java/org/ahocorasick/trie/Keyword.java
Normal file
60
src/main/java/org/ahocorasick/trie/Keyword.java
Normal file
@ -0,0 +1,60 @@
|
||||
/*
|
||||
* Copyright 2015 Rogue Wave Software.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.ahocorasick.trie;
|
||||
|
||||
/**
|
||||
* Keyword encapsulates part of a potential match along with the count
|
||||
* of prior source tokens consumed to create the potential match.
|
||||
*
|
||||
* @author doug.lovell
|
||||
*/
|
||||
public class Keyword implements Comparable {
|
||||
private final String text;
|
||||
private final int depth;
|
||||
|
||||
/**
|
||||
* Create portion of potential match
|
||||
* @param text content that matches
|
||||
* @param depth count of prior source tokens that comprise the match
|
||||
*/
|
||||
public Keyword(final String text, final int depth) {
|
||||
this.text = text;
|
||||
this.depth = depth;
|
||||
}
|
||||
|
||||
public int getDepth() {
|
||||
return depth;
|
||||
}
|
||||
|
||||
public String getText() {
|
||||
return text;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
final String t = getText();
|
||||
final int d = getDepth();
|
||||
|
||||
return "Keyword '" + t + "' at depth " + d;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(final Object o) {
|
||||
if (o instanceof Keyword) {
|
||||
return text.compareTo(((Keyword) o).text);
|
||||
}
|
||||
throw new IllegalArgumentException("Only supports comparison with other keywords");
|
||||
}
|
||||
}
|
||||
@ -2,19 +2,11 @@ package org.ahocorasick.trie;
|
||||
|
||||
public class MatchToken extends Token {
|
||||
|
||||
private final boolean wholeWord;
|
||||
|
||||
private final Emit emit;
|
||||
|
||||
public MatchToken(String fragment, Emit emit, boolean wholeWord) {
|
||||
public MatchToken(final String fragment, final Emit emit) {
|
||||
super(fragment);
|
||||
this.emit = emit;
|
||||
this.wholeWord = wholeWord;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isWholeWord() {
|
||||
return wholeWord;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
||||
@ -10,8 +10,8 @@ import java.util.*;
|
||||
* </p>
|
||||
*
|
||||
* <ul>
|
||||
* <li>success; when a character points to another state, it must return that state</li>
|
||||
* <li>failure; when a character has no matching state, the algorithm must be able to fall back on a
|
||||
* <li>success; when a transition points to another state, it must return that state</li>
|
||||
* <li>failure; when a transition has no matching state, the algorithm must be able to fall back on a
|
||||
* state with less depth</li>
|
||||
* <li>emits; when this state is passed and keywords have been matched, the matches must be
|
||||
* 'emitted' so that they can be used later on.</li>
|
||||
@ -19,7 +19,7 @@ import java.util.*;
|
||||
*
|
||||
* <p>
|
||||
* The root state is special in the sense that it has no failure state; it cannot fail. If it 'fails'
|
||||
* it will still parse the next character and start from the root node. This ensures that the algorithm
|
||||
* it will still parse the next transition and start from the root node. This ensures that the algorithm
|
||||
* always runs. All other states always have a fail state.
|
||||
* </p>
|
||||
*
|
||||
@ -27,7 +27,7 @@ import java.util.*;
|
||||
*/
|
||||
public class State {
|
||||
|
||||
/** effective the size of the keyword */
|
||||
/** effectively the size of the keyword */
|
||||
private final int depth;
|
||||
|
||||
/** only used for the root state to refer to itself in case no matches have been found */
|
||||
@ -35,15 +35,15 @@ public class State {
|
||||
|
||||
/**
|
||||
* referred to in the white paper as the 'goto' structure. From a state it is possible to go
|
||||
* to other states, depending on the character passed.
|
||||
* to other states, depending on the transition passed.
|
||||
*/
|
||||
private Map<Character,State> success = new HashMap<Character, State>();
|
||||
private final Map<Transition,State> success = new HashMap<>();
|
||||
|
||||
/** if no matching states are found, the failure state will be returned */
|
||||
private State failure = null;
|
||||
|
||||
/** whenever this state is reached, it will emit the matches keywords for future reference */
|
||||
private Set<String> emits = null;
|
||||
private Set<Keyword> emits = null;
|
||||
|
||||
public State() {
|
||||
this(0);
|
||||
@ -54,27 +54,27 @@ public class State {
|
||||
this.rootState = depth == 0 ? this : null;
|
||||
}
|
||||
|
||||
private State nextState(Character character, boolean ignoreRootState) {
|
||||
State nextState = this.success.get(character);
|
||||
private State nextState(final Transition transition, boolean ignoreRootState) {
|
||||
State nextState = this.success.get(transition);
|
||||
if (!ignoreRootState && nextState == null && this.rootState != null) {
|
||||
nextState = this.rootState;
|
||||
}
|
||||
return nextState;
|
||||
}
|
||||
|
||||
public State nextState(Character character) {
|
||||
return nextState(character, false);
|
||||
public State nextState(final Transition transition) {
|
||||
return nextState(transition, false);
|
||||
}
|
||||
|
||||
public State nextStateIgnoreRootState(Character character) {
|
||||
return nextState(character, true);
|
||||
public State nextStateIgnoreRootState(final Transition transition) {
|
||||
return nextState(transition, true);
|
||||
}
|
||||
|
||||
public State addState(Character character) {
|
||||
State nextState = nextStateIgnoreRootState(character);
|
||||
public State addState(final Transition transition) {
|
||||
State nextState = nextStateIgnoreRootState(transition);
|
||||
if (nextState == null) {
|
||||
nextState = new State(this.depth+1);
|
||||
this.success.put(character, nextState);
|
||||
this.success.put(transition, nextState);
|
||||
}
|
||||
return nextState;
|
||||
}
|
||||
@ -83,24 +83,28 @@ public class State {
|
||||
return this.depth;
|
||||
}
|
||||
|
||||
public void addEmit(String keyword) {
|
||||
public void addEmit(final Keyword keyword) {
|
||||
if (this.emits == null) {
|
||||
this.emits = new TreeSet<>();
|
||||
}
|
||||
this.emits.add(keyword);
|
||||
}
|
||||
|
||||
public void addEmit(Collection<String> emits) {
|
||||
for (String emit : emits) {
|
||||
public void addEmit(final Collection<Keyword> emits) {
|
||||
for (Keyword emit : emits) {
|
||||
addEmit(emit);
|
||||
}
|
||||
}
|
||||
|
||||
public Collection<String> emit() {
|
||||
return this.emits == null ? Collections.<String> emptyList() : this.emits;
|
||||
|
||||
public void addEmitString(final String key) {
|
||||
addEmit(new Keyword(key, getDepth()));
|
||||
}
|
||||
|
||||
public State failure(EmitCandidateFlushHandler emitCandidateFlushHandler) {
|
||||
public Collection<Keyword> emit() {
|
||||
return this.emits == null ? Collections.<Keyword> emptyList() : this.emits;
|
||||
}
|
||||
|
||||
public State failure(final EmitCandidateFlushHandler emitCandidateFlushHandler) {
|
||||
if (emitCandidateFlushHandler != null && this.failure.isRootState()) {
|
||||
emitCandidateFlushHandler.flush();
|
||||
}
|
||||
@ -111,7 +115,7 @@ public class State {
|
||||
return failure(null);
|
||||
}
|
||||
|
||||
public void setFailure(State failState) {
|
||||
public void setFailure(final State failState) {
|
||||
this.failure = failState;
|
||||
}
|
||||
|
||||
@ -119,7 +123,7 @@ public class State {
|
||||
return this.success.values();
|
||||
}
|
||||
|
||||
public Collection<Character> getTransitions() {
|
||||
public Collection<Transition> getTransitions() {
|
||||
return this.success.keySet();
|
||||
}
|
||||
|
||||
|
||||
@ -2,9 +2,9 @@ package org.ahocorasick.trie;
|
||||
|
||||
public abstract class Token {
|
||||
|
||||
private String fragment;
|
||||
private final String fragment;
|
||||
|
||||
public Token(String fragment) {
|
||||
public Token(final String fragment) {
|
||||
this.fragment = fragment;
|
||||
}
|
||||
|
||||
@ -14,14 +14,6 @@ public abstract class Token {
|
||||
|
||||
public abstract boolean isMatch();
|
||||
|
||||
public boolean isWholeWord() {
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean isWhiteSpace() {
|
||||
return false;
|
||||
}
|
||||
|
||||
public abstract Emit getEmit();
|
||||
|
||||
}
|
||||
|
||||
@ -9,8 +9,8 @@ public class Tokenizer {
|
||||
private final Collection<Emit> emits;
|
||||
|
||||
private final String text;
|
||||
|
||||
public Tokenizer(Collection<Emit> emits, String text) {
|
||||
|
||||
public Tokenizer(final Collection<Emit> emits, final String text) {
|
||||
this.emits = emits;
|
||||
this.text = text;
|
||||
}
|
||||
@ -34,14 +34,15 @@ public class Tokenizer {
|
||||
}
|
||||
|
||||
private Token createFragment(Emit emit, String text, int lastCollectedPosition) {
|
||||
return new FragmentToken(text.substring(lastCollectedPosition+1, emit == null ? text.length() : emit.getStart()));
|
||||
return new FragmentToken(text.substring(
|
||||
lastCollectedPosition+1,
|
||||
emit == null ? text.length() : emit.getStart()));
|
||||
}
|
||||
|
||||
private Token createMatch(Emit emit, String text) {
|
||||
return new MatchToken(
|
||||
text.substring(emit.getStart(), emit.getEnd()+1),
|
||||
emit,
|
||||
Trie.isWholeWord(this.text, emit.getStart(), emit.getEnd()));
|
||||
emit);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
69
src/main/java/org/ahocorasick/trie/Transition.java
Normal file
69
src/main/java/org/ahocorasick/trie/Transition.java
Normal file
@ -0,0 +1,69 @@
|
||||
/*
|
||||
* Copyright 2015 Rogue Wave Software.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.ahocorasick.trie;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
* Enables the trie to model transitions on whole words or characters
|
||||
* ... or whatever!
|
||||
* @author doug.lovell
|
||||
* @param <T>
|
||||
*/
|
||||
public class Transition<T> {
|
||||
protected final T token;
|
||||
protected final int start;
|
||||
protected final int length;
|
||||
|
||||
public Transition(final T token, int start, int length) {
|
||||
this.token = token;
|
||||
this.start = start;
|
||||
this.length = length;
|
||||
}
|
||||
|
||||
public int getStart() {
|
||||
return start;
|
||||
}
|
||||
|
||||
public int getLength() {
|
||||
return length;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
final int s = getStart();
|
||||
final int len = getLength();
|
||||
|
||||
return "Transition on '" + token + "' start: " + s + ", length: " + len;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return token.hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object obj) {
|
||||
if (obj == null) {
|
||||
return false;
|
||||
}
|
||||
if (getClass() != obj.getClass()) {
|
||||
return false;
|
||||
}
|
||||
final Transition<?> other = (Transition<?>) obj;
|
||||
return Objects.equals(this.token, other.token);
|
||||
}
|
||||
}
|
||||
@ -10,6 +10,7 @@ import org.ahocorasick.trie.handler.FirstMatchHandler;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.Queue;
|
||||
import java.util.LinkedList;
|
||||
import java.util.concurrent.LinkedBlockingDeque;
|
||||
|
||||
/**
|
||||
@ -19,27 +20,107 @@ import java.util.concurrent.LinkedBlockingDeque;
|
||||
*/
|
||||
public class Trie {
|
||||
|
||||
private TrieConfig trieConfig;
|
||||
private final TrieConfig trieConfig;
|
||||
|
||||
private State rootState;
|
||||
private final State rootState;
|
||||
|
||||
private Trie(TrieConfig trieConfig) {
|
||||
private Trie(final TrieConfig trieConfig) {
|
||||
this.trieConfig = trieConfig;
|
||||
this.rootState = new State();
|
||||
}
|
||||
|
||||
private abstract class KeywordTokenizer {
|
||||
protected int position = 0;
|
||||
protected final CharSequence input;
|
||||
protected int length;
|
||||
protected KeywordTokenizer(final CharSequence input) {
|
||||
this.input = input;
|
||||
this.length = input.length();
|
||||
}
|
||||
protected char currentChar() {
|
||||
return (position < length) ? input.charAt(position) : '\0';
|
||||
}
|
||||
public abstract Transition nextTransition();
|
||||
}
|
||||
|
||||
private class WordTokenizer extends KeywordTokenizer {
|
||||
public WordTokenizer(final CharSequence input) {
|
||||
super(input);
|
||||
}
|
||||
@Override
|
||||
public Transition<String> nextTransition() {
|
||||
WordTransition t = null;
|
||||
while (position < length && Character.isWhitespace(currentChar())) {
|
||||
++position;
|
||||
}
|
||||
int start = position;
|
||||
if (start < length) {
|
||||
do {
|
||||
++position;
|
||||
} while (position < length && Character.isLetterOrDigit(currentChar()));
|
||||
String word = input.subSequence(start, position).toString();
|
||||
t = new WordTransition(word, start);
|
||||
}
|
||||
return t;
|
||||
}
|
||||
}
|
||||
|
||||
private class CharacterTokenizer extends KeywordTokenizer {
|
||||
public CharacterTokenizer(final CharSequence input) {
|
||||
super(input);
|
||||
}
|
||||
@Override
|
||||
public Transition<Character> nextTransition() {
|
||||
CharacterTransition t = null;
|
||||
if (position < length) {
|
||||
t = new CharacterTransition(currentChar(), position);
|
||||
position += 1;
|
||||
}
|
||||
return t;
|
||||
}
|
||||
}
|
||||
|
||||
private class TokenStream {
|
||||
private final KeywordTokenizer tokenizer;
|
||||
private final StringBuilder input;
|
||||
|
||||
public TokenStream(final CharSequence text) {
|
||||
input = new StringBuilder(text.length());
|
||||
for (int p = 0; p < text.length(); ++p) {
|
||||
char ch = text.charAt(p);
|
||||
input.append(trieConfig.isCaseInsensitive() ?
|
||||
Character.toLowerCase(ch) : ch);
|
||||
}
|
||||
if (trieConfig.isOnlyWholeWords()) {
|
||||
tokenizer = new WordTokenizer(input);
|
||||
}
|
||||
else {
|
||||
tokenizer = new CharacterTokenizer(input);
|
||||
}
|
||||
}
|
||||
|
||||
public Transition nextTransition() {
|
||||
return tokenizer.nextTransition();
|
||||
}
|
||||
|
||||
public String input() {
|
||||
return input.toString();
|
||||
}
|
||||
|
||||
private void addKeyword(String keyword) {
|
||||
}
|
||||
|
||||
private void addKeyword(final CharSequence keyword) {
|
||||
if (keyword == null || keyword.length() == 0) {
|
||||
return;
|
||||
}
|
||||
State currentState = this.rootState;
|
||||
for (Character character : keyword.toCharArray()) {
|
||||
if (trieConfig.isCaseInsensitive()) {
|
||||
character = Character.toLowerCase(character);
|
||||
}
|
||||
currentState = currentState.addState(character);
|
||||
TokenStream tokenStream = new TokenStream(keyword);
|
||||
Transition transition = tokenStream.nextTransition();
|
||||
while (transition != null) {
|
||||
currentState = currentState.addState(transition);
|
||||
transition = tokenStream.nextTransition();
|
||||
}
|
||||
currentState.addEmit(trieConfig.isCaseInsensitive() ? keyword.toLowerCase() : keyword);
|
||||
currentState.addEmitString(tokenStream.input());
|
||||
}
|
||||
|
||||
public Collection<Token> tokenize(String text) {
|
||||
@ -47,66 +128,67 @@ public class Trie {
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public Collection<Emit> parseText(CharSequence text) {
|
||||
public Collection<Emit> parseText(final CharSequence text) {
|
||||
DefaultEmitHandler emitHandler = new DefaultEmitHandler();
|
||||
parseText(text, emitHandler);
|
||||
return emitHandler.getEmits();
|
||||
}
|
||||
|
||||
public boolean containsMatch(CharSequence text) {
|
||||
Emit firstMatch = firstMatch(text);
|
||||
return firstMatch != null;
|
||||
}
|
||||
public boolean containsMatch(final CharSequence text) {
|
||||
Emit firstMatch = firstMatch(text);
|
||||
return firstMatch != null;
|
||||
}
|
||||
|
||||
public Emit firstMatch(CharSequence text) {
|
||||
public Emit firstMatch(final CharSequence text) {
|
||||
FirstMatchHandler emitHandler = new FirstMatchHandler();
|
||||
parseText(text, emitHandler);
|
||||
return emitHandler.getFirstMatch();
|
||||
}
|
||||
|
||||
public void parseText(CharSequence text, EmitHandler emitHandler) {
|
||||
public void parseText(final CharSequence text, final EmitHandler emitHandler) {
|
||||
|
||||
final EmitCandidateHolder emitCandidateHolder = this.trieConfig.isAllowOverlaps() ?
|
||||
new OverlappingEmitCandidateHolder() :
|
||||
new NonOverlappingEmitCandidateHolder();
|
||||
|
||||
final EmitCandidateFlushHandler flushHandler = new EmitCandidateFlushHandler(emitHandler, emitCandidateHolder);
|
||||
final EmitCandidateFlushHandler flushHandler =
|
||||
new EmitCandidateFlushHandler(emitHandler, emitCandidateHolder);
|
||||
|
||||
TokenStream tknz = new TokenStream(text);
|
||||
|
||||
LinkedList<Transition> tknHistory = new LinkedList<>();
|
||||
State currentState = this.rootState;
|
||||
for (int position = 0; position < text.length(); position++) {
|
||||
|
||||
Transition nextTransition = tknz.nextTransition();
|
||||
while (nextTransition != null) {
|
||||
if (flushHandler.stop()) {
|
||||
return;
|
||||
}
|
||||
|
||||
Character character = text.charAt(position);
|
||||
if (trieConfig.isCaseInsensitive()) {
|
||||
character = Character.toLowerCase(character);
|
||||
tknHistory.add(nextTransition);
|
||||
currentState = getState(currentState, nextTransition, flushHandler);
|
||||
Collection<Keyword> emits = currentState.emit();
|
||||
int depth = currentState.getDepth();
|
||||
while (depth < tknHistory.size()) {
|
||||
tknHistory.remove();
|
||||
}
|
||||
currentState = getState(currentState, character, flushHandler);
|
||||
|
||||
Collection<String> emits = currentState.emit();
|
||||
for (String emit : emits) {
|
||||
int start = position - emit.length() + 1;
|
||||
if (!trieConfig.isOnlyWholeWords() || isWholeWord(text, start, position)) {
|
||||
emitCandidateHolder.addCandidate(new Emit(start, position, emit));
|
||||
}
|
||||
int position = nextTransition.getStart() + nextTransition.getLength();
|
||||
for (Keyword emit : emits) {
|
||||
int start = tknHistory.get(depth - emit.getDepth()).getStart();
|
||||
emitCandidateHolder.addCandidate(
|
||||
new Emit(start, position - 1, emit.getText()));
|
||||
}
|
||||
|
||||
nextTransition = tknz.nextTransition();
|
||||
}
|
||||
flushHandler.flush();
|
||||
}
|
||||
|
||||
public static boolean isWholeWord(CharSequence text, int start, int end) {
|
||||
return (start == 0 || Character.isWhitespace(text.charAt(start - 1))) &&
|
||||
(end == text.length() - 1 || Character.isWhitespace(text.charAt(end + 1)));
|
||||
}
|
||||
|
||||
private State getState(State currentState, Character character, EmitCandidateFlushHandler flushHandler) {
|
||||
State newCurrentState = currentState.nextState(character);
|
||||
private State getState(final State currentState,
|
||||
final Transition transition,
|
||||
final EmitCandidateFlushHandler flushHandler) {
|
||||
State failState = currentState;
|
||||
State newCurrentState = currentState.nextState(transition);
|
||||
while (newCurrentState == null) {
|
||||
currentState = currentState.failure(flushHandler);
|
||||
newCurrentState = currentState.nextState(character);
|
||||
failState = failState.failure(flushHandler);
|
||||
newCurrentState = failState.nextState(transition);
|
||||
}
|
||||
return newCurrentState;
|
||||
}
|
||||
@ -124,7 +206,7 @@ public class Trie {
|
||||
while (!queue.isEmpty()) {
|
||||
State currentState = queue.remove();
|
||||
|
||||
for (Character transition : currentState.getTransitions()) {
|
||||
for (Transition transition : currentState.getTransitions()) {
|
||||
State targetState = currentState.nextState(transition);
|
||||
queue.add(targetState);
|
||||
|
||||
@ -145,9 +227,11 @@ public class Trie {
|
||||
|
||||
public static class TrieBuilder {
|
||||
|
||||
private TrieConfig trieConfig = new TrieConfig();
|
||||
private final TrieConfig trieConfig = new TrieConfig();
|
||||
|
||||
private Trie trie = new Trie(trieConfig);
|
||||
private final Trie trie = new Trie(trieConfig);
|
||||
|
||||
private boolean hasAddedKeyword = false;
|
||||
|
||||
private TrieBuilder() {}
|
||||
|
||||
@ -162,15 +246,20 @@ public class Trie {
|
||||
}
|
||||
|
||||
public TrieBuilder onlyWholeWords() {
|
||||
if (hasAddedKeyword) {
|
||||
throw new IllegalStateException(
|
||||
"Unable to switch to only whole words after keywords added");
|
||||
}
|
||||
this.trieConfig.setOnlyWholeWords(true);
|
||||
return this;
|
||||
}
|
||||
|
||||
public TrieBuilder addKeyword(String keyword) {
|
||||
trie.addKeyword(keyword);
|
||||
hasAddedKeyword = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
public Trie build() {
|
||||
trie.constructFailureStates();
|
||||
return trie;
|
||||
|
||||
@ -7,7 +7,7 @@ public class TrieConfig {
|
||||
private boolean onlyWholeWords = false;
|
||||
|
||||
private boolean caseInsensitive = false;
|
||||
|
||||
|
||||
public boolean isAllowOverlaps() {
|
||||
return allowOverlaps;
|
||||
}
|
||||
|
||||
40
src/main/java/org/ahocorasick/trie/WordTransition.java
Normal file
40
src/main/java/org/ahocorasick/trie/WordTransition.java
Normal file
@ -0,0 +1,40 @@
|
||||
/*
|
||||
* Copyright 2015 Rogue Wave Software.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.ahocorasick.trie;
|
||||
|
||||
/**
|
||||
* Model transitions on words
|
||||
* @author doug.lovell
|
||||
*/
|
||||
public class WordTransition extends Transition<String> {
|
||||
|
||||
/**
|
||||
* Create a transition from a position in the source string
|
||||
* @param word to match
|
||||
* @param start position of first character within the source string
|
||||
*/
|
||||
public WordTransition(final String word, int start) {
|
||||
super(word, start, word.length());
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a transition without regard for position
|
||||
* @param word to match
|
||||
*/
|
||||
public WordTransition(final String word) {
|
||||
this(word, 0);
|
||||
}
|
||||
}
|
||||
@ -1,6 +1,5 @@
|
||||
package org.ahocorasick.trie;
|
||||
|
||||
import org.ahocorasick.trie.State;
|
||||
import org.junit.Test;
|
||||
|
||||
import static junit.framework.Assert.assertEquals;
|
||||
@ -10,15 +9,36 @@ public class StateTest {
|
||||
@Test
|
||||
public void constructSequenceOfCharacters() {
|
||||
State rootState = new State();
|
||||
Transition a = new CharacterTransition('a');
|
||||
Transition b = new CharacterTransition('b');
|
||||
Transition c = new CharacterTransition('c');
|
||||
rootState
|
||||
.addState('a')
|
||||
.addState('b')
|
||||
.addState('c');
|
||||
State currentState = rootState.nextState('a');
|
||||
.addState(a)
|
||||
.addState(b)
|
||||
.addState(c);
|
||||
State currentState = rootState.nextState(a);
|
||||
assertEquals(1, currentState.getDepth());
|
||||
currentState = currentState.nextState('b');
|
||||
currentState = currentState.nextState(b);
|
||||
assertEquals(2, currentState.getDepth());
|
||||
currentState = currentState.nextState('c');
|
||||
currentState = currentState.nextState(c);
|
||||
assertEquals(3, currentState.getDepth());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void constructSequenceOfWords() {
|
||||
State rootState = new State();
|
||||
Transition a = new WordTransition("Alpha");
|
||||
Transition b = new WordTransition("Bravo");
|
||||
Transition c = new WordTransition("Charlie");
|
||||
rootState
|
||||
.addState(a)
|
||||
.addState(b)
|
||||
.addState(c);
|
||||
State currentState = rootState.nextState(a);
|
||||
assertEquals(1, currentState.getDepth());
|
||||
currentState = currentState.nextState(b);
|
||||
assertEquals(2, currentState.getDepth());
|
||||
currentState = currentState.nextState(c);
|
||||
assertEquals(3, currentState.getDepth());
|
||||
}
|
||||
|
||||
|
||||
@ -2,6 +2,7 @@ package org.ahocorasick.trie;
|
||||
|
||||
import org.ahocorasick.trie.handler.EmitHandler;
|
||||
import org.ahocorasick.trie.handler.SimpleEmitHandler;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
@ -10,8 +11,9 @@ import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import static junit.framework.Assert.assertEquals;
|
||||
import static junit.framework.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import org.junit.Rule;
|
||||
import org.junit.rules.ExpectedException;
|
||||
|
||||
public class TrieTest {
|
||||
|
||||
@ -78,8 +80,9 @@ public class TrieTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
public void variousKeywordsFirstMatch() {
|
||||
public void variousKeywordsFirstMatchWordTransitions() {
|
||||
Trie trie = Trie.builder()
|
||||
.onlyWholeWords()
|
||||
.addKeyword("abc")
|
||||
.addKeyword("bcd")
|
||||
.addKeyword("cde")
|
||||
@ -193,6 +196,23 @@ public class TrieTest {
|
||||
checkEmit(iterator.next(), 51, 58, "broccoli");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void recipesWordTransitions() {
|
||||
Trie trie = Trie.builder()
|
||||
.onlyWholeWords()
|
||||
.addKeyword("veal")
|
||||
.addKeyword("cauliflower")
|
||||
.addKeyword("broccoli")
|
||||
.addKeyword("tomatoes")
|
||||
.build();
|
||||
Collection<Emit> emits = trie.parseText("2 cauliflower 3 tomatoes 4 slices of veal 100g broccoli");
|
||||
Iterator<Emit> iterator = emits.iterator();
|
||||
checkEmit(iterator.next(), 2, 12, "cauliflower");
|
||||
checkEmit(iterator.next(), 16, 23, "tomatoes");
|
||||
checkEmit(iterator.next(), 37, 40, "veal");
|
||||
checkEmit(iterator.next(), 47, 54, "broccoli");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void recipesFirstMatch() {
|
||||
Trie trie = Trie.builder()
|
||||
@ -395,6 +415,41 @@ public class TrieTest {
|
||||
assertToken(tokensIt.next(), " in reserve", false, false, false);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void tokenizeFullSentenceByWords() {
|
||||
Trie trie = Trie.builder()
|
||||
.onlyWholeWords()
|
||||
.addKeyword("Alpha")
|
||||
.addKeyword("Beta")
|
||||
.addKeyword("Gamma")
|
||||
.build();
|
||||
Collection<Token> tokens = trie.tokenize("Hear: Alpha team first, Beta from the rear, Gamma in reserve");
|
||||
assertEquals(7, tokens.size());
|
||||
Iterator<Token> tokensIt = tokens.iterator();
|
||||
assertToken(tokensIt.next(), "Hear: ", false, false, false);
|
||||
assertToken(tokensIt.next(), "Alpha", true, true, false);
|
||||
assertToken(tokensIt.next(), " team first, ", false, false, false);
|
||||
assertToken(tokensIt.next(), "Beta", true, true, false);
|
||||
assertToken(tokensIt.next(), " from the rear, ", false, false, false);
|
||||
assertToken(tokensIt.next(), "Gamma", true, true, false);
|
||||
assertToken(tokensIt.next(), " in reserve", false, false, false);
|
||||
}
|
||||
|
||||
@Rule
|
||||
public ExpectedException thrown = ExpectedException.none();
|
||||
|
||||
@Test
|
||||
public void onlyWholeWordsThrowsExceptionAfterKeywordsAdded()
|
||||
throws IllegalStateException {
|
||||
thrown.expect(IllegalStateException.class);
|
||||
thrown.expectMessage("Unable to switch to only whole words after keywords added");
|
||||
Trie trie = Trie.builder()
|
||||
.addKeyword("Happy for now")
|
||||
.onlyWholeWords()
|
||||
.addKeyword("Not so happy")
|
||||
.build();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void bug5InGithubReportedByXCurry() {
|
||||
Trie trie = Trie.builder().caseInsensitive().onlyWholeWords()
|
||||
@ -489,6 +544,22 @@ public class TrieTest {
|
||||
checkEmit(firstMatch, 5, 8, "this");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void unicodeInKeyword() {
|
||||
// The upper case character ('İ') is Unicode,
|
||||
// which was read by AC as a 2-byte char
|
||||
String target = "it is so much LİKE Unicode to mess with Java";
|
||||
Trie trie = Trie.builder()
|
||||
.onlyWholeWords()
|
||||
.addKeyword("so much LİKE Unicode")
|
||||
.addKeyword("it is")
|
||||
.build();
|
||||
Collection<Emit> emits = trie.parseText(target);
|
||||
Iterator<Emit> it = emits.iterator();
|
||||
checkEmit(it.next(), 0, 4, "it is");
|
||||
checkEmit(it.next(), 6, 25, "so much LİKE Unicode");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void partialMatchWhiteSpaces() {
|
||||
Trie trie = Trie.builder()
|
||||
@ -500,11 +571,61 @@ public class TrieTest {
|
||||
checkEmit(emits.iterator().next(), 0, 9, "#sugar-123");
|
||||
}
|
||||
|
||||
/*
|
||||
For onlyWholeWords, we'll ignore leading and trailing white space
|
||||
included on keywords
|
||||
*/
|
||||
@Test
|
||||
public void spacesAroundKeywordByWords() {
|
||||
String text = "lorem ipso facto genera linden pharma six 1";
|
||||
String keyword = " " + text + " ";
|
||||
Trie trie = Trie.builder()
|
||||
.onlyWholeWords()
|
||||
.caseInsensitive()
|
||||
.addKeyword(keyword)
|
||||
.build();
|
||||
Collection < Emit > emits = trie.parseText(
|
||||
text + " under addressed object ");
|
||||
assertEquals(1, emits.size());
|
||||
checkEmit(emits.iterator().next(), 0, text.length() - 1, keyword);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void punctuationInText() {
|
||||
Trie trie = Trie.builder()
|
||||
.onlyWholeWords()
|
||||
.addKeyword("pie")
|
||||
.build();
|
||||
|
||||
Collection<Emit> emits = trie.parseText("Want some pie? Gimme pie! pie, pie. The pie's revenge.");
|
||||
Assert.assertEquals(5, emits.size());
|
||||
Iterator<Emit> it = emits.iterator();
|
||||
checkEmit(it.next(), 10, 12, "pie");
|
||||
checkEmit(it.next(), 21, 23, "pie");
|
||||
checkEmit(it.next(), 26, 28, "pie");
|
||||
checkEmit(it.next(), 31, 33, "pie");
|
||||
checkEmit(it.next(), 40, 42, "pie");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void punctuationInSearchTerm() {
|
||||
Trie trie = Trie.builder()
|
||||
.onlyWholeWords()
|
||||
.addKeyword("Dr. Feelgood")
|
||||
.addKeyword("Oi!")
|
||||
.build();
|
||||
|
||||
Collection<Emit> emits = trie
|
||||
.parseText("The Oi! music genre is inspired by Dr. Feelgood and other bands. Oi or Dr Feelgood should not match.");
|
||||
|
||||
Assert.assertEquals(2, emits.size());
|
||||
|
||||
|
||||
}
|
||||
|
||||
private void assertToken(Token token, String fragment, boolean match, boolean wholeWord, boolean whiteSpace) {
|
||||
assertEquals(fragment, token.getFragment());
|
||||
assertEquals(match, token.isMatch());
|
||||
assertEquals(wholeWord, token.isWholeWord());
|
||||
assertEquals(whiteSpace, token.isWhiteSpace());
|
||||
}
|
||||
|
||||
private void checkEmit(Emit next, int expectedStart, int expectedEnd, String expectedKeyword) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user