first pass refactoring for word transitions

This commit is contained in:
Douglas Lovell 2015-10-27 16:55:35 -06:00
parent 3393e4f51f
commit 6283bf039d
4 changed files with 180 additions and 55 deletions

View File

@ -10,8 +10,8 @@ import java.util.*;
* </p>
*
* <ul>
* <li>success; when a character points to another state, it must return that state</li>
* <li>failure; when a character has no matching state, the algorithm must be able to fall back on a
* <li>success; when a transition points to another state, it must return that state</li>
* <li>failure; when a transition has no matching state, the algorithm must be able to fall back on a
* state with less depth</li>
* <li>emits; when this state is passed and keywords have been matched, the matches must be
* 'emitted' so that they can be used later on.</li>
@ -19,7 +19,7 @@ import java.util.*;
*
* <p>
* The root state is special in the sense that it has no failure state; it cannot fail. If it 'fails'
* it will still parse the next character and start from the root node. This ensures that the algorithm
* it will still parse the next transition and start from the root node. This ensures that the algorithm
* always runs. All other states always have a fail state.
* </p>
*
@ -35,9 +35,9 @@ public class State {
/**
* referred to in the white paper as the 'goto' structure. From a state it is possible to go
* to other states, depending on the character passed.
* to other states, depending on the transition passed.
*/
private Map<Character,State> success = new HashMap<Character, State>();
private final Map<Transition,State> success = new HashMap<>();
/** if no matching states are found, the failure state will be returned */
private State failure = null;
@ -54,27 +54,27 @@ public class State {
this.rootState = depth == 0 ? this : null;
}
private State nextState(Character character, boolean ignoreRootState) {
State nextState = this.success.get(character);
private State nextState(Transition t, boolean ignoreRootState) {
State nextState = this.success.get(t);
if (!ignoreRootState && nextState == null && this.rootState != null) {
nextState = this.rootState;
}
return nextState;
}
public State nextState(Character character) {
return nextState(character, false);
public State nextState(Transition t) {
return nextState(t, false);
}
public State nextStateIgnoreRootState(Character character) {
return nextState(character, true);
public State nextStateIgnoreRootState(Transition t) {
return nextState(t, true);
}
public State addState(Character character) {
State nextState = nextStateIgnoreRootState(character);
public State addState(Transition t) {
State nextState = nextStateIgnoreRootState(t);
if (nextState == null) {
nextState = new State(this.depth+1);
this.success.put(character, nextState);
this.success.put(t, nextState);
}
return nextState;
}
@ -119,7 +119,7 @@ public class State {
return this.success.values();
}
public Collection<Character> getTransitions() {
public Collection<Transition> getTransitions() {
return this.success.keySet();
}

View File

@ -0,0 +1,36 @@
/*
* Copyright 2015 Rogue Wave Software.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.ahocorasick.trie;
/**
* Enables the trie to model transitions on whole words or characters
* ... or whatever!
* @author doug.lovell
* @param <T>
*/
public class Transition<T> {
private final T token;
public Transition(T token) {
this.token = token;
}
public T transitionToken() {
return token;
}
public boolean isWordSeparator() {
return (!(token instanceof Character) ||
Character.isSpaceChar((Character)token));
}
}

View File

@ -19,27 +19,112 @@ import java.util.concurrent.LinkedBlockingDeque;
*/
public class Trie {
private TrieConfig trieConfig;
private final TrieConfig trieConfig;
private State rootState;
private final State rootState;
private Trie(TrieConfig trieConfig) {
this.trieConfig = trieConfig;
this.rootState = new State();
}
private interface KeywordTokenizer {
public Transition nextTransition();
}
private class WordTokenizer implements KeywordTokenizer {
private final java.util.StringTokenizer st;
private boolean lastWasSpace = true;
public WordTokenizer(String keyword) {
st = new java.util.StringTokenizer(keyword);
}
@Override
public Transition<String> nextTransition() {
Transition t;
if (lastWasSpace) {
t = new Transition<>(st.nextToken());
} else {
t = new Transition<>(' ');
}
lastWasSpace = !lastWasSpace;
return t;
}
}
private class CharacterTokenizer implements KeywordTokenizer {
private final java.text.StringCharacterIterator ct;
public CharacterTokenizer(String keyword) {
ct = new java.text.StringCharacterIterator(keyword);
}
@Override
public Transition<Character> nextTransition() {
return new Transition<>(ct.next());
}
}
private KeywordTokenizer keywordTokenizer(String keyword) {
KeywordTokenizer kwt;
if (trieConfig.hasOnlyWordNodes()) {
kwt = new WordTokenizer(keyword);
}
else {
kwt = new CharacterTokenizer(keyword);
}
return kwt;
}
private class TokenStream {
private final KeywordTokenizer kwt;
private Transition lookahead;
private final StringBuffer input = new StringBuffer();
public TokenStream(KeywordTokenizer kwt) {
this.kwt = kwt;
}
public Transition nextTransition() {
Transition next = lookahead;
if (next == null) {
next = kwt.nextTransition();
}
else {
lookahead = null;
}
if (next != null) {
input.append(next.transitionToken().toString());
}
return next;
}
public int position() {
return input.length();
}
public boolean isWholeWord(int start) {
if (lookahead == null) {
lookahead = kwt.nextTransition();
}
return ((start == 0 ||
Character.isSpaceChar(input.codePointAt(start))) &&
(lookahead == null || lookahead.isWordSeparator()));
}
}
private void addKeyword(String keyword) {
if (keyword == null || keyword.length() == 0) {
return;
}
State currentState = this.rootState;
for (Character character : keyword.toCharArray()) {
if (trieConfig.isCaseInsensitive()) {
character = Character.toLowerCase(character);
}
currentState = currentState.addState(character);
if (trieConfig.isCaseInsensitive()) {
keyword = keyword.toLowerCase();
}
currentState.addEmit(trieConfig.isCaseInsensitive() ? keyword.toLowerCase() : keyword);
State currentState = this.rootState;
KeywordTokenizer tknz = keywordTokenizer(keyword);
Transition tn = tknz.nextTransition();
while (tn != null) {
currentState = currentState.addState(tn);
tn = tknz.nextTransition();
}
currentState.addEmit(keyword);
}
public Collection<Token> tokenize(String text) {
@ -53,10 +138,10 @@ public class Trie {
return emitHandler.getEmits();
}
public boolean containsMatch(CharSequence text) {
Emit firstMatch = firstMatch(text);
return firstMatch != null;
}
public boolean containsMatch(CharSequence text) {
Emit firstMatch = firstMatch(text);
return firstMatch != null;
}
public Emit firstMatch(CharSequence text) {
FirstMatchHandler emitHandler = new FirstMatchHandler();
@ -72,41 +157,35 @@ public class Trie {
final EmitCandidateFlushHandler flushHandler = new EmitCandidateFlushHandler(emitHandler, emitCandidateHolder);
String input = text.toString();
if (trieConfig.isCaseInsensitive()) {
input = input.toLowerCase();
}
TokenStream tknz = new TokenStream(keywordTokenizer(input));
State currentState = this.rootState;
for (int position = 0; position < text.length(); position++) {
if (flushHandler.stop()) {
return;
}
Character character = text.charAt(position);
if (trieConfig.isCaseInsensitive()) {
character = Character.toLowerCase(character);
}
currentState = getState(currentState, character, flushHandler);
Transition tn = tknz.nextTransition();
while (tn != null) {
currentState = getState(currentState, tn, flushHandler);
Collection<String> emits = currentState.emit();
for (String emit : emits) {
int start = position - emit.length() + 1;
if (!trieConfig.isOnlyWholeWords() || isWholeWord(text, start, position)) {
int position = tknz.position();
int start = tknz.position() - emit.length() + 1;
if (!trieConfig.isOnlyWholeWords() || tknz.isWholeWord(start)) {
emitCandidateHolder.addCandidate(new Emit(start, position, emit));
}
}
tn = tknz.nextTransition();
}
flushHandler.flush();
}
public static boolean isWholeWord(CharSequence text, int start, int end) {
return (start == 0 || Character.isWhitespace(text.charAt(start - 1))) &&
(end == text.length() - 1 || Character.isWhitespace(text.charAt(end + 1)));
}
private State getState(State currentState, Character character, EmitCandidateFlushHandler flushHandler) {
State newCurrentState = currentState.nextState(character);
private State getState(State currentState, Transition transition, EmitCandidateFlushHandler flushHandler) {
State newCurrentState = currentState.nextState(transition);
while (newCurrentState == null) {
currentState = currentState.failure(flushHandler);
newCurrentState = currentState.nextState(character);
newCurrentState = currentState.nextState(transition);
}
return newCurrentState;
}
@ -124,7 +203,7 @@ public class Trie {
while (!queue.isEmpty()) {
State currentState = queue.remove();
for (Character transition : currentState.getTransitions()) {
for (Transition transition : currentState.getTransitions()) {
State targetState = currentState.nextState(transition);
queue.add(targetState);
@ -145,9 +224,9 @@ public class Trie {
public static class TrieBuilder {
private TrieConfig trieConfig = new TrieConfig();
private final TrieConfig trieConfig = new TrieConfig();
private Trie trie = new Trie(trieConfig);
private final Trie trie = new Trie(trieConfig);
private TrieBuilder() {}

View File

@ -7,6 +7,8 @@ public class TrieConfig {
private boolean onlyWholeWords = false;
private boolean caseInsensitive = false;
private boolean wordNodes = false;
public boolean isAllowOverlaps() {
return allowOverlaps;
@ -17,7 +19,7 @@ public class TrieConfig {
}
public boolean isOnlyWholeWords() {
return onlyWholeWords;
return wordNodes || onlyWholeWords;
}
public void setOnlyWholeWords(boolean onlyWholeWords) {
@ -31,4 +33,12 @@ public class TrieConfig {
public void setCaseInsensitive(boolean caseInsensitive) {
this.caseInsensitive = caseInsensitive;
}
public boolean hasOnlyWordNodes() {
return wordNodes;
}
public void setOnlyWordNodes(boolean wordNodes) {
this.wordNodes = wordNodes;
}
}