From 6283bf039d63be5b41a7b37d3e6b028ae67a1065 Mon Sep 17 00:00:00 2001
From: Douglas Lovell
Date: Tue, 27 Oct 2015 16:55:35 -0600
Subject: [PATCH] first pass refactoring for word transitions
---
src/main/java/org/ahocorasick/trie/State.java | 30 ++--
.../java/org/ahocorasick/trie/Transition.java | 36 ++++
src/main/java/org/ahocorasick/trie/Trie.java | 157 +++++++++++++-----
.../java/org/ahocorasick/trie/TrieConfig.java | 12 +-
4 files changed, 180 insertions(+), 55 deletions(-)
create mode 100644 src/main/java/org/ahocorasick/trie/Transition.java
diff --git a/src/main/java/org/ahocorasick/trie/State.java b/src/main/java/org/ahocorasick/trie/State.java
index 82d167b..e104732 100644
--- a/src/main/java/org/ahocorasick/trie/State.java
+++ b/src/main/java/org/ahocorasick/trie/State.java
@@ -10,8 +10,8 @@ import java.util.*;
*
*
*
- * - success; when a character points to another state, it must return that state
- * - failure; when a character has no matching state, the algorithm must be able to fall back on a
+ *
- success; when a transition points to another state, it must return that state
+ * - failure; when a transition has no matching state, the algorithm must be able to fall back on a
* state with less depth
* - emits; when this state is passed and keywords have been matched, the matches must be
* 'emitted' so that they can be used later on.
@@ -19,7 +19,7 @@ import java.util.*;
*
*
* The root state is special in the sense that it has no failure state; it cannot fail. If it 'fails'
- * it will still parse the next character and start from the root node. This ensures that the algorithm
+ * it will still parse the next transition and start from the root node. This ensures that the algorithm
* always runs. All other states always have a fail state.
*
*
@@ -35,9 +35,9 @@ public class State {
/**
* referred to in the white paper as the 'goto' structure. From a state it is possible to go
- * to other states, depending on the character passed.
+ * to other states, depending on the transition passed.
*/
- private Map success = new HashMap();
+ private final Map success = new HashMap<>();
/** if no matching states are found, the failure state will be returned */
private State failure = null;
@@ -54,27 +54,27 @@ public class State {
this.rootState = depth == 0 ? this : null;
}
- private State nextState(Character character, boolean ignoreRootState) {
- State nextState = this.success.get(character);
+ private State nextState(Transition t, boolean ignoreRootState) {
+ State nextState = this.success.get(t);
if (!ignoreRootState && nextState == null && this.rootState != null) {
nextState = this.rootState;
}
return nextState;
}
- public State nextState(Character character) {
- return nextState(character, false);
+ public State nextState(Transition t) {
+ return nextState(t, false);
}
- public State nextStateIgnoreRootState(Character character) {
- return nextState(character, true);
+ public State nextStateIgnoreRootState(Transition t) {
+ return nextState(t, true);
}
- public State addState(Character character) {
- State nextState = nextStateIgnoreRootState(character);
+ public State addState(Transition t) {
+ State nextState = nextStateIgnoreRootState(t);
if (nextState == null) {
nextState = new State(this.depth+1);
- this.success.put(character, nextState);
+ this.success.put(t, nextState);
}
return nextState;
}
@@ -119,7 +119,7 @@ public class State {
return this.success.values();
}
- public Collection getTransitions() {
+ public Collection getTransitions() {
return this.success.keySet();
}
diff --git a/src/main/java/org/ahocorasick/trie/Transition.java b/src/main/java/org/ahocorasick/trie/Transition.java
new file mode 100644
index 0000000..99776b0
--- /dev/null
+++ b/src/main/java/org/ahocorasick/trie/Transition.java
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2015 Rogue Wave Software.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.ahocorasick.trie;
+
+/**
+ * Enables the trie to model transitions on whole words or characters
+ * ... or whatever!
+ * @author doug.lovell
+ * @param
+ */
+public class Transition {
+ private final T token;
+ public Transition(T token) {
+ this.token = token;
+ }
+ public T transitionToken() {
+ return token;
+ }
+ public boolean isWordSeparator() {
+ return (!(token instanceof Character) ||
+ Character.isSpaceChar((Character)token));
+ }
+}
diff --git a/src/main/java/org/ahocorasick/trie/Trie.java b/src/main/java/org/ahocorasick/trie/Trie.java
index 3c5403d..d2ada77 100644
--- a/src/main/java/org/ahocorasick/trie/Trie.java
+++ b/src/main/java/org/ahocorasick/trie/Trie.java
@@ -19,27 +19,112 @@ import java.util.concurrent.LinkedBlockingDeque;
*/
public class Trie {
- private TrieConfig trieConfig;
+ private final TrieConfig trieConfig;
- private State rootState;
+ private final State rootState;
private Trie(TrieConfig trieConfig) {
this.trieConfig = trieConfig;
this.rootState = new State();
}
+
+ private interface KeywordTokenizer {
+ public Transition nextTransition();
+ }
+
+ private class WordTokenizer implements KeywordTokenizer {
+ private final java.util.StringTokenizer st;
+ private boolean lastWasSpace = true;
+ public WordTokenizer(String keyword) {
+ st = new java.util.StringTokenizer(keyword);
+ }
+ @Override
+ public Transition nextTransition() {
+ Transition t;
+ if (lastWasSpace) {
+ t = new Transition<>(st.nextToken());
+ } else {
+ t = new Transition<>(' ');
+ }
+ lastWasSpace = !lastWasSpace;
+ return t;
+ }
+ }
+
+ private class CharacterTokenizer implements KeywordTokenizer {
+ private final java.text.StringCharacterIterator ct;
+ public CharacterTokenizer(String keyword) {
+ ct = new java.text.StringCharacterIterator(keyword);
+ }
+ @Override
+ public Transition nextTransition() {
+ return new Transition<>(ct.next());
+ }
+ }
+
+ private KeywordTokenizer keywordTokenizer(String keyword) {
+ KeywordTokenizer kwt;
+ if (trieConfig.hasOnlyWordNodes()) {
+ kwt = new WordTokenizer(keyword);
+ }
+ else {
+ kwt = new CharacterTokenizer(keyword);
+ }
+ return kwt;
+ }
+ private class TokenStream {
+ private final KeywordTokenizer kwt;
+ private Transition lookahead;
+ private final StringBuffer input = new StringBuffer();
+
+ public TokenStream(KeywordTokenizer kwt) {
+ this.kwt = kwt;
+ }
+
+ public Transition nextTransition() {
+ Transition next = lookahead;
+ if (next == null) {
+ next = kwt.nextTransition();
+ }
+ else {
+ lookahead = null;
+ }
+ if (next != null) {
+ input.append(next.transitionToken().toString());
+ }
+ return next;
+ }
+
+ public int position() {
+ return input.length();
+ }
+
+ public boolean isWholeWord(int start) {
+ if (lookahead == null) {
+ lookahead = kwt.nextTransition();
+ }
+ return ((start == 0 ||
+ Character.isSpaceChar(input.codePointAt(start))) &&
+ (lookahead == null || lookahead.isWordSeparator()));
+ }
+ }
+
private void addKeyword(String keyword) {
if (keyword == null || keyword.length() == 0) {
return;
}
- State currentState = this.rootState;
- for (Character character : keyword.toCharArray()) {
- if (trieConfig.isCaseInsensitive()) {
- character = Character.toLowerCase(character);
- }
- currentState = currentState.addState(character);
+ if (trieConfig.isCaseInsensitive()) {
+ keyword = keyword.toLowerCase();
}
- currentState.addEmit(trieConfig.isCaseInsensitive() ? keyword.toLowerCase() : keyword);
+ State currentState = this.rootState;
+ KeywordTokenizer tknz = keywordTokenizer(keyword);
+ Transition tn = tknz.nextTransition();
+ while (tn != null) {
+ currentState = currentState.addState(tn);
+ tn = tknz.nextTransition();
+ }
+ currentState.addEmit(keyword);
}
public Collection tokenize(String text) {
@@ -53,10 +138,10 @@ public class Trie {
return emitHandler.getEmits();
}
- public boolean containsMatch(CharSequence text) {
- Emit firstMatch = firstMatch(text);
- return firstMatch != null;
- }
+ public boolean containsMatch(CharSequence text) {
+ Emit firstMatch = firstMatch(text);
+ return firstMatch != null;
+ }
public Emit firstMatch(CharSequence text) {
FirstMatchHandler emitHandler = new FirstMatchHandler();
@@ -72,41 +157,35 @@ public class Trie {
final EmitCandidateFlushHandler flushHandler = new EmitCandidateFlushHandler(emitHandler, emitCandidateHolder);
+ String input = text.toString();
+ if (trieConfig.isCaseInsensitive()) {
+ input = input.toLowerCase();
+ }
+ TokenStream tknz = new TokenStream(keywordTokenizer(input));
+
State currentState = this.rootState;
- for (int position = 0; position < text.length(); position++) {
-
- if (flushHandler.stop()) {
- return;
- }
-
- Character character = text.charAt(position);
- if (trieConfig.isCaseInsensitive()) {
- character = Character.toLowerCase(character);
- }
- currentState = getState(currentState, character, flushHandler);
-
+ Transition tn = tknz.nextTransition();
+ while (tn != null) {
+ currentState = getState(currentState, tn, flushHandler);
+
Collection emits = currentState.emit();
for (String emit : emits) {
- int start = position - emit.length() + 1;
- if (!trieConfig.isOnlyWholeWords() || isWholeWord(text, start, position)) {
+ int position = tknz.position();
+ int start = tknz.position() - emit.length() + 1;
+ if (!trieConfig.isOnlyWholeWords() || tknz.isWholeWord(start)) {
emitCandidateHolder.addCandidate(new Emit(start, position, emit));
}
}
-
+ tn = tknz.nextTransition();
}
flushHandler.flush();
}
- public static boolean isWholeWord(CharSequence text, int start, int end) {
- return (start == 0 || Character.isWhitespace(text.charAt(start - 1))) &&
- (end == text.length() - 1 || Character.isWhitespace(text.charAt(end + 1)));
- }
-
- private State getState(State currentState, Character character, EmitCandidateFlushHandler flushHandler) {
- State newCurrentState = currentState.nextState(character);
+ private State getState(State currentState, Transition transition, EmitCandidateFlushHandler flushHandler) {
+ State newCurrentState = currentState.nextState(transition);
while (newCurrentState == null) {
currentState = currentState.failure(flushHandler);
- newCurrentState = currentState.nextState(character);
+ newCurrentState = currentState.nextState(transition);
}
return newCurrentState;
}
@@ -124,7 +203,7 @@ public class Trie {
while (!queue.isEmpty()) {
State currentState = queue.remove();
- for (Character transition : currentState.getTransitions()) {
+ for (Transition transition : currentState.getTransitions()) {
State targetState = currentState.nextState(transition);
queue.add(targetState);
@@ -145,9 +224,9 @@ public class Trie {
public static class TrieBuilder {
- private TrieConfig trieConfig = new TrieConfig();
+ private final TrieConfig trieConfig = new TrieConfig();
- private Trie trie = new Trie(trieConfig);
+ private final Trie trie = new Trie(trieConfig);
private TrieBuilder() {}
diff --git a/src/main/java/org/ahocorasick/trie/TrieConfig.java b/src/main/java/org/ahocorasick/trie/TrieConfig.java
index 6fa05c7..60a0b02 100644
--- a/src/main/java/org/ahocorasick/trie/TrieConfig.java
+++ b/src/main/java/org/ahocorasick/trie/TrieConfig.java
@@ -7,6 +7,8 @@ public class TrieConfig {
private boolean onlyWholeWords = false;
private boolean caseInsensitive = false;
+
+ private boolean wordNodes = false;
public boolean isAllowOverlaps() {
return allowOverlaps;
@@ -17,7 +19,7 @@ public class TrieConfig {
}
public boolean isOnlyWholeWords() {
- return onlyWholeWords;
+ return wordNodes || onlyWholeWords;
}
public void setOnlyWholeWords(boolean onlyWholeWords) {
@@ -31,4 +33,12 @@ public class TrieConfig {
public void setCaseInsensitive(boolean caseInsensitive) {
this.caseInsensitive = caseInsensitive;
}
+
+ public boolean hasOnlyWordNodes() {
+ return wordNodes;
+ }
+
+ public void setOnlyWordNodes(boolean wordNodes) {
+ this.wordNodes = wordNodes;
+ }
}