first pass refactoring for word transitions

2015-10-27 16:55:35 -06:00 · 2015-10-27 16:55:35 -06:00 · 6283bf039d
commit 6283bf039d
parent 3393e4f51f
4 changed files with 180 additions and 55 deletions
--- a/src/main/java/org/ahocorasick/trie/State.java
+++ b/src/main/java/org/ahocorasick/trie/State.java
@ -10,8 +10,8 @@ import java.util.*;
 * </p>
 *
 * <ul>
- *     <li>success; when a character points to another state, it must return that state</li>
- *     <li>failure; when a character has no matching state, the algorithm must be able to fall back on a
+ *     <li>success; when a transition points to another state, it must return that state</li>
+ *     <li>failure; when a transition has no matching state, the algorithm must be able to fall back on a
 *         state with less depth</li>
 *     <li>emits; when this state is passed and keywords have been matched, the matches must be
 *         'emitted' so that they can be used later on.</li>
@ -19,7 +19,7 @@ import java.util.*;
 *
 * <p>
 *     The root state is special in the sense that it has no failure state; it cannot fail. If it 'fails'
- *     it will still parse the next character and start from the root node. This ensures that the algorithm
+ *     it will still parse the next transition and start from the root node. This ensures that the algorithm
 *     always runs. All other states always have a fail state.
 * </p>
 *
@ -35,9 +35,9 @@ public class State {

    /**
     * referred to in the white paper as the 'goto' structure. From a state it is possible to go
-     * to other states, depending on the character passed.
+     * to other states, depending on the transition passed.
     */
-    private Map<Character,State> success = new HashMap<Character, State>();
+    private final Map<Transition,State> success = new HashMap<>();

    /** if no matching states are found, the failure state will be returned */
    private State failure = null;
@ -54,27 +54,27 @@ public class State {
        this.rootState = depth == 0 ? this : null;
    }

-    private State nextState(Character character, boolean ignoreRootState) {
-        State nextState = this.success.get(character);
+    private State nextState(Transition t, boolean ignoreRootState) {
+        State nextState = this.success.get(t);
        if (!ignoreRootState && nextState == null && this.rootState != null) {
            nextState = this.rootState;
        }
        return nextState;
    }

-    public State nextState(Character character) {
-        return nextState(character, false);
+    public State nextState(Transition t) {
+        return nextState(t, false);
    }

-    public State nextStateIgnoreRootState(Character character) {
-        return nextState(character, true);
+    public State nextStateIgnoreRootState(Transition t) {
+        return nextState(t, true);
    }

-    public State addState(Character character) {
-        State nextState = nextStateIgnoreRootState(character);
+    public State addState(Transition t) {
+        State nextState = nextStateIgnoreRootState(t);
        if (nextState == null) {
            nextState = new State(this.depth+1);
-            this.success.put(character, nextState);
+            this.success.put(t, nextState);
        }
        return nextState;
    }
@ -119,7 +119,7 @@ public class State {
        return this.success.values();
    }

-    public Collection<Character> getTransitions() {
+    public Collection<Transition> getTransitions() {
        return this.success.keySet();
    }

--- a/src/main/java/org/ahocorasick/trie/Transition.java
+++ b/src/main/java/org/ahocorasick/trie/Transition.java
@ -0,0 +1,36 @@
+/*
+ * Copyright 2015 Rogue Wave Software.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.ahocorasick.trie;
+
+/**
+ * Enables the trie to model transitions on whole words or characters
+ * ... or whatever!
+ * @author doug.lovell
+ * @param <T>
+ */
+public class Transition<T> {
+    private final T token;
+    public Transition(T token) {
+        this.token = token;
+    }
+    public T transitionToken() {
+        return token;
+    }
+    public boolean isWordSeparator() {
+        return (!(token instanceof Character) ||
+                Character.isSpaceChar((Character)token));
+    }
+}
--- a/src/main/java/org/ahocorasick/trie/Trie.java
+++ b/src/main/java/org/ahocorasick/trie/Trie.java
@ -19,27 +19,112 @@ import java.util.concurrent.LinkedBlockingDeque;
 */
 public class Trie {

-    private TrieConfig trieConfig;
+    private final TrieConfig trieConfig;

-    private State rootState;
+    private final State rootState;

    private Trie(TrieConfig trieConfig) {
        this.trieConfig = trieConfig;
        this.rootState = new State();
    }
+    
+    private interface KeywordTokenizer {
+        public Transition nextTransition();
+    }
+    
+    private class WordTokenizer implements KeywordTokenizer {
+        private final java.util.StringTokenizer st;
+        private boolean lastWasSpace = true;
+        public WordTokenizer(String keyword) {
+            st = new java.util.StringTokenizer(keyword);
+        }
+        @Override
+        public Transition<String> nextTransition() {
+            Transition t;
+            if (lastWasSpace) {
+                t = new Transition<>(st.nextToken());
+            } else {
+                t = new Transition<>(' ');
+            }
+            lastWasSpace = !lastWasSpace;
+            return t;
+        }
+    }
+    
+    private class CharacterTokenizer implements KeywordTokenizer {
+        private final java.text.StringCharacterIterator ct;
+        public CharacterTokenizer(String keyword) {
+            ct = new java.text.StringCharacterIterator(keyword);
+        }
+        @Override
+        public Transition<Character> nextTransition() {
+            return new Transition<>(ct.next());
+        }
+    }
+    
+    private KeywordTokenizer keywordTokenizer(String keyword) {
+        KeywordTokenizer kwt;
+        if (trieConfig.hasOnlyWordNodes()) {
+            kwt = new WordTokenizer(keyword);
+        }
+        else {
+            kwt = new CharacterTokenizer(keyword);
+        }
+        return kwt;
+    }

+    private class TokenStream {
+        private final KeywordTokenizer kwt;
+        private Transition lookahead;
+        private final StringBuffer input = new StringBuffer();
+        
+        public TokenStream(KeywordTokenizer kwt) {
+            this.kwt = kwt;
+        }
+        
+        public Transition nextTransition() {
+            Transition next = lookahead;
+            if (next == null) {
+                next = kwt.nextTransition();
+            }
+            else {
+                lookahead = null;
+            }
+            if (next != null) {
+                input.append(next.transitionToken().toString());
+            }
+            return next;
+        }
+        
+        public int position() {
+            return input.length();
+        }
+        
+        public boolean isWholeWord(int start) {
+            if (lookahead == null) {
+                lookahead = kwt.nextTransition();
+            }
+            return ((start == 0 || 
+                     Character.isSpaceChar(input.codePointAt(start))) && 
+                    (lookahead == null || lookahead.isWordSeparator()));
+        }
+    }
+        
    private void addKeyword(String keyword) {
        if (keyword == null || keyword.length() == 0) {
            return;
        }
-        State currentState = this.rootState;
-        for (Character character : keyword.toCharArray()) {
-            if (trieConfig.isCaseInsensitive()) {
-                character = Character.toLowerCase(character);
-            }
-            currentState = currentState.addState(character);
+        if (trieConfig.isCaseInsensitive()) {
+            keyword = keyword.toLowerCase();
        }
-        currentState.addEmit(trieConfig.isCaseInsensitive() ? keyword.toLowerCase() : keyword);
+        State currentState = this.rootState;
+        KeywordTokenizer tknz = keywordTokenizer(keyword);
+        Transition tn = tknz.nextTransition();
+        while (tn != null) {
+            currentState = currentState.addState(tn);
+            tn = tknz.nextTransition();
+        }
+        currentState.addEmit(keyword);
    }

    public Collection<Token> tokenize(String text) {
@ -53,10 +138,10 @@ public class Trie {
        return emitHandler.getEmits();
    }

-	public boolean containsMatch(CharSequence text) {
-		Emit firstMatch = firstMatch(text);
-		return firstMatch != null;
-	}
+    public boolean containsMatch(CharSequence text) {
+            Emit firstMatch = firstMatch(text);
+            return firstMatch != null;
+    }

    public Emit firstMatch(CharSequence text) {
        FirstMatchHandler emitHandler = new FirstMatchHandler();
@ -72,41 +157,35 @@ public class Trie {

        final EmitCandidateFlushHandler flushHandler = new EmitCandidateFlushHandler(emitHandler, emitCandidateHolder);

+        String input = text.toString();
+        if (trieConfig.isCaseInsensitive()) {
+            input = input.toLowerCase();
+        }
+        TokenStream tknz = new TokenStream(keywordTokenizer(input));
+        
        State currentState = this.rootState;
-        for (int position = 0; position < text.length(); position++) {
-
-            if (flushHandler.stop()) {
-                return;
-            }
-
-            Character character = text.charAt(position);
-            if (trieConfig.isCaseInsensitive()) {
-                character = Character.toLowerCase(character);
-            }
-            currentState = getState(currentState, character, flushHandler);
-
+        Transition tn = tknz.nextTransition();
+        while (tn != null) {
+            currentState = getState(currentState, tn, flushHandler);
+            
            Collection<String> emits = currentState.emit();
            for (String emit : emits) {
-                int start = position - emit.length() + 1;
-                if (!trieConfig.isOnlyWholeWords() || isWholeWord(text, start, position)) {
+                int position = tknz.position();
+                int start = tknz.position() - emit.length() + 1;
+                if (!trieConfig.isOnlyWholeWords() || tknz.isWholeWord(start)) {
                    emitCandidateHolder.addCandidate(new Emit(start, position, emit));
                }
            }
-
+            tn = tknz.nextTransition();
        }
        flushHandler.flush();
    }

-    public static boolean isWholeWord(CharSequence text, int start, int end) {
-        return (start == 0 || Character.isWhitespace(text.charAt(start - 1))) &&
-               (end == text.length() - 1 || Character.isWhitespace(text.charAt(end + 1)));
-    }
-
-    private State getState(State currentState, Character character, EmitCandidateFlushHandler flushHandler) {
-        State newCurrentState = currentState.nextState(character);
+    private State getState(State currentState, Transition transition, EmitCandidateFlushHandler flushHandler) {
+        State newCurrentState = currentState.nextState(transition);
        while (newCurrentState == null) {
            currentState = currentState.failure(flushHandler);
-            newCurrentState = currentState.nextState(character);
+            newCurrentState = currentState.nextState(transition);
        }
        return newCurrentState;
    }
@ -124,7 +203,7 @@ public class Trie {
        while (!queue.isEmpty()) {
            State currentState = queue.remove();

-            for (Character transition : currentState.getTransitions()) {
+            for (Transition transition : currentState.getTransitions()) {
                State targetState = currentState.nextState(transition);
                queue.add(targetState);

@ -145,9 +224,9 @@ public class Trie {

    public static class TrieBuilder {

-        private TrieConfig trieConfig = new TrieConfig();
+        private final TrieConfig trieConfig = new TrieConfig();

-        private Trie trie = new Trie(trieConfig);
+        private final Trie trie = new Trie(trieConfig);

        private TrieBuilder() {}

--- a/src/main/java/org/ahocorasick/trie/TrieConfig.java
+++ b/src/main/java/org/ahocorasick/trie/TrieConfig.java
@ -7,6 +7,8 @@ public class TrieConfig {
    private boolean onlyWholeWords = false;

    private boolean caseInsensitive = false;
+    
+    private boolean wordNodes = false;

    public boolean isAllowOverlaps() {
        return allowOverlaps;
@ -17,7 +19,7 @@ public class TrieConfig {
    }

    public boolean isOnlyWholeWords() {
-        return onlyWholeWords;
+        return wordNodes || onlyWholeWords;
    }

    public void setOnlyWholeWords(boolean onlyWholeWords) {
@ -31,4 +33,12 @@ public class TrieConfig {
    public void setCaseInsensitive(boolean caseInsensitive) {
        this.caseInsensitive = caseInsensitive;
    }
+    
+    public boolean hasOnlyWordNodes() {
+        return wordNodes;
+    }
+    
+    public void setOnlyWordNodes(boolean wordNodes) {
+        this.wordNodes = wordNodes;
+    }
 }