Issue #4 Trie.tokenize() available. It returns a list of tokens. A token can be either a fragment (unmatched text) or a match. If it is a match, the original emit can be queried.

2014-02-01 22:01:15 +01:00 · 2014-02-01 22:01:15 +01:00 · bcde097070
commit bcde097070
parent ae20429936
6 changed files with 142 additions and 3 deletions
--- a/README.md
+++ b/README.md
@ -103,9 +103,33 @@ Normally, this match would not be found. With the caseInsensitive settings the e
 before the matching begins. Therefore it will find exactly one match. Since you still have control of the original
 search text and you will know exactly where the match was, you can still utilize the original casing.

-Now, let's tie it all together. Say, you have this
-
+In many cases you may want to do useful stuff with both the non-matching and the matching text. In this case, you
+might be better served by using the Trie.tokenize(). It allows you to loop over the entire text and deal with
+matches as soon as you encounter them. Let's look at an example where we want to highlight words from HGttG in HTML:

+```java
+    String speech = "The Answer to the Great Question... Of Life, " +
+            "the Universe and Everything... Is... Forty-two,' said " +
+            "Deep Thought, with infinite majesty and calm.";
+    Trie trie = new Trie().removeOverlaps().onlyWholeWords().caseInsensitive();
+    trie.addKeyword("great question");
+    trie.addKeyword("forty-two");
+    trie.addKeyword("deep thought");
+    Collection<Token> tokens = trie.tokenize(speech);
+    StringBuffer html = new StringBuffer();
+    html.append("<html><body><p>");
+    for (Token token : tokens) {
+        if (token.isMatch()) {
+            html.append("<i>");
+        }
+        html.append(token.getFragment());
+        if (token.isMatch()) {
+            html.append("</i>");
+        }
+    }
+    html.append("</p></body></html>");
+    System.out.println(html);
+```

 License
 -------
--- a/src/main/java/org/ahocorasick/trie/FragmentToken.java
+++ b/src/main/java/org/ahocorasick/trie/FragmentToken.java
@ -0,0 +1,18 @@
+package org.ahocorasick.trie;
+
+public class FragmentToken extends Token {
+
+    public FragmentToken(String fragment) {
+        super(fragment);
+    }
+
+    @Override
+    public boolean isMatch() {
+        return false;
+    }
+
+    @Override
+    public Emit getEmit() {
+        return null;
+    }
+}
--- a/src/main/java/org/ahocorasick/trie/MatchToken.java
+++ b/src/main/java/org/ahocorasick/trie/MatchToken.java
@ -0,0 +1,22 @@
+package org.ahocorasick.trie;
+
+public class MatchToken extends Token {
+
+    private Emit emit;
+
+    public MatchToken(String fragment, Emit emit) {
+        super(fragment);
+        this.emit = emit;
+    }
+
+    @Override
+    public boolean isMatch() {
+        return true;
+    }
+
+    @Override
+    public Emit getEmit() {
+        return this.emit;
+    }
+
+}
--- a/src/main/java/org/ahocorasick/trie/Token.java
+++ b/src/main/java/org/ahocorasick/trie/Token.java
@ -0,0 +1,19 @@
+package org.ahocorasick.trie;
+
+public abstract class Token {
+
+    private String fragment;
+
+    public Token(String fragment) {
+        this.fragment = fragment;
+    }
+
+    public String getFragment() {
+        return this.fragment;
+    }
+
+    public abstract boolean isMatch();
+
+    public abstract Emit getEmit();
+
+}
--- a/src/main/java/org/ahocorasick/trie/Trie.java
+++ b/src/main/java/org/ahocorasick/trie/Trie.java
@ -54,6 +54,34 @@ public class Trie {
        currentState.addEmit(keyword);
    }

+    public Collection<Token> tokenize(String text) {
+
+        Collection<Token> tokens = new ArrayList<Token>();
+
+        Collection<Emit> collectedEmits = parseText(text);
+        int lastCollectedPosition = -1;
+        for (Emit emit : collectedEmits) {
+            if (emit.getStart() - lastCollectedPosition > 1) {
+                tokens.add(createFragment(emit, text, lastCollectedPosition));
+            }
+            tokens.add(createMatch(emit, text));
+            lastCollectedPosition = emit.getEnd();
+        }
+        if (text.length() - lastCollectedPosition > 1) {
+            tokens.add(createFragment(null, text, lastCollectedPosition));
+        }
+
+        return tokens;
+    }
+
+    private Token createFragment(Emit emit, String text, int lastCollectedPosition) {
+        return new FragmentToken(text.substring(lastCollectedPosition+1, emit == null ? text.length() : emit.getStart()));
+    }
+
+    private Token createMatch(Emit emit, String text) {
+        return new MatchToken(text.substring(emit.getStart(), emit.getEnd()+1), emit);
+    }
+
    @SuppressWarnings("unchecked")
    public Collection<Emit> parseText(String text) {
        checkForConstructedFailureStates();
--- a/src/test/java/org/ahocorasick/trie/TrieTest.java
+++ b/src/test/java/org/ahocorasick/trie/TrieTest.java
@ -142,7 +142,7 @@ public class TrieTest {
        trie.addKeyword("once");
        trie.addKeyword("again");
        trie.addKeyword("börkü");
-        Collection<Emit> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ"); // left, middle, right test
+        Collection<Emit> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ");
        assertEquals(4, emits.size()); // Match must not be made
        Iterator<Emit> it = emits.iterator();
        checkEmit(it.next(), 0, 6, "turning");
@ -151,6 +151,34 @@ public class TrieTest {
        checkEmit(it.next(), 19, 23, "börkü");
    }

+    @Test
+    public void tokenizeFullSentence() {
+        Trie trie = new Trie();
+        trie.addKeyword("Alpha");
+        trie.addKeyword("Beta");
+        trie.addKeyword("Gamma");
+        Collection<Token> tokens = trie.tokenize("Hear: Alpha team first, Beta from the rear, Gamma in reserve");
+        assertEquals(7, tokens.size());
+        Iterator<Token> tokensIt = tokens.iterator();
+        assertEquals("Hear: ", tokensIt.next().getFragment());
+        assertEquals("Alpha", tokensIt.next().getFragment());
+        assertEquals(" team first, ", tokensIt.next().getFragment());
+        assertEquals("Beta", tokensIt.next().getFragment());
+        assertEquals(" from the rear, ", tokensIt.next().getFragment());
+        assertEquals("Gamma", tokensIt.next().getFragment());
+        assertEquals(" in reserve", tokensIt.next().getFragment());
+    }
+
+    @Test
+    public void tokenizeTokensInSequence() {
+        Trie trie = new Trie();
+        trie.addKeyword("Alpha");
+        trie.addKeyword("Beta");
+        trie.addKeyword("Gamma");
+        Collection<Token> tokens = trie.tokenize("Alpha Beta Gamma");
+        assertEquals(5, tokens.size());
+    }
+
    private void checkEmit(Emit next, int expectedStart, int expectedEnd, String expectedKeyword) {
        assertEquals(expectedStart, next.getStart());
        assertEquals(expectedEnd, next.getEnd());