From bcde09707001ae85e201847e2dda1fc0a6385aaa Mon Sep 17 00:00:00 2001
From: robert-bor <bor.robert@gmail.com>
Date: Sat, 1 Feb 2014 22:01:15 +0100
Subject: [PATCH] Issue #4 Trie.tokenize() available. It returns a list of
 tokens. A token can be either a fragment (unmatched text) or a match. If it
 is a match, the original emit can be queried.

---
 README.md                                     | 28 +++++++++++++++--
 .../org/ahocorasick/trie/FragmentToken.java   | 18 +++++++++++
 .../java/org/ahocorasick/trie/MatchToken.java | 22 ++++++++++++++
 src/main/java/org/ahocorasick/trie/Token.java | 19 ++++++++++++
 src/main/java/org/ahocorasick/trie/Trie.java  | 28 +++++++++++++++++
 .../java/org/ahocorasick/trie/TrieTest.java   | 30 ++++++++++++++++++-
 6 files changed, 142 insertions(+), 3 deletions(-)
 create mode 100644 src/main/java/org/ahocorasick/trie/FragmentToken.java
 create mode 100644 src/main/java/org/ahocorasick/trie/MatchToken.java
 create mode 100644 src/main/java/org/ahocorasick/trie/Token.java
diff --git a/README.md b/README.md
index 3b51877..d82839a 100644
--- a/README.md
+++ b/README.md
@@ -103,9 +103,33 @@ Normally, this match would not be found. With the caseInsensitive settings the e
 before the matching begins. Therefore it will find exactly one match. Since you still have control of the original
 search text and you will know exactly where the match was, you can still utilize the original casing.
 
-Now, let's tie it all together. Say, you have this
-
+In many cases you may want to do useful stuff with both the non-matching and the matching text. In this case, you
+might be better served by using the Trie.tokenize(). It allows you to loop over the entire text and deal with
+matches as soon as you encounter them. Let's look at an example where we want to highlight words from HGttG in HTML:
 
+```java
+    String speech = "The Answer to the Great Question... Of Life, " +
+            "the Universe and Everything... Is... Forty-two,' said " +
+            "Deep Thought, with infinite majesty and calm.";
+    Trie trie = new Trie().removeOverlaps().onlyWholeWords().caseInsensitive();
+    trie.addKeyword("great question");
+    trie.addKeyword("forty-two");
+    trie.addKeyword("deep thought");
+    Collection<Token> tokens = trie.tokenize(speech);
+    StringBuffer html = new StringBuffer();
+    html.append("<html><body><p>");
+    for (Token token : tokens) {
+        if (token.isMatch()) {
+            html.append("<i>");
+        }
+        html.append(token.getFragment());
+        if (token.isMatch()) {
+            html.append("</i>");
+        }
+    }
+    html.append("</p></body></html>");
+    System.out.println(html);
+```
 
 License
 -------
diff --git a/src/main/java/org/ahocorasick/trie/FragmentToken.java b/src/main/java/org/ahocorasick/trie/FragmentToken.java
new file mode 100644
index 0000000..85c498a
--- /dev/null
+++ b/src/main/java/org/ahocorasick/trie/FragmentToken.java
@@ -0,0 +1,18 @@
+package org.ahocorasick.trie;
+
+public class FragmentToken extends Token {
+
+    public FragmentToken(String fragment) {
+        super(fragment);
+    }
+
+    @Override
+    public boolean isMatch() {
+        return false;
+    }
+
+    @Override
+    public Emit getEmit() {
+        return null;
+    }
+}
diff --git a/src/main/java/org/ahocorasick/trie/MatchToken.java b/src/main/java/org/ahocorasick/trie/MatchToken.java
new file mode 100644
index 0000000..c2615dc
--- /dev/null
+++ b/src/main/java/org/ahocorasick/trie/MatchToken.java
@@ -0,0 +1,22 @@
+package org.ahocorasick.trie;
+
+public class MatchToken extends Token {
+
+    private Emit emit;
+
+    public MatchToken(String fragment, Emit emit) {
+        super(fragment);
+        this.emit = emit;
+    }
+
+    @Override
+    public boolean isMatch() {
+        return true;
+    }
+
+    @Override
+    public Emit getEmit() {
+        return this.emit;
+    }
+
+}
diff --git a/src/main/java/org/ahocorasick/trie/Token.java b/src/main/java/org/ahocorasick/trie/Token.java
new file mode 100644
index 0000000..65c1fac
--- /dev/null
+++ b/src/main/java/org/ahocorasick/trie/Token.java
@@ -0,0 +1,19 @@
+package org.ahocorasick.trie;
+
+public abstract class Token {
+
+    private String fragment;
+
+    public Token(String fragment) {
+        this.fragment = fragment;
+    }
+
+    public String getFragment() {
+        return this.fragment;
+    }
+
+    public abstract boolean isMatch();
+
+    public abstract Emit getEmit();
+
+}
diff --git a/src/main/java/org/ahocorasick/trie/Trie.java b/src/main/java/org/ahocorasick/trie/Trie.java
index 1afab18..0d28c9b 100644
--- a/src/main/java/org/ahocorasick/trie/Trie.java
+++ b/src/main/java/org/ahocorasick/trie/Trie.java
@@ -54,6 +54,34 @@ public class Trie {
         currentState.addEmit(keyword);
     }
 
+    public Collection<Token> tokenize(String text) {
+
+        Collection<Token> tokens = new ArrayList<Token>();
+
+        Collection<Emit> collectedEmits = parseText(text);
+        int lastCollectedPosition = -1;
+        for (Emit emit : collectedEmits) {
+            if (emit.getStart() - lastCollectedPosition > 1) {
+                tokens.add(createFragment(emit, text, lastCollectedPosition));
+            }
+            tokens.add(createMatch(emit, text));
+            lastCollectedPosition = emit.getEnd();
+        }
+        if (text.length() - lastCollectedPosition > 1) {
+            tokens.add(createFragment(null, text, lastCollectedPosition));
+        }
+
+        return tokens;
+    }
+
+    private Token createFragment(Emit emit, String text, int lastCollectedPosition) {
+        return new FragmentToken(text.substring(lastCollectedPosition+1, emit == null ? text.length() : emit.getStart()));
+    }
+
+    private Token createMatch(Emit emit, String text) {
+        return new MatchToken(text.substring(emit.getStart(), emit.getEnd()+1), emit);
+    }
+
     @SuppressWarnings("unchecked")
     public Collection<Emit> parseText(String text) {
         checkForConstructedFailureStates();
diff --git a/src/test/java/org/ahocorasick/trie/TrieTest.java b/src/test/java/org/ahocorasick/trie/TrieTest.java
index 16731b6..8000d63 100644
--- a/src/test/java/org/ahocorasick/trie/TrieTest.java
+++ b/src/test/java/org/ahocorasick/trie/TrieTest.java
@@ -142,7 +142,7 @@ public class TrieTest {
         trie.addKeyword("once");
         trie.addKeyword("again");
         trie.addKeyword("börkü");
-        Collection<Emit> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ"); // left, middle, right test
+        Collection<Emit> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ");
         assertEquals(4, emits.size()); // Match must not be made
         Iterator<Emit> it = emits.iterator();
         checkEmit(it.next(), 0, 6, "turning");
@@ -151,6 +151,34 @@ public class TrieTest {
         checkEmit(it.next(), 19, 23, "börkü");
     }
 
+    @Test
+    public void tokenizeFullSentence() {
+        Trie trie = new Trie();
+        trie.addKeyword("Alpha");
+        trie.addKeyword("Beta");
+        trie.addKeyword("Gamma");
+        Collection<Token> tokens = trie.tokenize("Hear: Alpha team first, Beta from the rear, Gamma in reserve");
+        assertEquals(7, tokens.size());
+        Iterator<Token> tokensIt = tokens.iterator();
+        assertEquals("Hear: ", tokensIt.next().getFragment());
+        assertEquals("Alpha", tokensIt.next().getFragment());
+        assertEquals(" team first, ", tokensIt.next().getFragment());
+        assertEquals("Beta", tokensIt.next().getFragment());
+        assertEquals(" from the rear, ", tokensIt.next().getFragment());
+        assertEquals("Gamma", tokensIt.next().getFragment());
+        assertEquals(" in reserve", tokensIt.next().getFragment());
+    }
+
+    @Test
+    public void tokenizeTokensInSequence() {
+        Trie trie = new Trie();
+        trie.addKeyword("Alpha");
+        trie.addKeyword("Beta");
+        trie.addKeyword("Gamma");
+        Collection<Token> tokens = trie.tokenize("Alpha Beta Gamma");
+        assertEquals(5, tokens.size());
+    }
+
     private void checkEmit(Emit next, int expectedStart, int expectedEnd, String expectedKeyword) {
         assertEquals(expectedStart, next.getStart());
         assertEquals(expectedEnd, next.getEnd());