From bcde09707001ae85e201847e2dda1fc0a6385aaa Mon Sep 17 00:00:00 2001 From: robert-bor Date: Sat, 1 Feb 2014 22:01:15 +0100 Subject: [PATCH] Issue #4 Trie.tokenize() available. It returns a list of tokens. A token can be either a fragment (unmatched text) or a match. If it is a match, the original emit can be queried. --- README.md | 28 +++++++++++++++-- .../org/ahocorasick/trie/FragmentToken.java | 18 +++++++++++ .../java/org/ahocorasick/trie/MatchToken.java | 22 ++++++++++++++ src/main/java/org/ahocorasick/trie/Token.java | 19 ++++++++++++ src/main/java/org/ahocorasick/trie/Trie.java | 28 +++++++++++++++++ .../java/org/ahocorasick/trie/TrieTest.java | 30 ++++++++++++++++++- 6 files changed, 142 insertions(+), 3 deletions(-) create mode 100644 src/main/java/org/ahocorasick/trie/FragmentToken.java create mode 100644 src/main/java/org/ahocorasick/trie/MatchToken.java create mode 100644 src/main/java/org/ahocorasick/trie/Token.java diff --git a/README.md b/README.md index 3b51877..d82839a 100644 --- a/README.md +++ b/README.md @@ -103,9 +103,33 @@ Normally, this match would not be found. With the caseInsensitive settings the e before the matching begins. Therefore it will find exactly one match. Since you still have control of the original search text and you will know exactly where the match was, you can still utilize the original casing. -Now, let's tie it all together. Say, you have this - +In many cases you may want to do useful stuff with both the non-matching and the matching text. In this case, you +might be better served by using the Trie.tokenize(). It allows you to loop over the entire text and deal with +matches as soon as you encounter them. Let's look at an example where we want to highlight words from HGttG in HTML: +```java + String speech = "The Answer to the Great Question... Of Life, " + + "the Universe and Everything... Is... Forty-two,' said " + + "Deep Thought, with infinite majesty and calm."; + Trie trie = new Trie().removeOverlaps().onlyWholeWords().caseInsensitive(); + trie.addKeyword("great question"); + trie.addKeyword("forty-two"); + trie.addKeyword("deep thought"); + Collection tokens = trie.tokenize(speech); + StringBuffer html = new StringBuffer(); + html.append("

"); + for (Token token : tokens) { + if (token.isMatch()) { + html.append(""); + } + html.append(token.getFragment()); + if (token.isMatch()) { + html.append(""); + } + } + html.append("

"); + System.out.println(html); +``` License ------- diff --git a/src/main/java/org/ahocorasick/trie/FragmentToken.java b/src/main/java/org/ahocorasick/trie/FragmentToken.java new file mode 100644 index 0000000..85c498a --- /dev/null +++ b/src/main/java/org/ahocorasick/trie/FragmentToken.java @@ -0,0 +1,18 @@ +package org.ahocorasick.trie; + +public class FragmentToken extends Token { + + public FragmentToken(String fragment) { + super(fragment); + } + + @Override + public boolean isMatch() { + return false; + } + + @Override + public Emit getEmit() { + return null; + } +} diff --git a/src/main/java/org/ahocorasick/trie/MatchToken.java b/src/main/java/org/ahocorasick/trie/MatchToken.java new file mode 100644 index 0000000..c2615dc --- /dev/null +++ b/src/main/java/org/ahocorasick/trie/MatchToken.java @@ -0,0 +1,22 @@ +package org.ahocorasick.trie; + +public class MatchToken extends Token { + + private Emit emit; + + public MatchToken(String fragment, Emit emit) { + super(fragment); + this.emit = emit; + } + + @Override + public boolean isMatch() { + return true; + } + + @Override + public Emit getEmit() { + return this.emit; + } + +} diff --git a/src/main/java/org/ahocorasick/trie/Token.java b/src/main/java/org/ahocorasick/trie/Token.java new file mode 100644 index 0000000..65c1fac --- /dev/null +++ b/src/main/java/org/ahocorasick/trie/Token.java @@ -0,0 +1,19 @@ +package org.ahocorasick.trie; + +public abstract class Token { + + private String fragment; + + public Token(String fragment) { + this.fragment = fragment; + } + + public String getFragment() { + return this.fragment; + } + + public abstract boolean isMatch(); + + public abstract Emit getEmit(); + +} diff --git a/src/main/java/org/ahocorasick/trie/Trie.java b/src/main/java/org/ahocorasick/trie/Trie.java index 1afab18..0d28c9b 100644 --- a/src/main/java/org/ahocorasick/trie/Trie.java +++ b/src/main/java/org/ahocorasick/trie/Trie.java @@ -54,6 +54,34 @@ public class Trie { currentState.addEmit(keyword); } + public Collection tokenize(String text) { + + Collection tokens = new ArrayList(); + + Collection collectedEmits = parseText(text); + int lastCollectedPosition = -1; + for (Emit emit : collectedEmits) { + if (emit.getStart() - lastCollectedPosition > 1) { + tokens.add(createFragment(emit, text, lastCollectedPosition)); + } + tokens.add(createMatch(emit, text)); + lastCollectedPosition = emit.getEnd(); + } + if (text.length() - lastCollectedPosition > 1) { + tokens.add(createFragment(null, text, lastCollectedPosition)); + } + + return tokens; + } + + private Token createFragment(Emit emit, String text, int lastCollectedPosition) { + return new FragmentToken(text.substring(lastCollectedPosition+1, emit == null ? text.length() : emit.getStart())); + } + + private Token createMatch(Emit emit, String text) { + return new MatchToken(text.substring(emit.getStart(), emit.getEnd()+1), emit); + } + @SuppressWarnings("unchecked") public Collection parseText(String text) { checkForConstructedFailureStates(); diff --git a/src/test/java/org/ahocorasick/trie/TrieTest.java b/src/test/java/org/ahocorasick/trie/TrieTest.java index 16731b6..8000d63 100644 --- a/src/test/java/org/ahocorasick/trie/TrieTest.java +++ b/src/test/java/org/ahocorasick/trie/TrieTest.java @@ -142,7 +142,7 @@ public class TrieTest { trie.addKeyword("once"); trie.addKeyword("again"); trie.addKeyword("börkü"); - Collection emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ"); // left, middle, right test + Collection emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ"); assertEquals(4, emits.size()); // Match must not be made Iterator it = emits.iterator(); checkEmit(it.next(), 0, 6, "turning"); @@ -151,6 +151,34 @@ public class TrieTest { checkEmit(it.next(), 19, 23, "börkü"); } + @Test + public void tokenizeFullSentence() { + Trie trie = new Trie(); + trie.addKeyword("Alpha"); + trie.addKeyword("Beta"); + trie.addKeyword("Gamma"); + Collection tokens = trie.tokenize("Hear: Alpha team first, Beta from the rear, Gamma in reserve"); + assertEquals(7, tokens.size()); + Iterator tokensIt = tokens.iterator(); + assertEquals("Hear: ", tokensIt.next().getFragment()); + assertEquals("Alpha", tokensIt.next().getFragment()); + assertEquals(" team first, ", tokensIt.next().getFragment()); + assertEquals("Beta", tokensIt.next().getFragment()); + assertEquals(" from the rear, ", tokensIt.next().getFragment()); + assertEquals("Gamma", tokensIt.next().getFragment()); + assertEquals(" in reserve", tokensIt.next().getFragment()); + } + + @Test + public void tokenizeTokensInSequence() { + Trie trie = new Trie(); + trie.addKeyword("Alpha"); + trie.addKeyword("Beta"); + trie.addKeyword("Gamma"); + Collection tokens = trie.tokenize("Alpha Beta Gamma"); + assertEquals(5, tokens.size()); + } + private void checkEmit(Emit next, int expectedStart, int expectedEnd, String expectedKeyword) { assertEquals(expectedStart, next.getStart()); assertEquals(expectedEnd, next.getEnd());