Issue #4 Trie.tokenize() available. It returns a list of tokens. A token can be either a fragment (unmatched text) or a match. If it is a match, the original emit can be queried.

This commit is contained in:
robert-bor 2014-02-01 22:01:15 +01:00
parent ae20429936
commit bcde097070
6 changed files with 142 additions and 3 deletions

View File

@ -103,9 +103,33 @@ Normally, this match would not be found. With the caseInsensitive settings the e
before the matching begins. Therefore it will find exactly one match. Since you still have control of the original
search text and you will know exactly where the match was, you can still utilize the original casing.
Now, let's tie it all together. Say, you have this
In many cases you may want to do useful stuff with both the non-matching and the matching text. In this case, you
might be better served by using the Trie.tokenize(). It allows you to loop over the entire text and deal with
matches as soon as you encounter them. Let's look at an example where we want to highlight words from HGttG in HTML:
```java
String speech = "The Answer to the Great Question... Of Life, " +
"the Universe and Everything... Is... Forty-two,' said " +
"Deep Thought, with infinite majesty and calm.";
Trie trie = new Trie().removeOverlaps().onlyWholeWords().caseInsensitive();
trie.addKeyword("great question");
trie.addKeyword("forty-two");
trie.addKeyword("deep thought");
Collection<Token> tokens = trie.tokenize(speech);
StringBuffer html = new StringBuffer();
html.append("<html><body><p>");
for (Token token : tokens) {
if (token.isMatch()) {
html.append("<i>");
}
html.append(token.getFragment());
if (token.isMatch()) {
html.append("</i>");
}
}
html.append("</p></body></html>");
System.out.println(html);
```
License
-------

View File

@ -0,0 +1,18 @@
package org.ahocorasick.trie;
public class FragmentToken extends Token {
public FragmentToken(String fragment) {
super(fragment);
}
@Override
public boolean isMatch() {
return false;
}
@Override
public Emit getEmit() {
return null;
}
}

View File

@ -0,0 +1,22 @@
package org.ahocorasick.trie;
public class MatchToken extends Token {
private Emit emit;
public MatchToken(String fragment, Emit emit) {
super(fragment);
this.emit = emit;
}
@Override
public boolean isMatch() {
return true;
}
@Override
public Emit getEmit() {
return this.emit;
}
}

View File

@ -0,0 +1,19 @@
package org.ahocorasick.trie;
public abstract class Token {
private String fragment;
public Token(String fragment) {
this.fragment = fragment;
}
public String getFragment() {
return this.fragment;
}
public abstract boolean isMatch();
public abstract Emit getEmit();
}

View File

@ -54,6 +54,34 @@ public class Trie {
currentState.addEmit(keyword);
}
public Collection<Token> tokenize(String text) {
Collection<Token> tokens = new ArrayList<Token>();
Collection<Emit> collectedEmits = parseText(text);
int lastCollectedPosition = -1;
for (Emit emit : collectedEmits) {
if (emit.getStart() - lastCollectedPosition > 1) {
tokens.add(createFragment(emit, text, lastCollectedPosition));
}
tokens.add(createMatch(emit, text));
lastCollectedPosition = emit.getEnd();
}
if (text.length() - lastCollectedPosition > 1) {
tokens.add(createFragment(null, text, lastCollectedPosition));
}
return tokens;
}
private Token createFragment(Emit emit, String text, int lastCollectedPosition) {
return new FragmentToken(text.substring(lastCollectedPosition+1, emit == null ? text.length() : emit.getStart()));
}
private Token createMatch(Emit emit, String text) {
return new MatchToken(text.substring(emit.getStart(), emit.getEnd()+1), emit);
}
@SuppressWarnings("unchecked")
public Collection<Emit> parseText(String text) {
checkForConstructedFailureStates();

View File

@ -142,7 +142,7 @@ public class TrieTest {
trie.addKeyword("once");
trie.addKeyword("again");
trie.addKeyword("börkü");
Collection<Emit> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ"); // left, middle, right test
Collection<Emit> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ");
assertEquals(4, emits.size()); // Match must not be made
Iterator<Emit> it = emits.iterator();
checkEmit(it.next(), 0, 6, "turning");
@ -151,6 +151,34 @@ public class TrieTest {
checkEmit(it.next(), 19, 23, "börkü");
}
@Test
public void tokenizeFullSentence() {
Trie trie = new Trie();
trie.addKeyword("Alpha");
trie.addKeyword("Beta");
trie.addKeyword("Gamma");
Collection<Token> tokens = trie.tokenize("Hear: Alpha team first, Beta from the rear, Gamma in reserve");
assertEquals(7, tokens.size());
Iterator<Token> tokensIt = tokens.iterator();
assertEquals("Hear: ", tokensIt.next().getFragment());
assertEquals("Alpha", tokensIt.next().getFragment());
assertEquals(" team first, ", tokensIt.next().getFragment());
assertEquals("Beta", tokensIt.next().getFragment());
assertEquals(" from the rear, ", tokensIt.next().getFragment());
assertEquals("Gamma", tokensIt.next().getFragment());
assertEquals(" in reserve", tokensIt.next().getFragment());
}
@Test
public void tokenizeTokensInSequence() {
Trie trie = new Trie();
trie.addKeyword("Alpha");
trie.addKeyword("Beta");
trie.addKeyword("Gamma");
Collection<Token> tokens = trie.tokenize("Alpha Beta Gamma");
assertEquals(5, tokens.size());
}
private void checkEmit(Emit next, int expectedStart, int expectedEnd, String expectedKeyword) {
assertEquals(expectedStart, next.getStart());
assertEquals(expectedEnd, next.getEnd());