Issue #4 Trie.tokenize() available. It returns a list of tokens. A token can be either a fragment (unmatched text) or a match. If it is a match, the original emit can be queried.
This commit is contained in:
parent
ae20429936
commit
bcde097070
28
README.md
28
README.md
@ -103,9 +103,33 @@ Normally, this match would not be found. With the caseInsensitive settings the e
|
||||
before the matching begins. Therefore it will find exactly one match. Since you still have control of the original
|
||||
search text and you will know exactly where the match was, you can still utilize the original casing.
|
||||
|
||||
Now, let's tie it all together. Say, you have this
|
||||
|
||||
In many cases you may want to do useful stuff with both the non-matching and the matching text. In this case, you
|
||||
might be better served by using the Trie.tokenize(). It allows you to loop over the entire text and deal with
|
||||
matches as soon as you encounter them. Let's look at an example where we want to highlight words from HGttG in HTML:
|
||||
|
||||
```java
|
||||
String speech = "The Answer to the Great Question... Of Life, " +
|
||||
"the Universe and Everything... Is... Forty-two,' said " +
|
||||
"Deep Thought, with infinite majesty and calm.";
|
||||
Trie trie = new Trie().removeOverlaps().onlyWholeWords().caseInsensitive();
|
||||
trie.addKeyword("great question");
|
||||
trie.addKeyword("forty-two");
|
||||
trie.addKeyword("deep thought");
|
||||
Collection<Token> tokens = trie.tokenize(speech);
|
||||
StringBuffer html = new StringBuffer();
|
||||
html.append("<html><body><p>");
|
||||
for (Token token : tokens) {
|
||||
if (token.isMatch()) {
|
||||
html.append("<i>");
|
||||
}
|
||||
html.append(token.getFragment());
|
||||
if (token.isMatch()) {
|
||||
html.append("</i>");
|
||||
}
|
||||
}
|
||||
html.append("</p></body></html>");
|
||||
System.out.println(html);
|
||||
```
|
||||
|
||||
License
|
||||
-------
|
||||
|
||||
18
src/main/java/org/ahocorasick/trie/FragmentToken.java
Normal file
18
src/main/java/org/ahocorasick/trie/FragmentToken.java
Normal file
@ -0,0 +1,18 @@
|
||||
package org.ahocorasick.trie;
|
||||
|
||||
public class FragmentToken extends Token {
|
||||
|
||||
public FragmentToken(String fragment) {
|
||||
super(fragment);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isMatch() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Emit getEmit() {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
22
src/main/java/org/ahocorasick/trie/MatchToken.java
Normal file
22
src/main/java/org/ahocorasick/trie/MatchToken.java
Normal file
@ -0,0 +1,22 @@
|
||||
package org.ahocorasick.trie;
|
||||
|
||||
public class MatchToken extends Token {
|
||||
|
||||
private Emit emit;
|
||||
|
||||
public MatchToken(String fragment, Emit emit) {
|
||||
super(fragment);
|
||||
this.emit = emit;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isMatch() {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Emit getEmit() {
|
||||
return this.emit;
|
||||
}
|
||||
|
||||
}
|
||||
19
src/main/java/org/ahocorasick/trie/Token.java
Normal file
19
src/main/java/org/ahocorasick/trie/Token.java
Normal file
@ -0,0 +1,19 @@
|
||||
package org.ahocorasick.trie;
|
||||
|
||||
public abstract class Token {
|
||||
|
||||
private String fragment;
|
||||
|
||||
public Token(String fragment) {
|
||||
this.fragment = fragment;
|
||||
}
|
||||
|
||||
public String getFragment() {
|
||||
return this.fragment;
|
||||
}
|
||||
|
||||
public abstract boolean isMatch();
|
||||
|
||||
public abstract Emit getEmit();
|
||||
|
||||
}
|
||||
@ -54,6 +54,34 @@ public class Trie {
|
||||
currentState.addEmit(keyword);
|
||||
}
|
||||
|
||||
public Collection<Token> tokenize(String text) {
|
||||
|
||||
Collection<Token> tokens = new ArrayList<Token>();
|
||||
|
||||
Collection<Emit> collectedEmits = parseText(text);
|
||||
int lastCollectedPosition = -1;
|
||||
for (Emit emit : collectedEmits) {
|
||||
if (emit.getStart() - lastCollectedPosition > 1) {
|
||||
tokens.add(createFragment(emit, text, lastCollectedPosition));
|
||||
}
|
||||
tokens.add(createMatch(emit, text));
|
||||
lastCollectedPosition = emit.getEnd();
|
||||
}
|
||||
if (text.length() - lastCollectedPosition > 1) {
|
||||
tokens.add(createFragment(null, text, lastCollectedPosition));
|
||||
}
|
||||
|
||||
return tokens;
|
||||
}
|
||||
|
||||
private Token createFragment(Emit emit, String text, int lastCollectedPosition) {
|
||||
return new FragmentToken(text.substring(lastCollectedPosition+1, emit == null ? text.length() : emit.getStart()));
|
||||
}
|
||||
|
||||
private Token createMatch(Emit emit, String text) {
|
||||
return new MatchToken(text.substring(emit.getStart(), emit.getEnd()+1), emit);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public Collection<Emit> parseText(String text) {
|
||||
checkForConstructedFailureStates();
|
||||
|
||||
@ -142,7 +142,7 @@ public class TrieTest {
|
||||
trie.addKeyword("once");
|
||||
trie.addKeyword("again");
|
||||
trie.addKeyword("börkü");
|
||||
Collection<Emit> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ"); // left, middle, right test
|
||||
Collection<Emit> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ");
|
||||
assertEquals(4, emits.size()); // Match must not be made
|
||||
Iterator<Emit> it = emits.iterator();
|
||||
checkEmit(it.next(), 0, 6, "turning");
|
||||
@ -151,6 +151,34 @@ public class TrieTest {
|
||||
checkEmit(it.next(), 19, 23, "börkü");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void tokenizeFullSentence() {
|
||||
Trie trie = new Trie();
|
||||
trie.addKeyword("Alpha");
|
||||
trie.addKeyword("Beta");
|
||||
trie.addKeyword("Gamma");
|
||||
Collection<Token> tokens = trie.tokenize("Hear: Alpha team first, Beta from the rear, Gamma in reserve");
|
||||
assertEquals(7, tokens.size());
|
||||
Iterator<Token> tokensIt = tokens.iterator();
|
||||
assertEquals("Hear: ", tokensIt.next().getFragment());
|
||||
assertEquals("Alpha", tokensIt.next().getFragment());
|
||||
assertEquals(" team first, ", tokensIt.next().getFragment());
|
||||
assertEquals("Beta", tokensIt.next().getFragment());
|
||||
assertEquals(" from the rear, ", tokensIt.next().getFragment());
|
||||
assertEquals("Gamma", tokensIt.next().getFragment());
|
||||
assertEquals(" in reserve", tokensIt.next().getFragment());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void tokenizeTokensInSequence() {
|
||||
Trie trie = new Trie();
|
||||
trie.addKeyword("Alpha");
|
||||
trie.addKeyword("Beta");
|
||||
trie.addKeyword("Gamma");
|
||||
Collection<Token> tokens = trie.tokenize("Alpha Beta Gamma");
|
||||
assertEquals(5, tokens.size());
|
||||
}
|
||||
|
||||
private void checkEmit(Emit next, int expectedStart, int expectedEnd, String expectedKeyword) {
|
||||
assertEquals(expectedStart, next.getStart());
|
||||
assertEquals(expectedEnd, next.getEnd());
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user