Issue #25 match tokens report back whether they are whole words or not
This commit is contained in:
parent
877a56c956
commit
438e546245
@ -2,11 +2,19 @@ package org.ahocorasick.trie;
|
||||
|
||||
public class MatchToken extends Token {
|
||||
|
||||
private Emit emit;
|
||||
private final boolean wholeWord;
|
||||
|
||||
public MatchToken(String fragment, Emit emit) {
|
||||
private final Emit emit;
|
||||
|
||||
public MatchToken(String fragment, Emit emit, boolean wholeWord) {
|
||||
super(fragment);
|
||||
this.emit = emit;
|
||||
this.wholeWord = wholeWord;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isWholeWord() {
|
||||
return wholeWord;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
||||
@ -14,6 +14,10 @@ public abstract class Token {
|
||||
|
||||
public abstract boolean isMatch();
|
||||
|
||||
public boolean isWholeWord() {
|
||||
return false;
|
||||
}
|
||||
|
||||
public abstract Emit getEmit();
|
||||
|
||||
}
|
||||
|
||||
@ -38,7 +38,10 @@ public class Tokenizer {
|
||||
}
|
||||
|
||||
private Token createMatch(Emit emit, String text) {
|
||||
return new MatchToken(text.substring(emit.getStart(), emit.getEnd()+1), emit);
|
||||
return new MatchToken(
|
||||
text.substring(emit.getStart(), emit.getEnd()+1),
|
||||
emit,
|
||||
Trie.isWholeWord(this.text, emit.getStart(), emit.getEnd()));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -97,7 +97,7 @@ public class Trie {
|
||||
flushHandler.flush();
|
||||
}
|
||||
|
||||
private boolean isWholeWord(CharSequence text, int start, int end) {
|
||||
public static boolean isWholeWord(CharSequence text, int start, int end) {
|
||||
return (start == 0 || Character.isWhitespace(text.charAt(start - 1))) &&
|
||||
(end == text.length() - 1 || Character.isWhitespace(text.charAt(end + 1)));
|
||||
}
|
||||
|
||||
@ -10,6 +10,7 @@ import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import static junit.framework.Assert.assertEquals;
|
||||
import static junit.framework.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
public class TrieTest {
|
||||
@ -332,6 +333,22 @@ public class TrieTest {
|
||||
checkEmit(firstMatch, 20, 24, "sugar");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void tokenizeAndReportOnWholeWords() {
|
||||
Trie trie = Trie.builder()
|
||||
.addKeyword("Alpha")
|
||||
.build();
|
||||
Collection<Token> tokens = trie.tokenize("Alpha AlphaAlpha Alpha");
|
||||
assertEquals(6, tokens.size());
|
||||
Iterator<Token> tokensIt = tokens.iterator();
|
||||
assertToken(tokensIt.next(), "Alpha", true, true);
|
||||
assertToken(tokensIt.next(), " ", false, false);
|
||||
assertToken(tokensIt.next(), "Alpha", true, false);
|
||||
assertToken(tokensIt.next(), "Alpha", true, false);
|
||||
assertToken(tokensIt.next(), " ", false, false);
|
||||
assertToken(tokensIt.next(), "Alpha", true, true);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void tokenizeFullSentence() {
|
||||
Trie trie = Trie.builder()
|
||||
@ -342,13 +359,13 @@ public class TrieTest {
|
||||
Collection<Token> tokens = trie.tokenize("Hear: Alpha team first, Beta from the rear, Gamma in reserve");
|
||||
assertEquals(7, tokens.size());
|
||||
Iterator<Token> tokensIt = tokens.iterator();
|
||||
assertEquals("Hear: ", tokensIt.next().getFragment());
|
||||
assertEquals("Alpha", tokensIt.next().getFragment());
|
||||
assertEquals(" team first, ", tokensIt.next().getFragment());
|
||||
assertEquals("Beta", tokensIt.next().getFragment());
|
||||
assertEquals(" from the rear, ", tokensIt.next().getFragment());
|
||||
assertEquals("Gamma", tokensIt.next().getFragment());
|
||||
assertEquals(" in reserve", tokensIt.next().getFragment());
|
||||
assertToken(tokensIt.next(), "Hear: ", false, false);
|
||||
assertToken(tokensIt.next(), "Alpha", true, true);
|
||||
assertToken(tokensIt.next(), " team first, ", false, false);
|
||||
assertToken(tokensIt.next(), "Beta", true, true);
|
||||
assertToken(tokensIt.next(), " from the rear, ", false, false);
|
||||
assertToken(tokensIt.next(), "Gamma", true, true);
|
||||
assertToken(tokensIt.next(), " in reserve", false, false);
|
||||
}
|
||||
|
||||
@Test
|
||||
@ -456,6 +473,12 @@ public class TrieTest {
|
||||
checkEmit(emits.iterator().next(), 0, 9, "#sugar-123");
|
||||
}
|
||||
|
||||
private void assertToken(Token token, String fragment, boolean match, boolean wholeWord) {
|
||||
assertEquals(fragment, token.getFragment());
|
||||
assertEquals(match, token.isMatch());
|
||||
assertEquals(wholeWord, token.isWholeWord());
|
||||
}
|
||||
|
||||
private void checkEmit(Emit next, int expectedStart, int expectedEnd, String expectedKeyword) {
|
||||
assertEquals("Start of emit should have been " + expectedStart, expectedStart, next.getStart());
|
||||
assertEquals("End of emit should have been " + expectedEnd, expectedEnd, next.getEnd());
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user