Issue #25 match tokens report back whether they are whole words or not

This commit is contained in:
robert-bor 2015-09-27 18:22:38 +02:00
parent 877a56c956
commit 438e546245
5 changed files with 49 additions and 11 deletions

View File

@ -2,11 +2,19 @@ package org.ahocorasick.trie;
public class MatchToken extends Token {
private Emit emit;
private final boolean wholeWord;
public MatchToken(String fragment, Emit emit) {
private final Emit emit;
public MatchToken(String fragment, Emit emit, boolean wholeWord) {
super(fragment);
this.emit = emit;
this.wholeWord = wholeWord;
}
@Override
public boolean isWholeWord() {
return wholeWord;
}
@Override

View File

@ -14,6 +14,10 @@ public abstract class Token {
public abstract boolean isMatch();
public boolean isWholeWord() {
return false;
}
public abstract Emit getEmit();
}

View File

@ -38,7 +38,10 @@ public class Tokenizer {
}
private Token createMatch(Emit emit, String text) {
return new MatchToken(text.substring(emit.getStart(), emit.getEnd()+1), emit);
return new MatchToken(
text.substring(emit.getStart(), emit.getEnd()+1),
emit,
Trie.isWholeWord(this.text, emit.getStart(), emit.getEnd()));
}
}

View File

@ -97,7 +97,7 @@ public class Trie {
flushHandler.flush();
}
private boolean isWholeWord(CharSequence text, int start, int end) {
public static boolean isWholeWord(CharSequence text, int start, int end) {
return (start == 0 || Character.isWhitespace(text.charAt(start - 1))) &&
(end == text.length() - 1 || Character.isWhitespace(text.charAt(end + 1)));
}

View File

@ -10,6 +10,7 @@ import java.util.Iterator;
import java.util.List;
import static junit.framework.Assert.assertEquals;
import static junit.framework.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
public class TrieTest {
@ -332,6 +333,22 @@ public class TrieTest {
checkEmit(firstMatch, 20, 24, "sugar");
}
@Test
public void tokenizeAndReportOnWholeWords() {
Trie trie = Trie.builder()
.addKeyword("Alpha")
.build();
Collection<Token> tokens = trie.tokenize("Alpha AlphaAlpha Alpha");
assertEquals(6, tokens.size());
Iterator<Token> tokensIt = tokens.iterator();
assertToken(tokensIt.next(), "Alpha", true, true);
assertToken(tokensIt.next(), " ", false, false);
assertToken(tokensIt.next(), "Alpha", true, false);
assertToken(tokensIt.next(), "Alpha", true, false);
assertToken(tokensIt.next(), " ", false, false);
assertToken(tokensIt.next(), "Alpha", true, true);
}
@Test
public void tokenizeFullSentence() {
Trie trie = Trie.builder()
@ -342,13 +359,13 @@ public class TrieTest {
Collection<Token> tokens = trie.tokenize("Hear: Alpha team first, Beta from the rear, Gamma in reserve");
assertEquals(7, tokens.size());
Iterator<Token> tokensIt = tokens.iterator();
assertEquals("Hear: ", tokensIt.next().getFragment());
assertEquals("Alpha", tokensIt.next().getFragment());
assertEquals(" team first, ", tokensIt.next().getFragment());
assertEquals("Beta", tokensIt.next().getFragment());
assertEquals(" from the rear, ", tokensIt.next().getFragment());
assertEquals("Gamma", tokensIt.next().getFragment());
assertEquals(" in reserve", tokensIt.next().getFragment());
assertToken(tokensIt.next(), "Hear: ", false, false);
assertToken(tokensIt.next(), "Alpha", true, true);
assertToken(tokensIt.next(), " team first, ", false, false);
assertToken(tokensIt.next(), "Beta", true, true);
assertToken(tokensIt.next(), " from the rear, ", false, false);
assertToken(tokensIt.next(), "Gamma", true, true);
assertToken(tokensIt.next(), " in reserve", false, false);
}
@Test
@ -456,6 +473,12 @@ public class TrieTest {
checkEmit(emits.iterator().next(), 0, 9, "#sugar-123");
}
private void assertToken(Token token, String fragment, boolean match, boolean wholeWord) {
assertEquals(fragment, token.getFragment());
assertEquals(match, token.isMatch());
assertEquals(wholeWord, token.isWholeWord());
}
private void checkEmit(Emit next, int expectedStart, int expectedEnd, String expectedKeyword) {
assertEquals("Start of emit should have been " + expectedStart, expectedStart, next.getStart());
assertEquals("End of emit should have been " + expectedEnd, expectedEnd, next.getEnd());