Merge branch 'rripken-master' into feature/footprint-reduction

Conflicts:
	src/main/java/org/ahocorasick/trie/Trie.java
	src/test/java/org/ahocorasick/trie/TrieTest.java
This commit is contained in:
robert-bor 2015-09-22 20:38:29 +02:00
commit 4633b1ba2a
3 changed files with 165 additions and 9 deletions

View File

@ -35,7 +35,7 @@ public class State {
* referred to in the white paper as the 'goto' structure. From a state it is possible to go
* to other states, depending on the character passed.
*/
private Map<Character,State> success = new TreeMap<Character, State>();
private Map<Character,State> success = new HashMap<Character, State>();
/** if no matching states are found, the failure state will be returned */
private State failure = null;

View File

@ -85,6 +85,11 @@ public class Trie {
return collectedEmits;
}
public boolean matches(String text)
{
Emit firstMatch = firstMatch(text);
return firstMatch != null;
}
public void parseText(CharSequence text, EmitHandler emitHandler) {
State currentState = this.rootState;
for (int position = 0; position < text.length(); position++) {
@ -113,10 +118,63 @@ public class Trie {
removeEmits.add(emit);
}
for (Emit removeEmit : removeEmits) {
collectedEmits.remove(removeEmit);
}
}
public Emit firstMatch(String text)
{
if (!trieConfig.isAllowOverlaps()) {
// Slow path. Needs to find all the matches to detect overlaps.
Collection<Emit> parseText = parseText(text);
if (parseText != null && !parseText.isEmpty()) {
return parseText.iterator().next();
}
} else {
// Fast path. Returs first match found.
checkForConstructedFailureStates();
int position = 0;
State currentState = this.rootState;
for (Character character : text.toCharArray()) {
if (trieConfig.isCaseInsensitive()) {
character = Character.toLowerCase(character);
}
currentState = getState(currentState, character);
Collection<String> emitStrs = currentState.emit();
if (emitStrs != null && !emitStrs.isEmpty()) {
for (String emitStr : emitStrs) {
final Emit emit = new Emit(position - emitStr.length() + 1, position, emitStr);
if (trieConfig.isOnlyWholeWords()) {
if (!isPartialMatch(text, emit)) {
return emit;
}
} else {
return emit;
}
}
}
position++;
}
}
return null;
}
private boolean isPartialMatch(String searchText, Emit emit)
{
return (emit.getStart() != 0 &&
Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) ||
(emit.getEnd() + 1 != searchText.length() &&
Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1)));
}
private void removePartialMatches(String searchText, List<Emit> collectedEmits)
{
List<Emit> removeEmits = new ArrayList<Emit>();
for (Emit emit : collectedEmits) {
if (isPartialMatch(searchText, emit)) {
removeEmits.add(emit);
}
}
for (Emit removeEmit : removeEmits) {
collectedEmits.remove(removeEmit);
}
}
private State getState(State currentState, Character character) {
State newCurrentState = currentState.nextState(character);

View File

@ -22,6 +22,14 @@ public class TrieTest {
checkEmit(iterator.next(), 0, 2, "abc");
}
@Test
public void keywordAndTextAreTheSameFirstMatch() {
Trie trie = new Trie();
trie.addKeyword("abc");
Emit firstMatch = trie.firstMatch("abc");
checkEmit(firstMatch, 0, 2, "abc");
}
@Test
public void textIsLongerThanKeyword() {
Trie trie = Trie.builder()
@ -32,6 +40,14 @@ public class TrieTest {
checkEmit(iterator.next(), 1, 3, "abc");
}
@Test
public void textIsLongerThanKeywordFirstMatch() {
Trie trie = new Trie();
trie.addKeyword("abc");
Emit firstMatch = trie.firstMatch(" abc");
checkEmit(firstMatch, 1, 3, "abc");
}
@Test
public void variousKeywordsOneMatch() {
Trie trie = Trie.builder()
@ -44,6 +60,16 @@ public class TrieTest {
checkEmit(iterator.next(), 0, 2, "bcd");
}
@Test
public void variousKeywordsFirstMatch() {
Trie trie = new Trie();
trie.addKeyword("abc");
trie.addKeyword("bcd");
trie.addKeyword("cde");
Emit firstMatch = trie.firstMatch("bcd");
checkEmit(firstMatch, 0, 2, "bcd");
}
@Test
public void ushersTestAndStopOnHit() {
Trie trie = Trie.builder()
@ -76,6 +102,17 @@ public class TrieTest {
checkEmit(iterator.next(), 2, 5, "hers");
}
@Test
public void ushersTestFirstMatch() {
Trie trie = new Trie();
trie.addKeyword("hers");
trie.addKeyword("his");
trie.addKeyword("she");
trie.addKeyword("he");
Emit firstMatch = trie.firstMatch("ushers");
checkEmit(firstMatch, 2, 3, "he");
}
@Test
public void ushersTestByCallback() {
Trie trie = Trie.builder()
@ -111,6 +148,14 @@ public class TrieTest {
checkEmit(iterator.next(), 9, 12, "hers");
}
@Test
public void misleadingTestFirstMatch() {
Trie trie = new Trie();
trie.addKeyword("hers");
Emit firstMatch = trie.firstMatch("h he her hers");
checkEmit(firstMatch, 9, 12, "hers");
}
@Test
public void recipes() {
Trie trie = Trie.builder()
@ -127,12 +172,23 @@ public class TrieTest {
checkEmit(iterator.next(), 51, 58, "broccoli");
}
@Test
public void recipesFirstMatch() {
Trie trie = new Trie();
trie.addKeyword("veal");
trie.addKeyword("cauliflower");
trie.addKeyword("broccoli");
trie.addKeyword("tomatoes");
Emit firstMatch = trie.firstMatch("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli");
checkEmit(firstMatch, 2, 12, "cauliflower");
}
@Test
public void longAndShortOverlappingMatch() {
Trie trie = Trie.builder()
.addKeyword("he")
.addKeyword("hehehehe")
.build();
Trie trie = new Trie();
trie.addKeyword("he");
trie.addKeyword("hehehehe");
Collection<Emit> emits = trie.parseText("hehehehehe");
Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 0, 1, "he");
@ -159,6 +215,17 @@ public class TrieTest {
checkEmit(iterator.next(), 6, 7, "ab");
}
@Test
public void nonOverlappingFirstMatch() {
Trie trie = new Trie().removeOverlaps();
trie.addKeyword("ab");
trie.addKeyword("cba");
trie.addKeyword("ababc");
Emit firstMatch = trie.firstMatch("ababcbab");
checkEmit(firstMatch, 0, 4, "ababc");
}
@Test
public void startOfChurchillSpeech() {
Trie trie = Trie.builder().removeOverlaps()
@ -187,6 +254,15 @@ public class TrieTest {
checkEmit(emits.iterator().next(), 20, 24, "sugar");
}
@Test
public void partialMatchFirstMatch() {
Trie trie = new Trie().onlyWholeWords();
trie.addKeyword("sugar");
Emit firstMatch = trie.firstMatch("sugarcane sugarcane sugar canesugar"); // left, middle, right test
checkEmit(firstMatch, 20, 24, "sugar");
}
@Test
public void tokenizeFullSentence() {
Trie trie = Trie.builder()
@ -240,6 +316,18 @@ public class TrieTest {
checkEmit(it.next(), 19, 23, "börkü");
}
@Test
public void caseInsensitiveFirstMatch() {
Trie trie = new Trie().caseInsensitive();
trie.addKeyword("turning");
trie.addKeyword("once");
trie.addKeyword("again");
trie.addKeyword("börkü");
Emit firstMatch = trie.firstMatch("TurninG OnCe AgAiN BÖRKÜ");
checkEmit(firstMatch, 0, 6, "turning");
}
@Test
public void tokenizeTokensInSequence() {
Trie trie = Trie.builder()
@ -274,6 +362,16 @@ public class TrieTest {
checkEmit(it.next(), 5, 8, "this");
}
@Test
public void unicodeIssueBug8ReportedByDwyerkFirstMatch() {
String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char
Trie trie = new Trie().caseInsensitive().onlyWholeWords();
assertEquals("THIS", target.substring(5,9)); // Java does it the right way
trie.addKeyword("this");
Emit firstMatch = trie.firstMatch(target);
checkEmit(firstMatch, 5, 8, "this");
}
private void checkEmit(Emit next, int expectedStart, int expectedEnd, String expectedKeyword) {
assertEquals("Start of emit should have been "+expectedStart, expectedStart, next.getStart());
assertEquals("End of emit should have been "+expectedEnd, expectedEnd, next.getEnd());