Compare commits

...

1 Commits

Author SHA1 Message Date
Ahmed Sobhi
b1c1ad4d71 Fix unicode issue which caused wrong emit start/end
https://github.com/robert-bor/aho-corasick/issues/39

- Some unicode characters when lowercased in a string became more than a character
- This leads to problems in the proper size (start, end) in Emit
- Fix builds upon fix of https://github.com/robert-bor/aho-corasick/issues/8
2016-08-22 02:21:21 +01:00
2 changed files with 76 additions and 55 deletions

View File

@ -32,13 +32,10 @@ public class Trie {
return; return;
} }
State currentState = this.rootState; State currentState = this.rootState;
for (Character character : keyword.toCharArray()) { String caseAdjustedKeyword = adjustCase(keyword).toString();
if (trieConfig.isCaseInsensitive()) { for (Character character : caseAdjustedKeyword.toCharArray())
character = Character.toLowerCase(character);
}
currentState = currentState.addState(character); currentState = currentState.addState(character);
} currentState.addEmit(caseAdjustedKeyword);
currentState.addEmit(trieConfig.isCaseInsensitive() ? keyword.toLowerCase() : keyword);
} }
public Collection<Token> tokenize(String text) { public Collection<Token> tokenize(String text) {
@ -85,25 +82,23 @@ public class Trie {
} }
if (!trieConfig.isAllowOverlaps()) { if (!trieConfig.isAllowOverlaps()) {
IntervalTree intervalTree = new IntervalTree((List<Intervalable>)(List<?>)collectedEmits); IntervalTree intervalTree = new IntervalTree((List<Intervalable>) (List<?>) collectedEmits);
intervalTree.removeOverlaps((List<Intervalable>) (List<?>) collectedEmits); intervalTree.removeOverlaps((List<Intervalable>) (List<?>) collectedEmits);
} }
return collectedEmits; return collectedEmits;
} }
public boolean containsMatch(CharSequence text) { public boolean containsMatch(CharSequence text) {
Emit firstMatch = firstMatch(text); Emit firstMatch = firstMatch(text);
return firstMatch != null; return firstMatch != null;
} }
public void parseText(CharSequence text, EmitHandler emitHandler) { public void parseText(CharSequence text, EmitHandler emitHandler) {
State currentState = this.rootState; State currentState = this.rootState;
for (int position = 0; position < text.length(); position++) { CharSequence caseAdjustedText = adjustCase(text);
Character character = text.charAt(position); for (int position = 0; position < caseAdjustedText.length(); position++) {
if (trieConfig.isCaseInsensitive()) { Character character = caseAdjustedText.charAt(position);
character = Character.toLowerCase(character);
}
currentState = getState(currentState, character); currentState = getState(currentState, character);
if (storeEmits(position, currentState, emitHandler) && trieConfig.isStopOnHit()) { if (storeEmits(position, currentState, emitHandler) && trieConfig.isStopOnHit()) {
return; return;
@ -112,46 +107,44 @@ public class Trie {
} }
public Emit firstMatch(CharSequence text) { public Emit firstMatch(CharSequence text) {
if (!trieConfig.isAllowOverlaps()) { if (!trieConfig.isAllowOverlaps()) {
// Slow path. Needs to find all the matches to detect overlaps. // Slow path. Needs to find all the matches to detect overlaps.
Collection<Emit> parseText = parseText(text); Collection<Emit> parseText = parseText(text);
if (parseText != null && !parseText.isEmpty()) { if (parseText != null && !parseText.isEmpty()) {
return parseText.iterator().next(); return parseText.iterator().next();
} }
} else { } else {
// Fast path. Returns first match found. // Fast path. Returns first match found.
State currentState = this.rootState; State currentState = this.rootState;
for (int position = 0; position < text.length(); position++) { CharSequence caseAdjustedText = adjustCase(text);
Character character = text.charAt(position); for (int position = 0; position < caseAdjustedText.length(); position++) {
if (trieConfig.isCaseInsensitive()) { Character character = caseAdjustedText.charAt(position);
character = Character.toLowerCase(character); currentState = getState(currentState, character);
} Collection<String> emitStrs = currentState.emit();
currentState = getState(currentState, character); if (emitStrs != null && !emitStrs.isEmpty()) {
Collection<String> emitStrs = currentState.emit(); for (String emitStr : emitStrs) {
if (emitStrs != null && !emitStrs.isEmpty()) { final Emit emit = new Emit(position - emitStr.length() + 1, position, emitStr);
for (String emitStr : emitStrs) { if (trieConfig.isOnlyWholeWords()) {
final Emit emit = new Emit(position - emitStr.length() + 1, position, emitStr); if (!isPartialMatch(text, emit)) {
if (trieConfig.isOnlyWholeWords()) { return emit;
if (!isPartialMatch(text, emit)) { }
return emit; } else {
} return emit;
} else { }
return emit; }
} }
} }
} }
} return null;
} }
return null;
}
private boolean isPartialMatch(CharSequence searchText, Emit emit) { private boolean isPartialMatch(CharSequence searchText, Emit emit) {
return (emit.getStart() != 0 && return (emit.getStart() != 0 &&
Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) || Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) ||
(emit.getEnd() + 1 != searchText.length() && (emit.getEnd() + 1 != searchText.length() &&
Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1))); Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1)));
} }
private void removePartialMatches(CharSequence searchText, List<Emit> collectedEmits) { private void removePartialMatches(CharSequence searchText, List<Emit> collectedEmits) {
List<Emit> removeEmits = new ArrayList<>(); List<Emit> removeEmits = new ArrayList<>();
@ -229,6 +222,17 @@ public class Trie {
return emitted; return emitted;
} }
private CharSequence adjustCase(CharSequence text) {
if(trieConfig.isCaseInsensitive()){
char[] textChars = text.toString().toCharArray();
char[] adjustedTextChars = new char[textChars.length];
for(int i = 0; i < textChars.length; i++)
adjustedTextChars[i] = Character.toLowerCase(textChars[i]);
return new String(adjustedTextChars);
}
return text;
}
public static TrieBuilder builder() { public static TrieBuilder builder() {
return new TrieBuilder(); return new TrieBuilder();
} }

View File

@ -415,6 +415,23 @@ public class TrieTest {
checkEmit(firstMatch, 5, 8, "this"); checkEmit(firstMatch, 5, 8, "this");
} }
@Test
public void unicodeIssueBug39ReportedByHumanzz(){
// Problem: "İ".length => 1, "İ".toLowerCase().length => 2. This causes all sorts of unexpected behaviors
// and bugs where the Emit will have a size different from the original string.
// Soln: As in issue #8, convert at character level Character.toLowerCase('İ') => 'i' + make sure
// that emit gets the properly cased keyword.
String upperLengthOne = "İnt";
Trie trie = Trie.builder()
.caseInsensitive()
.onlyWholeWords()
.addKeyword(upperLengthOne)
.build();
Collection<Emit> emits = trie.parseText("İnt is good");
assertEquals(1, emits.size());
checkEmit(emits.iterator().next(), 0, 2, "int");
}
@Test @Test
public void partialMatchWhiteSpaces() { public void partialMatchWhiteSpaces() {
Trie trie = Trie.builder() Trie trie = Trie.builder()