Compare commits

...

1 Commits

Author SHA1 Message Date
Ahmed Sobhi
b1c1ad4d71 Fix unicode issue which caused wrong emit start/end
https://github.com/robert-bor/aho-corasick/issues/39

- Some unicode characters when lowercased in a string became more than a character
- This leads to problems in the proper size (start, end) in Emit
- Fix builds upon fix of https://github.com/robert-bor/aho-corasick/issues/8
2016-08-22 02:21:21 +01:00
2 changed files with 76 additions and 55 deletions

View File

@ -32,13 +32,10 @@ public class Trie {
return;
}
State currentState = this.rootState;
for (Character character : keyword.toCharArray()) {
if (trieConfig.isCaseInsensitive()) {
character = Character.toLowerCase(character);
}
String caseAdjustedKeyword = adjustCase(keyword).toString();
for (Character character : caseAdjustedKeyword.toCharArray())
currentState = currentState.addState(character);
}
currentState.addEmit(trieConfig.isCaseInsensitive() ? keyword.toLowerCase() : keyword);
currentState.addEmit(caseAdjustedKeyword);
}
public Collection<Token> tokenize(String text) {
@ -85,25 +82,23 @@ public class Trie {
}
if (!trieConfig.isAllowOverlaps()) {
IntervalTree intervalTree = new IntervalTree((List<Intervalable>)(List<?>)collectedEmits);
IntervalTree intervalTree = new IntervalTree((List<Intervalable>) (List<?>) collectedEmits);
intervalTree.removeOverlaps((List<Intervalable>) (List<?>) collectedEmits);
}
return collectedEmits;
}
public boolean containsMatch(CharSequence text) {
Emit firstMatch = firstMatch(text);
return firstMatch != null;
}
public boolean containsMatch(CharSequence text) {
Emit firstMatch = firstMatch(text);
return firstMatch != null;
}
public void parseText(CharSequence text, EmitHandler emitHandler) {
State currentState = this.rootState;
for (int position = 0; position < text.length(); position++) {
Character character = text.charAt(position);
if (trieConfig.isCaseInsensitive()) {
character = Character.toLowerCase(character);
}
CharSequence caseAdjustedText = adjustCase(text);
for (int position = 0; position < caseAdjustedText.length(); position++) {
Character character = caseAdjustedText.charAt(position);
currentState = getState(currentState, character);
if (storeEmits(position, currentState, emitHandler) && trieConfig.isStopOnHit()) {
return;
@ -112,46 +107,44 @@ public class Trie {
}
public Emit firstMatch(CharSequence text) {
if (!trieConfig.isAllowOverlaps()) {
// Slow path. Needs to find all the matches to detect overlaps.
Collection<Emit> parseText = parseText(text);
if (parseText != null && !parseText.isEmpty()) {
return parseText.iterator().next();
}
} else {
// Fast path. Returns first match found.
State currentState = this.rootState;
for (int position = 0; position < text.length(); position++) {
Character character = text.charAt(position);
if (trieConfig.isCaseInsensitive()) {
character = Character.toLowerCase(character);
}
currentState = getState(currentState, character);
Collection<String> emitStrs = currentState.emit();
if (emitStrs != null && !emitStrs.isEmpty()) {
for (String emitStr : emitStrs) {
final Emit emit = new Emit(position - emitStr.length() + 1, position, emitStr);
if (trieConfig.isOnlyWholeWords()) {
if (!isPartialMatch(text, emit)) {
return emit;
}
} else {
return emit;
}
}
}
}
}
return null;
}
public Emit firstMatch(CharSequence text) {
if (!trieConfig.isAllowOverlaps()) {
// Slow path. Needs to find all the matches to detect overlaps.
Collection<Emit> parseText = parseText(text);
if (parseText != null && !parseText.isEmpty()) {
return parseText.iterator().next();
}
} else {
// Fast path. Returns first match found.
State currentState = this.rootState;
CharSequence caseAdjustedText = adjustCase(text);
for (int position = 0; position < caseAdjustedText.length(); position++) {
Character character = caseAdjustedText.charAt(position);
currentState = getState(currentState, character);
Collection<String> emitStrs = currentState.emit();
if (emitStrs != null && !emitStrs.isEmpty()) {
for (String emitStr : emitStrs) {
final Emit emit = new Emit(position - emitStr.length() + 1, position, emitStr);
if (trieConfig.isOnlyWholeWords()) {
if (!isPartialMatch(text, emit)) {
return emit;
}
} else {
return emit;
}
}
}
}
}
return null;
}
private boolean isPartialMatch(CharSequence searchText, Emit emit) {
return (emit.getStart() != 0 &&
Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) ||
(emit.getEnd() + 1 != searchText.length() &&
Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1)));
}
private boolean isPartialMatch(CharSequence searchText, Emit emit) {
return (emit.getStart() != 0 &&
Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) ||
(emit.getEnd() + 1 != searchText.length() &&
Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1)));
}
private void removePartialMatches(CharSequence searchText, List<Emit> collectedEmits) {
List<Emit> removeEmits = new ArrayList<>();
@ -229,6 +222,17 @@ public class Trie {
return emitted;
}
private CharSequence adjustCase(CharSequence text) {
if(trieConfig.isCaseInsensitive()){
char[] textChars = text.toString().toCharArray();
char[] adjustedTextChars = new char[textChars.length];
for(int i = 0; i < textChars.length; i++)
adjustedTextChars[i] = Character.toLowerCase(textChars[i]);
return new String(adjustedTextChars);
}
return text;
}
public static TrieBuilder builder() {
return new TrieBuilder();
}

View File

@ -415,6 +415,23 @@ public class TrieTest {
checkEmit(firstMatch, 5, 8, "this");
}
@Test
public void unicodeIssueBug39ReportedByHumanzz(){
// Problem: "İ".length => 1, "İ".toLowerCase().length => 2. This causes all sorts of unexpected behaviors
// and bugs where the Emit will have a size different from the original string.
// Soln: As in issue #8, convert at character level Character.toLowerCase('İ') => 'i' + make sure
// that emit gets the properly cased keyword.
String upperLengthOne = "İnt";
Trie trie = Trie.builder()
.caseInsensitive()
.onlyWholeWords()
.addKeyword(upperLengthOne)
.build();
Collection<Emit> emits = trie.parseText("İnt is good");
assertEquals(1, emits.size());
checkEmit(emits.iterator().next(), 0, 2, "int");
}
@Test
public void partialMatchWhiteSpaces() {
Trie trie = Trie.builder()