Fix unicode issue which caused wrong emit start/end

https://github.com/robert-bor/aho-corasick/issues/39 - Some unicode characters when lowercased in a string became more than a character - This leads to problems in the proper size (start, end) in Emit - Fix builds upon fix of https://github.com/robert-bor/aho-corasick/issues/8
2016-08-22 02:21:21 +01:00
2 changed files with 76 additions and 55 deletions
--- a/src/main/java/org/ahocorasick/trie/Trie.java
+++ b/src/main/java/org/ahocorasick/trie/Trie.java
@ -32,13 +32,10 @@ public class Trie {
            return;
        }
        State currentState = this.rootState;
-        for (Character character : keyword.toCharArray()) {
+        String caseAdjustedKeyword = adjustCase(keyword).toString();
-            if (trieConfig.isCaseInsensitive()) {
+        for (Character character : caseAdjustedKeyword.toCharArray())
                character = Character.toLowerCase(character);
            }
            currentState = currentState.addState(character);
-        }
+        currentState.addEmit(caseAdjustedKeyword);
        currentState.addEmit(trieConfig.isCaseInsensitive() ? keyword.toLowerCase() : keyword);
    }
    public Collection<Token> tokenize(String text) {
@ -85,25 +82,23 @@ public class Trie {
        }
        if (!trieConfig.isAllowOverlaps()) {
-            IntervalTree intervalTree = new IntervalTree((List<Intervalable>)(List<?>)collectedEmits);
+            IntervalTree intervalTree = new IntervalTree((List<Intervalable>) (List<?>) collectedEmits);
            intervalTree.removeOverlaps((List<Intervalable>) (List<?>) collectedEmits);
        }
        return collectedEmits;
    }
-	public boolean containsMatch(CharSequence text) {
+    public boolean containsMatch(CharSequence text) {
-		Emit firstMatch = firstMatch(text);
+        Emit firstMatch = firstMatch(text);
-		return firstMatch != null;
+        return firstMatch != null;
-	}
+    }
    public void parseText(CharSequence text, EmitHandler emitHandler) {
        State currentState = this.rootState;
-        for (int position = 0; position < text.length(); position++) {
+        CharSequence caseAdjustedText = adjustCase(text);
-            Character character = text.charAt(position);
+        for (int position = 0; position < caseAdjustedText.length(); position++) {
-            if (trieConfig.isCaseInsensitive()) {
+            Character character = caseAdjustedText.charAt(position);
                character = Character.toLowerCase(character);
            }
            currentState = getState(currentState, character);
            if (storeEmits(position, currentState, emitHandler) && trieConfig.isStopOnHit()) {
                return;
@ -112,46 +107,44 @@ public class Trie {
    }
-	public Emit firstMatch(CharSequence text) {
+    public Emit firstMatch(CharSequence text) {
-		if (!trieConfig.isAllowOverlaps()) {
+        if (!trieConfig.isAllowOverlaps()) {
-			// Slow path. Needs to find all the matches to detect overlaps.
+            // Slow path. Needs to find all the matches to detect overlaps.
-			Collection<Emit> parseText = parseText(text);
+            Collection<Emit> parseText = parseText(text);
-			if (parseText != null && !parseText.isEmpty()) {
+            if (parseText != null && !parseText.isEmpty()) {
-				return parseText.iterator().next();
+                return parseText.iterator().next();
-			}
+            }
-		} else {
+        } else {
-			// Fast path. Returns first match found.
+            // Fast path. Returns first match found.
-			State currentState = this.rootState;
+            State currentState = this.rootState;
-            for (int position = 0; position < text.length(); position++) {
+            CharSequence caseAdjustedText = adjustCase(text);
-                Character character = text.charAt(position);
+            for (int position = 0; position < caseAdjustedText.length(); position++) {
-				if (trieConfig.isCaseInsensitive()) {
+                Character character = caseAdjustedText.charAt(position);
-					character = Character.toLowerCase(character);
+                currentState = getState(currentState, character);
-				}
+                Collection<String> emitStrs = currentState.emit();
-				currentState = getState(currentState, character);
+                if (emitStrs != null && !emitStrs.isEmpty()) {
-				Collection<String> emitStrs = currentState.emit();
+                    for (String emitStr : emitStrs) {
-				if (emitStrs != null && !emitStrs.isEmpty()) {
+                        final Emit emit = new Emit(position - emitStr.length() + 1, position, emitStr);
-					for (String emitStr : emitStrs) {
+                        if (trieConfig.isOnlyWholeWords()) {
-						final Emit emit = new Emit(position - emitStr.length() + 1, position, emitStr);
+                            if (!isPartialMatch(text, emit)) {
-						if (trieConfig.isOnlyWholeWords()) {
+                                return emit;
-							if (!isPartialMatch(text, emit)) {
+                            }
-								return emit;
+                        } else {
-							}
+                            return emit;
-						} else {
+                        }
-							return emit;
+                    }
-						}
+                }
-					}
+            }
-				}
+        }
-			}
+        return null;
-		}
+    }
 		return null;
 	}
-	private boolean isPartialMatch(CharSequence searchText, Emit emit) {
+    private boolean isPartialMatch(CharSequence searchText, Emit emit) {
-		return (emit.getStart() != 0 &&
+            return (emit.getStart() != 0 &&
-			Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) ||
+                    Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) ||
-			(emit.getEnd() + 1 != searchText.length() &&
+                    (emit.getEnd() + 1 != searchText.length() &&
-			Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1)));
+                            Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1)));
-	}
+    }
 	private void removePartialMatches(CharSequence searchText, List<Emit> collectedEmits) {
 		List<Emit> removeEmits = new ArrayList<>();
@ -229,6 +222,17 @@ public class Trie {
        return emitted;
    }
    private CharSequence adjustCase(CharSequence text) {
        if(trieConfig.isCaseInsensitive()){
            char[] textChars = text.toString().toCharArray();
            char[] adjustedTextChars = new char[textChars.length];
            for(int i = 0; i < textChars.length; i++)
                adjustedTextChars[i] = Character.toLowerCase(textChars[i]);
            return new String(adjustedTextChars);
        }
        return text;
    }
    public static TrieBuilder builder() {
        return new TrieBuilder();
    }
--- a/src/test/java/org/ahocorasick/trie/TrieTest.java
+++ b/src/test/java/org/ahocorasick/trie/TrieTest.java
@ -415,6 +415,23 @@ public class TrieTest {
        checkEmit(firstMatch, 5, 8, "this");
    }
    @Test
    public void unicodeIssueBug39ReportedByHumanzz(){
        // Problem: "İ".length => 1, "İ".toLowerCase().length => 2. This causes all sorts of unexpected behaviors
        // and bugs where the Emit will have a size different from the original string.
        // Soln: As in issue #8, convert at character level Character.toLowerCase('İ') => 'i'  + make sure
        // that emit gets the properly cased keyword.
        String upperLengthOne = "İnt";
        Trie trie = Trie.builder()
                .caseInsensitive()
                .onlyWholeWords()
                .addKeyword(upperLengthOne)
                .build();
        Collection<Emit> emits = trie.parseText("İnt is good");
        assertEquals(1, emits.size());
        checkEmit(emits.iterator().next(), 0, 2, "int");
    }
    @Test
    public void partialMatchWhiteSpaces() {
        Trie trie = Trie.builder()