Issue #8 fixed Unicode issue by converting characters individually, not the entire search text

2014-08-26 09:50:15 +02:00 · 2014-08-26 09:50:15 +02:00 · e8b5be0497
commit e8b5be0497
parent 7431c74a7f
3 changed files with 20 additions and 7 deletions
--- a/pom.xml
+++ b/pom.xml
@ -3,7 +3,7 @@

    <groupId>org.ahocorasick</groupId>
    <artifactId>ahocorasick</artifactId>
-    <version>0.2.2</version>
+    <version>0.3.0-SNAPSHOT</version>
    <packaging>jar</packaging>
    <name>Aho-CoraSick algorithm for efficient string matching</name>
    <description>Java library for efficient string matching against a large set of keywords</description>
--- a/src/main/java/org/ahocorasick/trie/Trie.java
+++ b/src/main/java/org/ahocorasick/trie/Trie.java
@ -89,14 +89,13 @@ public class Trie {
    public Collection<Emit> parseText(String text) {
        checkForConstructedFailureStates();

-        if (trieConfig.isCaseInsensitive()) {
-            text = text.toLowerCase();
-        }
-
        int position = 0;
        State currentState = this.rootState;
        List<Emit> collectedEmits = new ArrayList<Emit>();
        for (Character character : text.toCharArray()) {
+            if (trieConfig.isCaseInsensitive()) {
+                character = Character.toLowerCase(character);
+            }
            currentState = getState(currentState, character);
            storeEmits(position, currentState, collectedEmits);
            position++;
--- a/src/test/java/org/ahocorasick/trie/TrieTest.java
+++ b/src/test/java/org/ahocorasick/trie/TrieTest.java
@ -193,6 +193,7 @@ public class TrieTest {
        assertEquals(5, tokens.size());
    }

+    // Test offered by XCurry, https://github.com/robert-bor/aho-corasick/issues/7
    @Test
    public void zeroLengthTestBug7InGithubReportedByXCurry() {
        Trie trie = new Trie().removeOverlaps().onlyWholeWords().caseInsensitive();
@ -200,9 +201,22 @@ public class TrieTest {
        trie.tokenize("Try a natural lip and subtle bronzer to keep all the focus on those big bright eyes with NARS Eyeshadow Duo in Rated R And the winner is... Boots No7 Advanced Renewal Anti-ageing Glycolic Peel Kit ($25 amazon.com) won most-appealing peel.");
    }

+    // Test offered by dwyerk, https://github.com/robert-bor/aho-corasick/issues/8
+    @Test
+    public void unicodeIssueBug8ReportedByDwyerk() {
+        String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char
+        Trie trie = new Trie().caseInsensitive().onlyWholeWords();
+        assertEquals("THIS", target.substring(5,9)); // Java does it the right way
+        trie.addKeyword("this");
+        Collection<Emit> emits = trie.parseText(target);
+        assertEquals(1, emits.size());
+        Iterator<Emit> it = emits.iterator();
+        checkEmit(it.next(), 5, 8, "this");
+    }
+
    private void checkEmit(Emit next, int expectedStart, int expectedEnd, String expectedKeyword) {
-        assertEquals(expectedStart, next.getStart());
-        assertEquals(expectedEnd, next.getEnd());
+        assertEquals("Start of emit should have been "+expectedStart, expectedStart, next.getStart());
+        assertEquals("End of emit should have been "+expectedEnd, expectedEnd, next.getEnd());
        assertEquals(expectedKeyword, next.getKeyword());
    }