Issue #8 fixed Unicode issue by converting characters individually, not the entire search text

This commit is contained in:
robert-bor 2014-08-26 09:50:15 +02:00
parent 7431c74a7f
commit e8b5be0497
3 changed files with 20 additions and 7 deletions

View File

@ -3,7 +3,7 @@
<groupId>org.ahocorasick</groupId>
<artifactId>ahocorasick</artifactId>
<version>0.2.2</version>
<version>0.3.0-SNAPSHOT</version>
<packaging>jar</packaging>
<name>Aho-CoraSick algorithm for efficient string matching</name>
<description>Java library for efficient string matching against a large set of keywords</description>

View File

@ -89,14 +89,13 @@ public class Trie {
public Collection<Emit> parseText(String text) {
checkForConstructedFailureStates();
if (trieConfig.isCaseInsensitive()) {
text = text.toLowerCase();
}
int position = 0;
State currentState = this.rootState;
List<Emit> collectedEmits = new ArrayList<Emit>();
for (Character character : text.toCharArray()) {
if (trieConfig.isCaseInsensitive()) {
character = Character.toLowerCase(character);
}
currentState = getState(currentState, character);
storeEmits(position, currentState, collectedEmits);
position++;

View File

@ -193,6 +193,7 @@ public class TrieTest {
assertEquals(5, tokens.size());
}
// Test offered by XCurry, https://github.com/robert-bor/aho-corasick/issues/7
@Test
public void zeroLengthTestBug7InGithubReportedByXCurry() {
Trie trie = new Trie().removeOverlaps().onlyWholeWords().caseInsensitive();
@ -200,9 +201,22 @@ public class TrieTest {
trie.tokenize("Try a natural lip and subtle bronzer to keep all the focus on those big bright eyes with NARS Eyeshadow Duo in Rated R And the winner is... Boots No7 Advanced Renewal Anti-ageing Glycolic Peel Kit ($25 amazon.com) won most-appealing peel.");
}
// Test offered by dwyerk, https://github.com/robert-bor/aho-corasick/issues/8
@Test
public void unicodeIssueBug8ReportedByDwyerk() {
String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char
Trie trie = new Trie().caseInsensitive().onlyWholeWords();
assertEquals("THIS", target.substring(5,9)); // Java does it the right way
trie.addKeyword("this");
Collection<Emit> emits = trie.parseText(target);
assertEquals(1, emits.size());
Iterator<Emit> it = emits.iterator();
checkEmit(it.next(), 5, 8, "this");
}
private void checkEmit(Emit next, int expectedStart, int expectedEnd, String expectedKeyword) {
assertEquals(expectedStart, next.getStart());
assertEquals(expectedEnd, next.getEnd());
assertEquals("Start of emit should have been "+expectedStart, expectedStart, next.getStart());
assertEquals("End of emit should have been "+expectedEnd, expectedEnd, next.getEnd());
assertEquals(expectedKeyword, next.getKeyword());
}