From ae204299364f1744d2fdf00f560fa948850fcb3a Mon Sep 17 00:00:00 2001 From: robert-bor Date: Sat, 1 Feb 2014 21:04:53 +0100 Subject: [PATCH] Issue #3 added case insensitivity when matching keywords --- README.md | 15 +++++++++++++++ pom.xml | 4 ++-- src/main/java/org/ahocorasick/trie/Trie.java | 14 +++++++++++--- .../java/org/ahocorasick/trie/TrieConfig.java | 9 +++++++++ src/test/java/org/ahocorasick/trie/TrieTest.java | 16 ++++++++++++++++ 5 files changed, 53 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index ae69df3..3b51877 100644 --- a/README.md +++ b/README.md @@ -90,6 +90,21 @@ If you want the algorithm to only check for whole words, you can tell the Trie t In this case, it will only find one match, whereas it would normally find four. The sugarcane/canesugar words are discarded because they are partial matches. +Some text are WrItTeN in combinations of lowercase and uppercase and therefore hard to identify. You can instruct +the Trie to lowercase the entire searchtext to ease the matching process. + +```java + Trie trie = new Trie().caseInsensitive(); + trie.addKeyword("casing"); + Collection emits = trie.parseText("CaSiNg"); +``` + +Normally, this match would not be found. With the caseInsensitive settings the entire search text is lowercased +before the matching begins. Therefore it will find exactly one match. Since you still have control of the original +search text and you will know exactly where the match was, you can still utilize the original casing. + +Now, let's tie it all together. Say, you have this + License diff --git a/pom.xml b/pom.xml index 0b0557f..f8ab0bf 100644 --- a/pom.xml +++ b/pom.xml @@ -78,8 +78,8 @@ org.apache.maven.plugins maven-compiler-plugin - 1.6 - 1.6 + 1.7 + 1.7 diff --git a/src/main/java/org/ahocorasick/trie/Trie.java b/src/main/java/org/ahocorasick/trie/Trie.java index 7133a4e..1afab18 100644 --- a/src/main/java/org/ahocorasick/trie/Trie.java +++ b/src/main/java/org/ahocorasick/trie/Trie.java @@ -31,6 +31,11 @@ public class Trie { this(new TrieConfig()); } + public Trie caseInsensitive() { + this.trieConfig.setCaseInsensitive(true); + return this; + } + public Trie removeOverlaps() { this.trieConfig.setAllowOverlaps(false); return this; @@ -42,7 +47,6 @@ public class Trie { } public void addKeyword(String keyword) { - State currentState = this.rootState; for (Character character : keyword.toCharArray()) { currentState = currentState.addState(character); @@ -54,6 +58,10 @@ public class Trie { public Collection parseText(String text) { checkForConstructedFailureStates(); + if (trieConfig.isCaseInsensitive()) { + text = text.toLowerCase(); + } + int position = 0; State currentState = this.rootState; List collectedEmits = new ArrayList(); @@ -80,9 +88,9 @@ public class Trie { List removeEmits = new ArrayList(); for (Emit emit : collectedEmits) { if ((emit.getStart() == 0 || - searchText.charAt(emit.getStart() - 1) == ' ') && + !Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) && (emit.getEnd() == size || - searchText.charAt(emit.getEnd() + 1) == ' ')) { + !Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1)))) { continue; } removeEmits.add(emit); diff --git a/src/main/java/org/ahocorasick/trie/TrieConfig.java b/src/main/java/org/ahocorasick/trie/TrieConfig.java index 7dcfd0a..6fa05c7 100644 --- a/src/main/java/org/ahocorasick/trie/TrieConfig.java +++ b/src/main/java/org/ahocorasick/trie/TrieConfig.java @@ -6,6 +6,8 @@ public class TrieConfig { private boolean onlyWholeWords = false; + private boolean caseInsensitive = false; + public boolean isAllowOverlaps() { return allowOverlaps; } @@ -22,4 +24,11 @@ public class TrieConfig { this.onlyWholeWords = onlyWholeWords; } + public boolean isCaseInsensitive() { + return caseInsensitive; + } + + public void setCaseInsensitive(boolean caseInsensitive) { + this.caseInsensitive = caseInsensitive; + } } diff --git a/src/test/java/org/ahocorasick/trie/TrieTest.java b/src/test/java/org/ahocorasick/trie/TrieTest.java index f569a5b..16731b6 100644 --- a/src/test/java/org/ahocorasick/trie/TrieTest.java +++ b/src/test/java/org/ahocorasick/trie/TrieTest.java @@ -135,6 +135,22 @@ public class TrieTest { checkEmit(emits.iterator().next(), 20, 24, "sugar"); } + @Test + public void caseInsensitive() { + Trie trie = new Trie().caseInsensitive(); + trie.addKeyword("turning"); + trie.addKeyword("once"); + trie.addKeyword("again"); + trie.addKeyword("börkü"); + Collection emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ"); // left, middle, right test + assertEquals(4, emits.size()); // Match must not be made + Iterator it = emits.iterator(); + checkEmit(it.next(), 0, 6, "turning"); + checkEmit(it.next(), 8, 11, "once"); + checkEmit(it.next(), 13, 17, "again"); + checkEmit(it.next(), 19, 23, "börkü"); + } + private void checkEmit(Emit next, int expectedStart, int expectedEnd, String expectedKeyword) { assertEquals(expectedStart, next.getStart()); assertEquals(expectedEnd, next.getEnd());