Issue #3 added case insensitivity when matching keywords

This commit is contained in:
robert-bor 2014-02-01 21:04:53 +01:00
parent cb44a6bff2
commit ae20429936
5 changed files with 53 additions and 5 deletions

View File

@ -90,6 +90,21 @@ If you want the algorithm to only check for whole words, you can tell the Trie t
In this case, it will only find one match, whereas it would normally find four. The sugarcane/canesugar words
are discarded because they are partial matches.
Some text are WrItTeN in combinations of lowercase and uppercase and therefore hard to identify. You can instruct
the Trie to lowercase the entire searchtext to ease the matching process.
```java
Trie trie = new Trie().caseInsensitive();
trie.addKeyword("casing");
Collection<Emit> emits = trie.parseText("CaSiNg");
```
Normally, this match would not be found. With the caseInsensitive settings the entire search text is lowercased
before the matching begins. Therefore it will find exactly one match. Since you still have control of the original
search text and you will know exactly where the match was, you can still utilize the original casing.
Now, let's tie it all together. Say, you have this
License

View File

@ -78,8 +78,8 @@
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>1.6</source>
<target>1.6</target>
<source>1.7</source>
<target>1.7</target>
</configuration>
</plugin>

View File

@ -31,6 +31,11 @@ public class Trie {
this(new TrieConfig());
}
public Trie caseInsensitive() {
this.trieConfig.setCaseInsensitive(true);
return this;
}
public Trie removeOverlaps() {
this.trieConfig.setAllowOverlaps(false);
return this;
@ -42,7 +47,6 @@ public class Trie {
}
public void addKeyword(String keyword) {
State currentState = this.rootState;
for (Character character : keyword.toCharArray()) {
currentState = currentState.addState(character);
@ -54,6 +58,10 @@ public class Trie {
public Collection<Emit> parseText(String text) {
checkForConstructedFailureStates();
if (trieConfig.isCaseInsensitive()) {
text = text.toLowerCase();
}
int position = 0;
State currentState = this.rootState;
List<Emit> collectedEmits = new ArrayList<Emit>();
@ -80,9 +88,9 @@ public class Trie {
List<Emit> removeEmits = new ArrayList<Emit>();
for (Emit emit : collectedEmits) {
if ((emit.getStart() == 0 ||
searchText.charAt(emit.getStart() - 1) == ' ') &&
!Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) &&
(emit.getEnd() == size ||
searchText.charAt(emit.getEnd() + 1) == ' ')) {
!Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1)))) {
continue;
}
removeEmits.add(emit);

View File

@ -6,6 +6,8 @@ public class TrieConfig {
private boolean onlyWholeWords = false;
private boolean caseInsensitive = false;
public boolean isAllowOverlaps() {
return allowOverlaps;
}
@ -22,4 +24,11 @@ public class TrieConfig {
this.onlyWholeWords = onlyWholeWords;
}
public boolean isCaseInsensitive() {
return caseInsensitive;
}
public void setCaseInsensitive(boolean caseInsensitive) {
this.caseInsensitive = caseInsensitive;
}
}

View File

@ -135,6 +135,22 @@ public class TrieTest {
checkEmit(emits.iterator().next(), 20, 24, "sugar");
}
@Test
public void caseInsensitive() {
Trie trie = new Trie().caseInsensitive();
trie.addKeyword("turning");
trie.addKeyword("once");
trie.addKeyword("again");
trie.addKeyword("börkü");
Collection<Emit> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ"); // left, middle, right test
assertEquals(4, emits.size()); // Match must not be made
Iterator<Emit> it = emits.iterator();
checkEmit(it.next(), 0, 6, "turning");
checkEmit(it.next(), 8, 11, "once");
checkEmit(it.next(), 13, 17, "again");
checkEmit(it.next(), 19, 23, "börkü");
}
private void checkEmit(Emit next, int expectedStart, int expectedEnd, String expectedKeyword) {
assertEquals(expectedStart, next.getStart());
assertEquals(expectedEnd, next.getEnd());