Issue #3 added case insensitivity when matching keywords
This commit is contained in:
parent
cb44a6bff2
commit
ae20429936
15
README.md
15
README.md
@ -90,6 +90,21 @@ If you want the algorithm to only check for whole words, you can tell the Trie t
|
||||
In this case, it will only find one match, whereas it would normally find four. The sugarcane/canesugar words
|
||||
are discarded because they are partial matches.
|
||||
|
||||
Some text are WrItTeN in combinations of lowercase and uppercase and therefore hard to identify. You can instruct
|
||||
the Trie to lowercase the entire searchtext to ease the matching process.
|
||||
|
||||
```java
|
||||
Trie trie = new Trie().caseInsensitive();
|
||||
trie.addKeyword("casing");
|
||||
Collection<Emit> emits = trie.parseText("CaSiNg");
|
||||
```
|
||||
|
||||
Normally, this match would not be found. With the caseInsensitive settings the entire search text is lowercased
|
||||
before the matching begins. Therefore it will find exactly one match. Since you still have control of the original
|
||||
search text and you will know exactly where the match was, you can still utilize the original casing.
|
||||
|
||||
Now, let's tie it all together. Say, you have this
|
||||
|
||||
|
||||
|
||||
License
|
||||
|
||||
4
pom.xml
4
pom.xml
@ -78,8 +78,8 @@
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<configuration>
|
||||
<source>1.6</source>
|
||||
<target>1.6</target>
|
||||
<source>1.7</source>
|
||||
<target>1.7</target>
|
||||
</configuration>
|
||||
</plugin>
|
||||
|
||||
|
||||
@ -31,6 +31,11 @@ public class Trie {
|
||||
this(new TrieConfig());
|
||||
}
|
||||
|
||||
public Trie caseInsensitive() {
|
||||
this.trieConfig.setCaseInsensitive(true);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Trie removeOverlaps() {
|
||||
this.trieConfig.setAllowOverlaps(false);
|
||||
return this;
|
||||
@ -42,7 +47,6 @@ public class Trie {
|
||||
}
|
||||
|
||||
public void addKeyword(String keyword) {
|
||||
|
||||
State currentState = this.rootState;
|
||||
for (Character character : keyword.toCharArray()) {
|
||||
currentState = currentState.addState(character);
|
||||
@ -54,6 +58,10 @@ public class Trie {
|
||||
public Collection<Emit> parseText(String text) {
|
||||
checkForConstructedFailureStates();
|
||||
|
||||
if (trieConfig.isCaseInsensitive()) {
|
||||
text = text.toLowerCase();
|
||||
}
|
||||
|
||||
int position = 0;
|
||||
State currentState = this.rootState;
|
||||
List<Emit> collectedEmits = new ArrayList<Emit>();
|
||||
@ -80,9 +88,9 @@ public class Trie {
|
||||
List<Emit> removeEmits = new ArrayList<Emit>();
|
||||
for (Emit emit : collectedEmits) {
|
||||
if ((emit.getStart() == 0 ||
|
||||
searchText.charAt(emit.getStart() - 1) == ' ') &&
|
||||
!Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) &&
|
||||
(emit.getEnd() == size ||
|
||||
searchText.charAt(emit.getEnd() + 1) == ' ')) {
|
||||
!Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1)))) {
|
||||
continue;
|
||||
}
|
||||
removeEmits.add(emit);
|
||||
|
||||
@ -6,6 +6,8 @@ public class TrieConfig {
|
||||
|
||||
private boolean onlyWholeWords = false;
|
||||
|
||||
private boolean caseInsensitive = false;
|
||||
|
||||
public boolean isAllowOverlaps() {
|
||||
return allowOverlaps;
|
||||
}
|
||||
@ -22,4 +24,11 @@ public class TrieConfig {
|
||||
this.onlyWholeWords = onlyWholeWords;
|
||||
}
|
||||
|
||||
public boolean isCaseInsensitive() {
|
||||
return caseInsensitive;
|
||||
}
|
||||
|
||||
public void setCaseInsensitive(boolean caseInsensitive) {
|
||||
this.caseInsensitive = caseInsensitive;
|
||||
}
|
||||
}
|
||||
|
||||
@ -135,6 +135,22 @@ public class TrieTest {
|
||||
checkEmit(emits.iterator().next(), 20, 24, "sugar");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void caseInsensitive() {
|
||||
Trie trie = new Trie().caseInsensitive();
|
||||
trie.addKeyword("turning");
|
||||
trie.addKeyword("once");
|
||||
trie.addKeyword("again");
|
||||
trie.addKeyword("börkü");
|
||||
Collection<Emit> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ"); // left, middle, right test
|
||||
assertEquals(4, emits.size()); // Match must not be made
|
||||
Iterator<Emit> it = emits.iterator();
|
||||
checkEmit(it.next(), 0, 6, "turning");
|
||||
checkEmit(it.next(), 8, 11, "once");
|
||||
checkEmit(it.next(), 13, 17, "again");
|
||||
checkEmit(it.next(), 19, 23, "börkü");
|
||||
}
|
||||
|
||||
private void checkEmit(Emit next, int expectedStart, int expectedEnd, String expectedKeyword) {
|
||||
assertEquals(expectedStart, next.getStart());
|
||||
assertEquals(expectedEnd, next.getEnd());
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user