Compare commits
1 Commits
master
...
issue40_ma
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b1c1ad4d71 |
@ -32,13 +32,10 @@ public class Trie {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
State currentState = this.rootState;
|
State currentState = this.rootState;
|
||||||
for (Character character : keyword.toCharArray()) {
|
String caseAdjustedKeyword = adjustCase(keyword).toString();
|
||||||
if (trieConfig.isCaseInsensitive()) {
|
for (Character character : caseAdjustedKeyword.toCharArray())
|
||||||
character = Character.toLowerCase(character);
|
|
||||||
}
|
|
||||||
currentState = currentState.addState(character);
|
currentState = currentState.addState(character);
|
||||||
}
|
currentState.addEmit(caseAdjustedKeyword);
|
||||||
currentState.addEmit(trieConfig.isCaseInsensitive() ? keyword.toLowerCase() : keyword);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public Collection<Token> tokenize(String text) {
|
public Collection<Token> tokenize(String text) {
|
||||||
@ -85,25 +82,23 @@ public class Trie {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!trieConfig.isAllowOverlaps()) {
|
if (!trieConfig.isAllowOverlaps()) {
|
||||||
IntervalTree intervalTree = new IntervalTree((List<Intervalable>)(List<?>)collectedEmits);
|
IntervalTree intervalTree = new IntervalTree((List<Intervalable>) (List<?>) collectedEmits);
|
||||||
intervalTree.removeOverlaps((List<Intervalable>) (List<?>) collectedEmits);
|
intervalTree.removeOverlaps((List<Intervalable>) (List<?>) collectedEmits);
|
||||||
}
|
}
|
||||||
|
|
||||||
return collectedEmits;
|
return collectedEmits;
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean containsMatch(CharSequence text) {
|
public boolean containsMatch(CharSequence text) {
|
||||||
Emit firstMatch = firstMatch(text);
|
Emit firstMatch = firstMatch(text);
|
||||||
return firstMatch != null;
|
return firstMatch != null;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void parseText(CharSequence text, EmitHandler emitHandler) {
|
public void parseText(CharSequence text, EmitHandler emitHandler) {
|
||||||
State currentState = this.rootState;
|
State currentState = this.rootState;
|
||||||
for (int position = 0; position < text.length(); position++) {
|
CharSequence caseAdjustedText = adjustCase(text);
|
||||||
Character character = text.charAt(position);
|
for (int position = 0; position < caseAdjustedText.length(); position++) {
|
||||||
if (trieConfig.isCaseInsensitive()) {
|
Character character = caseAdjustedText.charAt(position);
|
||||||
character = Character.toLowerCase(character);
|
|
||||||
}
|
|
||||||
currentState = getState(currentState, character);
|
currentState = getState(currentState, character);
|
||||||
if (storeEmits(position, currentState, emitHandler) && trieConfig.isStopOnHit()) {
|
if (storeEmits(position, currentState, emitHandler) && trieConfig.isStopOnHit()) {
|
||||||
return;
|
return;
|
||||||
@ -112,46 +107,44 @@ public class Trie {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public Emit firstMatch(CharSequence text) {
|
public Emit firstMatch(CharSequence text) {
|
||||||
if (!trieConfig.isAllowOverlaps()) {
|
if (!trieConfig.isAllowOverlaps()) {
|
||||||
// Slow path. Needs to find all the matches to detect overlaps.
|
// Slow path. Needs to find all the matches to detect overlaps.
|
||||||
Collection<Emit> parseText = parseText(text);
|
Collection<Emit> parseText = parseText(text);
|
||||||
if (parseText != null && !parseText.isEmpty()) {
|
if (parseText != null && !parseText.isEmpty()) {
|
||||||
return parseText.iterator().next();
|
return parseText.iterator().next();
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Fast path. Returns first match found.
|
// Fast path. Returns first match found.
|
||||||
State currentState = this.rootState;
|
State currentState = this.rootState;
|
||||||
for (int position = 0; position < text.length(); position++) {
|
CharSequence caseAdjustedText = adjustCase(text);
|
||||||
Character character = text.charAt(position);
|
for (int position = 0; position < caseAdjustedText.length(); position++) {
|
||||||
if (trieConfig.isCaseInsensitive()) {
|
Character character = caseAdjustedText.charAt(position);
|
||||||
character = Character.toLowerCase(character);
|
currentState = getState(currentState, character);
|
||||||
}
|
Collection<String> emitStrs = currentState.emit();
|
||||||
currentState = getState(currentState, character);
|
if (emitStrs != null && !emitStrs.isEmpty()) {
|
||||||
Collection<String> emitStrs = currentState.emit();
|
for (String emitStr : emitStrs) {
|
||||||
if (emitStrs != null && !emitStrs.isEmpty()) {
|
final Emit emit = new Emit(position - emitStr.length() + 1, position, emitStr);
|
||||||
for (String emitStr : emitStrs) {
|
if (trieConfig.isOnlyWholeWords()) {
|
||||||
final Emit emit = new Emit(position - emitStr.length() + 1, position, emitStr);
|
if (!isPartialMatch(text, emit)) {
|
||||||
if (trieConfig.isOnlyWholeWords()) {
|
return emit;
|
||||||
if (!isPartialMatch(text, emit)) {
|
}
|
||||||
return emit;
|
} else {
|
||||||
}
|
return emit;
|
||||||
} else {
|
}
|
||||||
return emit;
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
return null;
|
||||||
}
|
}
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean isPartialMatch(CharSequence searchText, Emit emit) {
|
private boolean isPartialMatch(CharSequence searchText, Emit emit) {
|
||||||
return (emit.getStart() != 0 &&
|
return (emit.getStart() != 0 &&
|
||||||
Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) ||
|
Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) ||
|
||||||
(emit.getEnd() + 1 != searchText.length() &&
|
(emit.getEnd() + 1 != searchText.length() &&
|
||||||
Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1)));
|
Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1)));
|
||||||
}
|
}
|
||||||
|
|
||||||
private void removePartialMatches(CharSequence searchText, List<Emit> collectedEmits) {
|
private void removePartialMatches(CharSequence searchText, List<Emit> collectedEmits) {
|
||||||
List<Emit> removeEmits = new ArrayList<>();
|
List<Emit> removeEmits = new ArrayList<>();
|
||||||
@ -229,6 +222,17 @@ public class Trie {
|
|||||||
return emitted;
|
return emitted;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private CharSequence adjustCase(CharSequence text) {
|
||||||
|
if(trieConfig.isCaseInsensitive()){
|
||||||
|
char[] textChars = text.toString().toCharArray();
|
||||||
|
char[] adjustedTextChars = new char[textChars.length];
|
||||||
|
for(int i = 0; i < textChars.length; i++)
|
||||||
|
adjustedTextChars[i] = Character.toLowerCase(textChars[i]);
|
||||||
|
return new String(adjustedTextChars);
|
||||||
|
}
|
||||||
|
return text;
|
||||||
|
}
|
||||||
|
|
||||||
public static TrieBuilder builder() {
|
public static TrieBuilder builder() {
|
||||||
return new TrieBuilder();
|
return new TrieBuilder();
|
||||||
}
|
}
|
||||||
|
|||||||
@ -415,6 +415,23 @@ public class TrieTest {
|
|||||||
checkEmit(firstMatch, 5, 8, "this");
|
checkEmit(firstMatch, 5, 8, "this");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void unicodeIssueBug39ReportedByHumanzz(){
|
||||||
|
// Problem: "İ".length => 1, "İ".toLowerCase().length => 2. This causes all sorts of unexpected behaviors
|
||||||
|
// and bugs where the Emit will have a size different from the original string.
|
||||||
|
// Soln: As in issue #8, convert at character level Character.toLowerCase('İ') => 'i' + make sure
|
||||||
|
// that emit gets the properly cased keyword.
|
||||||
|
String upperLengthOne = "İnt";
|
||||||
|
Trie trie = Trie.builder()
|
||||||
|
.caseInsensitive()
|
||||||
|
.onlyWholeWords()
|
||||||
|
.addKeyword(upperLengthOne)
|
||||||
|
.build();
|
||||||
|
Collection<Emit> emits = trie.parseText("İnt is good");
|
||||||
|
assertEquals(1, emits.size());
|
||||||
|
checkEmit(emits.iterator().next(), 0, 2, "int");
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void partialMatchWhiteSpaces() {
|
public void partialMatchWhiteSpaces() {
|
||||||
Trie trie = Trie.builder()
|
Trie trie = Trie.builder()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user