Issue #24 big cleanup, removed all post-processing methods for whole words and non-overlapping sequences and integrated the same functionality closer to the AC algorithm.
This commit is contained in:
parent
b274844b75
commit
b42c664796
@ -1,5 +1,7 @@
|
||||
package org.ahocorasick.trie;
|
||||
|
||||
import org.ahocorasick.trie.candidate.EmitCandidateFlushHandler;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
@ -98,10 +100,17 @@ public class State {
|
||||
return this.emits == null ? Collections.<String> emptyList() : this.emits;
|
||||
}
|
||||
|
||||
public State failure() {
|
||||
public State failure(EmitCandidateFlushHandler emitCandidateFlushHandler) {
|
||||
if (emitCandidateFlushHandler != null && this.failure.isRootState()) {
|
||||
emitCandidateFlushHandler.flush();
|
||||
}
|
||||
return this.failure;
|
||||
}
|
||||
|
||||
public State failure() {
|
||||
return failure(null);
|
||||
}
|
||||
|
||||
public void setFailure(State failState) {
|
||||
this.failure = failState;
|
||||
}
|
||||
@ -114,4 +123,8 @@ public class State {
|
||||
return this.success.keySet();
|
||||
}
|
||||
|
||||
public boolean isRootState() {
|
||||
return this.depth == 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,13 +1,14 @@
|
||||
package org.ahocorasick.trie;
|
||||
|
||||
import org.ahocorasick.interval.IntervalTree;
|
||||
import org.ahocorasick.interval.Intervalable;
|
||||
import org.ahocorasick.trie.candidate.EmitCandidateFlushHandler;
|
||||
import org.ahocorasick.trie.candidate.EmitCandidateHolder;
|
||||
import org.ahocorasick.trie.candidate.NonOverlappingEmitCandidateHolder;
|
||||
import org.ahocorasick.trie.candidate.OverlappingEmitCandidateHolder;
|
||||
import org.ahocorasick.trie.handler.DefaultEmitHandler;
|
||||
import org.ahocorasick.trie.handler.EmitHandler;
|
||||
import org.ahocorasick.trie.handler.FirstMatchHandler;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Queue;
|
||||
import java.util.concurrent.LinkedBlockingDeque;
|
||||
|
||||
@ -49,23 +50,7 @@ public class Trie {
|
||||
public Collection<Emit> parseText(CharSequence text) {
|
||||
DefaultEmitHandler emitHandler = new DefaultEmitHandler();
|
||||
parseText(text, emitHandler);
|
||||
|
||||
List<Emit> collectedEmits = emitHandler.getEmits();
|
||||
|
||||
if (trieConfig.isOnlyWholeWords()) {
|
||||
removePartialMatches(text, collectedEmits);
|
||||
}
|
||||
|
||||
if (trieConfig.isOnlyWholeWordsWhiteSpaceSeparated()) {
|
||||
removePartialMatchesWhiteSpaceSeparated(text, collectedEmits);
|
||||
}
|
||||
|
||||
if (!trieConfig.isAllowOverlaps()) {
|
||||
IntervalTree intervalTree = new IntervalTree((List<Intervalable>)(List<?>)collectedEmits);
|
||||
intervalTree.removeOverlaps((List<Intervalable>) (List<?>) collectedEmits);
|
||||
}
|
||||
|
||||
return collectedEmits;
|
||||
return emitHandler.getEmits();
|
||||
}
|
||||
|
||||
public boolean containsMatch(CharSequence text) {
|
||||
@ -73,91 +58,56 @@ public class Trie {
|
||||
return firstMatch != null;
|
||||
}
|
||||
|
||||
public Emit firstMatch(CharSequence text) {
|
||||
FirstMatchHandler emitHandler = new FirstMatchHandler();
|
||||
parseText(text, emitHandler);
|
||||
return emitHandler.getFirstMatch();
|
||||
}
|
||||
|
||||
public void parseText(CharSequence text, EmitHandler emitHandler) {
|
||||
|
||||
final EmitCandidateHolder emitCandidateHolder = this.trieConfig.isAllowOverlaps() ?
|
||||
new OverlappingEmitCandidateHolder() :
|
||||
new NonOverlappingEmitCandidateHolder();
|
||||
|
||||
final EmitCandidateFlushHandler flushHandler = new EmitCandidateFlushHandler(emitHandler, emitCandidateHolder);
|
||||
|
||||
State currentState = this.rootState;
|
||||
for (int position = 0; position < text.length(); position++) {
|
||||
|
||||
if (flushHandler.stop()) {
|
||||
return;
|
||||
}
|
||||
|
||||
Character character = text.charAt(position);
|
||||
if (trieConfig.isCaseInsensitive()) {
|
||||
character = Character.toLowerCase(character);
|
||||
}
|
||||
currentState = getState(currentState, character);
|
||||
storeEmits(position, currentState, emitHandler);
|
||||
}
|
||||
currentState = getState(currentState, character, flushHandler);
|
||||
|
||||
}
|
||||
|
||||
public Emit firstMatch(CharSequence text) {
|
||||
if (!trieConfig.isAllowOverlaps()) {
|
||||
// Slow path. Needs to find all the matches to detect overlaps.
|
||||
Collection<Emit> parseText = parseText(text);
|
||||
if (parseText != null && !parseText.isEmpty()) {
|
||||
return parseText.iterator().next();
|
||||
}
|
||||
} else {
|
||||
// Fast path. Returns first match found.
|
||||
State currentState = this.rootState;
|
||||
for (int position = 0; position < text.length(); position++) {
|
||||
Character character = text.charAt(position);
|
||||
if (trieConfig.isCaseInsensitive()) {
|
||||
character = Character.toLowerCase(character);
|
||||
}
|
||||
currentState = getState(currentState, character);
|
||||
Collection<String> emitStrs = currentState.emit();
|
||||
if (emitStrs != null && !emitStrs.isEmpty()) {
|
||||
for (String emitStr : emitStrs) {
|
||||
final Emit emit = new Emit(position - emitStr.length() + 1, position, emitStr);
|
||||
if (trieConfig.isOnlyWholeWords()) {
|
||||
if (!isPartialMatch(text, emit)) {
|
||||
return emit;
|
||||
}
|
||||
} else {
|
||||
return emit;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private boolean isPartialMatch(CharSequence searchText, Emit emit) {
|
||||
return (emit.getStart() != 0 &&
|
||||
Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) ||
|
||||
(emit.getEnd() + 1 != searchText.length() &&
|
||||
Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1)));
|
||||
}
|
||||
|
||||
private void removePartialMatches(CharSequence searchText, List<Emit> collectedEmits) {
|
||||
List<Emit> removeEmits = new ArrayList<>();
|
||||
for (Emit emit : collectedEmits) {
|
||||
if (isPartialMatch(searchText, emit)) {
|
||||
removeEmits.add(emit);
|
||||
}
|
||||
}
|
||||
for (Emit removeEmit : removeEmits) {
|
||||
collectedEmits.remove(removeEmit);
|
||||
}
|
||||
}
|
||||
|
||||
private void removePartialMatchesWhiteSpaceSeparated(CharSequence searchText, List<Emit> collectedEmits) {
|
||||
long size = searchText.length();
|
||||
List<Emit> removeEmits = new ArrayList<>();
|
||||
for (Emit emit : collectedEmits) {
|
||||
if ((emit.getStart() == 0 || Character.isWhitespace(searchText.charAt(emit.getStart() - 1))) &&
|
||||
(emit.getEnd() + 1 == size || Character.isWhitespace(searchText.charAt(emit.getEnd() + 1)))) {
|
||||
continue;
|
||||
Collection<String> emits = currentState.emit();
|
||||
if (emits != null && !emits.isEmpty()) {
|
||||
for (String emit : emits) {
|
||||
int start = position - emit.length() + 1;
|
||||
if (!trieConfig.isOnlyWholeWords() || isWholeWord(text, start, position)) {
|
||||
emitCandidateHolder.addCandidate(new Emit(start, position, emit));
|
||||
}
|
||||
}
|
||||
}
|
||||
removeEmits.add(emit);
|
||||
}
|
||||
for (Emit removeEmit : removeEmits) {
|
||||
collectedEmits.remove(removeEmit);
|
||||
|
||||
}
|
||||
flushHandler.flush();
|
||||
}
|
||||
|
||||
private State getState(State currentState, Character character) {
|
||||
private boolean isWholeWord(CharSequence text, int start, int end) {
|
||||
return (start == 0 || Character.isWhitespace(text.charAt(start - 1))) &&
|
||||
(end == text.length() - 1 || Character.isWhitespace(text.charAt(end + 1)));
|
||||
}
|
||||
|
||||
private State getState(State currentState, Character character, EmitCandidateFlushHandler flushHandler) {
|
||||
State newCurrentState = currentState.nextState(character);
|
||||
while (newCurrentState == null) {
|
||||
currentState = currentState.failure();
|
||||
currentState = currentState.failure(flushHandler);
|
||||
newCurrentState = currentState.nextState(character);
|
||||
}
|
||||
return newCurrentState;
|
||||
@ -191,15 +141,6 @@ public class Trie {
|
||||
}
|
||||
}
|
||||
|
||||
private void storeEmits(int position, State currentState, EmitHandler emitHandler) {
|
||||
Collection<String> emits = currentState.emit();
|
||||
if (emits != null && !emits.isEmpty()) {
|
||||
for (String emit : emits) {
|
||||
emitHandler.emit(new Emit(position - emit.length() + 1, position, emit));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static TrieBuilder builder() {
|
||||
return new TrieBuilder();
|
||||
}
|
||||
@ -227,11 +168,6 @@ public class Trie {
|
||||
return this;
|
||||
}
|
||||
|
||||
public TrieBuilder onlyWholeWordsWhiteSpaceSeparated() {
|
||||
this.trieConfig.setOnlyWholeWordsWhiteSpaceSeparated(true);
|
||||
return this;
|
||||
}
|
||||
|
||||
public TrieBuilder addKeyword(String keyword) {
|
||||
trie.addKeyword(keyword);
|
||||
return this;
|
||||
|
||||
@ -6,8 +6,6 @@ public class TrieConfig {
|
||||
|
||||
private boolean onlyWholeWords = false;
|
||||
|
||||
private boolean onlyWholeWordsWhiteSpaceSeparated = false;
|
||||
|
||||
private boolean caseInsensitive = false;
|
||||
|
||||
public boolean isAllowOverlaps() {
|
||||
@ -26,12 +24,6 @@ public class TrieConfig {
|
||||
this.onlyWholeWords = onlyWholeWords;
|
||||
}
|
||||
|
||||
public boolean isOnlyWholeWordsWhiteSpaceSeparated() { return onlyWholeWordsWhiteSpaceSeparated; }
|
||||
|
||||
public void setOnlyWholeWordsWhiteSpaceSeparated(boolean onlyWholeWordsWhiteSpaceSeparated) {
|
||||
this.onlyWholeWordsWhiteSpaceSeparated = onlyWholeWordsWhiteSpaceSeparated;
|
||||
}
|
||||
|
||||
public boolean isCaseInsensitive() {
|
||||
return caseInsensitive;
|
||||
}
|
||||
|
||||
@ -0,0 +1,27 @@
|
||||
package org.ahocorasick.trie.candidate;
|
||||
|
||||
import org.ahocorasick.trie.Emit;
|
||||
import org.ahocorasick.trie.handler.EmitHandler;
|
||||
|
||||
public class EmitCandidateFlushHandler {
|
||||
|
||||
private final EmitHandler emitHandler;
|
||||
|
||||
private final EmitCandidateHolder emitCandidateHolder;
|
||||
|
||||
public EmitCandidateFlushHandler(EmitHandler emitHandler, EmitCandidateHolder emitCandidateHolder) {
|
||||
this.emitHandler = emitHandler;
|
||||
this.emitCandidateHolder = emitCandidateHolder;
|
||||
}
|
||||
|
||||
public void flush() {
|
||||
for (Emit emit : emitCandidateHolder.flush()) {
|
||||
emitHandler.emit(emit);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean stop() {
|
||||
return emitHandler.stop();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,12 @@
|
||||
package org.ahocorasick.trie.candidate;
|
||||
|
||||
import org.ahocorasick.trie.Emit;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public interface EmitCandidateHolder {
|
||||
|
||||
void addCandidate(Emit emitCandidate);
|
||||
List<Emit> flush();
|
||||
|
||||
}
|
||||
@ -0,0 +1,18 @@
|
||||
package org.ahocorasick.trie.candidate;
|
||||
|
||||
import org.ahocorasick.interval.IntervalTree;
|
||||
import org.ahocorasick.interval.Intervalable;
|
||||
import org.ahocorasick.trie.Emit;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public class NonOverlappingEmitCandidateHolder extends OverlappingEmitCandidateHolder {
|
||||
|
||||
@Override
|
||||
public List<Emit> flush() {
|
||||
IntervalTree intervalTree = new IntervalTree((List<Intervalable>)(List<?>)emitCandidates);
|
||||
intervalTree.removeOverlaps((List<Intervalable>) (List<?>) emitCandidates);
|
||||
return super.flush();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,28 @@
|
||||
package org.ahocorasick.trie.candidate;
|
||||
|
||||
import org.ahocorasick.trie.Emit;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
public class OverlappingEmitCandidateHolder implements EmitCandidateHolder {
|
||||
|
||||
protected List<Emit> emitCandidates = new ArrayList<>();
|
||||
|
||||
@Override
|
||||
public void addCandidate(Emit emitCandidate) {
|
||||
this.emitCandidates.add(emitCandidate);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<Emit> flush() {
|
||||
return reset(emitCandidates);
|
||||
}
|
||||
|
||||
private List<Emit> reset(List<Emit> emitCandidates) {
|
||||
this.emitCandidates = new ArrayList<>();
|
||||
return emitCandidates;
|
||||
}
|
||||
|
||||
}
|
||||
@ -5,7 +5,7 @@ import org.ahocorasick.trie.Emit;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public class DefaultEmitHandler implements EmitHandler {
|
||||
public class DefaultEmitHandler extends SimpleEmitHandler {
|
||||
|
||||
private List<Emit> emits = new ArrayList<>();
|
||||
|
||||
|
||||
@ -3,5 +3,16 @@ package org.ahocorasick.trie.handler;
|
||||
import org.ahocorasick.trie.Emit;
|
||||
|
||||
public interface EmitHandler {
|
||||
|
||||
/**
|
||||
* Callback handler that deals with an emit it gets from the parser
|
||||
* @param emit the current emit that must be dealt with
|
||||
*/
|
||||
void emit(Emit emit);
|
||||
|
||||
/**
|
||||
* Force the parse process to stop
|
||||
* @return true if the process must stop
|
||||
*/
|
||||
boolean stop();
|
||||
}
|
||||
|
||||
@ -0,0 +1,28 @@
|
||||
package org.ahocorasick.trie.handler;
|
||||
|
||||
import org.ahocorasick.trie.Emit;
|
||||
|
||||
public class FirstMatchHandler extends SimpleEmitHandler {
|
||||
|
||||
private Emit firstMatch;
|
||||
|
||||
private boolean stop = false;
|
||||
|
||||
@Override
|
||||
public void emit(Emit emit) {
|
||||
if (!stop) {
|
||||
firstMatch = emit;
|
||||
stop = true;
|
||||
}
|
||||
}
|
||||
|
||||
public Emit getFirstMatch() {
|
||||
return firstMatch;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean stop() {
|
||||
return this.stop;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,14 @@
|
||||
package org.ahocorasick.trie.handler;
|
||||
|
||||
import org.ahocorasick.trie.Emit;
|
||||
|
||||
public abstract class SimpleEmitHandler implements EmitHandler {
|
||||
|
||||
@Override
|
||||
public abstract void emit(Emit emit);
|
||||
|
||||
@Override
|
||||
public boolean stop() {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@ -1,6 +1,7 @@
|
||||
package org.ahocorasick.trie;
|
||||
|
||||
import org.ahocorasick.trie.handler.EmitHandler;
|
||||
import org.ahocorasick.trie.handler.SimpleEmitHandler;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
@ -129,7 +130,7 @@ public class TrieTest {
|
||||
.build();
|
||||
|
||||
final List<Emit> emits = new ArrayList<>();
|
||||
EmitHandler emitHandler = new EmitHandler() {
|
||||
EmitHandler emitHandler = new SimpleEmitHandler() {
|
||||
|
||||
@Override
|
||||
public void emit(Emit emit) {
|
||||
@ -209,6 +210,51 @@ public class TrieTest {
|
||||
checkEmit(iterator.next(), 2, 9, "hehehehe");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void nonOverlappingWholeWords() {
|
||||
Trie trie = Trie.builder()
|
||||
.removeOverlaps()
|
||||
.onlyWholeWords()
|
||||
.addKeyword("peper molen")
|
||||
.addKeyword("molen wiel")
|
||||
.addKeyword("wiel dop")
|
||||
.addKeyword("dop")
|
||||
.build();
|
||||
Collection<Emit> emits = trie.parseText("peper molen wiel dop xwiel dop wiel dopx wiel dop");
|
||||
assertEquals(4, emits.size());
|
||||
Iterator<Emit> iterator = emits.iterator();
|
||||
checkEmit(iterator.next(), 0, 10, "peper molen");
|
||||
checkEmit(iterator.next(), 12, 19, "wiel dop");
|
||||
checkEmit(iterator.next(), 27, 29, "dop");
|
||||
checkEmit(iterator.next(), 41, 48, "wiel dop");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void nonOverlappingWholeWordsWithCustomEmitHandler() {
|
||||
Trie trie = Trie.builder()
|
||||
.removeOverlaps()
|
||||
.onlyWholeWords()
|
||||
.addKeyword("peper molen")
|
||||
.addKeyword("molen wiel")
|
||||
.addKeyword("wiel dop")
|
||||
.addKeyword("dop")
|
||||
.build();
|
||||
final List<Emit> emits = new ArrayList<>();
|
||||
EmitHandler emitHandler = new SimpleEmitHandler() {
|
||||
@Override
|
||||
public void emit(Emit emit) {
|
||||
emits.add(emit);
|
||||
}
|
||||
};
|
||||
trie.parseText("peper molen wiel dop xwiel dop wiel dopx wiel dop", emitHandler);
|
||||
assertEquals(4, emits.size());
|
||||
Iterator<Emit> iterator = emits.iterator();
|
||||
checkEmit(iterator.next(), 0, 10, "peper molen");
|
||||
checkEmit(iterator.next(), 12, 19, "wiel dop");
|
||||
checkEmit(iterator.next(), 27, 29, "dop");
|
||||
checkEmit(iterator.next(), 41, 48, "wiel dop");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void nonOverlapping() {
|
||||
Trie trie = Trie.builder().removeOverlaps()
|
||||
@ -402,7 +448,7 @@ public class TrieTest {
|
||||
@Test
|
||||
public void partialMatchWhiteSpaces() {
|
||||
Trie trie = Trie.builder()
|
||||
.onlyWholeWordsWhiteSpaceSeparated()
|
||||
.onlyWholeWords()
|
||||
.addKeyword("#sugar-123")
|
||||
.build();
|
||||
Collection < Emit > emits = trie.parseText("#sugar-123 #sugar-1234"); // left, middle, right test
|
||||
|
||||
@ -0,0 +1,21 @@
|
||||
package org.ahocorasick.trie.candidate;
|
||||
|
||||
import org.ahocorasick.trie.Emit;
|
||||
import org.ahocorasick.trie.handler.EmitHandler;
|
||||
import org.ahocorasick.trie.handler.FirstMatchHandler;
|
||||
import org.junit.Test;
|
||||
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
public class EmitCandidateFlushHandlerTest {
|
||||
|
||||
@Test
|
||||
public void stop() {
|
||||
EmitHandler emitHandler = new FirstMatchHandler();
|
||||
EmitCandidateFlushHandler flushHandler = new EmitCandidateFlushHandler(emitHandler, null);
|
||||
assertFalse(flushHandler.stop());
|
||||
emitHandler.emit(new Emit(0, 2, "bla"));
|
||||
assertTrue(flushHandler.stop());
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,35 @@
|
||||
package org.ahocorasick.trie.candidate;
|
||||
|
||||
import org.ahocorasick.trie.Emit;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
public class NonOverlappingEmitCandidateHolderTest {
|
||||
|
||||
@Test
|
||||
public void retainLongestEmit() {
|
||||
EmitCandidateHolder holder = new NonOverlappingEmitCandidateHolder();
|
||||
holder.addCandidate(new Emit(0, 2, "she"));
|
||||
holder.addCandidate(new Emit(1, 2, "he"));
|
||||
List<Emit> emits = holder.flush();
|
||||
assertEquals(1, emits.size());
|
||||
assertEquals("she", emits.get(0).getKeyword());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void multipleOverlaps() {
|
||||
EmitCandidateHolder holder = new NonOverlappingEmitCandidateHolder();
|
||||
holder.addCandidate(new Emit(0, 4, "ababc"));
|
||||
holder.addCandidate(new Emit(4, 6, "cba"));
|
||||
holder.addCandidate(new Emit(6, 7, "ab"));
|
||||
List<Emit> emits = holder.flush();
|
||||
assertEquals(2, emits.size());
|
||||
assertEquals("ababc", emits.get(0).getKeyword());
|
||||
assertEquals("ab", emits.get(1).getKeyword());
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,22 @@
|
||||
package org.ahocorasick.trie.candidate;
|
||||
|
||||
import org.ahocorasick.trie.Emit;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
public class OverlappingEmitCandidateHolderTest {
|
||||
|
||||
@Test
|
||||
public void addAndFlush() {
|
||||
EmitCandidateHolder holder = new OverlappingEmitCandidateHolder();
|
||||
holder.addCandidate(new Emit(0, 2, "ABC"));
|
||||
holder.addCandidate(new Emit(2, 4, "CDE"));
|
||||
List<Emit> emits = holder.flush();
|
||||
assertEquals(2, emits.size());
|
||||
assertEquals("ABC", emits.get(0).getKeyword());
|
||||
assertEquals("CDE", emits.get(1).getKeyword());
|
||||
}
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user