Fixed formatting changes.

This commit is contained in:
ryan 2014-10-06 11:02:01 -07:00
parent df503bae43
commit a46e7dfe1d

View File

@ -12,120 +12,107 @@ import java.util.concurrent.LinkedBlockingDeque;
/**
*
* Based on the Aho-Corasick white paper, Bell technologies:
* ftp://163.13.200.222/assistant/bearhero/prog/%A8%E4%A5%A6/ac_bm.pdf
*
* ftp://163.13.200.222/assistant/bearhero/prog/%A8%E4%A5%A6/ac_bm.pdf
* @author Robert Bor
*/
public class Trie
{
public class Trie {
private TrieConfig trieConfig;
private TrieConfig trieConfig;
private State rootState;
private State rootState;
private boolean failureStatesConstructed = false;
private boolean failureStatesConstructed = false;
public Trie(TrieConfig trieConfig)
{
this.trieConfig = trieConfig;
this.rootState = new State();
}
public Trie(TrieConfig trieConfig) {
this.trieConfig = trieConfig;
this.rootState = new State();
}
public Trie()
{
this(new TrieConfig());
}
public Trie() {
this(new TrieConfig());
}
public Trie caseInsensitive()
{
this.trieConfig.setCaseInsensitive(true);
return this;
}
public Trie caseInsensitive() {
this.trieConfig.setCaseInsensitive(true);
return this;
}
public Trie removeOverlaps()
{
this.trieConfig.setAllowOverlaps(false);
return this;
}
public Trie removeOverlaps() {
this.trieConfig.setAllowOverlaps(false);
return this;
}
public Trie onlyWholeWords()
{
this.trieConfig.setOnlyWholeWords(true);
return this;
}
public Trie onlyWholeWords() {
this.trieConfig.setOnlyWholeWords(true);
return this;
}
public void addKeyword(String keyword)
{
if (keyword == null || keyword.length() == 0) {
return;
}
State currentState = this.rootState;
for (Character character : keyword.toCharArray()) {
currentState = currentState.addState(character);
}
currentState.addEmit(keyword);
}
public void addKeyword(String keyword) {
if (keyword == null || keyword.length() == 0) {
return;
}
State currentState = this.rootState;
for (Character character : keyword.toCharArray()) {
currentState = currentState.addState(character);
}
currentState.addEmit(keyword);
}
public Collection<Token> tokenize(String text)
{
public Collection<Token> tokenize(String text) {
Collection<Token> tokens = new ArrayList<Token>();
Collection<Token> tokens = new ArrayList<Token>();
Collection<Emit> collectedEmits = parseText(text);
int lastCollectedPosition = -1;
for (Emit emit : collectedEmits) {
if (emit.getStart() - lastCollectedPosition > 1) {
tokens.add(createFragment(emit, text, lastCollectedPosition));
}
tokens.add(createMatch(emit, text));
lastCollectedPosition = emit.getEnd();
}
if (text.length() - lastCollectedPosition > 1) {
tokens.add(createFragment(null, text, lastCollectedPosition));
}
Collection<Emit> collectedEmits = parseText(text);
int lastCollectedPosition = -1;
for (Emit emit : collectedEmits) {
if (emit.getStart() - lastCollectedPosition > 1) {
tokens.add(createFragment(emit, text, lastCollectedPosition));
}
tokens.add(createMatch(emit, text));
lastCollectedPosition = emit.getEnd();
}
if (text.length() - lastCollectedPosition > 1) {
tokens.add(createFragment(null, text, lastCollectedPosition));
}
return tokens;
}
return tokens;
}
private Token createFragment(Emit emit, String text, int lastCollectedPosition)
{
return new FragmentToken(text.substring(lastCollectedPosition + 1, emit == null ? text.length() : emit.
getStart()));
}
private Token createFragment(Emit emit, String text, int lastCollectedPosition) {
return new FragmentToken(text.substring(lastCollectedPosition+1, emit == null ? text.length() : emit.getStart()));
}
private Token createMatch(Emit emit, String text)
{
return new MatchToken(text.substring(emit.getStart(), emit.getEnd() + 1), emit);
}
private Token createMatch(Emit emit, String text) {
return new MatchToken(text.substring(emit.getStart(), emit.getEnd()+1), emit);
}
@SuppressWarnings("unchecked")
public Collection<Emit> parseText(String text)
{
checkForConstructedFailureStates();
@SuppressWarnings("unchecked")
public Collection<Emit> parseText(String text) {
checkForConstructedFailureStates();
int position = 0;
State currentState = this.rootState;
List<Emit> collectedEmits = new ArrayList<Emit>();
for (Character character : text.toCharArray()) {
if (trieConfig.isCaseInsensitive()) {
character = Character.toLowerCase(character);
}
currentState = getState(currentState, character);
storeEmits(position, currentState, collectedEmits);
position++;
}
int position = 0;
State currentState = this.rootState;
List<Emit> collectedEmits = new ArrayList<Emit>();
for (Character character : text.toCharArray()) {
if (trieConfig.isCaseInsensitive()) {
character = Character.toLowerCase(character);
}
currentState = getState(currentState, character);
storeEmits(position, currentState, collectedEmits);
position++;
}
if (trieConfig.isOnlyWholeWords()) {
removePartialMatches(text, collectedEmits);
}
if (trieConfig.isOnlyWholeWords()) {
removePartialMatches(text, collectedEmits);
}
if (!trieConfig.isAllowOverlaps()) {
IntervalTree intervalTree = new IntervalTree((List<Intervalable>) (List<?>) collectedEmits);
intervalTree.removeOverlaps((List<Intervalable>) (List<?>) collectedEmits);
}
if (!trieConfig.isAllowOverlaps()) {
IntervalTree intervalTree = new IntervalTree((List<Intervalable>)(List<?>)collectedEmits);
intervalTree.removeOverlaps((List<Intervalable>) (List<?>) collectedEmits);
}
return collectedEmits;
}
return collectedEmits;
}
public boolean matches(String text)
{
@ -135,7 +122,6 @@ public class Trie
public Emit firstMatch(String text)
{
if (!trieConfig.isAllowOverlaps()) {
// Slow path. Needs to find all the matches to detect overlaps.
Collection<Emit> parseText = parseText(text);
@ -143,10 +129,8 @@ public class Trie
return parseText.iterator().next();
}
} else {
// Fast path. Returs first match found.
// Fast path. Returs first match found.
checkForConstructedFailureStates();
int position = 0;
State currentState = this.rootState;
for (Character character : text.toCharArray()) {
@ -154,12 +138,10 @@ public class Trie
character = Character.toLowerCase(character);
}
currentState = getState(currentState, character);
Collection<String> emitStrs = currentState.emit();
if (emitStrs != null && !emitStrs.isEmpty()) {
for (String emitStr : emitStrs) {
final Emit emit = new Emit(position - emitStr.length() + 1, position, emitStr);
if (trieConfig.isOnlyWholeWords()) {
if (!isPartialMatch(text, emit)) {
return emit;
@ -169,12 +151,9 @@ public class Trie
}
}
}
position++;
}
}
return null;
}
@ -188,74 +167,68 @@ public class Trie
private void removePartialMatches(String searchText, List<Emit> collectedEmits)
{
long size = searchText.length();
List<Emit> removeEmits = new ArrayList<Emit>();
for (Emit emit : collectedEmits) {
if (isPartialMatch(searchText, emit)) {
removeEmits.add(emit);
}
}
for (Emit removeEmit : removeEmits) {
collectedEmits.remove(removeEmit);
}
}
private State getState(State currentState, Character character)
{
State newCurrentState = currentState.nextState(character);
while (newCurrentState == null) {
currentState = currentState.failure();
newCurrentState = currentState.nextState(character);
}
return newCurrentState;
}
private State getState(State currentState, Character character) {
State newCurrentState = currentState.nextState(character);
while (newCurrentState == null) {
currentState = currentState.failure();
newCurrentState = currentState.nextState(character);
}
return newCurrentState;
}
private void checkForConstructedFailureStates()
{
if (!this.failureStatesConstructed) {
constructFailureStates();
}
}
private void checkForConstructedFailureStates() {
if (!this.failureStatesConstructed) {
constructFailureStates();
}
}
private void constructFailureStates()
{
Queue<State> queue = new LinkedBlockingDeque<State>();
private void constructFailureStates() {
Queue<State> queue = new LinkedBlockingDeque<State>();
// First, set the fail state of all depth 1 states to the root state
for (State depthOneState : this.rootState.getStates()) {
depthOneState.setFailure(this.rootState);
queue.add(depthOneState);
}
this.failureStatesConstructed = true;
// First, set the fail state of all depth 1 states to the root state
for (State depthOneState : this.rootState.getStates()) {
depthOneState.setFailure(this.rootState);
queue.add(depthOneState);
}
this.failureStatesConstructed = true;
// Second, determine the fail state for all depth > 1 state
while (!queue.isEmpty()) {
State currentState = queue.remove();
// Second, determine the fail state for all depth > 1 state
while (!queue.isEmpty()) {
State currentState = queue.remove();
for (Character transition : currentState.getTransitions()) {
State targetState = currentState.nextState(transition);
queue.add(targetState);
for (Character transition : currentState.getTransitions()) {
State targetState = currentState.nextState(transition);
queue.add(targetState);
State traceFailureState = currentState.failure();
while (traceFailureState.nextState(transition) == null) {
traceFailureState = traceFailureState.failure();
}
State newFailureState = traceFailureState.nextState(transition);
targetState.setFailure(newFailureState);
targetState.addEmit(newFailureState.emit());
}
}
}
State traceFailureState = currentState.failure();
while (traceFailureState.nextState(transition) == null) {
traceFailureState = traceFailureState.failure();
}
State newFailureState = traceFailureState.nextState(transition);
targetState.setFailure(newFailureState);
targetState.addEmit(newFailureState.emit());
}
}
}
private void storeEmits(int position, State currentState, List<Emit> collectedEmits)
{
Collection<String> emits = currentState.emit();
if (emits != null && !emits.isEmpty()) {
for (String emit : emits) {
collectedEmits.add(new Emit(position - emit.length() + 1, position, emit));
}
}
}
private void storeEmits(int position, State currentState, List<Emit> collectedEmits) {
Collection<String> emits = currentState.emit();
if (emits != null && !emits.isEmpty()) {
for (String emit : emits) {
collectedEmits.add(new Emit(position-emit.length()+1, position, emit));
}
}
}
}