Issue #23 added callback handler concept which omits the custom setting up of a list, but instead places direct calls to the handler. The handler are only supported on the lowest level of aho-corasick, ie no overlap, whole words and token support
Also added the possibility to pass a reader to the same level as above.
This commit is contained in:
parent
25eeef5168
commit
88799fb3da
@ -2,6 +2,9 @@ package org.ahocorasick.trie;
|
||||
|
||||
import org.ahocorasick.interval.IntervalTree;
|
||||
import org.ahocorasick.interval.Intervalable;
|
||||
import org.ahocorasick.trie.configuration.ParseConfiguration;
|
||||
import org.ahocorasick.trie.handler.DefaultEmitHandler;
|
||||
import org.ahocorasick.trie.handler.EmitHandler;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
@ -87,19 +90,12 @@ public class Trie {
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public Collection<Emit> parseText(String text) {
|
||||
checkForConstructedFailureStates();
|
||||
DefaultEmitHandler emitHandler = new DefaultEmitHandler();
|
||||
parseText(new ParseConfiguration()
|
||||
.setEmitHandler(emitHandler)
|
||||
.setText(text));
|
||||
|
||||
int position = 0;
|
||||
State currentState = this.rootState;
|
||||
List<Emit> collectedEmits = new ArrayList<Emit>();
|
||||
for (Character character : text.toCharArray()) {
|
||||
if (trieConfig.isCaseInsensitive()) {
|
||||
character = Character.toLowerCase(character);
|
||||
}
|
||||
currentState = getState(currentState, character);
|
||||
storeEmits(position, currentState, collectedEmits);
|
||||
position++;
|
||||
}
|
||||
List<Emit> collectedEmits = emitHandler.getEmits();
|
||||
|
||||
if (trieConfig.isOnlyWholeWords()) {
|
||||
removePartialMatches(text, collectedEmits);
|
||||
@ -113,6 +109,22 @@ public class Trie {
|
||||
return collectedEmits;
|
||||
}
|
||||
|
||||
public void parseText(ParseConfiguration parseConfiguration) {
|
||||
checkForConstructedFailureStates();
|
||||
|
||||
int position = 0;
|
||||
State currentState = this.rootState;
|
||||
for (Character character : parseConfiguration) {
|
||||
if (trieConfig.isCaseInsensitive()) {
|
||||
character = Character.toLowerCase(character);
|
||||
}
|
||||
currentState = getState(currentState, character);
|
||||
storeEmits(position, currentState, parseConfiguration.getEmitHandler());
|
||||
position++;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private void removePartialMatches(String searchText, List<Emit> collectedEmits) {
|
||||
long size = searchText.length();
|
||||
List<Emit> removeEmits = new ArrayList<Emit>();
|
||||
@ -175,11 +187,11 @@ public class Trie {
|
||||
}
|
||||
}
|
||||
|
||||
private void storeEmits(int position, State currentState, List<Emit> collectedEmits) {
|
||||
private void storeEmits(int position, State currentState, EmitHandler emitHandler) {
|
||||
Collection<String> emits = currentState.emit();
|
||||
if (emits != null && !emits.isEmpty()) {
|
||||
for (String emit : emits) {
|
||||
collectedEmits.add(new Emit(position-emit.length()+1, position, emit));
|
||||
emitHandler.emit(new Emit(position - emit.length() + 1, position, emit));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -0,0 +1,43 @@
|
||||
package org.ahocorasick.trie.configuration;
|
||||
|
||||
import org.ahocorasick.trie.handler.EmitHandler;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.Iterator;
|
||||
|
||||
public class ParseConfiguration implements Iterable<Character> {
|
||||
|
||||
private String text;
|
||||
|
||||
private Reader reader;
|
||||
|
||||
private EmitHandler emitHandler;
|
||||
|
||||
public ParseConfiguration setText(String text) {
|
||||
this.text = text;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ParseConfiguration setText(Reader reader) {
|
||||
this.reader = reader;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ParseConfiguration setEmitHandler(EmitHandler emitHandler) {
|
||||
this.emitHandler = emitHandler;
|
||||
return this;
|
||||
}
|
||||
|
||||
public EmitHandler getEmitHandler() {
|
||||
return emitHandler;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<Character> iterator() {
|
||||
if (reader != null) {
|
||||
return new ReaderIterator(reader);
|
||||
}
|
||||
return new StringIterator(text);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,44 @@
|
||||
package org.ahocorasick.trie.configuration;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.Reader;
|
||||
import java.util.Iterator;
|
||||
|
||||
public class ReaderIterator implements Iterator<Character> {
|
||||
|
||||
private Reader reader;
|
||||
|
||||
private int readCharacter;
|
||||
|
||||
public ReaderIterator(Reader reader) {
|
||||
this.reader = reader;
|
||||
readIntoBuffer();
|
||||
}
|
||||
|
||||
private void readIntoBuffer() {
|
||||
try {
|
||||
readCharacter = reader.read();
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return readCharacter != -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Character next() {
|
||||
Character returnChar = (char)readCharacter;
|
||||
readIntoBuffer();
|
||||
return returnChar;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,30 @@
|
||||
package org.ahocorasick.trie.configuration;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
public class StringIterator implements Iterator<Character> {
|
||||
|
||||
private int counter = 0;
|
||||
|
||||
private String text;
|
||||
|
||||
public StringIterator(String text) {
|
||||
this.text = text;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return counter < text.length();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Character next() {
|
||||
return text.charAt(counter++);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,21 @@
|
||||
package org.ahocorasick.trie.handler;
|
||||
|
||||
import org.ahocorasick.trie.Emit;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public class DefaultEmitHandler implements EmitHandler {
|
||||
|
||||
private List<Emit> emits = new ArrayList<>();
|
||||
|
||||
@Override
|
||||
public void emit(Emit emit) {
|
||||
this.emits.add(emit);
|
||||
}
|
||||
|
||||
public List<Emit> getEmits() {
|
||||
return this.emits;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,7 @@
|
||||
package org.ahocorasick.trie.handler;
|
||||
|
||||
import org.ahocorasick.trie.Emit;
|
||||
|
||||
public interface EmitHandler {
|
||||
void emit(Emit emit);
|
||||
}
|
||||
@ -1,9 +1,13 @@
|
||||
package org.ahocorasick.trie;
|
||||
|
||||
import org.ahocorasick.trie.configuration.ParseConfiguration;
|
||||
import org.ahocorasick.trie.handler.EmitHandler;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import static junit.framework.Assert.assertEquals;
|
||||
|
||||
@ -53,6 +57,30 @@ public class TrieTest {
|
||||
checkEmit(iterator.next(), 2, 5, "hers");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void ushersTestByCallback() {
|
||||
Trie trie = new Trie();
|
||||
trie.addKeyword("hers");
|
||||
trie.addKeyword("his");
|
||||
trie.addKeyword("she");
|
||||
trie.addKeyword("he");
|
||||
|
||||
final List<Emit> emits = new ArrayList<>();
|
||||
EmitHandler emitHandler = new EmitHandler() {
|
||||
|
||||
@Override
|
||||
public void emit(Emit emit) {
|
||||
emits.add(emit);
|
||||
}
|
||||
};
|
||||
trie.parseText(new ParseConfiguration().setText("ushers").setEmitHandler(emitHandler));
|
||||
assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5
|
||||
Iterator<Emit> iterator = emits.iterator();
|
||||
checkEmit(iterator.next(), 2, 3, "he");
|
||||
checkEmit(iterator.next(), 1, 3, "she");
|
||||
checkEmit(iterator.next(), 2, 5, "hers");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void misleadingTest() {
|
||||
Trie trie = new Trie();
|
||||
|
||||
@ -0,0 +1,33 @@
|
||||
package org.ahocorasick.trie.configuration;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
public class ParseConfigurationTest {
|
||||
|
||||
@Test
|
||||
public void reader() throws IOException {
|
||||
StringReader reader = new StringReader("hällö");
|
||||
ParseConfiguration parseConfiguration = new ParseConfiguration().setText(reader);
|
||||
assertIterator(parseConfiguration);
|
||||
reader.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void string() throws IOException {
|
||||
ParseConfiguration parseConfiguration = new ParseConfiguration().setText("hällö");
|
||||
assertIterator(parseConfiguration);
|
||||
}
|
||||
|
||||
private void assertIterator(ParseConfiguration parseConfiguration) {
|
||||
StringBuffer text = new StringBuffer();
|
||||
for (Character character : parseConfiguration) {
|
||||
text.append(character);
|
||||
}
|
||||
assertEquals("hällö", text.toString());
|
||||
}
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user