Issue #23 added callback handler concept which omits the custom setting up of a list, but instead places direct calls to the handler. The handler are only supported on the lowest level of aho-corasick, ie no overlap, whole words and token support

Also added the possibility to pass a reader to the same level as above.
This commit is contained in:
robert-bor 2015-09-21 21:09:26 +02:00
parent 25eeef5168
commit 88799fb3da
8 changed files with 232 additions and 14 deletions

View File

@ -2,6 +2,9 @@ package org.ahocorasick.trie;
import org.ahocorasick.interval.IntervalTree;
import org.ahocorasick.interval.Intervalable;
import org.ahocorasick.trie.configuration.ParseConfiguration;
import org.ahocorasick.trie.handler.DefaultEmitHandler;
import org.ahocorasick.trie.handler.EmitHandler;
import java.util.ArrayList;
import java.util.Collection;
@ -87,19 +90,12 @@ public class Trie {
@SuppressWarnings("unchecked")
public Collection<Emit> parseText(String text) {
checkForConstructedFailureStates();
DefaultEmitHandler emitHandler = new DefaultEmitHandler();
parseText(new ParseConfiguration()
.setEmitHandler(emitHandler)
.setText(text));
int position = 0;
State currentState = this.rootState;
List<Emit> collectedEmits = new ArrayList<Emit>();
for (Character character : text.toCharArray()) {
if (trieConfig.isCaseInsensitive()) {
character = Character.toLowerCase(character);
}
currentState = getState(currentState, character);
storeEmits(position, currentState, collectedEmits);
position++;
}
List<Emit> collectedEmits = emitHandler.getEmits();
if (trieConfig.isOnlyWholeWords()) {
removePartialMatches(text, collectedEmits);
@ -113,6 +109,22 @@ public class Trie {
return collectedEmits;
}
public void parseText(ParseConfiguration parseConfiguration) {
checkForConstructedFailureStates();
int position = 0;
State currentState = this.rootState;
for (Character character : parseConfiguration) {
if (trieConfig.isCaseInsensitive()) {
character = Character.toLowerCase(character);
}
currentState = getState(currentState, character);
storeEmits(position, currentState, parseConfiguration.getEmitHandler());
position++;
}
}
private void removePartialMatches(String searchText, List<Emit> collectedEmits) {
long size = searchText.length();
List<Emit> removeEmits = new ArrayList<Emit>();
@ -175,11 +187,11 @@ public class Trie {
}
}
private void storeEmits(int position, State currentState, List<Emit> collectedEmits) {
private void storeEmits(int position, State currentState, EmitHandler emitHandler) {
Collection<String> emits = currentState.emit();
if (emits != null && !emits.isEmpty()) {
for (String emit : emits) {
collectedEmits.add(new Emit(position-emit.length()+1, position, emit));
emitHandler.emit(new Emit(position - emit.length() + 1, position, emit));
}
}
}

View File

@ -0,0 +1,43 @@
package org.ahocorasick.trie.configuration;
import org.ahocorasick.trie.handler.EmitHandler;
import java.io.Reader;
import java.util.Iterator;
public class ParseConfiguration implements Iterable<Character> {
private String text;
private Reader reader;
private EmitHandler emitHandler;
public ParseConfiguration setText(String text) {
this.text = text;
return this;
}
public ParseConfiguration setText(Reader reader) {
this.reader = reader;
return this;
}
public ParseConfiguration setEmitHandler(EmitHandler emitHandler) {
this.emitHandler = emitHandler;
return this;
}
public EmitHandler getEmitHandler() {
return emitHandler;
}
@Override
public Iterator<Character> iterator() {
if (reader != null) {
return new ReaderIterator(reader);
}
return new StringIterator(text);
}
}

View File

@ -0,0 +1,44 @@
package org.ahocorasick.trie.configuration;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.util.Iterator;
public class ReaderIterator implements Iterator<Character> {
private Reader reader;
private int readCharacter;
public ReaderIterator(Reader reader) {
this.reader = reader;
readIntoBuffer();
}
private void readIntoBuffer() {
try {
readCharacter = reader.read();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
@Override
public boolean hasNext() {
return readCharacter != -1;
}
@Override
public Character next() {
Character returnChar = (char)readCharacter;
readIntoBuffer();
return returnChar;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
}

View File

@ -0,0 +1,30 @@
package org.ahocorasick.trie.configuration;
import java.util.Iterator;
public class StringIterator implements Iterator<Character> {
private int counter = 0;
private String text;
public StringIterator(String text) {
this.text = text;
}
@Override
public boolean hasNext() {
return counter < text.length();
}
@Override
public Character next() {
return text.charAt(counter++);
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
}

View File

@ -0,0 +1,21 @@
package org.ahocorasick.trie.handler;
import org.ahocorasick.trie.Emit;
import java.util.ArrayList;
import java.util.List;
public class DefaultEmitHandler implements EmitHandler {
private List<Emit> emits = new ArrayList<>();
@Override
public void emit(Emit emit) {
this.emits.add(emit);
}
public List<Emit> getEmits() {
return this.emits;
}
}

View File

@ -0,0 +1,7 @@
package org.ahocorasick.trie.handler;
import org.ahocorasick.trie.Emit;
public interface EmitHandler {
void emit(Emit emit);
}

View File

@ -1,9 +1,13 @@
package org.ahocorasick.trie;
import org.ahocorasick.trie.configuration.ParseConfiguration;
import org.ahocorasick.trie.handler.EmitHandler;
import org.junit.Test;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import static junit.framework.Assert.assertEquals;
@ -53,6 +57,30 @@ public class TrieTest {
checkEmit(iterator.next(), 2, 5, "hers");
}
@Test
public void ushersTestByCallback() {
Trie trie = new Trie();
trie.addKeyword("hers");
trie.addKeyword("his");
trie.addKeyword("she");
trie.addKeyword("he");
final List<Emit> emits = new ArrayList<>();
EmitHandler emitHandler = new EmitHandler() {
@Override
public void emit(Emit emit) {
emits.add(emit);
}
};
trie.parseText(new ParseConfiguration().setText("ushers").setEmitHandler(emitHandler));
assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5
Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 2, 3, "he");
checkEmit(iterator.next(), 1, 3, "she");
checkEmit(iterator.next(), 2, 5, "hers");
}
@Test
public void misleadingTest() {
Trie trie = new Trie();

View File

@ -0,0 +1,33 @@
package org.ahocorasick.trie.configuration;
import org.junit.Test;
import java.io.IOException;
import java.io.StringReader;
import static org.junit.Assert.assertEquals;
public class ParseConfigurationTest {
@Test
public void reader() throws IOException {
StringReader reader = new StringReader("hällö");
ParseConfiguration parseConfiguration = new ParseConfiguration().setText(reader);
assertIterator(parseConfiguration);
reader.close();
}
@Test
public void string() throws IOException {
ParseConfiguration parseConfiguration = new ParseConfiguration().setText("hällö");
assertIterator(parseConfiguration);
}
private void assertIterator(ParseConfiguration parseConfiguration) {
StringBuffer text = new StringBuffer();
for (Character character : parseConfiguration) {
text.append(character);
}
assertEquals("hällö", text.toString());
}
}