Issue #24 tokenize() method implementation extracted to separate class

This commit is contained in:
robert-bor 2015-09-27 14:37:36 +02:00
parent 5203efbbcb
commit bfaa32b20e
2 changed files with 45 additions and 25 deletions

View File

@ -0,0 +1,44 @@
package org.ahocorasick.trie;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
public class Tokenizer {
private final Collection<Emit> emits;
private final String text;
public Tokenizer(Collection<Emit> emits, String text) {
this.emits = emits;
this.text = text;
}
public Collection<Token> tokenize() {
List<Token> tokens = new ArrayList<>();
int lastCollectedPosition = -1;
for (Emit emit : emits) {
if (emit.getStart() - lastCollectedPosition > 1) {
tokens.add(createFragment(emit, text, lastCollectedPosition));
}
tokens.add(createMatch(emit, text));
lastCollectedPosition = emit.getEnd();
}
if (text.length() - lastCollectedPosition > 1) {
tokens.add(createFragment(null, text, lastCollectedPosition));
}
return tokens;
}
private Token createFragment(Emit emit, String text, int lastCollectedPosition) {
return new FragmentToken(text.substring(lastCollectedPosition+1, emit == null ? text.length() : emit.getStart()));
}
private Token createMatch(Emit emit, String text) {
return new MatchToken(text.substring(emit.getStart(), emit.getEnd()+1), emit);
}
}

View File

@ -42,31 +42,7 @@ public class Trie {
}
public Collection<Token> tokenize(String text) {
Collection<Token> tokens = new ArrayList<>();
Collection<Emit> collectedEmits = parseText(text);
int lastCollectedPosition = -1;
for (Emit emit : collectedEmits) {
if (emit.getStart() - lastCollectedPosition > 1) {
tokens.add(createFragment(emit, text, lastCollectedPosition));
}
tokens.add(createMatch(emit, text));
lastCollectedPosition = emit.getEnd();
}
if (text.length() - lastCollectedPosition > 1) {
tokens.add(createFragment(null, text, lastCollectedPosition));
}
return tokens;
}
private Token createFragment(Emit emit, String text, int lastCollectedPosition) {
return new FragmentToken(text.substring(lastCollectedPosition+1, emit == null ? text.length() : emit.getStart()));
}
private Token createMatch(Emit emit, String text) {
return new MatchToken(text.substring(emit.getStart(), emit.getEnd()+1), emit);
return new Tokenizer(parseText(text), text).tokenize();
}
@SuppressWarnings("unchecked")