Issue #16 Use builder pattern to create Trie

Previously, there was a race condition in Trie#parseText since
it called constructFailureStates on first run without synchronization.
See https://github.com/robert-bor/aho-corasick/issues/16

This commit fixes this by using the builder pattern in order to
create a fully initialized Trie.

N.B. This changes the API
This commit is contained in:
Petter Remen 2015-07-03 12:29:31 +02:00
parent 25eeef5168
commit 9bce51e001
3 changed files with 141 additions and 110 deletions

View File

@ -51,11 +51,12 @@ Usage
-----
Setting up the Trie is a piece of cake:
```java
Trie trie = new Trie();
trie.addKeyword("hers");
trie.addKeyword("his");
trie.addKeyword("she");
trie.addKeyword("he");
Trie trie = Trie.builder()
.addKeyword("hers")
.addKeyword("his")
.addKeyword("she")
.addKeyword("he")
.build();
Collection<Emit> emits = trie.parseText("ushers");
```
@ -68,9 +69,11 @@ In normal situations you probably want to remove overlapping instances, retainin
matches.
```java
Trie trie = new Trie().removeOverlaps();
trie.addKeyword("hot");
trie.addKeyword("hot chocolate");
Trie trie = Trie.builder()
.removeOverlaps()
.addKeyword("hot")
.addKeyword("hot chocolate")
.build();
Collection<Emit> emits = trie.parseText("hot chocolate");
```
@ -82,8 +85,10 @@ There is only one result now:
If you want the algorithm to only check for whole words, you can tell the Trie to do so:
```java
Trie trie = new Trie().onlyWholeWords();
trie.addKeyword("sugar");
Trie trie = Trie.builder()
.onlyWholeWords()
.addKeyword("sugar")
.build();
Collection<Emit> emits = trie.parseText("sugarcane sugarcane sugar canesugar");
```
@ -94,8 +99,10 @@ Some text are WrItTeN in combinations of lowercase and uppercase and therefore h
the Trie to lowercase the entire searchtext to ease the matching process.
```java
Trie trie = new Trie().caseInsensitive();
trie.addKeyword("casing");
Trie trie = Trie.builder()
.caseInsensitive()
.addKeyword("casing")
.build();
Collection<Emit> emits = trie.parseText("CaSiNg");
```
@ -111,10 +118,11 @@ matches as soon as you encounter them. Let's look at an example where we want to
String speech = "The Answer to the Great Question... Of Life, " +
"the Universe and Everything... Is... Forty-two,' said " +
"Deep Thought, with infinite majesty and calm.";
Trie trie = new Trie().removeOverlaps().onlyWholeWords().caseInsensitive();
trie.addKeyword("great question");
trie.addKeyword("forty-two");
trie.addKeyword("deep thought");
Trie trie = Trie.builder().removeOverlaps().onlyWholeWords().caseInsensitive()
.addKeyword("great question")
.addKeyword("forty-two")
.addKeyword("deep thought")
.build();
Collection<Token> tokens = trie.tokenize(speech);
StringBuffer html = new StringBuffer();
html.append("<html><body><p>");

View File

@ -20,33 +20,12 @@ public class Trie {
private State rootState;
private boolean failureStatesConstructed = false;
public Trie(TrieConfig trieConfig) {
private Trie(TrieConfig trieConfig) {
this.trieConfig = trieConfig;
this.rootState = new State();
}
public Trie() {
this(new TrieConfig());
}
public Trie caseInsensitive() {
this.trieConfig.setCaseInsensitive(true);
return this;
}
public Trie removeOverlaps() {
this.trieConfig.setAllowOverlaps(false);
return this;
}
public Trie onlyWholeWords() {
this.trieConfig.setOnlyWholeWords(true);
return this;
}
public void addKeyword(String keyword) {
private void addKeyword(String keyword) {
if (keyword == null || keyword.length() == 0) {
return;
}
@ -87,8 +66,6 @@ public class Trie {
@SuppressWarnings("unchecked")
public Collection<Emit> parseText(String text) {
checkForConstructedFailureStates();
int position = 0;
State currentState = this.rootState;
List<Emit> collectedEmits = new ArrayList<Emit>();
@ -140,12 +117,6 @@ public class Trie {
return newCurrentState;
}
private void checkForConstructedFailureStates() {
if (!this.failureStatesConstructed) {
constructFailureStates();
}
}
private void constructFailureStates() {
Queue<State> queue = new LinkedBlockingDeque<State>();
@ -154,7 +125,6 @@ public class Trie {
depthOneState.setFailure(this.rootState);
queue.add(depthOneState);
}
this.failureStatesConstructed = true;
// Second, determine the fail state for all depth > 1 state
while (!queue.isEmpty()) {
@ -184,4 +154,41 @@ public class Trie {
}
}
public static TrieBuilder builder() {
return new TrieBuilder();
}
public static class TrieBuilder {
private TrieConfig trieConfig = new TrieConfig();
private Trie trie = new Trie(trieConfig);
private TrieBuilder() {}
public TrieBuilder caseInsensitive() {
this.trieConfig.setCaseInsensitive(true);
return this;
}
public TrieBuilder removeOverlaps() {
this.trieConfig.setAllowOverlaps(false);
return this;
}
public TrieBuilder onlyWholeWords() {
this.trieConfig.setOnlyWholeWords(true);
return this;
}
public TrieBuilder addKeyword(String keyword) {
trie.addKeyword(keyword);
return this;
}
public Trie build() {
trie.constructFailureStates();
return trie;
}
}
}

View File

@ -11,8 +11,9 @@ public class TrieTest {
@Test
public void keywordAndTextAreTheSame() {
Trie trie = new Trie();
trie.addKeyword("abc");
Trie trie = Trie.builder()
.addKeyword("abc")
.build();
Collection<Emit> emits = trie.parseText("abc");
Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 0, 2, "abc");
@ -20,8 +21,9 @@ public class TrieTest {
@Test
public void textIsLongerThanKeyword() {
Trie trie = new Trie();
trie.addKeyword("abc");
Trie trie = Trie.builder()
.addKeyword("abc")
.build();
Collection<Emit> emits = trie.parseText(" abc");
Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 1, 3, "abc");
@ -29,10 +31,11 @@ public class TrieTest {
@Test
public void variousKeywordsOneMatch() {
Trie trie = new Trie();
trie.addKeyword("abc");
trie.addKeyword("bcd");
trie.addKeyword("cde");
Trie trie = Trie.builder()
.addKeyword("abc")
.addKeyword("bcd")
.addKeyword("cde")
.build();
Collection<Emit> emits = trie.parseText("bcd");
Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 0, 2, "bcd");
@ -40,11 +43,12 @@ public class TrieTest {
@Test
public void ushersTest() {
Trie trie = new Trie();
trie.addKeyword("hers");
trie.addKeyword("his");
trie.addKeyword("she");
trie.addKeyword("he");
Trie trie = Trie.builder()
.addKeyword("hers")
.addKeyword("his")
.addKeyword("she")
.addKeyword("he")
.build();
Collection<Emit> emits = trie.parseText("ushers");
assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5
Iterator<Emit> iterator = emits.iterator();
@ -55,8 +59,9 @@ public class TrieTest {
@Test
public void misleadingTest() {
Trie trie = new Trie();
trie.addKeyword("hers");
Trie trie = Trie.builder()
.addKeyword("hers")
.build();
Collection<Emit> emits = trie.parseText("h he her hers");
Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 9, 12, "hers");
@ -64,11 +69,12 @@ public class TrieTest {
@Test
public void recipes() {
Trie trie = new Trie();
trie.addKeyword("veal");
trie.addKeyword("cauliflower");
trie.addKeyword("broccoli");
trie.addKeyword("tomatoes");
Trie trie = Trie.builder()
.addKeyword("veal")
.addKeyword("cauliflower")
.addKeyword("broccoli")
.addKeyword("tomatoes")
.build();
Collection<Emit> emits = trie.parseText("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli");
Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 2, 12, "cauliflower");
@ -79,9 +85,10 @@ public class TrieTest {
@Test
public void longAndShortOverlappingMatch() {
Trie trie = new Trie();
trie.addKeyword("he");
trie.addKeyword("hehehehe");
Trie trie = Trie.builder()
.addKeyword("he")
.addKeyword("hehehehe")
.build();
Collection<Emit> emits = trie.parseText("hehehehehe");
Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 0, 1, "he");
@ -95,10 +102,11 @@ public class TrieTest {
@Test
public void nonOverlapping() {
Trie trie = new Trie().removeOverlaps();
trie.addKeyword("ab");
trie.addKeyword("cba");
trie.addKeyword("ababc");
Trie trie = Trie.builder().removeOverlaps()
.addKeyword("ab")
.addKeyword("cba")
.addKeyword("ababc")
.build();
Collection<Emit> emits = trie.parseText("ababcbab");
assertEquals(2, emits.size());
Iterator<Emit> iterator = emits.iterator();
@ -109,25 +117,27 @@ public class TrieTest {
@Test
public void startOfChurchillSpeech() {
Trie trie = new Trie().removeOverlaps();
trie.addKeyword("T");
trie.addKeyword("u");
trie.addKeyword("ur");
trie.addKeyword("r");
trie.addKeyword("urn");
trie.addKeyword("ni");
trie.addKeyword("i");
trie.addKeyword("in");
trie.addKeyword("n");
trie.addKeyword("urning");
Trie trie = Trie.builder().removeOverlaps()
.addKeyword("T")
.addKeyword("u")
.addKeyword("ur")
.addKeyword("r")
.addKeyword("urn")
.addKeyword("ni")
.addKeyword("i")
.addKeyword("in")
.addKeyword("n")
.addKeyword("urning")
.build();
Collection<Emit> emits = trie.parseText("Turning");
assertEquals(2, emits.size());
}
@Test
public void partialMatch() {
Trie trie = new Trie().onlyWholeWords();
trie.addKeyword("sugar");
Trie trie = Trie.builder().onlyWholeWords()
.addKeyword("sugar")
.build();
Collection<Emit> emits = trie.parseText("sugarcane sugarcane sugar canesugar"); // left, middle, right test
assertEquals(1, emits.size()); // Match must not be made
checkEmit(emits.iterator().next(), 20, 24, "sugar");
@ -135,10 +145,11 @@ public class TrieTest {
@Test
public void tokenizeFullSentence() {
Trie trie = new Trie();
trie.addKeyword("Alpha");
trie.addKeyword("Beta");
trie.addKeyword("Gamma");
Trie trie = Trie.builder()
.addKeyword("Alpha")
.addKeyword("Beta")
.addKeyword("Gamma")
.build();
Collection<Token> tokens = trie.tokenize("Hear: Alpha team first, Beta from the rear, Gamma in reserve");
assertEquals(7, tokens.size());
Iterator<Token> tokensIt = tokens.iterator();
@ -153,11 +164,12 @@ public class TrieTest {
@Test
public void bug5InGithubReportedByXCurry() {
Trie trie = new Trie().caseInsensitive().onlyWholeWords();
trie.addKeyword("turning");
trie.addKeyword("once");
trie.addKeyword("again");
trie.addKeyword("börkü");
Trie trie = Trie.builder().caseInsensitive().onlyWholeWords()
.addKeyword("turning")
.addKeyword("once")
.addKeyword("again")
.addKeyword("börkü")
.build();
Collection<Emit> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ");
assertEquals(4, emits.size()); // Match must not be made
Iterator<Emit> it = emits.iterator();
@ -169,11 +181,12 @@ public class TrieTest {
@Test
public void caseInsensitive() {
Trie trie = new Trie().caseInsensitive();
trie.addKeyword("turning");
trie.addKeyword("once");
trie.addKeyword("again");
trie.addKeyword("börkü");
Trie trie = Trie.builder().caseInsensitive()
.addKeyword("turning")
.addKeyword("once")
.addKeyword("again")
.addKeyword("börkü")
.build();
Collection<Emit> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ");
assertEquals(4, emits.size()); // Match must not be made
Iterator<Emit> it = emits.iterator();
@ -185,10 +198,11 @@ public class TrieTest {
@Test
public void tokenizeTokensInSequence() {
Trie trie = new Trie();
trie.addKeyword("Alpha");
trie.addKeyword("Beta");
trie.addKeyword("Gamma");
Trie trie = Trie.builder()
.addKeyword("Alpha")
.addKeyword("Beta")
.addKeyword("Gamma")
.build();
Collection<Token> tokens = trie.tokenize("Alpha Beta Gamma");
assertEquals(5, tokens.size());
}
@ -196,8 +210,9 @@ public class TrieTest {
// Test offered by XCurry, https://github.com/robert-bor/aho-corasick/issues/7
@Test
public void zeroLengthTestBug7InGithubReportedByXCurry() {
Trie trie = new Trie().removeOverlaps().onlyWholeWords().caseInsensitive();
trie.addKeyword("");
Trie trie = Trie.builder().removeOverlaps().onlyWholeWords().caseInsensitive()
.addKeyword("")
.build();
trie.tokenize("Try a natural lip and subtle bronzer to keep all the focus on those big bright eyes with NARS Eyeshadow Duo in Rated R And the winner is... Boots No7 Advanced Renewal Anti-ageing Glycolic Peel Kit ($25 amazon.com) won most-appealing peel.");
}
@ -205,9 +220,10 @@ public class TrieTest {
@Test
public void unicodeIssueBug8ReportedByDwyerk() {
String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char
Trie trie = new Trie().caseInsensitive().onlyWholeWords();
assertEquals("THIS", target.substring(5,9)); // Java does it the right way
trie.addKeyword("this");
Trie trie = Trie.builder().caseInsensitive().onlyWholeWords()
.addKeyword("this")
.build();
Collection<Emit> emits = trie.parseText(target);
assertEquals(1, emits.size());
Iterator<Emit> it = emits.iterator();