Issue #16 Use builder pattern to create Trie
Previously, there was a race condition in Trie#parseText since it called constructFailureStates on first run without synchronization. See https://github.com/robert-bor/aho-corasick/issues/16 This commit fixes this by using the builder pattern in order to create a fully initialized Trie. N.B. This changes the API
This commit is contained in:
parent
25eeef5168
commit
9bce51e001
40
README.md
40
README.md
@ -51,11 +51,12 @@ Usage
|
||||
-----
|
||||
Setting up the Trie is a piece of cake:
|
||||
```java
|
||||
Trie trie = new Trie();
|
||||
trie.addKeyword("hers");
|
||||
trie.addKeyword("his");
|
||||
trie.addKeyword("she");
|
||||
trie.addKeyword("he");
|
||||
Trie trie = Trie.builder()
|
||||
.addKeyword("hers")
|
||||
.addKeyword("his")
|
||||
.addKeyword("she")
|
||||
.addKeyword("he")
|
||||
.build();
|
||||
Collection<Emit> emits = trie.parseText("ushers");
|
||||
```
|
||||
|
||||
@ -68,9 +69,11 @@ In normal situations you probably want to remove overlapping instances, retainin
|
||||
matches.
|
||||
|
||||
```java
|
||||
Trie trie = new Trie().removeOverlaps();
|
||||
trie.addKeyword("hot");
|
||||
trie.addKeyword("hot chocolate");
|
||||
Trie trie = Trie.builder()
|
||||
.removeOverlaps()
|
||||
.addKeyword("hot")
|
||||
.addKeyword("hot chocolate")
|
||||
.build();
|
||||
Collection<Emit> emits = trie.parseText("hot chocolate");
|
||||
```
|
||||
|
||||
@ -82,8 +85,10 @@ There is only one result now:
|
||||
If you want the algorithm to only check for whole words, you can tell the Trie to do so:
|
||||
|
||||
```java
|
||||
Trie trie = new Trie().onlyWholeWords();
|
||||
trie.addKeyword("sugar");
|
||||
Trie trie = Trie.builder()
|
||||
.onlyWholeWords()
|
||||
.addKeyword("sugar")
|
||||
.build();
|
||||
Collection<Emit> emits = trie.parseText("sugarcane sugarcane sugar canesugar");
|
||||
```
|
||||
|
||||
@ -94,8 +99,10 @@ Some text are WrItTeN in combinations of lowercase and uppercase and therefore h
|
||||
the Trie to lowercase the entire searchtext to ease the matching process.
|
||||
|
||||
```java
|
||||
Trie trie = new Trie().caseInsensitive();
|
||||
trie.addKeyword("casing");
|
||||
Trie trie = Trie.builder()
|
||||
.caseInsensitive()
|
||||
.addKeyword("casing")
|
||||
.build();
|
||||
Collection<Emit> emits = trie.parseText("CaSiNg");
|
||||
```
|
||||
|
||||
@ -111,10 +118,11 @@ matches as soon as you encounter them. Let's look at an example where we want to
|
||||
String speech = "The Answer to the Great Question... Of Life, " +
|
||||
"the Universe and Everything... Is... Forty-two,' said " +
|
||||
"Deep Thought, with infinite majesty and calm.";
|
||||
Trie trie = new Trie().removeOverlaps().onlyWholeWords().caseInsensitive();
|
||||
trie.addKeyword("great question");
|
||||
trie.addKeyword("forty-two");
|
||||
trie.addKeyword("deep thought");
|
||||
Trie trie = Trie.builder().removeOverlaps().onlyWholeWords().caseInsensitive()
|
||||
.addKeyword("great question")
|
||||
.addKeyword("forty-two")
|
||||
.addKeyword("deep thought")
|
||||
.build();
|
||||
Collection<Token> tokens = trie.tokenize(speech);
|
||||
StringBuffer html = new StringBuffer();
|
||||
html.append("<html><body><p>");
|
||||
|
||||
@ -20,33 +20,12 @@ public class Trie {
|
||||
|
||||
private State rootState;
|
||||
|
||||
private boolean failureStatesConstructed = false;
|
||||
|
||||
public Trie(TrieConfig trieConfig) {
|
||||
private Trie(TrieConfig trieConfig) {
|
||||
this.trieConfig = trieConfig;
|
||||
this.rootState = new State();
|
||||
}
|
||||
|
||||
public Trie() {
|
||||
this(new TrieConfig());
|
||||
}
|
||||
|
||||
public Trie caseInsensitive() {
|
||||
this.trieConfig.setCaseInsensitive(true);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Trie removeOverlaps() {
|
||||
this.trieConfig.setAllowOverlaps(false);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Trie onlyWholeWords() {
|
||||
this.trieConfig.setOnlyWholeWords(true);
|
||||
return this;
|
||||
}
|
||||
|
||||
public void addKeyword(String keyword) {
|
||||
private void addKeyword(String keyword) {
|
||||
if (keyword == null || keyword.length() == 0) {
|
||||
return;
|
||||
}
|
||||
@ -87,8 +66,6 @@ public class Trie {
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public Collection<Emit> parseText(String text) {
|
||||
checkForConstructedFailureStates();
|
||||
|
||||
int position = 0;
|
||||
State currentState = this.rootState;
|
||||
List<Emit> collectedEmits = new ArrayList<Emit>();
|
||||
@ -140,12 +117,6 @@ public class Trie {
|
||||
return newCurrentState;
|
||||
}
|
||||
|
||||
private void checkForConstructedFailureStates() {
|
||||
if (!this.failureStatesConstructed) {
|
||||
constructFailureStates();
|
||||
}
|
||||
}
|
||||
|
||||
private void constructFailureStates() {
|
||||
Queue<State> queue = new LinkedBlockingDeque<State>();
|
||||
|
||||
@ -154,7 +125,6 @@ public class Trie {
|
||||
depthOneState.setFailure(this.rootState);
|
||||
queue.add(depthOneState);
|
||||
}
|
||||
this.failureStatesConstructed = true;
|
||||
|
||||
// Second, determine the fail state for all depth > 1 state
|
||||
while (!queue.isEmpty()) {
|
||||
@ -184,4 +154,41 @@ public class Trie {
|
||||
}
|
||||
}
|
||||
|
||||
public static TrieBuilder builder() {
|
||||
return new TrieBuilder();
|
||||
}
|
||||
|
||||
public static class TrieBuilder {
|
||||
|
||||
private TrieConfig trieConfig = new TrieConfig();
|
||||
|
||||
private Trie trie = new Trie(trieConfig);
|
||||
|
||||
private TrieBuilder() {}
|
||||
|
||||
public TrieBuilder caseInsensitive() {
|
||||
this.trieConfig.setCaseInsensitive(true);
|
||||
return this;
|
||||
}
|
||||
|
||||
public TrieBuilder removeOverlaps() {
|
||||
this.trieConfig.setAllowOverlaps(false);
|
||||
return this;
|
||||
}
|
||||
|
||||
public TrieBuilder onlyWholeWords() {
|
||||
this.trieConfig.setOnlyWholeWords(true);
|
||||
return this;
|
||||
}
|
||||
|
||||
public TrieBuilder addKeyword(String keyword) {
|
||||
trie.addKeyword(keyword);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Trie build() {
|
||||
trie.constructFailureStates();
|
||||
return trie;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -11,8 +11,9 @@ public class TrieTest {
|
||||
|
||||
@Test
|
||||
public void keywordAndTextAreTheSame() {
|
||||
Trie trie = new Trie();
|
||||
trie.addKeyword("abc");
|
||||
Trie trie = Trie.builder()
|
||||
.addKeyword("abc")
|
||||
.build();
|
||||
Collection<Emit> emits = trie.parseText("abc");
|
||||
Iterator<Emit> iterator = emits.iterator();
|
||||
checkEmit(iterator.next(), 0, 2, "abc");
|
||||
@ -20,8 +21,9 @@ public class TrieTest {
|
||||
|
||||
@Test
|
||||
public void textIsLongerThanKeyword() {
|
||||
Trie trie = new Trie();
|
||||
trie.addKeyword("abc");
|
||||
Trie trie = Trie.builder()
|
||||
.addKeyword("abc")
|
||||
.build();
|
||||
Collection<Emit> emits = trie.parseText(" abc");
|
||||
Iterator<Emit> iterator = emits.iterator();
|
||||
checkEmit(iterator.next(), 1, 3, "abc");
|
||||
@ -29,10 +31,11 @@ public class TrieTest {
|
||||
|
||||
@Test
|
||||
public void variousKeywordsOneMatch() {
|
||||
Trie trie = new Trie();
|
||||
trie.addKeyword("abc");
|
||||
trie.addKeyword("bcd");
|
||||
trie.addKeyword("cde");
|
||||
Trie trie = Trie.builder()
|
||||
.addKeyword("abc")
|
||||
.addKeyword("bcd")
|
||||
.addKeyword("cde")
|
||||
.build();
|
||||
Collection<Emit> emits = trie.parseText("bcd");
|
||||
Iterator<Emit> iterator = emits.iterator();
|
||||
checkEmit(iterator.next(), 0, 2, "bcd");
|
||||
@ -40,11 +43,12 @@ public class TrieTest {
|
||||
|
||||
@Test
|
||||
public void ushersTest() {
|
||||
Trie trie = new Trie();
|
||||
trie.addKeyword("hers");
|
||||
trie.addKeyword("his");
|
||||
trie.addKeyword("she");
|
||||
trie.addKeyword("he");
|
||||
Trie trie = Trie.builder()
|
||||
.addKeyword("hers")
|
||||
.addKeyword("his")
|
||||
.addKeyword("she")
|
||||
.addKeyword("he")
|
||||
.build();
|
||||
Collection<Emit> emits = trie.parseText("ushers");
|
||||
assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5
|
||||
Iterator<Emit> iterator = emits.iterator();
|
||||
@ -55,8 +59,9 @@ public class TrieTest {
|
||||
|
||||
@Test
|
||||
public void misleadingTest() {
|
||||
Trie trie = new Trie();
|
||||
trie.addKeyword("hers");
|
||||
Trie trie = Trie.builder()
|
||||
.addKeyword("hers")
|
||||
.build();
|
||||
Collection<Emit> emits = trie.parseText("h he her hers");
|
||||
Iterator<Emit> iterator = emits.iterator();
|
||||
checkEmit(iterator.next(), 9, 12, "hers");
|
||||
@ -64,11 +69,12 @@ public class TrieTest {
|
||||
|
||||
@Test
|
||||
public void recipes() {
|
||||
Trie trie = new Trie();
|
||||
trie.addKeyword("veal");
|
||||
trie.addKeyword("cauliflower");
|
||||
trie.addKeyword("broccoli");
|
||||
trie.addKeyword("tomatoes");
|
||||
Trie trie = Trie.builder()
|
||||
.addKeyword("veal")
|
||||
.addKeyword("cauliflower")
|
||||
.addKeyword("broccoli")
|
||||
.addKeyword("tomatoes")
|
||||
.build();
|
||||
Collection<Emit> emits = trie.parseText("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli");
|
||||
Iterator<Emit> iterator = emits.iterator();
|
||||
checkEmit(iterator.next(), 2, 12, "cauliflower");
|
||||
@ -79,9 +85,10 @@ public class TrieTest {
|
||||
|
||||
@Test
|
||||
public void longAndShortOverlappingMatch() {
|
||||
Trie trie = new Trie();
|
||||
trie.addKeyword("he");
|
||||
trie.addKeyword("hehehehe");
|
||||
Trie trie = Trie.builder()
|
||||
.addKeyword("he")
|
||||
.addKeyword("hehehehe")
|
||||
.build();
|
||||
Collection<Emit> emits = trie.parseText("hehehehehe");
|
||||
Iterator<Emit> iterator = emits.iterator();
|
||||
checkEmit(iterator.next(), 0, 1, "he");
|
||||
@ -95,10 +102,11 @@ public class TrieTest {
|
||||
|
||||
@Test
|
||||
public void nonOverlapping() {
|
||||
Trie trie = new Trie().removeOverlaps();
|
||||
trie.addKeyword("ab");
|
||||
trie.addKeyword("cba");
|
||||
trie.addKeyword("ababc");
|
||||
Trie trie = Trie.builder().removeOverlaps()
|
||||
.addKeyword("ab")
|
||||
.addKeyword("cba")
|
||||
.addKeyword("ababc")
|
||||
.build();
|
||||
Collection<Emit> emits = trie.parseText("ababcbab");
|
||||
assertEquals(2, emits.size());
|
||||
Iterator<Emit> iterator = emits.iterator();
|
||||
@ -109,25 +117,27 @@ public class TrieTest {
|
||||
|
||||
@Test
|
||||
public void startOfChurchillSpeech() {
|
||||
Trie trie = new Trie().removeOverlaps();
|
||||
trie.addKeyword("T");
|
||||
trie.addKeyword("u");
|
||||
trie.addKeyword("ur");
|
||||
trie.addKeyword("r");
|
||||
trie.addKeyword("urn");
|
||||
trie.addKeyword("ni");
|
||||
trie.addKeyword("i");
|
||||
trie.addKeyword("in");
|
||||
trie.addKeyword("n");
|
||||
trie.addKeyword("urning");
|
||||
Trie trie = Trie.builder().removeOverlaps()
|
||||
.addKeyword("T")
|
||||
.addKeyword("u")
|
||||
.addKeyword("ur")
|
||||
.addKeyword("r")
|
||||
.addKeyword("urn")
|
||||
.addKeyword("ni")
|
||||
.addKeyword("i")
|
||||
.addKeyword("in")
|
||||
.addKeyword("n")
|
||||
.addKeyword("urning")
|
||||
.build();
|
||||
Collection<Emit> emits = trie.parseText("Turning");
|
||||
assertEquals(2, emits.size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void partialMatch() {
|
||||
Trie trie = new Trie().onlyWholeWords();
|
||||
trie.addKeyword("sugar");
|
||||
Trie trie = Trie.builder().onlyWholeWords()
|
||||
.addKeyword("sugar")
|
||||
.build();
|
||||
Collection<Emit> emits = trie.parseText("sugarcane sugarcane sugar canesugar"); // left, middle, right test
|
||||
assertEquals(1, emits.size()); // Match must not be made
|
||||
checkEmit(emits.iterator().next(), 20, 24, "sugar");
|
||||
@ -135,10 +145,11 @@ public class TrieTest {
|
||||
|
||||
@Test
|
||||
public void tokenizeFullSentence() {
|
||||
Trie trie = new Trie();
|
||||
trie.addKeyword("Alpha");
|
||||
trie.addKeyword("Beta");
|
||||
trie.addKeyword("Gamma");
|
||||
Trie trie = Trie.builder()
|
||||
.addKeyword("Alpha")
|
||||
.addKeyword("Beta")
|
||||
.addKeyword("Gamma")
|
||||
.build();
|
||||
Collection<Token> tokens = trie.tokenize("Hear: Alpha team first, Beta from the rear, Gamma in reserve");
|
||||
assertEquals(7, tokens.size());
|
||||
Iterator<Token> tokensIt = tokens.iterator();
|
||||
@ -153,11 +164,12 @@ public class TrieTest {
|
||||
|
||||
@Test
|
||||
public void bug5InGithubReportedByXCurry() {
|
||||
Trie trie = new Trie().caseInsensitive().onlyWholeWords();
|
||||
trie.addKeyword("turning");
|
||||
trie.addKeyword("once");
|
||||
trie.addKeyword("again");
|
||||
trie.addKeyword("börkü");
|
||||
Trie trie = Trie.builder().caseInsensitive().onlyWholeWords()
|
||||
.addKeyword("turning")
|
||||
.addKeyword("once")
|
||||
.addKeyword("again")
|
||||
.addKeyword("börkü")
|
||||
.build();
|
||||
Collection<Emit> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ");
|
||||
assertEquals(4, emits.size()); // Match must not be made
|
||||
Iterator<Emit> it = emits.iterator();
|
||||
@ -169,11 +181,12 @@ public class TrieTest {
|
||||
|
||||
@Test
|
||||
public void caseInsensitive() {
|
||||
Trie trie = new Trie().caseInsensitive();
|
||||
trie.addKeyword("turning");
|
||||
trie.addKeyword("once");
|
||||
trie.addKeyword("again");
|
||||
trie.addKeyword("börkü");
|
||||
Trie trie = Trie.builder().caseInsensitive()
|
||||
.addKeyword("turning")
|
||||
.addKeyword("once")
|
||||
.addKeyword("again")
|
||||
.addKeyword("börkü")
|
||||
.build();
|
||||
Collection<Emit> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ");
|
||||
assertEquals(4, emits.size()); // Match must not be made
|
||||
Iterator<Emit> it = emits.iterator();
|
||||
@ -185,10 +198,11 @@ public class TrieTest {
|
||||
|
||||
@Test
|
||||
public void tokenizeTokensInSequence() {
|
||||
Trie trie = new Trie();
|
||||
trie.addKeyword("Alpha");
|
||||
trie.addKeyword("Beta");
|
||||
trie.addKeyword("Gamma");
|
||||
Trie trie = Trie.builder()
|
||||
.addKeyword("Alpha")
|
||||
.addKeyword("Beta")
|
||||
.addKeyword("Gamma")
|
||||
.build();
|
||||
Collection<Token> tokens = trie.tokenize("Alpha Beta Gamma");
|
||||
assertEquals(5, tokens.size());
|
||||
}
|
||||
@ -196,8 +210,9 @@ public class TrieTest {
|
||||
// Test offered by XCurry, https://github.com/robert-bor/aho-corasick/issues/7
|
||||
@Test
|
||||
public void zeroLengthTestBug7InGithubReportedByXCurry() {
|
||||
Trie trie = new Trie().removeOverlaps().onlyWholeWords().caseInsensitive();
|
||||
trie.addKeyword("");
|
||||
Trie trie = Trie.builder().removeOverlaps().onlyWholeWords().caseInsensitive()
|
||||
.addKeyword("")
|
||||
.build();
|
||||
trie.tokenize("Try a natural lip and subtle bronzer to keep all the focus on those big bright eyes with NARS Eyeshadow Duo in Rated R And the winner is... Boots No7 Advanced Renewal Anti-ageing Glycolic Peel Kit ($25 amazon.com) won most-appealing peel.");
|
||||
}
|
||||
|
||||
@ -205,9 +220,10 @@ public class TrieTest {
|
||||
@Test
|
||||
public void unicodeIssueBug8ReportedByDwyerk() {
|
||||
String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char
|
||||
Trie trie = new Trie().caseInsensitive().onlyWholeWords();
|
||||
assertEquals("THIS", target.substring(5,9)); // Java does it the right way
|
||||
trie.addKeyword("this");
|
||||
Trie trie = Trie.builder().caseInsensitive().onlyWholeWords()
|
||||
.addKeyword("this")
|
||||
.build();
|
||||
Collection<Emit> emits = trie.parseText(target);
|
||||
assertEquals(1, emits.size());
|
||||
Iterator<Emit> it = emits.iterator();
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user