From 9bce51e00121ec8281e6ed8ebc3fe05b2d43407c Mon Sep 17 00:00:00 2001 From: Petter Remen Date: Fri, 3 Jul 2015 12:29:31 +0200 Subject: [PATCH] Issue #16 Use builder pattern to create Trie Previously, there was a race condition in Trie#parseText since it called constructFailureStates on first run without synchronization. See https://github.com/robert-bor/aho-corasick/issues/16 This commit fixes this by using the builder pattern in order to create a fully initialized Trie. N.B. This changes the API --- README.md | 40 +++-- src/main/java/org/ahocorasick/trie/Trie.java | 71 +++++---- .../java/org/ahocorasick/trie/TrieTest.java | 140 ++++++++++-------- 3 files changed, 141 insertions(+), 110 deletions(-) diff --git a/README.md b/README.md index ca79ef4..26a1226 100644 --- a/README.md +++ b/README.md @@ -51,11 +51,12 @@ Usage ----- Setting up the Trie is a piece of cake: ```java - Trie trie = new Trie(); - trie.addKeyword("hers"); - trie.addKeyword("his"); - trie.addKeyword("she"); - trie.addKeyword("he"); + Trie trie = Trie.builder() + .addKeyword("hers") + .addKeyword("his") + .addKeyword("she") + .addKeyword("he") + .build(); Collection emits = trie.parseText("ushers"); ``` @@ -68,9 +69,11 @@ In normal situations you probably want to remove overlapping instances, retainin matches. ```java - Trie trie = new Trie().removeOverlaps(); - trie.addKeyword("hot"); - trie.addKeyword("hot chocolate"); + Trie trie = Trie.builder() + .removeOverlaps() + .addKeyword("hot") + .addKeyword("hot chocolate") + .build(); Collection emits = trie.parseText("hot chocolate"); ``` @@ -82,8 +85,10 @@ There is only one result now: If you want the algorithm to only check for whole words, you can tell the Trie to do so: ```java - Trie trie = new Trie().onlyWholeWords(); - trie.addKeyword("sugar"); + Trie trie = Trie.builder() + .onlyWholeWords() + .addKeyword("sugar") + .build(); Collection emits = trie.parseText("sugarcane sugarcane sugar canesugar"); ``` @@ -94,8 +99,10 @@ Some text are WrItTeN in combinations of lowercase and uppercase and therefore h the Trie to lowercase the entire searchtext to ease the matching process. ```java - Trie trie = new Trie().caseInsensitive(); - trie.addKeyword("casing"); + Trie trie = Trie.builder() + .caseInsensitive() + .addKeyword("casing") + .build(); Collection emits = trie.parseText("CaSiNg"); ``` @@ -111,10 +118,11 @@ matches as soon as you encounter them. Let's look at an example where we want to String speech = "The Answer to the Great Question... Of Life, " + "the Universe and Everything... Is... Forty-two,' said " + "Deep Thought, with infinite majesty and calm."; - Trie trie = new Trie().removeOverlaps().onlyWholeWords().caseInsensitive(); - trie.addKeyword("great question"); - trie.addKeyword("forty-two"); - trie.addKeyword("deep thought"); + Trie trie = Trie.builder().removeOverlaps().onlyWholeWords().caseInsensitive() + .addKeyword("great question") + .addKeyword("forty-two") + .addKeyword("deep thought") + .build(); Collection tokens = trie.tokenize(speech); StringBuffer html = new StringBuffer(); html.append("

"); diff --git a/src/main/java/org/ahocorasick/trie/Trie.java b/src/main/java/org/ahocorasick/trie/Trie.java index 155698b..fa9b23d 100644 --- a/src/main/java/org/ahocorasick/trie/Trie.java +++ b/src/main/java/org/ahocorasick/trie/Trie.java @@ -20,33 +20,12 @@ public class Trie { private State rootState; - private boolean failureStatesConstructed = false; - - public Trie(TrieConfig trieConfig) { + private Trie(TrieConfig trieConfig) { this.trieConfig = trieConfig; this.rootState = new State(); } - public Trie() { - this(new TrieConfig()); - } - - public Trie caseInsensitive() { - this.trieConfig.setCaseInsensitive(true); - return this; - } - - public Trie removeOverlaps() { - this.trieConfig.setAllowOverlaps(false); - return this; - } - - public Trie onlyWholeWords() { - this.trieConfig.setOnlyWholeWords(true); - return this; - } - - public void addKeyword(String keyword) { + private void addKeyword(String keyword) { if (keyword == null || keyword.length() == 0) { return; } @@ -87,8 +66,6 @@ public class Trie { @SuppressWarnings("unchecked") public Collection parseText(String text) { - checkForConstructedFailureStates(); - int position = 0; State currentState = this.rootState; List collectedEmits = new ArrayList(); @@ -140,12 +117,6 @@ public class Trie { return newCurrentState; } - private void checkForConstructedFailureStates() { - if (!this.failureStatesConstructed) { - constructFailureStates(); - } - } - private void constructFailureStates() { Queue queue = new LinkedBlockingDeque(); @@ -154,7 +125,6 @@ public class Trie { depthOneState.setFailure(this.rootState); queue.add(depthOneState); } - this.failureStatesConstructed = true; // Second, determine the fail state for all depth > 1 state while (!queue.isEmpty()) { @@ -184,4 +154,41 @@ public class Trie { } } + public static TrieBuilder builder() { + return new TrieBuilder(); + } + + public static class TrieBuilder { + + private TrieConfig trieConfig = new TrieConfig(); + + private Trie trie = new Trie(trieConfig); + + private TrieBuilder() {} + + public TrieBuilder caseInsensitive() { + this.trieConfig.setCaseInsensitive(true); + return this; + } + + public TrieBuilder removeOverlaps() { + this.trieConfig.setAllowOverlaps(false); + return this; + } + + public TrieBuilder onlyWholeWords() { + this.trieConfig.setOnlyWholeWords(true); + return this; + } + + public TrieBuilder addKeyword(String keyword) { + trie.addKeyword(keyword); + return this; + } + + public Trie build() { + trie.constructFailureStates(); + return trie; + } + } } diff --git a/src/test/java/org/ahocorasick/trie/TrieTest.java b/src/test/java/org/ahocorasick/trie/TrieTest.java index 6cc7ff7..548d6bf 100644 --- a/src/test/java/org/ahocorasick/trie/TrieTest.java +++ b/src/test/java/org/ahocorasick/trie/TrieTest.java @@ -11,8 +11,9 @@ public class TrieTest { @Test public void keywordAndTextAreTheSame() { - Trie trie = new Trie(); - trie.addKeyword("abc"); + Trie trie = Trie.builder() + .addKeyword("abc") + .build(); Collection emits = trie.parseText("abc"); Iterator iterator = emits.iterator(); checkEmit(iterator.next(), 0, 2, "abc"); @@ -20,8 +21,9 @@ public class TrieTest { @Test public void textIsLongerThanKeyword() { - Trie trie = new Trie(); - trie.addKeyword("abc"); + Trie trie = Trie.builder() + .addKeyword("abc") + .build(); Collection emits = trie.parseText(" abc"); Iterator iterator = emits.iterator(); checkEmit(iterator.next(), 1, 3, "abc"); @@ -29,10 +31,11 @@ public class TrieTest { @Test public void variousKeywordsOneMatch() { - Trie trie = new Trie(); - trie.addKeyword("abc"); - trie.addKeyword("bcd"); - trie.addKeyword("cde"); + Trie trie = Trie.builder() + .addKeyword("abc") + .addKeyword("bcd") + .addKeyword("cde") + .build(); Collection emits = trie.parseText("bcd"); Iterator iterator = emits.iterator(); checkEmit(iterator.next(), 0, 2, "bcd"); @@ -40,11 +43,12 @@ public class TrieTest { @Test public void ushersTest() { - Trie trie = new Trie(); - trie.addKeyword("hers"); - trie.addKeyword("his"); - trie.addKeyword("she"); - trie.addKeyword("he"); + Trie trie = Trie.builder() + .addKeyword("hers") + .addKeyword("his") + .addKeyword("she") + .addKeyword("he") + .build(); Collection emits = trie.parseText("ushers"); assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5 Iterator iterator = emits.iterator(); @@ -55,8 +59,9 @@ public class TrieTest { @Test public void misleadingTest() { - Trie trie = new Trie(); - trie.addKeyword("hers"); + Trie trie = Trie.builder() + .addKeyword("hers") + .build(); Collection emits = trie.parseText("h he her hers"); Iterator iterator = emits.iterator(); checkEmit(iterator.next(), 9, 12, "hers"); @@ -64,11 +69,12 @@ public class TrieTest { @Test public void recipes() { - Trie trie = new Trie(); - trie.addKeyword("veal"); - trie.addKeyword("cauliflower"); - trie.addKeyword("broccoli"); - trie.addKeyword("tomatoes"); + Trie trie = Trie.builder() + .addKeyword("veal") + .addKeyword("cauliflower") + .addKeyword("broccoli") + .addKeyword("tomatoes") + .build(); Collection emits = trie.parseText("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli"); Iterator iterator = emits.iterator(); checkEmit(iterator.next(), 2, 12, "cauliflower"); @@ -79,9 +85,10 @@ public class TrieTest { @Test public void longAndShortOverlappingMatch() { - Trie trie = new Trie(); - trie.addKeyword("he"); - trie.addKeyword("hehehehe"); + Trie trie = Trie.builder() + .addKeyword("he") + .addKeyword("hehehehe") + .build(); Collection emits = trie.parseText("hehehehehe"); Iterator iterator = emits.iterator(); checkEmit(iterator.next(), 0, 1, "he"); @@ -95,10 +102,11 @@ public class TrieTest { @Test public void nonOverlapping() { - Trie trie = new Trie().removeOverlaps(); - trie.addKeyword("ab"); - trie.addKeyword("cba"); - trie.addKeyword("ababc"); + Trie trie = Trie.builder().removeOverlaps() + .addKeyword("ab") + .addKeyword("cba") + .addKeyword("ababc") + .build(); Collection emits = trie.parseText("ababcbab"); assertEquals(2, emits.size()); Iterator iterator = emits.iterator(); @@ -109,25 +117,27 @@ public class TrieTest { @Test public void startOfChurchillSpeech() { - Trie trie = new Trie().removeOverlaps(); - trie.addKeyword("T"); - trie.addKeyword("u"); - trie.addKeyword("ur"); - trie.addKeyword("r"); - trie.addKeyword("urn"); - trie.addKeyword("ni"); - trie.addKeyword("i"); - trie.addKeyword("in"); - trie.addKeyword("n"); - trie.addKeyword("urning"); + Trie trie = Trie.builder().removeOverlaps() + .addKeyword("T") + .addKeyword("u") + .addKeyword("ur") + .addKeyword("r") + .addKeyword("urn") + .addKeyword("ni") + .addKeyword("i") + .addKeyword("in") + .addKeyword("n") + .addKeyword("urning") + .build(); Collection emits = trie.parseText("Turning"); assertEquals(2, emits.size()); } @Test public void partialMatch() { - Trie trie = new Trie().onlyWholeWords(); - trie.addKeyword("sugar"); + Trie trie = Trie.builder().onlyWholeWords() + .addKeyword("sugar") + .build(); Collection emits = trie.parseText("sugarcane sugarcane sugar canesugar"); // left, middle, right test assertEquals(1, emits.size()); // Match must not be made checkEmit(emits.iterator().next(), 20, 24, "sugar"); @@ -135,10 +145,11 @@ public class TrieTest { @Test public void tokenizeFullSentence() { - Trie trie = new Trie(); - trie.addKeyword("Alpha"); - trie.addKeyword("Beta"); - trie.addKeyword("Gamma"); + Trie trie = Trie.builder() + .addKeyword("Alpha") + .addKeyword("Beta") + .addKeyword("Gamma") + .build(); Collection tokens = trie.tokenize("Hear: Alpha team first, Beta from the rear, Gamma in reserve"); assertEquals(7, tokens.size()); Iterator tokensIt = tokens.iterator(); @@ -153,11 +164,12 @@ public class TrieTest { @Test public void bug5InGithubReportedByXCurry() { - Trie trie = new Trie().caseInsensitive().onlyWholeWords(); - trie.addKeyword("turning"); - trie.addKeyword("once"); - trie.addKeyword("again"); - trie.addKeyword("börkü"); + Trie trie = Trie.builder().caseInsensitive().onlyWholeWords() + .addKeyword("turning") + .addKeyword("once") + .addKeyword("again") + .addKeyword("börkü") + .build(); Collection emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ"); assertEquals(4, emits.size()); // Match must not be made Iterator it = emits.iterator(); @@ -169,11 +181,12 @@ public class TrieTest { @Test public void caseInsensitive() { - Trie trie = new Trie().caseInsensitive(); - trie.addKeyword("turning"); - trie.addKeyword("once"); - trie.addKeyword("again"); - trie.addKeyword("börkü"); + Trie trie = Trie.builder().caseInsensitive() + .addKeyword("turning") + .addKeyword("once") + .addKeyword("again") + .addKeyword("börkü") + .build(); Collection emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ"); assertEquals(4, emits.size()); // Match must not be made Iterator it = emits.iterator(); @@ -185,10 +198,11 @@ public class TrieTest { @Test public void tokenizeTokensInSequence() { - Trie trie = new Trie(); - trie.addKeyword("Alpha"); - trie.addKeyword("Beta"); - trie.addKeyword("Gamma"); + Trie trie = Trie.builder() + .addKeyword("Alpha") + .addKeyword("Beta") + .addKeyword("Gamma") + .build(); Collection tokens = trie.tokenize("Alpha Beta Gamma"); assertEquals(5, tokens.size()); } @@ -196,8 +210,9 @@ public class TrieTest { // Test offered by XCurry, https://github.com/robert-bor/aho-corasick/issues/7 @Test public void zeroLengthTestBug7InGithubReportedByXCurry() { - Trie trie = new Trie().removeOverlaps().onlyWholeWords().caseInsensitive(); - trie.addKeyword(""); + Trie trie = Trie.builder().removeOverlaps().onlyWholeWords().caseInsensitive() + .addKeyword("") + .build(); trie.tokenize("Try a natural lip and subtle bronzer to keep all the focus on those big bright eyes with NARS Eyeshadow Duo in Rated R And the winner is... Boots No7 Advanced Renewal Anti-ageing Glycolic Peel Kit ($25 amazon.com) won most-appealing peel."); } @@ -205,9 +220,10 @@ public class TrieTest { @Test public void unicodeIssueBug8ReportedByDwyerk() { String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char - Trie trie = new Trie().caseInsensitive().onlyWholeWords(); assertEquals("THIS", target.substring(5,9)); // Java does it the right way - trie.addKeyword("this"); + Trie trie = Trie.builder().caseInsensitive().onlyWholeWords() + .addKeyword("this") + .build(); Collection emits = trie.parseText(target); assertEquals(1, emits.size()); Iterator it = emits.iterator();