Merge branch 'remen-master' into feature/footprint-reduction

Conflicts:
	src/main/java/org/ahocorasick/trie/Trie.java
This commit is contained in:
robert-bor 2015-09-22 20:06:04 +02:00
commit fcefdfdaf9
3 changed files with 141 additions and 113 deletions

View File

@ -51,11 +51,12 @@ Usage
-----
Setting up the Trie is a piece of cake:
```java
Trie trie = new Trie();
trie.addKeyword("hers");
trie.addKeyword("his");
trie.addKeyword("she");
trie.addKeyword("he");
Trie trie = Trie.builder()
.addKeyword("hers")
.addKeyword("his")
.addKeyword("she")
.addKeyword("he")
.build();
Collection<Emit> emits = trie.parseText("ushers");
```
@ -68,9 +69,11 @@ In normal situations you probably want to remove overlapping instances, retainin
matches.
```java
Trie trie = new Trie().removeOverlaps();
trie.addKeyword("hot");
trie.addKeyword("hot chocolate");
Trie trie = Trie.builder()
.removeOverlaps()
.addKeyword("hot")
.addKeyword("hot chocolate")
.build();
Collection<Emit> emits = trie.parseText("hot chocolate");
```
@ -82,8 +85,10 @@ There is only one result now:
If you want the algorithm to only check for whole words, you can tell the Trie to do so:
```java
Trie trie = new Trie().onlyWholeWords();
trie.addKeyword("sugar");
Trie trie = Trie.builder()
.onlyWholeWords()
.addKeyword("sugar")
.build();
Collection<Emit> emits = trie.parseText("sugarcane sugarcane sugar canesugar");
```
@ -94,8 +99,10 @@ Some text are WrItTeN in combinations of lowercase and uppercase and therefore h
the Trie to lowercase the entire searchtext to ease the matching process.
```java
Trie trie = new Trie().caseInsensitive();
trie.addKeyword("casing");
Trie trie = Trie.builder()
.caseInsensitive()
.addKeyword("casing")
.build();
Collection<Emit> emits = trie.parseText("CaSiNg");
```
@ -111,10 +118,11 @@ matches as soon as you encounter them. Let's look at an example where we want to
String speech = "The Answer to the Great Question... Of Life, " +
"the Universe and Everything... Is... Forty-two,' said " +
"Deep Thought, with infinite majesty and calm.";
Trie trie = new Trie().removeOverlaps().onlyWholeWords().caseInsensitive();
trie.addKeyword("great question");
trie.addKeyword("forty-two");
trie.addKeyword("deep thought");
Trie trie = Trie.builder().removeOverlaps().onlyWholeWords().caseInsensitive()
.addKeyword("great question")
.addKeyword("forty-two")
.addKeyword("deep thought")
.build();
Collection<Token> tokens = trie.tokenize(speech);
StringBuffer html = new StringBuffer();
html.append("<html><body><p>");

View File

@ -22,38 +22,12 @@ public class Trie {
private State rootState;
private boolean failureStatesConstructed = false;
public Trie(TrieConfig trieConfig) {
private Trie(TrieConfig trieConfig) {
this.trieConfig = trieConfig;
this.rootState = new State();
}
public Trie() {
this(new TrieConfig());
}
public Trie caseInsensitive() {
this.trieConfig.setCaseInsensitive(true);
return this;
}
public Trie removeOverlaps() {
this.trieConfig.setAllowOverlaps(false);
return this;
}
public Trie onlyWholeWords() {
this.trieConfig.setOnlyWholeWords(true);
return this;
}
public Trie stopOnHit() {
this.trieConfig.setStopOnHit(true);
return this;
}
public void addKeyword(String keyword) {
private void addKeyword(String keyword) {
if (keyword == null || keyword.length() == 0) {
return;
}
@ -155,12 +129,6 @@ public class Trie {
return newCurrentState;
}
private void checkForConstructedFailureStates() {
if (!this.failureStatesConstructed) {
constructFailureStates();
}
}
private void constructFailureStates() {
Queue<State> queue = new LinkedBlockingDeque<State>();
@ -169,7 +137,6 @@ public class Trie {
depthOneState.setFailure(this.rootState);
queue.add(depthOneState);
}
this.failureStatesConstructed = true;
// Second, determine the fail state for all depth > 1 state
while (!queue.isEmpty()) {
@ -202,4 +169,41 @@ public class Trie {
return emitted;
}
public static TrieBuilder builder() {
return new TrieBuilder();
}
public static class TrieBuilder {
private TrieConfig trieConfig = new TrieConfig();
private Trie trie = new Trie(trieConfig);
private TrieBuilder() {}
public TrieBuilder caseInsensitive() {
this.trieConfig.setCaseInsensitive(true);
return this;
}
public TrieBuilder removeOverlaps() {
this.trieConfig.setAllowOverlaps(false);
return this;
}
public TrieBuilder onlyWholeWords() {
this.trieConfig.setOnlyWholeWords(true);
return this;
}
public TrieBuilder addKeyword(String keyword) {
trie.addKeyword(keyword);
return this;
}
public Trie build() {
trie.constructFailureStates();
return trie;
}
}
}

View File

@ -14,8 +14,9 @@ public class TrieTest {
@Test
public void keywordAndTextAreTheSame() {
Trie trie = new Trie();
trie.addKeyword("abc");
Trie trie = Trie.builder()
.addKeyword("abc")
.build();
Collection<Emit> emits = trie.parseText("abc");
Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 0, 2, "abc");
@ -23,8 +24,9 @@ public class TrieTest {
@Test
public void textIsLongerThanKeyword() {
Trie trie = new Trie();
trie.addKeyword("abc");
Trie trie = Trie.builder()
.addKeyword("abc")
.build();
Collection<Emit> emits = trie.parseText(" abc");
Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 1, 3, "abc");
@ -32,10 +34,11 @@ public class TrieTest {
@Test
public void variousKeywordsOneMatch() {
Trie trie = new Trie();
trie.addKeyword("abc");
trie.addKeyword("bcd");
trie.addKeyword("cde");
Trie trie = Trie.builder()
.addKeyword("abc")
.addKeyword("bcd")
.addKeyword("cde")
.build();
Collection<Emit> emits = trie.parseText("bcd");
Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 0, 2, "bcd");
@ -58,11 +61,12 @@ public class TrieTest {
@Test
public void ushersTest() {
Trie trie = new Trie();
trie.addKeyword("hers");
trie.addKeyword("his");
trie.addKeyword("she");
trie.addKeyword("he");
Trie trie = Trie.builder()
.addKeyword("hers")
.addKeyword("his")
.addKeyword("she")
.addKeyword("he")
.build();
Collection<Emit> emits = trie.parseText("ushers");
assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5
Iterator<Emit> iterator = emits.iterator();
@ -97,8 +101,9 @@ public class TrieTest {
@Test
public void misleadingTest() {
Trie trie = new Trie();
trie.addKeyword("hers");
Trie trie = Trie.builder()
.addKeyword("hers")
.build();
Collection<Emit> emits = trie.parseText("h he her hers");
Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 9, 12, "hers");
@ -106,11 +111,12 @@ public class TrieTest {
@Test
public void recipes() {
Trie trie = new Trie();
trie.addKeyword("veal");
trie.addKeyword("cauliflower");
trie.addKeyword("broccoli");
trie.addKeyword("tomatoes");
Trie trie = Trie.builder()
.addKeyword("veal")
.addKeyword("cauliflower")
.addKeyword("broccoli")
.addKeyword("tomatoes")
.build();
Collection<Emit> emits = trie.parseText("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli");
Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 2, 12, "cauliflower");
@ -121,9 +127,10 @@ public class TrieTest {
@Test
public void longAndShortOverlappingMatch() {
Trie trie = new Trie();
trie.addKeyword("he");
trie.addKeyword("hehehehe");
Trie trie = Trie.builder()
.addKeyword("he")
.addKeyword("hehehehe")
.build();
Collection<Emit> emits = trie.parseText("hehehehehe");
Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 0, 1, "he");
@ -137,10 +144,11 @@ public class TrieTest {
@Test
public void nonOverlapping() {
Trie trie = new Trie().removeOverlaps();
trie.addKeyword("ab");
trie.addKeyword("cba");
trie.addKeyword("ababc");
Trie trie = Trie.builder().removeOverlaps()
.addKeyword("ab")
.addKeyword("cba")
.addKeyword("ababc")
.build();
Collection<Emit> emits = trie.parseText("ababcbab");
assertEquals(2, emits.size());
Iterator<Emit> iterator = emits.iterator();
@ -151,25 +159,27 @@ public class TrieTest {
@Test
public void startOfChurchillSpeech() {
Trie trie = new Trie().removeOverlaps();
trie.addKeyword("T");
trie.addKeyword("u");
trie.addKeyword("ur");
trie.addKeyword("r");
trie.addKeyword("urn");
trie.addKeyword("ni");
trie.addKeyword("i");
trie.addKeyword("in");
trie.addKeyword("n");
trie.addKeyword("urning");
Trie trie = Trie.builder().removeOverlaps()
.addKeyword("T")
.addKeyword("u")
.addKeyword("ur")
.addKeyword("r")
.addKeyword("urn")
.addKeyword("ni")
.addKeyword("i")
.addKeyword("in")
.addKeyword("n")
.addKeyword("urning")
.build();
Collection<Emit> emits = trie.parseText("Turning");
assertEquals(2, emits.size());
}
@Test
public void partialMatch() {
Trie trie = new Trie().onlyWholeWords();
trie.addKeyword("sugar");
Trie trie = Trie.builder().onlyWholeWords()
.addKeyword("sugar")
.build();
Collection<Emit> emits = trie.parseText("sugarcane sugarcane sugar canesugar"); // left, middle, right test
assertEquals(1, emits.size()); // Match must not be made
checkEmit(emits.iterator().next(), 20, 24, "sugar");
@ -177,10 +187,11 @@ public class TrieTest {
@Test
public void tokenizeFullSentence() {
Trie trie = new Trie();
trie.addKeyword("Alpha");
trie.addKeyword("Beta");
trie.addKeyword("Gamma");
Trie trie = Trie.builder()
.addKeyword("Alpha")
.addKeyword("Beta")
.addKeyword("Gamma")
.build();
Collection<Token> tokens = trie.tokenize("Hear: Alpha team first, Beta from the rear, Gamma in reserve");
assertEquals(7, tokens.size());
Iterator<Token> tokensIt = tokens.iterator();
@ -195,11 +206,12 @@ public class TrieTest {
@Test
public void bug5InGithubReportedByXCurry() {
Trie trie = new Trie().caseInsensitive().onlyWholeWords();
trie.addKeyword("turning");
trie.addKeyword("once");
trie.addKeyword("again");
trie.addKeyword("börkü");
Trie trie = Trie.builder().caseInsensitive().onlyWholeWords()
.addKeyword("turning")
.addKeyword("once")
.addKeyword("again")
.addKeyword("börkü")
.build();
Collection<Emit> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ");
assertEquals(4, emits.size()); // Match must not be made
Iterator<Emit> it = emits.iterator();
@ -211,11 +223,12 @@ public class TrieTest {
@Test
public void caseInsensitive() {
Trie trie = new Trie().caseInsensitive();
trie.addKeyword("turning");
trie.addKeyword("once");
trie.addKeyword("again");
trie.addKeyword("börkü");
Trie trie = Trie.builder().caseInsensitive()
.addKeyword("turning")
.addKeyword("once")
.addKeyword("again")
.addKeyword("börkü")
.build();
Collection<Emit> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ");
assertEquals(4, emits.size()); // Match must not be made
Iterator<Emit> it = emits.iterator();
@ -227,10 +240,11 @@ public class TrieTest {
@Test
public void tokenizeTokensInSequence() {
Trie trie = new Trie();
trie.addKeyword("Alpha");
trie.addKeyword("Beta");
trie.addKeyword("Gamma");
Trie trie = Trie.builder()
.addKeyword("Alpha")
.addKeyword("Beta")
.addKeyword("Gamma")
.build();
Collection<Token> tokens = trie.tokenize("Alpha Beta Gamma");
assertEquals(5, tokens.size());
}
@ -238,8 +252,9 @@ public class TrieTest {
// Test offered by XCurry, https://github.com/robert-bor/aho-corasick/issues/7
@Test
public void zeroLengthTestBug7InGithubReportedByXCurry() {
Trie trie = new Trie().removeOverlaps().onlyWholeWords().caseInsensitive();
trie.addKeyword("");
Trie trie = Trie.builder().removeOverlaps().onlyWholeWords().caseInsensitive()
.addKeyword("")
.build();
trie.tokenize("Try a natural lip and subtle bronzer to keep all the focus on those big bright eyes with NARS Eyeshadow Duo in Rated R And the winner is... Boots No7 Advanced Renewal Anti-ageing Glycolic Peel Kit ($25 amazon.com) won most-appealing peel.");
}
@ -247,9 +262,10 @@ public class TrieTest {
@Test
public void unicodeIssueBug8ReportedByDwyerk() {
String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char
Trie trie = new Trie().caseInsensitive().onlyWholeWords();
assertEquals("THIS", target.substring(5,9)); // Java does it the right way
trie.addKeyword("this");
Trie trie = Trie.builder().caseInsensitive().onlyWholeWords()
.addKeyword("this")
.build();
Collection<Emit> emits = trie.parseText(target);
assertEquals(1, emits.size());
Iterator<Emit> it = emits.iterator();