diff --git a/src/main/java/org/ahocorasick/trie/FragmentToken.java b/src/main/java/org/ahocorasick/trie/FragmentToken.java index 85c498a..f0c899f 100644 --- a/src/main/java/org/ahocorasick/trie/FragmentToken.java +++ b/src/main/java/org/ahocorasick/trie/FragmentToken.java @@ -2,8 +2,16 @@ package org.ahocorasick.trie; public class FragmentToken extends Token { + private boolean whiteSpace; + public FragmentToken(String fragment) { super(fragment); + this.whiteSpace = true; + for (int position = 0; position < fragment.length(); position++) { + if (!Character.isWhitespace(fragment.charAt(position))) { + whiteSpace = false; + } + } } @Override @@ -15,4 +23,10 @@ public class FragmentToken extends Token { public Emit getEmit() { return null; } + + @Override + public boolean isWhiteSpace() { + return whiteSpace; + } + } diff --git a/src/main/java/org/ahocorasick/trie/Token.java b/src/main/java/org/ahocorasick/trie/Token.java index 36d4a3c..7ec280f 100644 --- a/src/main/java/org/ahocorasick/trie/Token.java +++ b/src/main/java/org/ahocorasick/trie/Token.java @@ -18,6 +18,10 @@ public abstract class Token { return false; } + public boolean isWhiteSpace() { + return false; + } + public abstract Emit getEmit(); } diff --git a/src/test/java/org/ahocorasick/trie/TrieTest.java b/src/test/java/org/ahocorasick/trie/TrieTest.java index 37b7d8e..03f7924 100644 --- a/src/test/java/org/ahocorasick/trie/TrieTest.java +++ b/src/test/java/org/ahocorasick/trie/TrieTest.java @@ -53,6 +53,18 @@ public class TrieTest { checkEmit(firstMatch, 1, 3, "abc"); } + @Test + public void sameKeywordTwice() { + Trie trie = Trie.builder() + .addKeyword("abc") + .addKeyword("abc") + .build(); + Collection emits = trie.parseText("abc"); + assertEquals(1, emits.size()); + Iterator iterator = emits.iterator(); + checkEmit(iterator.next(), 0, 2, "abc"); + } + @Test public void variousKeywordsOneMatch() { Trie trie = Trie.builder() @@ -341,12 +353,27 @@ public class TrieTest { Collection tokens = trie.tokenize("Alpha AlphaAlpha Alpha"); assertEquals(6, tokens.size()); Iterator tokensIt = tokens.iterator(); - assertToken(tokensIt.next(), "Alpha", true, true); - assertToken(tokensIt.next(), " ", false, false); - assertToken(tokensIt.next(), "Alpha", true, false); - assertToken(tokensIt.next(), "Alpha", true, false); - assertToken(tokensIt.next(), " ", false, false); - assertToken(tokensIt.next(), "Alpha", true, true); + assertToken(tokensIt.next(), "Alpha", true, true, false); + assertToken(tokensIt.next(), " ", false, false, true); + assertToken(tokensIt.next(), "Alpha", true, false, false); + assertToken(tokensIt.next(), "Alpha", true, false, false); + assertToken(tokensIt.next(), " ", false, false, true); + assertToken(tokensIt.next(), "Alpha", true, true, false); + } + + @Test + public void whiteSpaceTokens() { + Trie trie = Trie.builder() + .addKeyword("Alpha") + .build(); + Collection tokens = trie.tokenize("Alpha \tthe\t Alpha\n Alpha"); + assertEquals(5, tokens.size()); + Iterator tokensIt = tokens.iterator(); + assertToken(tokensIt.next(), "Alpha", true, true, false); + assertToken(tokensIt.next(), " \tthe\t ", false, false, false); + assertToken(tokensIt.next(), "Alpha", true, true, false); + assertToken(tokensIt.next(), "\n ", false, false, true); + assertToken(tokensIt.next(), "Alpha", true, true, false); } @Test @@ -359,13 +386,13 @@ public class TrieTest { Collection tokens = trie.tokenize("Hear: Alpha team first, Beta from the rear, Gamma in reserve"); assertEquals(7, tokens.size()); Iterator tokensIt = tokens.iterator(); - assertToken(tokensIt.next(), "Hear: ", false, false); - assertToken(tokensIt.next(), "Alpha", true, true); - assertToken(tokensIt.next(), " team first, ", false, false); - assertToken(tokensIt.next(), "Beta", true, true); - assertToken(tokensIt.next(), " from the rear, ", false, false); - assertToken(tokensIt.next(), "Gamma", true, true); - assertToken(tokensIt.next(), " in reserve", false, false); + assertToken(tokensIt.next(), "Hear: ", false, false, false); + assertToken(tokensIt.next(), "Alpha", true, true, false); + assertToken(tokensIt.next(), " team first, ", false, false, false); + assertToken(tokensIt.next(), "Beta", true, true, false); + assertToken(tokensIt.next(), " from the rear, ", false, false, false); + assertToken(tokensIt.next(), "Gamma", true, true, false); + assertToken(tokensIt.next(), " in reserve", false, false, false); } @Test @@ -473,10 +500,11 @@ public class TrieTest { checkEmit(emits.iterator().next(), 0, 9, "#sugar-123"); } - private void assertToken(Token token, String fragment, boolean match, boolean wholeWord) { + private void assertToken(Token token, String fragment, boolean match, boolean wholeWord, boolean whiteSpace) { assertEquals(fragment, token.getFragment()); assertEquals(match, token.isMatch()); assertEquals(wholeWord, token.isWholeWord()); + assertEquals(whiteSpace, token.isWhiteSpace()); } private void checkEmit(Emit next, int expectedStart, int expectedEnd, String expectedKeyword) {