Merge branch 'rripken-master' into feature/footprint-reduction

Conflicts: src/main/java/org/ahocorasick/trie/Trie.java src/test/java/org/ahocorasick/trie/TrieTest.java
2015-09-22 20:38:29 +02:00 · 2015-09-22 20:38:29 +02:00 · 4633b1ba2a
commit 4633b1ba2a
parent 30f003c5ae d1478c7480
3 changed files with 165 additions and 9 deletions
--- a/src/main/java/org/ahocorasick/trie/State.java
+++ b/src/main/java/org/ahocorasick/trie/State.java
@ -35,7 +35,7 @@ public class State {
     * referred to in the white paper as the 'goto' structure. From a state it is possible to go
     * to other states, depending on the character passed.
     */
-    private Map<Character,State> success = new TreeMap<Character, State>();
+    private Map<Character,State> success = new HashMap<Character, State>();

    /** if no matching states are found, the failure state will be returned */
    private State failure = null;
--- a/src/main/java/org/ahocorasick/trie/Trie.java
+++ b/src/main/java/org/ahocorasick/trie/Trie.java
@ -85,6 +85,11 @@ public class Trie {
        return collectedEmits;
    }

+	public boolean matches(String text)
+	{
+		Emit firstMatch = firstMatch(text);
+		return firstMatch != null;
+	}
    public void parseText(CharSequence text, EmitHandler emitHandler) {
        State currentState = this.rootState;
        for (int position = 0; position < text.length(); position++) {
@ -113,10 +118,63 @@ public class Trie {
            removeEmits.add(emit);
        }

-        for (Emit removeEmit : removeEmits) {
-            collectedEmits.remove(removeEmit);
-        }
-    }
+	public Emit firstMatch(String text)
+	{
+		if (!trieConfig.isAllowOverlaps()) {
+			// Slow path. Needs to find all the matches to detect overlaps.
+			Collection<Emit> parseText = parseText(text);
+			if (parseText != null && !parseText.isEmpty()) {
+				return parseText.iterator().next();
+			}
+		} else {
+			// Fast path. Returs first match found.
+			checkForConstructedFailureStates();
+			int position = 0;
+			State currentState = this.rootState;
+			for (Character character : text.toCharArray()) {
+				if (trieConfig.isCaseInsensitive()) {
+					character = Character.toLowerCase(character);
+				}
+				currentState = getState(currentState, character);
+				Collection<String> emitStrs = currentState.emit();
+				if (emitStrs != null && !emitStrs.isEmpty()) {
+					for (String emitStr : emitStrs) {
+						final Emit emit = new Emit(position - emitStr.length() + 1, position, emitStr);
+						if (trieConfig.isOnlyWholeWords()) {
+							if (!isPartialMatch(text, emit)) {
+								return emit;
+							}
+						} else {
+							return emit;
+						}
+					}
+				}
+				position++;
+			}
+		}
+		return null;
+	}
+
+	private boolean isPartialMatch(String searchText, Emit emit)
+	{
+		return (emit.getStart() != 0 &&
+			Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) ||
+			(emit.getEnd() + 1 != searchText.length() &&
+			Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1)));
+	}
+
+	private void removePartialMatches(String searchText, List<Emit> collectedEmits)
+	{
+		List<Emit> removeEmits = new ArrayList<Emit>();
+		for (Emit emit : collectedEmits) {
+			if (isPartialMatch(searchText, emit)) {
+				removeEmits.add(emit);
+			}
+		}
+		for (Emit removeEmit : removeEmits) {
+			collectedEmits.remove(removeEmit);
+		}
+	}

    private State getState(State currentState, Character character) {
        State newCurrentState = currentState.nextState(character);
--- a/src/test/java/org/ahocorasick/trie/TrieTest.java
+++ b/src/test/java/org/ahocorasick/trie/TrieTest.java
@ -22,6 +22,14 @@ public class TrieTest {
        checkEmit(iterator.next(), 0, 2, "abc");
    }

+	 @Test
+    public void keywordAndTextAreTheSameFirstMatch() {
+        Trie trie = new Trie();
+        trie.addKeyword("abc");
+		Emit firstMatch = trie.firstMatch("abc");
+        checkEmit(firstMatch, 0, 2, "abc");
+    }
+
    @Test
    public void textIsLongerThanKeyword() {
        Trie trie = Trie.builder()
@ -32,6 +40,14 @@ public class TrieTest {
        checkEmit(iterator.next(), 1, 3, "abc");
    }

+	@Test
+    public void textIsLongerThanKeywordFirstMatch() {
+        Trie trie = new Trie();
+        trie.addKeyword("abc");
+		Emit firstMatch = trie.firstMatch(" abc");
+        checkEmit(firstMatch, 1, 3, "abc");
+    }
+
    @Test
    public void variousKeywordsOneMatch() {
        Trie trie = Trie.builder()
@ -44,6 +60,16 @@ public class TrieTest {
        checkEmit(iterator.next(), 0, 2, "bcd");
    }

+	@Test
+    public void variousKeywordsFirstMatch() {
+        Trie trie = new Trie();
+        trie.addKeyword("abc");
+        trie.addKeyword("bcd");
+        trie.addKeyword("cde");
+		Emit firstMatch = trie.firstMatch("bcd");
+        checkEmit(firstMatch, 0, 2, "bcd");
+    }
+
    @Test
    public void ushersTestAndStopOnHit() {
        Trie trie = Trie.builder()
@ -76,6 +102,17 @@ public class TrieTest {
        checkEmit(iterator.next(), 2, 5, "hers");
    }

+	 @Test
+    public void ushersTestFirstMatch() {
+        Trie trie = new Trie();
+        trie.addKeyword("hers");
+        trie.addKeyword("his");
+        trie.addKeyword("she");
+        trie.addKeyword("he");
+		Emit firstMatch = trie.firstMatch("ushers");
+		checkEmit(firstMatch, 2, 3, "he");
+	}
+
    @Test
    public void ushersTestByCallback() {
        Trie trie = Trie.builder()
@ -111,6 +148,14 @@ public class TrieTest {
        checkEmit(iterator.next(), 9, 12, "hers");
    }

+	 @Test
+    public void misleadingTestFirstMatch() {
+        Trie trie = new Trie();
+        trie.addKeyword("hers");
+		Emit firstMatch = trie.firstMatch("h he her hers");
+        checkEmit(firstMatch, 9, 12, "hers");
+    }
+
    @Test
    public void recipes() {
        Trie trie = Trie.builder()
@ -127,12 +172,23 @@ public class TrieTest {
        checkEmit(iterator.next(), 51, 58, "broccoli");
    }

+	 @Test
+    public void recipesFirstMatch() {
+        Trie trie = new Trie();
+        trie.addKeyword("veal");
+        trie.addKeyword("cauliflower");
+        trie.addKeyword("broccoli");
+        trie.addKeyword("tomatoes");
+		Emit firstMatch = trie.firstMatch("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli");
+
+        checkEmit(firstMatch, 2, 12, "cauliflower");
+    }
+
    @Test
    public void longAndShortOverlappingMatch() {
-        Trie trie = Trie.builder()
-            .addKeyword("he")
-            .addKeyword("hehehehe")
-            .build();
+        Trie trie = new Trie();
+        trie.addKeyword("he");
+        trie.addKeyword("hehehehe");
        Collection<Emit> emits = trie.parseText("hehehehehe");
        Iterator<Emit> iterator = emits.iterator();
        checkEmit(iterator.next(), 0, 1, "he");
@ -159,6 +215,17 @@ public class TrieTest {
        checkEmit(iterator.next(), 6, 7, "ab");
    }

+	 @Test
+    public void nonOverlappingFirstMatch() {
+        Trie trie = new Trie().removeOverlaps();
+        trie.addKeyword("ab");
+        trie.addKeyword("cba");
+        trie.addKeyword("ababc");
+		Emit firstMatch = trie.firstMatch("ababcbab");
+
+        checkEmit(firstMatch, 0, 4, "ababc");
+    }
+
    @Test
    public void startOfChurchillSpeech() {
        Trie trie = Trie.builder().removeOverlaps()
@ -187,6 +254,15 @@ public class TrieTest {
        checkEmit(emits.iterator().next(), 20, 24, "sugar");
    }

+	   @Test
+    public void partialMatchFirstMatch() {
+        Trie trie = new Trie().onlyWholeWords();
+        trie.addKeyword("sugar");
+		Emit firstMatch = trie.firstMatch("sugarcane sugarcane sugar canesugar"); // left, middle, right test
+        
+        checkEmit(firstMatch, 20, 24, "sugar");
+    }
+
    @Test
    public void tokenizeFullSentence() {
        Trie trie = Trie.builder()
@ -240,6 +316,18 @@ public class TrieTest {
        checkEmit(it.next(), 19, 23, "börkü");
    }

+	@Test
+    public void caseInsensitiveFirstMatch() {
+        Trie trie = new Trie().caseInsensitive();
+        trie.addKeyword("turning");
+        trie.addKeyword("once");
+        trie.addKeyword("again");
+        trie.addKeyword("börkü");
+		Emit firstMatch = trie.firstMatch("TurninG OnCe AgAiN BÖRKÜ");
+
+        checkEmit(firstMatch, 0, 6, "turning");
+    }
+
    @Test
    public void tokenizeTokensInSequence() {
        Trie trie = Trie.builder()
@ -274,6 +362,16 @@ public class TrieTest {
        checkEmit(it.next(), 5, 8, "this");
    }

+	 @Test
+    public void unicodeIssueBug8ReportedByDwyerkFirstMatch() {
+        String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char
+        Trie trie = new Trie().caseInsensitive().onlyWholeWords();
+        assertEquals("THIS", target.substring(5,9)); // Java does it the right way
+        trie.addKeyword("this");
+		Emit firstMatch = trie.firstMatch(target);
+        checkEmit(firstMatch, 5, 8, "this");
+    }
+
    private void checkEmit(Emit next, int expectedStart, int expectedEnd, String expectedKeyword) {
        assertEquals("Start of emit should have been "+expectedStart, expectedStart, next.getStart());
        assertEquals("End of emit should have been "+expectedEnd, expectedEnd, next.getEnd());