Merge pull request #48 from robert-bor/bugfixes

Added missing override annotations. Added final modifier to Interval …
This commit is contained in:
Robert Bor 2016-11-30 12:14:21 +01:00 committed by GitHub
commit 3114fabffd
7 changed files with 157 additions and 148 deletions

View File

@ -2,8 +2,8 @@ package org.ahocorasick.interval;
public class Interval implements Intervalable {
private int start;
private int end;
private final int start;
private final int end;
/**
* Constructs an interval with a start and end position.
@ -21,6 +21,7 @@ public class Interval implements Intervalable {
*
* @return A number between 0 (start of text) and the text length.
*/
@Override
public int getStart() {
return this.start;
}
@ -30,6 +31,7 @@ public class Interval implements Intervalable {
*
* @return A number between getStart() + 1 and the text length.
*/
@Override
public int getEnd() {
return this.end;
}
@ -39,6 +41,7 @@ public class Interval implements Intervalable {
*
* @return The end position less the start position, plus one.
*/
@Override
public int size() {
return end - start + 1;
}
@ -47,6 +50,7 @@ public class Interval implements Intervalable {
* Answers whether the given interval overlaps this interval
* instance.
*
* @param other
* @return true The intervals overlap.
*/
public boolean overlapsWith(final Interval other) {

View File

@ -7,4 +7,5 @@ public interface Intervalable extends Comparable {
public int getEnd();
public int size();
}

View File

@ -74,11 +74,11 @@ public class State {
return nextState(character, false);
}
public State nextStateIgnoreRootState(final Character character) {
public State nextStateIgnoreRootState(Character character) {
return nextState(character, true);
}
public State addState(final String keyword) {
public State addState(String keyword) {
State state = this;
for (final Character character : keyword.toCharArray()) {
@ -88,7 +88,7 @@ public class State {
return state;
}
public State addState(final Character character) {
public State addState(Character character) {
State nextState = nextStateIgnoreRootState(character);
if (nextState == null) {
nextState = new State(this.depth + 1);
@ -101,14 +101,14 @@ public class State {
return this.depth;
}
public void addEmit(final String keyword) {
public void addEmit(String keyword) {
if (this.emits == null) {
this.emits = new TreeSet<>();
}
this.emits.add(keyword);
}
public void addEmit(final Collection<String> emits) {
public void addEmit(Collection<String> emits) {
for (String emit : emits) {
addEmit(emit);
}

View File

@ -11,9 +11,7 @@ import java.util.List;
import java.util.Queue;
import java.util.concurrent.LinkedBlockingDeque;
import static java.lang.Character.*;
import java.lang.Character;
import static java.lang.Character.isWhitespace;
/**
* Based on the Aho-Corasick white paper, Bell technologies:
@ -39,12 +37,36 @@ public class Trie {
* @throws NullPointerException if the keyword is null.
*/
private void addKeyword(String keyword) {
if (keyword.length() > 0) {
if (isCaseInsensitive()) {
keyword = keyword.toLowerCase();
}
if (keyword.isEmpty()) {
return;
}
addState(keyword).addEmit(keyword);
if (isCaseInsensitive()) {
keyword = keyword.toLowerCase();
}
addState(keyword).addEmit(keyword);
}
/**
* Delegates to addKeyword.
*
* @param keywords List of search term to add to the list of search terms.
*/
private void addKeywords(final String[] keywords) {
for (final String keyword : keywords) {
addKeyword(keyword);
}
}
/**
* Delegates to addKeyword.
*
* @param keywords List of search term to add to the list of search terms.
*/
private void addKeywords(final Collection<String> keywords) {
for (final String keyword : keywords) {
addKeyword(keyword);
}
}
@ -73,14 +95,11 @@ public class Trie {
return tokens;
}
private Token createFragment(
final Emit emit,
final String text,
final int lastCollectedPosition) {
private Token createFragment(final Emit emit, final String text, final int lastCollectedPosition) {
return new FragmentToken(text.substring(lastCollectedPosition + 1, emit == null ? text.length() : emit.getStart()));
}
private Token createMatch(final Emit emit, final String text) {
private Token createMatch(Emit emit, String text) {
return new MatchToken(text.substring(emit.getStart(), emit.getEnd() + 1), emit);
}
@ -119,7 +138,7 @@ public class Trie {
// TODO: Maybe lowercase the entire string at once?
if (trieConfig.isCaseInsensitive()) {
character = toLowerCase(character);
character = Character.toLowerCase(character);
}
currentState = getState(currentState, character);
@ -129,10 +148,17 @@ public class Trie {
}
}
/**
* The first matching text sequence.
*
* @param text The text to search for keywords.
* @return null if no matches found.
*/
public Emit firstMatch(final CharSequence text) {
if (!trieConfig.isAllowOverlaps()) {
// Slow path. Needs to find all the matches to detect overlaps.
Collection<Emit> parseText = parseText(text);
final Collection<Emit> parseText = parseText(text);
if (parseText != null && !parseText.isEmpty()) {
return parseText.iterator().next();
}
@ -145,7 +171,7 @@ public class Trie {
// TODO: Lowercase the entire string at once?
if (trieConfig.isCaseInsensitive()) {
character = toLowerCase(character);
character = Character.toLowerCase(character);
}
currentState = getState(currentState, character);
@ -171,9 +197,9 @@ public class Trie {
private boolean isPartialMatch(final CharSequence searchText, final Emit emit) {
return (emit.getStart() != 0 &&
isAlphabetic(searchText.charAt(emit.getStart() - 1))) ||
Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) ||
(emit.getEnd() + 1 != searchText.length() &&
isAlphabetic(searchText.charAt(emit.getEnd() + 1)));
Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1)));
}
private void removePartialMatches(final CharSequence searchText, final List<Emit> collectedEmits) {
@ -207,16 +233,15 @@ public class Trie {
}
}
private State getState(final State initialState, final Character character) {
State currentState = initialState;
State updatedState = currentState.nextState(character);
private State getState(State currentState, final Character character) {
State newCurrentState = currentState.nextState(character);
while (updatedState == null) {
while (newCurrentState == null) {
currentState = currentState.failure();
updatedState = currentState.nextState(character);
newCurrentState = currentState.nextState(character);
}
return updatedState;
return newCurrentState;
}
private void constructFailureStates() {
@ -249,10 +274,7 @@ public class Trie {
}
}
private boolean storeEmits(
final int position,
final State currentState,
final EmitHandler emitHandler) {
private boolean storeEmits(final int position, final State currentState, final EmitHandler emitHandler) {
boolean emitted = false;
final Collection<String> emits = currentState.emit();
@ -276,8 +298,7 @@ public class Trie {
}
/**
* Constructs a TrieBuilder instance for configuring the Trie using a fluent
* interface.
* Provides a fluent interface for constructing Trie instances.
*
* @return The builder used to configure its Trie.
*/
@ -285,9 +306,6 @@ public class Trie {
return new TrieBuilder();
}
/**
* Provides a fluent interface for constructing Trie instances.
*/
public static class TrieBuilder {
private final TrieConfig trieConfig = new TrieConfig();
@ -301,49 +319,15 @@ public class Trie {
}
/**
* Adds a keyword to the Trie's list of text search keywords.
*
* @param keyword The keyword to add to the list.
* @return This builder.
* @throws NullPointerException if the keyword is null.
*/
public TrieBuilder addKeyword(final CharSequence keyword) {
getTrie().addKeyword(keyword.toString());
return this;
}
/**
* Adds a list of keywords to the Trie's list of text search keywords.
*
* @param keywords The keywords to add to the list.
* @return This builder.
*/
public TrieBuilder addKeywords(final CharSequence... keywords) {
for (final CharSequence keyword : keywords) {
addKeyword(keyword);
}
return this;
}
/**
* Adds a list of keywords to the Trie's list of text search keywords.
*
* @param keywords The keywords to add to the list.
* @return This builder.
*/
public TrieBuilder addKeywords(final Collection<CharSequence> keywords) {
return addKeywords(keywords.toArray(new CharSequence[keywords.size()]));
}
/**
* Configure the Trie to ignore case when searching for keywords in the
* text.
* Configure the Trie to ignore case when searching for keywords in
* the text. This must be called before calling addKeyword because
* the algorithm converts keywords to lowercase as they are added,
* depending on this case sensitivity setting.
*
* @return This builder.
*/
public TrieBuilder ignoreCase() {
getTrieConfig().setCaseInsensitive(true);
this.trieConfig.setCaseInsensitive(true);
return this;
}
@ -353,7 +337,41 @@ public class Trie {
* @return This builder.
*/
public TrieBuilder ignoreOverlaps() {
getTrieConfig().setAllowOverlaps(false);
this.trieConfig.setAllowOverlaps(false);
return this;
}
/**
* Adds a keyword to the Trie's list of text search keywords.
*
* @param keyword The keyword to add to the list.
* @return This builder.
* @throws NullPointerException if the keyword is null.
*/
public TrieBuilder addKeyword(final String keyword) {
this.trie.addKeyword(keyword);
return this;
}
/**
* Adds a list of keywords to the Trie's list of text search keywords.
*
* @param keywords The keywords to add to the list.
* @return This builder.
*/
public TrieBuilder addKeywords(final String... keywords) {
this.trie.addKeywords(keywords);
return this;
}
/**
* Adds a list of keywords to the Trie's list of text search keywords.
*
* @param keywords The keywords to add to the list.
* @return This builder.
*/
public TrieBuilder addKeywords(final Collection<String> keywords) {
this.trie.addKeywords(keywords);
return this;
}
@ -363,7 +381,7 @@ public class Trie {
* @return This builder.
*/
public TrieBuilder onlyWholeWords() {
getTrieConfig().setOnlyWholeWords(true);
this.trieConfig.setOnlyWholeWords(true);
return this;
}
@ -375,47 +393,33 @@ public class Trie {
* @return This builder.
*/
public TrieBuilder onlyWholeWordsWhiteSpaceSeparated() {
getTrieConfig().setOnlyWholeWordsWhiteSpaceSeparated(true);
this.trieConfig.setOnlyWholeWordsWhiteSpaceSeparated(true);
return this;
}
/**
* Configure the Trie to stop searching for matches after the first
* keyword is found in the text.
* Configure the Trie to stop after the first keyword is found in the
* text.
*
* @return This builder.
*/
public TrieBuilder onlyFirstMatch() {
getTrieConfig().setStopOnHit(true);
public TrieBuilder stopOnHit() {
trie.trieConfig.setStopOnHit(true);
return this;
}
/**
* Construct the Trie using the builder settings.
* Configure the Trie based on the builder settings.
*
* @return The configured Trie.
*/
public Trie build() {
getTrie().constructFailureStates();
return getTrie();
}
private Trie getTrie() {
this.trie.constructFailureStates();
return this.trie;
}
private TrieConfig getTrieConfig() {
return this.trieConfig;
}
/**
* @deprecated Use onlyFirstMatch()
*/
public TrieBuilder stopOnHit() {
return onlyFirstMatch();
}
/**
* @return This builder.
* @deprecated Use ignoreCase()
*/
public TrieBuilder caseInsensitive() {
@ -423,6 +427,7 @@ public class Trie {
}
/**
* @return This builder.
* @deprecated Use ignoreOverlaps()
*/
public TrieBuilder removeOverlaps() {

View File

@ -3,20 +3,20 @@ package org.ahocorasick.interval;
import org.junit.Test;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import static java.util.Collections.sort;
import static junit.framework.Assert.assertEquals;
public class IntervalableComparatorByPositionTest {
@Test
public void sortOnPosition() {
List<Intervalable> intervals = new ArrayList<>();
List<Intervalable> intervals = new ArrayList<Intervalable>();
intervals.add(new Interval(4, 5));
intervals.add(new Interval(1, 4));
intervals.add(new Interval(3, 8));
sort(intervals, new IntervalableComparatorByPosition());
Collections.sort(intervals, new IntervalableComparatorByPosition());
assertEquals(4, intervals.get(0).size());
assertEquals(6, intervals.get(1).size());
assertEquals(2, intervals.get(2).size());

View File

@ -3,20 +3,20 @@ package org.ahocorasick.interval;
import org.junit.Test;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import static java.util.Collections.sort;
import static junit.framework.Assert.assertEquals;
public class IntervalableComparatorBySizeTest {
@Test
public void sortOnSize() {
List<Intervalable> intervals = new ArrayList<>();
List<Intervalable> intervals = new ArrayList<Intervalable>();
intervals.add(new Interval(4, 5));
intervals.add(new Interval(1, 4));
intervals.add(new Interval(3, 8));
sort(intervals, new IntervalableComparatorBySize());
Collections.sort(intervals, new IntervalableComparatorBySize());
assertEquals(6, intervals.get(0).size());
assertEquals(4, intervals.get(1).size());
assertEquals(2, intervals.get(2).size());
@ -24,10 +24,10 @@ public class IntervalableComparatorBySizeTest {
@Test
public void sortOnSizeThenPosition() {
List<Intervalable> intervals = new ArrayList<>();
List<Intervalable> intervals = new ArrayList<Intervalable>();
intervals.add(new Interval(4, 7));
intervals.add(new Interval(2, 5));
sort(intervals, new IntervalableComparatorBySize());
Collections.sort(intervals, new IntervalableComparatorBySize());
assertEquals(2, intervals.get(0).getStart());
assertEquals(4, intervals.get(1).getStart());
}

View File

@ -7,10 +7,9 @@ import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.ThreadLocalRandom;
import static java.util.concurrent.ThreadLocalRandom.current;
import static junit.framework.Assert.assertEquals;
import static org.ahocorasick.trie.Trie.builder;
import static org.junit.Assert.assertTrue;
public class TrieTest {
@ -36,7 +35,7 @@ public class TrieTest {
@Test
public void keywordAndTextAreTheSame() {
Trie trie = builder()
Trie trie = Trie.builder()
.addKeyword(ALPHABET[0])
.build();
Collection<Emit> emits = trie.parseText(ALPHABET[0]);
@ -46,7 +45,7 @@ public class TrieTest {
@Test
public void keywordAndTextAreTheSameFirstMatch() {
Trie trie = builder()
Trie trie = Trie.builder()
.addKeyword(ALPHABET[0])
.build();
Emit firstMatch = trie.firstMatch(ALPHABET[0]);
@ -55,7 +54,7 @@ public class TrieTest {
@Test
public void textIsLongerThanKeyword() {
Trie trie = builder()
Trie trie = Trie.builder()
.addKeyword(ALPHABET[0])
.build();
Collection<Emit> emits = trie.parseText(" " + ALPHABET[0]);
@ -65,7 +64,7 @@ public class TrieTest {
@Test
public void textIsLongerThanKeywordFirstMatch() {
Trie trie = builder()
Trie trie = Trie.builder()
.addKeyword(ALPHABET[0])
.build();
Emit firstMatch = trie.firstMatch(" " + ALPHABET[0]);
@ -74,7 +73,7 @@ public class TrieTest {
@Test
public void variousKeywordsOneMatch() {
Trie trie = builder()
Trie trie = Trie.builder()
.addKeywords(ALPHABET)
.build();
Collection<Emit> emits = trie.parseText("bcd");
@ -84,7 +83,7 @@ public class TrieTest {
@Test
public void variousKeywordsFirstMatch() {
Trie trie = builder()
Trie trie = Trie.builder()
.addKeywords(ALPHABET)
.build();
Emit firstMatch = trie.firstMatch("bcd");
@ -93,7 +92,7 @@ public class TrieTest {
@Test
public void ushersTestAndStopOnHit() {
Trie trie = builder()
Trie trie = Trie.builder()
.addKeywords(PRONOUNS)
.stopOnHit()
.build();
@ -106,7 +105,7 @@ public class TrieTest {
@Test
public void ushersTest() {
Trie trie = builder()
Trie trie = Trie.builder()
.addKeywords(PRONOUNS)
.build();
Collection<Emit> emits = trie.parseText("ushers");
@ -119,7 +118,7 @@ public class TrieTest {
@Test
public void ushersTestWithCapitalKeywords() {
Trie trie = builder()
Trie trie = Trie.builder()
.ignoreCase()
.addKeyword("HERS")
.addKeyword("HIS")
@ -136,7 +135,7 @@ public class TrieTest {
@Test
public void ushersTestFirstMatch() {
Trie trie = builder()
Trie trie = Trie.builder()
.addKeywords(PRONOUNS)
.build();
Emit firstMatch = trie.firstMatch("ushers");
@ -145,7 +144,7 @@ public class TrieTest {
@Test
public void ushersTestByCallback() {
Trie trie = builder()
Trie trie = Trie.builder()
.addKeywords(PRONOUNS)
.build();
@ -167,7 +166,7 @@ public class TrieTest {
@Test
public void misleadingTest() {
Trie trie = builder()
Trie trie = Trie.builder()
.addKeyword("hers")
.build();
Collection<Emit> emits = trie.parseText("h he her hers");
@ -177,7 +176,7 @@ public class TrieTest {
@Test
public void misleadingTestFirstMatch() {
Trie trie = builder()
Trie trie = Trie.builder()
.addKeyword("hers")
.build();
Emit firstMatch = trie.firstMatch("h he her hers");
@ -186,7 +185,7 @@ public class TrieTest {
@Test
public void recipes() {
Trie trie = builder()
Trie trie = Trie.builder()
.addKeywords(FOOD)
.build();
Collection<Emit> emits = trie.parseText("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli");
@ -199,7 +198,7 @@ public class TrieTest {
@Test
public void recipesFirstMatch() {
Trie trie = builder()
Trie trie = Trie.builder()
.addKeywords(FOOD)
.build();
Emit firstMatch = trie.firstMatch("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli");
@ -209,7 +208,7 @@ public class TrieTest {
@Test
public void longAndShortOverlappingMatch() {
Trie trie = builder()
Trie trie = Trie.builder()
.addKeyword("he")
.addKeyword("hehehehe")
.build();
@ -226,7 +225,7 @@ public class TrieTest {
@Test
public void nonOverlapping() {
Trie trie = builder().removeOverlaps()
Trie trie = Trie.builder().removeOverlaps()
.addKeyword("ab")
.addKeyword("cba")
.addKeyword("ababc")
@ -241,7 +240,7 @@ public class TrieTest {
@Test
public void nonOverlappingFirstMatch() {
Trie trie = builder().removeOverlaps()
Trie trie = Trie.builder().removeOverlaps()
.addKeyword("ab")
.addKeyword("cba")
.addKeyword("ababc")
@ -253,7 +252,7 @@ public class TrieTest {
@Test
public void containsMatch() {
Trie trie = builder().removeOverlaps()
Trie trie = Trie.builder().removeOverlaps()
.addKeyword("ab")
.addKeyword("cba")
.addKeyword("ababc")
@ -263,7 +262,7 @@ public class TrieTest {
@Test
public void startOfChurchillSpeech() {
Trie trie = builder().removeOverlaps()
Trie trie = Trie.builder().removeOverlaps()
.addKeyword("T")
.addKeyword("u")
.addKeyword("ur")
@ -281,7 +280,7 @@ public class TrieTest {
@Test
public void partialMatch() {
Trie trie = builder()
Trie trie = Trie.builder()
.onlyWholeWords()
.addKeyword("sugar")
.build();
@ -292,7 +291,7 @@ public class TrieTest {
@Test
public void partialMatchFirstMatch() {
Trie trie = builder()
Trie trie = Trie.builder()
.onlyWholeWords()
.addKeyword("sugar")
.build();
@ -303,7 +302,7 @@ public class TrieTest {
@Test
public void tokenizeFullSentence() {
Trie trie = builder()
Trie trie = Trie.builder()
.addKeywords(GREEK_LETTERS)
.build();
Collection<Token> tokens = trie.tokenize("Hear: Alpha team first, Beta from the rear, Gamma in reserve");
@ -321,7 +320,7 @@ public class TrieTest {
// @see https://github.com/robert-bor/aho-corasick/issues/5
@Test
public void testStringIndexOutOfBoundsException() {
Trie trie = builder().ignoreCase().onlyWholeWords()
Trie trie = Trie.builder().ignoreCase().onlyWholeWords()
.addKeywords(UNICODE)
.build();
Collection<Emit> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ");
@ -335,7 +334,7 @@ public class TrieTest {
@Test
public void testIgnoreCase() {
Trie trie = builder().ignoreCase()
Trie trie = Trie.builder().ignoreCase()
.addKeywords(UNICODE)
.build();
Collection<Emit> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ");
@ -349,7 +348,7 @@ public class TrieTest {
@Test
public void testIgnoreCaseFirstMatch() {
Trie trie = builder().ignoreCase()
Trie trie = Trie.builder().ignoreCase()
.addKeywords(UNICODE)
.build();
Emit firstMatch = trie.firstMatch("TurninG OnCe AgAiN BÖRKÜ");
@ -359,7 +358,7 @@ public class TrieTest {
@Test
public void tokenizeTokensInSequence() {
Trie trie = builder()
Trie trie = Trie.builder()
.addKeywords(GREEK_LETTERS)
.build();
Collection<Token> tokens = trie.tokenize("Alpha Beta Gamma");
@ -369,7 +368,7 @@ public class TrieTest {
// @see https://github.com/robert-bor/aho-corasick/issues/7
@Test
public void testZeroLength() {
Trie trie = builder().ignoreOverlaps().onlyWholeWords().ignoreCase()
Trie trie = Trie.builder().ignoreOverlaps().onlyWholeWords().ignoreCase()
.addKeyword("")
.build();
trie.tokenize("Try a natural lip and subtle bronzer to keep all the focus on those big bright eyes with NARS Eyeshadow Duo in Rated R And the winner is... Boots No7 Advanced Renewal Anti-ageing Glycolic Peel Kit ($25 amazon.com) won most-appealing peel.");
@ -380,7 +379,7 @@ public class TrieTest {
public void testUnicode1() {
String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char
assertEquals("THIS", target.substring(5, 9)); // Java does it the right way
Trie trie = builder().ignoreCase().onlyWholeWords()
Trie trie = Trie.builder().ignoreCase().onlyWholeWords()
.addKeyword("this")
.build();
Collection<Emit> emits = trie.parseText(target);
@ -393,7 +392,7 @@ public class TrieTest {
@Test
public void testUnicode2() {
String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char
Trie trie = builder()
Trie trie = Trie.builder()
.ignoreCase()
.onlyWholeWords()
.addKeyword("this")
@ -405,7 +404,7 @@ public class TrieTest {
@Test
public void testPartialMatchWhiteSpaces() {
Trie trie = builder()
Trie trie = Trie.builder()
.onlyWholeWordsWhiteSpaceSeparated()
.addKeyword("#sugar-123")
.build();
@ -423,7 +422,7 @@ public class TrieTest {
injectKeyword(text, keyword, interval);
Trie trie = builder()
Trie trie = Trie.builder()
.onlyWholeWords()
.addKeyword(keyword)
.build();
@ -439,10 +438,10 @@ public class TrieTest {
* @param count The number of numbers to generate.
* @return A character sequence filled with random digits.
*/
private StringBuilder randomNumbers(final int count) {
private StringBuilder randomNumbers(int count) {
final StringBuilder sb = new StringBuilder(count);
for (int i = count - 1; i >= 0; i--) {
while (--count > 0) {
sb.append(randomInt(0, 10));
}
@ -468,7 +467,7 @@ public class TrieTest {
}
private int randomInt(final int min, final int max) {
return current().nextInt(min, max);
return ThreadLocalRandom.current().nextInt(min, max);
}
private void checkEmit(Emit next, int expectedStart, int expectedEnd, String expectedKeyword) {