4 spaces for code
Badges for Travis, Codacy, Codecov, Maven and Javadoc Added Travis CI build instructions
This commit is contained in:
parent
2f1ec8d041
commit
8ae9636201
6
.travis.yml
Normal file
6
.travis.yml
Normal file
@ -0,0 +1,6 @@
|
||||
language: java
|
||||
install: mvn install -DskipTests=true -Dgpg.skip=true
|
||||
jdk:
|
||||
- oraclejdk8
|
||||
after_success:
|
||||
- bash <(curl -s https://codecov.io/bash)
|
||||
@ -1,6 +1,12 @@
|
||||
Aho-Corasick
|
||||
============
|
||||
|
||||
[](https://travis-ci.org/robert-bor/aho-corasick)
|
||||
[](https://www.codacy.com/app/bor-robert/aho-corasick)
|
||||
[](https://maven-badges.herokuapp.com/maven-central/org.ahocorasick/ahocorasick)
|
||||
[](http://www.javadoc.io/doc/org.ahocorasick/ahocorasick)
|
||||
[](http://www.apache.org/licenses/LICENSE-2.0)
|
||||
|
||||
Dependency
|
||||
----------
|
||||
Include this dependency in your POM. Be sure to check for the latest version in Maven Central.
|
||||
|
||||
93
pom.xml
93
pom.xml
@ -10,11 +10,16 @@
|
||||
<inceptionYear>2014</inceptionYear>
|
||||
<url>http://ahocorasick.org</url>
|
||||
|
||||
<parent>
|
||||
<groupId>org.sonatype.oss</groupId>
|
||||
<artifactId>oss-parent</artifactId>
|
||||
<version>7</version>
|
||||
</parent>
|
||||
<distributionManagement>
|
||||
<snapshotRepository>
|
||||
<id>ossrh</id>
|
||||
<url>https://oss.sonatype.org/content/repositories/snapshots</url>
|
||||
</snapshotRepository>
|
||||
<repository>
|
||||
<id>ossrh</id>
|
||||
<url>https://oss.sonatype.org/service/local/staging/deploy/maven2/</url>
|
||||
</repository>
|
||||
</distributionManagement>
|
||||
|
||||
<organization>
|
||||
<name>42 BV</name>
|
||||
@ -39,9 +44,15 @@
|
||||
<name>Robert Bor</name>
|
||||
<organization>42</organization>
|
||||
</developer>
|
||||
<developer>
|
||||
<name></name>
|
||||
</developer>
|
||||
</developers>
|
||||
|
||||
<properties>
|
||||
<java.version>1.7</java.version>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
|
||||
<junit.version>4.10</junit.version>
|
||||
<!-- Reporting -->
|
||||
<maven.cobertura.version>2.5.2</maven.cobertura.version>
|
||||
@ -63,15 +74,19 @@
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
|
||||
<defaultGoal>install</defaultGoal>
|
||||
|
||||
<plugins>
|
||||
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-jar-plugin</artifactId>
|
||||
<version>2.4</version>
|
||||
<groupId>org.sonatype.plugins</groupId>
|
||||
<artifactId>nexus-staging-maven-plugin</artifactId>
|
||||
<version>1.6.7</version>
|
||||
<extensions>true</extensions>
|
||||
<configuration>
|
||||
<serverId>ossrh</serverId>
|
||||
<nexusUrl>https://oss.sonatype.org/</nexusUrl>
|
||||
<autoReleaseAfterClose>false</autoReleaseAfterClose>
|
||||
</configuration>
|
||||
</plugin>
|
||||
|
||||
<plugin>
|
||||
@ -79,30 +94,56 @@
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<version>3.6.0</version>
|
||||
<configuration>
|
||||
<source>1.7</source>
|
||||
<target>1.7</target>
|
||||
<source>${java.version}</source>
|
||||
<target>${java.version}</target>
|
||||
<encoding>${project.build.sourceEncoding}</encoding>
|
||||
</configuration>
|
||||
</plugin>
|
||||
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
<reporting>
|
||||
<plugins>
|
||||
|
||||
<plugin>
|
||||
<groupId>org.codehaus.mojo</groupId>
|
||||
<artifactId>cobertura-maven-plugin</artifactId>
|
||||
<version>${maven.cobertura.version}</version>
|
||||
</plugin>
|
||||
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-javadoc-plugin</artifactId>
|
||||
<version>${maven.javadoc.version}</version>
|
||||
<version>2.9.1</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>attach-javadocs</id>
|
||||
<goals>
|
||||
<goal>jar</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-source-plugin</artifactId>
|
||||
<version>2.2.1</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>attach-sources</id>
|
||||
<goals>
|
||||
<goal>jar-no-fork</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-gpg-plugin</artifactId>
|
||||
<version>1.5</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>sign-artifacts</id>
|
||||
<phase>verify</phase>
|
||||
<goals>
|
||||
<goal>sign</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
|
||||
</plugins>
|
||||
</reporting>
|
||||
</build>
|
||||
|
||||
</project>
|
||||
@ -9,7 +9,7 @@ public class Interval implements Intervalable {
|
||||
* Constructs an interval with a start and end position.
|
||||
*
|
||||
* @param start The interval's starting text position.
|
||||
* @param end The interval's ending text position.
|
||||
* @param end The interval's ending text position.
|
||||
*/
|
||||
public Interval(final int start, final int end) {
|
||||
this.start = start;
|
||||
@ -51,7 +51,7 @@ public class Interval implements Intervalable {
|
||||
*/
|
||||
public boolean overlapsWith(final Interval other) {
|
||||
return this.start <= other.getEnd() &&
|
||||
this.end >= other.getStart();
|
||||
this.end >= other.getStart();
|
||||
}
|
||||
|
||||
public boolean overlapsWith(int point) {
|
||||
@ -63,9 +63,9 @@ public class Interval implements Intervalable {
|
||||
if (!(o instanceof Intervalable)) {
|
||||
return false;
|
||||
}
|
||||
Intervalable other = (Intervalable)o;
|
||||
Intervalable other = (Intervalable) o;
|
||||
return this.start == other.getStart() &&
|
||||
this.end == other.getEnd();
|
||||
this.end == other.getEnd();
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -78,7 +78,7 @@ public class Interval implements Intervalable {
|
||||
if (!(o instanceof Intervalable)) {
|
||||
return -1;
|
||||
}
|
||||
Intervalable other = (Intervalable)o;
|
||||
Intervalable other = (Intervalable) o;
|
||||
int comparison = this.start - other.getStart();
|
||||
return comparison != 0 ? comparison : this.end - other.getEnd();
|
||||
}
|
||||
|
||||
@ -6,7 +6,7 @@ import java.util.List;
|
||||
|
||||
public class IntervalNode {
|
||||
|
||||
private enum Direction { LEFT, RIGHT }
|
||||
private enum Direction {LEFT, RIGHT}
|
||||
|
||||
private IntervalNode left = null;
|
||||
private IntervalNode right = null;
|
||||
@ -93,12 +93,12 @@ public class IntervalNode {
|
||||
List<Intervalable> overlaps = new ArrayList<Intervalable>();
|
||||
for (Intervalable currentInterval : this.intervals) {
|
||||
switch (direction) {
|
||||
case LEFT :
|
||||
case LEFT:
|
||||
if (currentInterval.getStart() <= interval.getEnd()) {
|
||||
overlaps.add(currentInterval);
|
||||
}
|
||||
break;
|
||||
case RIGHT :
|
||||
case RIGHT:
|
||||
if (currentInterval.getEnd() >= interval.getStart()) {
|
||||
overlaps.add(currentInterval);
|
||||
}
|
||||
|
||||
@ -3,7 +3,9 @@ package org.ahocorasick.interval;
|
||||
public interface Intervalable extends Comparable {
|
||||
|
||||
public int getStart();
|
||||
|
||||
public int getEnd();
|
||||
|
||||
public int size();
|
||||
|
||||
}
|
||||
|
||||
@ -4,43 +4,51 @@ import java.util.*;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* A state has various important tasks it must attend to:
|
||||
* A state has various important tasks it must attend to:
|
||||
* </p>
|
||||
*
|
||||
* <ul>
|
||||
* <li>success; when a character points to another state, it must return that state</li>
|
||||
* <li>failure; when a character has no matching state, the algorithm must be able to fall back on a
|
||||
* state with less depth</li>
|
||||
* <li>emits; when this state is passed and keywords have been matched, the matches must be
|
||||
* 'emitted' so that they can be used later on.</li>
|
||||
* </ul>
|
||||
*
|
||||
* <p>
|
||||
* The root state is special in the sense that it has no failure state; it cannot fail. If it 'fails'
|
||||
* it will still parse the next character and start from the root node. This ensures that the algorithm
|
||||
* always runs. All other states always have a fail state.
|
||||
* <ul>
|
||||
* <li>success; when a character points to another state, it must return that state</li>
|
||||
* <li>failure; when a character has no matching state, the algorithm must be able to fall back on a
|
||||
* state with less depth</li>
|
||||
* <li>emits; when this state is passed and keywords have been matched, the matches must be
|
||||
* 'emitted' so that they can be used later on.</li>
|
||||
* </ul>
|
||||
* <p>
|
||||
* <p>
|
||||
* The root state is special in the sense that it has no failure state; it cannot fail. If it 'fails'
|
||||
* it will still parse the next character and start from the root node. This ensures that the algorithm
|
||||
* always runs. All other states always have a fail state.
|
||||
* </p>
|
||||
*
|
||||
* @author Robert Bor
|
||||
*/
|
||||
public class State {
|
||||
|
||||
/** effective the size of the keyword */
|
||||
/**
|
||||
* effective the size of the keyword
|
||||
*/
|
||||
private final int depth;
|
||||
|
||||
/** only used for the root state to refer to itself in case no matches have been found */
|
||||
/**
|
||||
* only used for the root state to refer to itself in case no matches have been found
|
||||
*/
|
||||
private final State rootState;
|
||||
|
||||
/**
|
||||
* referred to in the white paper as the 'goto' structure. From a state it is possible to go
|
||||
* to other states, depending on the character passed.
|
||||
*/
|
||||
private final Map<Character,State> success = new HashMap<>();
|
||||
private final Map<Character, State> success = new HashMap<>();
|
||||
|
||||
/** if no matching states are found, the failure state will be returned */
|
||||
/**
|
||||
* if no matching states are found, the failure state will be returned
|
||||
*/
|
||||
private State failure;
|
||||
|
||||
/** whenever this state is reached, it will emit the matches keywords for future reference */
|
||||
/**
|
||||
* whenever this state is reached, it will emit the matches keywords for future reference
|
||||
*/
|
||||
private Set<String> emits;
|
||||
|
||||
public State() {
|
||||
@ -54,11 +62,11 @@ public class State {
|
||||
|
||||
private State nextState(final Character character, final boolean ignoreRootState) {
|
||||
State nextState = this.success.get(character);
|
||||
|
||||
|
||||
if (!ignoreRootState && nextState == null && this.rootState != null) {
|
||||
nextState = this.rootState;
|
||||
}
|
||||
|
||||
|
||||
return nextState;
|
||||
}
|
||||
|
||||
@ -69,21 +77,21 @@ public class State {
|
||||
public State nextStateIgnoreRootState(Character character) {
|
||||
return nextState(character, true);
|
||||
}
|
||||
|
||||
public State addState( String keyword ) {
|
||||
State state = this;
|
||||
|
||||
for (final Character character : keyword.toCharArray()) {
|
||||
state = state.addState(character);
|
||||
}
|
||||
|
||||
return state;
|
||||
|
||||
public State addState(String keyword) {
|
||||
State state = this;
|
||||
|
||||
for (final Character character : keyword.toCharArray()) {
|
||||
state = state.addState(character);
|
||||
}
|
||||
|
||||
return state;
|
||||
}
|
||||
|
||||
public State addState(Character character) {
|
||||
State nextState = nextStateIgnoreRootState(character);
|
||||
if (nextState == null) {
|
||||
nextState = new State(this.depth+1);
|
||||
nextState = new State(this.depth + 1);
|
||||
this.success.put(character, nextState);
|
||||
}
|
||||
return nextState;
|
||||
@ -107,7 +115,7 @@ public class State {
|
||||
}
|
||||
|
||||
public Collection<String> emit() {
|
||||
return this.emits == null ? Collections.<String> emptyList() : this.emits;
|
||||
return this.emits == null ? Collections.<String>emptyList() : this.emits;
|
||||
}
|
||||
|
||||
public State failure() {
|
||||
|
||||
@ -1,11 +1,13 @@
|
||||
package org.ahocorasick.trie;
|
||||
|
||||
import static java.lang.Character.isWhitespace;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Queue;
|
||||
import java.util.concurrent.LinkedBlockingDeque;
|
||||
|
||||
import org.ahocorasick.interval.IntervalTree;
|
||||
import org.ahocorasick.interval.Intervalable;
|
||||
import org.ahocorasick.trie.handler.DefaultEmitHandler;
|
||||
@ -14,7 +16,7 @@ import org.ahocorasick.trie.handler.EmitHandler;
|
||||
/**
|
||||
* Based on the Aho-Corasick white paper, Bell technologies:
|
||||
* http://cr.yp.to/bib/1975/aho.pdf
|
||||
*
|
||||
*
|
||||
* @author Robert Bor
|
||||
*/
|
||||
public class Trie {
|
||||
@ -27,21 +29,20 @@ public class Trie {
|
||||
this.trieConfig = trieConfig;
|
||||
this.rootState = new State();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Used by the builder to add a text search keyword.
|
||||
*
|
||||
*
|
||||
* @param keyword The search term to add to the list of search terms.
|
||||
*
|
||||
* @throws NullPointerException if the keyword is null.
|
||||
*/
|
||||
private void addKeyword(String keyword) {
|
||||
if( keyword.isEmpty() ) {
|
||||
return;
|
||||
if (keyword.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
if( isCaseInsensitive() ) {
|
||||
keyword = keyword.toLowerCase();
|
||||
|
||||
if (isCaseInsensitive()) {
|
||||
keyword = keyword.toLowerCase();
|
||||
}
|
||||
|
||||
addState(keyword).addEmit(keyword);
|
||||
@ -49,44 +50,44 @@ public class Trie {
|
||||
|
||||
/**
|
||||
* Delegates to addKeyword.
|
||||
*
|
||||
*
|
||||
* @param keywords List of search term to add to the list of search terms.
|
||||
*/
|
||||
private void addKeywords( final String[] keywords ) {
|
||||
for( final String keyword : keywords ) {
|
||||
addKeyword( keyword );
|
||||
}
|
||||
private void addKeywords(final String[] keywords) {
|
||||
for (final String keyword : keywords) {
|
||||
addKeyword(keyword);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Delegates to addKeyword.
|
||||
*
|
||||
*
|
||||
* @param keywords List of search term to add to the list of search terms.
|
||||
*/
|
||||
private void addKeywords( final Collection<String> keywords ) {
|
||||
for( final String keyword : keywords ) {
|
||||
addKeyword( keyword );
|
||||
}
|
||||
private void addKeywords(final Collection<String> keywords) {
|
||||
for (final String keyword : keywords) {
|
||||
addKeyword(keyword);
|
||||
}
|
||||
}
|
||||
|
||||
private State addState(final String keyword) {
|
||||
return getRootState().addState(keyword);
|
||||
}
|
||||
|
||||
|
||||
public Collection<Token> tokenize(final String text) {
|
||||
final Collection<Token> tokens = new ArrayList<>();
|
||||
final Collection<Emit> collectedEmits = parseText(text);
|
||||
int lastCollectedPosition = -1;
|
||||
|
||||
|
||||
for (final Emit emit : collectedEmits) {
|
||||
if (emit.getStart() - lastCollectedPosition > 1) {
|
||||
tokens.add(createFragment(emit, text, lastCollectedPosition));
|
||||
}
|
||||
|
||||
|
||||
tokens.add(createMatch(emit, text));
|
||||
lastCollectedPosition = emit.getEnd();
|
||||
}
|
||||
|
||||
|
||||
if (text.length() - lastCollectedPosition > 1) {
|
||||
tokens.add(createFragment(null, text, lastCollectedPosition));
|
||||
}
|
||||
@ -95,11 +96,11 @@ public class Trie {
|
||||
}
|
||||
|
||||
private Token createFragment(final Emit emit, final String text, final int lastCollectedPosition) {
|
||||
return new FragmentToken(text.substring(lastCollectedPosition+1, emit == null ? text.length() : emit.getStart()));
|
||||
return new FragmentToken(text.substring(lastCollectedPosition + 1, emit == null ? text.length() : emit.getStart()));
|
||||
}
|
||||
|
||||
private Token createMatch(Emit emit, String text) {
|
||||
return new MatchToken(text.substring(emit.getStart(), emit.getEnd()+1), emit);
|
||||
return new MatchToken(text.substring(emit.getStart(), emit.getEnd() + 1), emit);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
@ -118,7 +119,7 @@ public class Trie {
|
||||
}
|
||||
|
||||
if (!trieConfig.isAllowOverlaps()) {
|
||||
IntervalTree intervalTree = new IntervalTree((List<Intervalable>)(List<?>)collectedEmits);
|
||||
IntervalTree intervalTree = new IntervalTree((List<Intervalable>) (List<?>) collectedEmits);
|
||||
intervalTree.removeOverlaps((List<Intervalable>) (List<?>) collectedEmits);
|
||||
}
|
||||
|
||||
@ -131,15 +132,15 @@ public class Trie {
|
||||
|
||||
public void parseText(final CharSequence text, final EmitHandler emitHandler) {
|
||||
State currentState = getRootState();
|
||||
|
||||
|
||||
for (int position = 0; position < text.length(); position++) {
|
||||
Character character = text.charAt(position);
|
||||
|
||||
|
||||
// TODO: Maybe lowercase the entire string at once?
|
||||
if (trieConfig.isCaseInsensitive()) {
|
||||
character = Character.toLowerCase(character);
|
||||
}
|
||||
|
||||
|
||||
currentState = getState(currentState, character);
|
||||
if (storeEmits(position, currentState, emitHandler) && trieConfig.isStopOnHit()) {
|
||||
return;
|
||||
@ -157,18 +158,18 @@ public class Trie {
|
||||
} else {
|
||||
// Fast path. Returns first match found.
|
||||
State currentState = getRootState();
|
||||
|
||||
|
||||
for (int position = 0; position < text.length(); position++) {
|
||||
Character character = text.charAt(position);
|
||||
|
||||
|
||||
// TODO: Lowercase the entire string at once?
|
||||
if (trieConfig.isCaseInsensitive()) {
|
||||
character = Character.toLowerCase(character);
|
||||
}
|
||||
|
||||
|
||||
currentState = getState(currentState, character);
|
||||
Collection<String> emitStrs = currentState.emit();
|
||||
|
||||
|
||||
if (emitStrs != null && !emitStrs.isEmpty()) {
|
||||
for (String emitStr : emitStrs) {
|
||||
final Emit emit = new Emit(position - emitStr.length() + 1, position, emitStr);
|
||||
@ -183,26 +184,26 @@ public class Trie {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private boolean isPartialMatch(final CharSequence searchText, final Emit emit) {
|
||||
return (emit.getStart() != 0 &&
|
||||
Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) ||
|
||||
(emit.getEnd() + 1 != searchText.length() &&
|
||||
Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1)));
|
||||
Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) ||
|
||||
(emit.getEnd() + 1 != searchText.length() &&
|
||||
Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1)));
|
||||
}
|
||||
|
||||
private void removePartialMatches(final CharSequence searchText, final List<Emit> collectedEmits) {
|
||||
final List<Emit> removeEmits = new ArrayList<>();
|
||||
|
||||
|
||||
for (final Emit emit : collectedEmits) {
|
||||
if (isPartialMatch(searchText, emit)) {
|
||||
removeEmits.add(emit);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
for (final Emit removeEmit : removeEmits) {
|
||||
collectedEmits.remove(removeEmit);
|
||||
}
|
||||
@ -211,15 +212,15 @@ public class Trie {
|
||||
private void removePartialMatchesWhiteSpaceSeparated(final CharSequence searchText, final List<Emit> collectedEmits) {
|
||||
final long size = searchText.length();
|
||||
final List<Emit> removeEmits = new ArrayList<>();
|
||||
|
||||
|
||||
for (final Emit emit : collectedEmits) {
|
||||
if ((emit.getStart() == 0 || isWhitespace(searchText.charAt(emit.getStart() - 1))) &&
|
||||
(emit.getEnd() + 1 == size || isWhitespace(searchText.charAt(emit.getEnd() + 1)))) {
|
||||
(emit.getEnd() + 1 == size || isWhitespace(searchText.charAt(emit.getEnd() + 1)))) {
|
||||
continue;
|
||||
}
|
||||
removeEmits.add(emit);
|
||||
}
|
||||
|
||||
|
||||
for (final Emit removeEmit : removeEmits) {
|
||||
collectedEmits.remove(removeEmit);
|
||||
}
|
||||
@ -227,12 +228,12 @@ public class Trie {
|
||||
|
||||
private State getState(State currentState, final Character character) {
|
||||
State newCurrentState = currentState.nextState(character);
|
||||
|
||||
|
||||
while (newCurrentState == null) {
|
||||
currentState = currentState.failure();
|
||||
newCurrentState = currentState.nextState(character);
|
||||
}
|
||||
|
||||
|
||||
return newCurrentState;
|
||||
}
|
||||
|
||||
@ -269,7 +270,7 @@ public class Trie {
|
||||
private boolean storeEmits(final int position, final State currentState, final EmitHandler emitHandler) {
|
||||
boolean emitted = false;
|
||||
final Collection<String> emits = currentState.emit();
|
||||
|
||||
|
||||
// TODO: The check for empty might be superfluous.
|
||||
if (emits != null && !emits.isEmpty()) {
|
||||
for (final String emit : emits) {
|
||||
@ -277,21 +278,21 @@ public class Trie {
|
||||
emitted = true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return emitted;
|
||||
}
|
||||
|
||||
private boolean isCaseInsensitive() {
|
||||
return trieConfig.isCaseInsensitive();
|
||||
return trieConfig.isCaseInsensitive();
|
||||
}
|
||||
|
||||
|
||||
private State getRootState() {
|
||||
return this.rootState;
|
||||
return this.rootState;
|
||||
}
|
||||
|
||||
/**
|
||||
* Provides a fluent interface for constructing Trie instances.
|
||||
*
|
||||
*
|
||||
* @return The builder used to configure its Trie.
|
||||
*/
|
||||
public static TrieBuilder builder() {
|
||||
@ -307,13 +308,13 @@ public class Trie {
|
||||
/**
|
||||
* Default (empty) constructor.
|
||||
*/
|
||||
private TrieBuilder() {}
|
||||
private TrieBuilder() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a keyword to the Trie's list of text search keywords.
|
||||
*
|
||||
*
|
||||
* @param keyword The keyword to add to the list.
|
||||
*
|
||||
* @return This builder.
|
||||
* @throws NullPointerException if the keyword is null.
|
||||
*/
|
||||
@ -321,35 +322,33 @@ public class Trie {
|
||||
this.trie.addKeyword(keyword);
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Adds a list of keywords to the Trie's list of text search keywords.
|
||||
*
|
||||
*
|
||||
* @param keywords The keywords to add to the list.
|
||||
*
|
||||
* @return This builder.
|
||||
*/
|
||||
public TrieBuilder addKeywords(final String... keywords) {
|
||||
this.trie.addKeywords(keywords);
|
||||
return this;
|
||||
this.trie.addKeywords(keywords);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a list of keywords to the Trie's list of text search keywords.
|
||||
*
|
||||
*
|
||||
* @param keywords The keywords to add to the list.
|
||||
*
|
||||
* @return This builder.
|
||||
*/
|
||||
public TrieBuilder addKeywords(final Collection<String> keywords) {
|
||||
this.trie.addKeywords(keywords);
|
||||
return this;
|
||||
this.trie.addKeywords(keywords);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Configure the Trie to ignore case when searching for keywords in
|
||||
* the text.
|
||||
*
|
||||
*
|
||||
* @return This builder.
|
||||
*/
|
||||
public TrieBuilder ignoreCase() {
|
||||
@ -359,7 +358,7 @@ public class Trie {
|
||||
|
||||
/**
|
||||
* Configure the Trie to ignore overlapping keywords.
|
||||
*
|
||||
*
|
||||
* @return This builder.
|
||||
*/
|
||||
public TrieBuilder ignoreOverlaps() {
|
||||
@ -369,7 +368,7 @@ public class Trie {
|
||||
|
||||
/**
|
||||
* Configure the Trie to match whole keywords in the text.
|
||||
*
|
||||
*
|
||||
* @return This builder.
|
||||
*/
|
||||
public TrieBuilder onlyWholeWords() {
|
||||
@ -381,7 +380,7 @@ public class Trie {
|
||||
* Configure the Trie to match whole keywords that are separated by
|
||||
* whitespace in the text. For example, "this keyword thatkeyword"
|
||||
* would only match the first occurrence of "keyword".
|
||||
*
|
||||
*
|
||||
* @return This builder.
|
||||
*/
|
||||
public TrieBuilder onlyWholeWordsWhiteSpaceSeparated() {
|
||||
@ -392,7 +391,7 @@ public class Trie {
|
||||
/**
|
||||
* Configure the Trie to stop after the first keyword is found in the
|
||||
* text.
|
||||
*
|
||||
*
|
||||
* @return This builder.
|
||||
*/
|
||||
public TrieBuilder stopOnHit() {
|
||||
@ -402,27 +401,25 @@ public class Trie {
|
||||
|
||||
/**
|
||||
* Configure the Trie based on the builder settings.
|
||||
*
|
||||
*
|
||||
* @return The configured Trie.
|
||||
*/
|
||||
public Trie build() {
|
||||
this.trie.constructFailureStates();
|
||||
return this.trie;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @deprecated Use ignoreCase()
|
||||
*
|
||||
* @return This builder.
|
||||
* @deprecated Use ignoreCase()
|
||||
*/
|
||||
public TrieBuilder caseInsensitive() {
|
||||
return ignoreCase();
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated Use ignoreOverlaps()
|
||||
*
|
||||
* @return This builder.
|
||||
* @deprecated Use ignoreOverlaps()
|
||||
*/
|
||||
public TrieBuilder removeOverlaps() {
|
||||
return ignoreOverlaps();
|
||||
|
||||
@ -12,9 +12,13 @@ public class TrieConfig {
|
||||
|
||||
private boolean stopOnHit = false;
|
||||
|
||||
public boolean isStopOnHit() { return stopOnHit; }
|
||||
public boolean isStopOnHit() {
|
||||
return stopOnHit;
|
||||
}
|
||||
|
||||
public void setStopOnHit(boolean stopOnHit) { this.stopOnHit = stopOnHit; }
|
||||
public void setStopOnHit(boolean stopOnHit) {
|
||||
this.stopOnHit = stopOnHit;
|
||||
}
|
||||
|
||||
public boolean isAllowOverlaps() {
|
||||
return allowOverlaps;
|
||||
@ -32,7 +36,9 @@ public class TrieConfig {
|
||||
this.onlyWholeWords = onlyWholeWords;
|
||||
}
|
||||
|
||||
public boolean isOnlyWholeWordsWhiteSpaceSeparated() { return onlyWholeWordsWhiteSpaceSeparated; }
|
||||
public boolean isOnlyWholeWordsWhiteSpaceSeparated() {
|
||||
return onlyWholeWordsWhiteSpaceSeparated;
|
||||
}
|
||||
|
||||
public void setOnlyWholeWordsWhiteSpaceSeparated(boolean onlyWholeWordsWhiteSpaceSeparated) {
|
||||
this.onlyWholeWordsWhiteSpaceSeparated = onlyWholeWordsWhiteSpaceSeparated;
|
||||
|
||||
@ -12,19 +12,19 @@ public class IntervalTest {
|
||||
|
||||
@Test
|
||||
public void construct() {
|
||||
Interval i = new Interval(1,3);
|
||||
Interval i = new Interval(1, 3);
|
||||
assertEquals(1, i.getStart());
|
||||
assertEquals(3, i.getEnd());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void size() {
|
||||
assertEquals(3, new Interval(0,2).size());
|
||||
assertEquals(3, new Interval(0, 2).size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void intervaloverlaps() {
|
||||
assertTrue(new Interval(1,3).overlapsWith(new Interval(2,4)));
|
||||
assertTrue(new Interval(1, 3).overlapsWith(new Interval(2, 4)));
|
||||
}
|
||||
|
||||
@Test
|
||||
@ -34,7 +34,7 @@ public class IntervalTest {
|
||||
|
||||
@Test
|
||||
public void pointOverlaps() {
|
||||
assertTrue(new Interval(1,3).overlapsWith(2));
|
||||
assertTrue(new Interval(1, 3).overlapsWith(2));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
||||
@ -9,7 +9,7 @@ import java.util.List;
|
||||
import static junit.framework.Assert.assertEquals;
|
||||
|
||||
public class IntervalTreeTest {
|
||||
|
||||
|
||||
@Test
|
||||
public void findOverlaps() {
|
||||
List<Intervalable> intervals = new ArrayList<Intervalable>();
|
||||
@ -20,7 +20,7 @@ public class IntervalTreeTest {
|
||||
intervals.add(new Interval(4, 6));
|
||||
intervals.add(new Interval(5, 7));
|
||||
IntervalTree intervalTree = new IntervalTree(intervals);
|
||||
List<Intervalable> overlaps = intervalTree.findOverlaps(new Interval(1,3));
|
||||
List<Intervalable> overlaps = intervalTree.findOverlaps(new Interval(1, 3));
|
||||
assertEquals(3, overlaps.size());
|
||||
Iterator<Intervalable> overlapsIt = overlaps.iterator();
|
||||
assertOverlap(overlapsIt.next(), 2, 4);
|
||||
@ -47,5 +47,5 @@ public class IntervalTreeTest {
|
||||
assertEquals(expectedStart, interval.getStart());
|
||||
assertEquals(expectedEnd, interval.getEnd());
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@ -13,9 +13,9 @@ public class IntervalableComparatorByPositionTest {
|
||||
@Test
|
||||
public void sortOnPosition() {
|
||||
List<Intervalable> intervals = new ArrayList<Intervalable>();
|
||||
intervals.add(new Interval(4,5));
|
||||
intervals.add(new Interval(1,4));
|
||||
intervals.add(new Interval(3,8));
|
||||
intervals.add(new Interval(4, 5));
|
||||
intervals.add(new Interval(1, 4));
|
||||
intervals.add(new Interval(3, 8));
|
||||
Collections.sort(intervals, new IntervalableComparatorByPosition());
|
||||
assertEquals(4, intervals.get(0).size());
|
||||
assertEquals(6, intervals.get(1).size());
|
||||
|
||||
@ -13,9 +13,9 @@ public class IntervalableComparatorBySizeTest {
|
||||
@Test
|
||||
public void sortOnSize() {
|
||||
List<Intervalable> intervals = new ArrayList<Intervalable>();
|
||||
intervals.add(new Interval(4,5));
|
||||
intervals.add(new Interval(1,4));
|
||||
intervals.add(new Interval(3,8));
|
||||
intervals.add(new Interval(4, 5));
|
||||
intervals.add(new Interval(1, 4));
|
||||
intervals.add(new Interval(3, 8));
|
||||
Collections.sort(intervals, new IntervalableComparatorBySize());
|
||||
assertEquals(6, intervals.get(0).size());
|
||||
assertEquals(4, intervals.get(1).size());
|
||||
@ -25,8 +25,8 @@ public class IntervalableComparatorBySizeTest {
|
||||
@Test
|
||||
public void sortOnSizeThenPosition() {
|
||||
List<Intervalable> intervals = new ArrayList<Intervalable>();
|
||||
intervals.add(new Interval(4,7));
|
||||
intervals.add(new Interval(2,5));
|
||||
intervals.add(new Interval(4, 7));
|
||||
intervals.add(new Interval(2, 5));
|
||||
Collections.sort(intervals, new IntervalableComparatorBySize());
|
||||
assertEquals(2, intervals.get(0).getStart());
|
||||
assertEquals(4, intervals.get(1).getStart());
|
||||
|
||||
@ -11,9 +11,9 @@ public class StateTest {
|
||||
public void constructSequenceOfCharacters() {
|
||||
State rootState = new State();
|
||||
rootState
|
||||
.addState('a')
|
||||
.addState('b')
|
||||
.addState('c');
|
||||
.addState('a')
|
||||
.addState('b')
|
||||
.addState('c');
|
||||
State currentState = rootState.nextState('a');
|
||||
assertEquals(1, currentState.getDepth());
|
||||
currentState = currentState.nextState('b');
|
||||
|
||||
@ -5,30 +5,34 @@ import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.ThreadLocalRandom;
|
||||
|
||||
import static junit.framework.Assert.assertEquals;
|
||||
|
||||
import org.ahocorasick.trie.handler.EmitHandler;
|
||||
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
public class TrieTest {
|
||||
private final static String[] ALPHABET = new String[]{
|
||||
"abc", "bcd", "cde"
|
||||
"abc", "bcd", "cde"
|
||||
};
|
||||
|
||||
|
||||
private final static String[] PRONOUNS = new String[]{
|
||||
"hers", "his", "she", "he"
|
||||
"hers", "his", "she", "he"
|
||||
};
|
||||
|
||||
private final static String[] FOOD = new String[]{
|
||||
"veal", "cauliflower", "broccoli", "tomatoes"
|
||||
"veal", "cauliflower", "broccoli", "tomatoes"
|
||||
};
|
||||
|
||||
private final static String[] GREEK_LETTERS = new String[]{
|
||||
"Alpha", "Beta", "Gamma"
|
||||
"Alpha", "Beta", "Gamma"
|
||||
};
|
||||
|
||||
|
||||
private final static String[] UNICODE = new String[]{
|
||||
"turning", "once", "again", "börkü"
|
||||
"turning", "once", "again", "börkü"
|
||||
};
|
||||
|
||||
@Test
|
||||
@ -406,7 +410,7 @@ public class TrieTest {
|
||||
.onlyWholeWordsWhiteSpaceSeparated()
|
||||
.addKeyword("#sugar-123")
|
||||
.build();
|
||||
Collection < Emit > emits = trie.parseText("#sugar-123 #sugar-1234"); // left, middle, right test
|
||||
Collection<Emit> emits = trie.parseText("#sugar-123 #sugar-1234"); // left, middle, right test
|
||||
assertEquals(1, emits.size()); // Match must not be made
|
||||
checkEmit(emits.iterator().next(), 0, 9, "#sugar-123");
|
||||
}
|
||||
@ -415,57 +419,57 @@ public class TrieTest {
|
||||
public void testLargeString() {
|
||||
final int interval = 100;
|
||||
final int textSize = 1000000;
|
||||
final String keyword = FOOD[ 1 ];
|
||||
final StringBuilder text = randomNumbers( textSize );
|
||||
final String keyword = FOOD[1];
|
||||
final StringBuilder text = randomNumbers(textSize);
|
||||
|
||||
injectKeyword( text, keyword, interval );
|
||||
injectKeyword(text, keyword, interval);
|
||||
|
||||
Trie trie = Trie.builder()
|
||||
.onlyWholeWords()
|
||||
.addKeyword( keyword )
|
||||
.build();
|
||||
.onlyWholeWords()
|
||||
.addKeyword(keyword)
|
||||
.build();
|
||||
|
||||
final Collection<Emit> emits = trie.parseText( text );
|
||||
final Collection<Emit> emits = trie.parseText(text);
|
||||
|
||||
assertEquals( textSize / interval, emits.size() );
|
||||
assertEquals(textSize / interval, emits.size());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Generates a random sequence of ASCII numbers.
|
||||
*
|
||||
*
|
||||
* @param count The number of numbers to generate.
|
||||
* @return A character sequence filled with random digits.
|
||||
*/
|
||||
private StringBuilder randomNumbers( int count ) {
|
||||
final StringBuilder sb = new StringBuilder( count );
|
||||
private StringBuilder randomNumbers(int count) {
|
||||
final StringBuilder sb = new StringBuilder(count);
|
||||
|
||||
while( --count > 0 ) {
|
||||
sb.append( randomInt( 0, 10 ) );
|
||||
while (--count > 0) {
|
||||
sb.append(randomInt(0, 10));
|
||||
}
|
||||
|
||||
return sb;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Injects keywords into a string builder.
|
||||
*
|
||||
* @param source Should contain a bunch of random data that cannot match
|
||||
* any keyword.
|
||||
* @param keyword A keyword to inject repeatedly in the text.
|
||||
*
|
||||
* @param source Should contain a bunch of random data that cannot match
|
||||
* any keyword.
|
||||
* @param keyword A keyword to inject repeatedly in the text.
|
||||
* @param interval How often to inject the keyword.
|
||||
*/
|
||||
private void injectKeyword(
|
||||
final StringBuilder source,
|
||||
final String keyword,
|
||||
final int interval ) {
|
||||
private void injectKeyword(
|
||||
final StringBuilder source,
|
||||
final String keyword,
|
||||
final int interval) {
|
||||
final int length = source.length();
|
||||
for( int i = 0; i < length; i += interval ) {
|
||||
source.replace( i, i + keyword.length(), keyword );
|
||||
for (int i = 0; i < length; i += interval) {
|
||||
source.replace(i, i + keyword.length(), keyword);
|
||||
}
|
||||
}
|
||||
|
||||
private int randomInt( final int min, final int max ) {
|
||||
return ThreadLocalRandom.current().nextInt( min, max );
|
||||
|
||||
private int randomInt(final int min, final int max) {
|
||||
return ThreadLocalRandom.current().nextInt(min, max);
|
||||
}
|
||||
|
||||
private void checkEmit(Emit next, int expectedStart, int expectedEnd, String expectedKeyword) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user