Compare commits

..

1 Commits

Author SHA1 Message Date
Dave Jarvis
713632dded Address IDE warnings 2020-08-23 16:15:32 -07:00
53 changed files with 1175 additions and 1624 deletions

View File

@ -1,21 +0,0 @@
include:
- project: 'gitlab/gitlab'
ref: 'main'
file: 'ci-templates/gradle_java.yml'
deploy:
stage: deploy
tags:
- dind
script:
- echo "Building with gradle version ${BUILDVERSION}"
- gradle -Pversion=${BUILDVERSION} publish
- echo "BUILDVERSION=$BUILDVERSION" >> version.env
artifacts:
reports:
dotenv: version.env
rules:
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
- if: $CI_COMMIT_BRANCH =~ /^release/
- if: $CI_COMMIT_TAG

6
.travis.yml Normal file
View File

@ -0,0 +1,6 @@
language: java
install: mvn install -DskipTests=true -Dgpg.skip=true
jdk:
- openjdk8
after_success:
- bash <(curl -s https://codecov.io/bash)

View File

@ -1,202 +1,237 @@
Aho-Corasick
============
Apache License [![Build Status](https://travis-ci.org/robert-bor/aho-corasick.svg?branch=master)](https://travis-ci.org/robert-bor/aho-corasick)
Version 2.0, January 2004 [![Codacy Badge](https://api.codacy.com/project/badge/Grade/0f65bfb641f745a4b301b85d028a4a8d)](https://www.codacy.com/app/bor-robert/aho-corasick)
http://www.apache.org/licenses/ [![Codecov](https://codecov.io/gh/robert-bor/aho-corasick/branch/master/graph/badge.svg)](https://codecov.io/gh/robert-bor/aho-corasick)
[![Maven Central](https://maven-badges.herokuapp.com/maven-central/org.ahocorasick/ahocorasick/badge.svg)](https://maven-badges.herokuapp.com/maven-central/org.ahocorasick/ahocorasick)
[![Javadoc](https://javadoc-emblem.rhcloud.com/doc/org.ahocorasick/ahocorasick/badge.svg)](http://www.javadoc.io/doc/org.ahocorasick/ahocorasick)
[![Apache 2](http://img.shields.io/badge/license-Apache%202-blue.svg)](http://www.apache.org/licenses/LICENSE-2.0)
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION Dependency
----------
1. Definitions. Include this dependency in your POM. Be sure to check for the latest version in Maven Central.
"License" shall mean the terms and conditions for use, reproduction, ```xml
and distribution as defined by Sections 1 through 9 of this document. <dependency>
<groupId>org.ahocorasick</groupId>
<artifactId>ahocorasick</artifactId>
<version>0.5.0</version>
</dependency>
```
"Licensor" shall mean the copyright owner or entity authorized by Introduction
the copyright owner that is granting the License. ------------
"Legal Entity" shall mean the union of the acting entity and all Most free-text searching is based on Lucene-like approaches, where the
other entities that control, are controlled by, or are under common search text is parsed into its various components. For every keyword a
control with that entity. For the purposes of this definition, lookup is done to see where it occurs. When looking for a couple of keywords
"control" means (i) the power, direct or indirect, to cause the this approach is great, but when searching for 100,000 words, the approach
direction or management of such entity, whether by contract or is quite slow (for example, checking against a dictionary).
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity The Aho-Corasick algorithm shines when looking for multiple words.
exercising permissions granted by this License. Rather than chop up the search text, it uses all the keywords to build
a [Trie](http://en.wikipedia.org/wiki/Trie) construct. The crucial
Aho-Corasick components include:
"Source" form shall mean the preferred form for making modifications, * goto
including but not limited to software source code, documentation * fail
source, and configuration files. * output
"Object" form shall mean any form resulting from mechanical Every character encountered is presented to a state object within the
transformation or translation of a Source form, including but *goto* structure. If there is a matching state, that will be elevated to
not limited to compiled object code, generated documentation, the new current state.
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or However, if there is no matching state, the algorithm will signal a
Object form, made available under the License, as indicated by a *fail* and fall back to states with less depth (i.e., a match less long)
copyright notice that is included in or attached to the work and proceed from there, until it found a matching state, or it has reached
(an example is provided in the Appendix below). the root state.
"Derivative Works" shall mean any work, whether in Source or Object Whenever a state is reached that matches an entire keyword, it is
form, that is based on (or derived from) the Work and for which the emitted to an *output* set which can be read after the entire scan
editorial revisions, annotations, elaborations, or other modifications has completed.
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including The algorithm is O(n). No matter how many keywords are given, or how large
the original version of the Work and any modifications or additions the search text is, the performance will decline linearly.
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity The Aho-Corasick algorithm can help:
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of * find words in texts to link or emphasize them;
this License, each Contributor hereby grants to You a perpetual, * add semantics to plain text; or
worldwide, non-exclusive, no-charge, royalty-free, irrevocable * check against a dictionary to see if syntactic errors were made.
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of See the [white paper](http://cr.yp.to/bib/1975/aho.pdf) by Aho and
this License, each Contributor hereby grants to You a perpetual, Corasick for algorithmic details.
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the Usage
Work or Derivative Works thereof in any medium, with or without -----
modifications, and in Source or Object form, provided that You Set up the Trie using a builder as follows:
meet the following conditions:
(a) You must give any other recipients of the Work or ```java
Derivative Works a copy of this License; and Trie trie = Trie.builder()
.addKeyword("hers")
.addKeyword("his")
.addKeyword("she")
.addKeyword("he")
.build();
Collection<Emit> emits = trie.parseText("ushers");
```
(b) You must cause any modified files to carry prominent notices The collection will contain `Emit` objects that match:
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works * "she" starting at position 1, ending at position 3
that You distribute, all copyright, patent, trademark, and * "he" starting at position 2, ending at position 3
attribution notices from the Source form of the Work, * "hers" starting at position 2, ending at position 5
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its In situations where overlapping instances are not desired, retain
distribution, then any Derivative Works that You distribute must the longest and left-most matches by calling `ignoreOverlaps()`:
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and ```java
may provide additional or different license terms and conditions Trie trie = Trie.builder()
for use, reproduction, or distribution of Your modifications, or .ignoreOverlaps()
for any such Derivative Works as a whole, provided Your use, .addKeyword("hot")
reproduction, and distribution of the Work otherwise complies with .addKeyword("hot chocolate")
the conditions stated in this License. .build();
Collection<Emit> emits = trie.parseText("hot chocolate");
```
5. Submission of Contributions. Unless You explicitly state otherwise, The `ignoreOverlaps()` method tells the Trie to remove all overlapping
any Contribution intentionally submitted for inclusion in the Work matches. For this it relies on the following conflict resolution rules:
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade 1. longer matches prevail over shorter matches; and
names, trademarks, service marks, or product names of the Licensor, 1. left-most prevails over right-most.
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or Only one result is returned:
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory, * "hot chocolate" starting at position 0, ending at position 12
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing To check for whole words exclusively, call `onlyWholeWords()` as follows:
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS ```java
Trie trie = Trie.builder()
.onlyWholeWords()
.addKeyword("sugar")
.build();
Collection<Emit> emits = trie.parseText("sugarcane sugar canesugar");
```
APPENDIX: How to apply the Apache License to your work. Only one match is found; whereas, without calling `onlyWholeWords()` four
matches are found. The sugarcane/canesugar words are discarded because
they are partial matches.
To apply the Apache License to your work, attach the following Some text is `WrItTeN` in mixed case, which makes it hard to identify.
boilerplate notice, with the fields enclosed by brackets "[]" Instruct the Trie to convert the searchtext to lowercase to ease the
replaced with your own identifying information. (Don't include matching process. The lower-casing applies to keywords as well.
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2018 Robert Bor ```java
Trie trie = Trie.builder()
.ignoreCase()
.addKeyword("casing")
.build();
Collection<Emit> emits = trie.parseText("CaSiNg");
```
Licensed under the Apache License, Version 2.0 (the "License"); Normally, this match would not be found. By calling `ignoreCase()`,
you may not use this file except in compliance with the License. the entire search text is made lowercase before matching begins.
You may obtain a copy of the License at Therefore it will find exactly one match.
http://www.apache.org/licenses/LICENSE-2.0 It is also possible to just ask whether the text matches any of
the keywords, or just to return the first match it finds.
```java
Trie trie = Trie.builder().ignoreOverlaps()
.addKeyword("ab")
.addKeyword("cba")
.addKeyword("ababc")
.build();
Emit firstMatch = trie.firstMatch("ababcbab");
```
The value for `firstMatch` will be "ababc" from position 0. The
`containsMatch()` method checks whether `firstMatch` found a match and
returns `true` if that is the case.
For a barebones Aho-Corasick algorithm with a custom emit handler use:
```java
Trie trie = Trie.builder()
.addKeyword("hers")
.addKeyword("his")
.addKeyword("she")
.addKeyword("he")
.build();
final List<Emit> emits = new ArrayList<>();
EmitHandler emitHandler = new EmitHandler() {
@Override
public void emit(Emit emit) {
emits.add(emit);
}
};
```
In many cases you may want to do perform tasks with both the non-matching
and the matching text. Such implementations may be better served by using
`Trie.tokenize()`. The `tokenize()` method allows looping over the
corpus to deal with matches as soon as they are encountered. Here's an
example that outputs key words as italicized HTML elements:
```java
String speech = "The Answer to the Great Question... Of Life, " +
"the Universe and Everything... Is... Forty-two,' said " +
"Deep Thought, with infinite majesty and calm.";
Trie trie = Trie.builder().ignoreOverlaps().onlyWholeWords().ignoreCase()
.addKeyword("great question")
.addKeyword("forty-two")
.addKeyword("deep thought")
.build();
Collection<Token> tokens = trie.tokenize(speech);
StringBuilder html = new StringBuilder();
html.append("<html><body><p>");
for (Token token : tokens) {
if (token.isMatch()) {
html.append("<i>");
}
html.append(token.getFragment());
if (token.isMatch()) {
html.append("</i>");
}
}
html.append("</p></body></html>");
System.out.println(html);
```
You can also emit custom outputs. This might for example be useful to
implement a trivial named entity recognizer. In this case use a
`PayloadTrie` instead of a `Trie` as follows:
```java
class Word {
private final String gender;
public Word(String gender) {
this.gender = gender;
}
}
PayloadTrie<Word> trie = PayloadTrie.<Word>builder()
.addKeyword("hers", new Word("f")
.addKeyword("his", new Word("m"))
.addKeyword("she", new Word("f"))
.addKeyword("he", new Word("m"))
.addKeyword("nonbinary", new Word("nb"))
.addKeyword("transgender", new Word("tg"))
.build();
Collection<PayloadEmit<Word>> emits = trie.parseText("ushers");
```
Releases
--------
See [releases](https://github.com/robert-bor/aho-corasick/releases) for details.
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@ -2,9 +2,10 @@ Aho-Corasick
============ ============
[![Build Status](https://travis-ci.org/robert-bor/aho-corasick.svg?branch=master)](https://travis-ci.org/robert-bor/aho-corasick) [![Build Status](https://travis-ci.org/robert-bor/aho-corasick.svg?branch=master)](https://travis-ci.org/robert-bor/aho-corasick)
[![Codacy Badge](https://api.codacy.com/project/badge/Grade/0f65bfb641f745a4b301b85d028a4a8d)](https://www.codacy.com/app/bor-robert/aho-corasick)
[![Codecov](https://codecov.io/gh/robert-bor/aho-corasick/branch/master/graph/badge.svg)](https://codecov.io/gh/robert-bor/aho-corasick) [![Codecov](https://codecov.io/gh/robert-bor/aho-corasick/branch/master/graph/badge.svg)](https://codecov.io/gh/robert-bor/aho-corasick)
[![Maven Central](https://maven-badges.herokuapp.com/maven-central/org.ahocorasick/ahocorasick/badge.svg)](https://maven-badges.herokuapp.com/maven-central/org.ahocorasick/ahocorasick) [![Maven Central](https://maven-badges.herokuapp.com/maven-central/org.ahocorasick/ahocorasick/badge.svg)](https://maven-badges.herokuapp.com/maven-central/org.ahocorasick/ahocorasick)
[![Javadoc](https://javadoc.io/badge2/org.ahocorasick/ahocorasick/javadoc.svg)](https://javadoc.io/doc/org.ahocorasick/ahocorasick) [![Javadoc](https://javadoc-emblem.rhcloud.com/doc/org.ahocorasick/ahocorasick/badge.svg)](http://www.javadoc.io/doc/org.ahocorasick/ahocorasick)
[![Apache 2](http://img.shields.io/badge/license-Apache%202-blue.svg)](http://www.apache.org/licenses/LICENSE-2.0) [![Apache 2](http://img.shields.io/badge/license-Apache%202-blue.svg)](http://www.apache.org/licenses/LICENSE-2.0)
Dependency Dependency
@ -16,7 +17,7 @@ Include this dependency in your POM. Be sure to check for the latest version in
<dependency> <dependency>
<groupId>org.ahocorasick</groupId> <groupId>org.ahocorasick</groupId>
<artifactId>ahocorasick</artifactId> <artifactId>ahocorasick</artifactId>
<version>0.6.3</version> <version>0.4.0</version>
</dependency> </dependency>
``` ```
@ -115,7 +116,7 @@ Trie trie = Trie.builder()
Collection<Emit> emits = trie.parseText("sugarcane sugar canesugar"); Collection<Emit> emits = trie.parseText("sugarcane sugar canesugar");
``` ```
Only one match is found; whereas, without calling `onlyWholeWords()` three Only one match is found; whereas, without calling `onlyWholeWords()` four
matches are found. The sugarcane/canesugar words are discarded because matches are found. The sugarcane/canesugar words are discarded because
they are partial matches. they are partial matches.
@ -219,7 +220,7 @@ class Word {
} }
PayloadTrie<Word> trie = PayloadTrie.<Word>builder() PayloadTrie<Word> trie = PayloadTrie.<Word>builder()
.addKeyword("hers", new Word("f")) .addKeyword("hers", new Word("f")
.addKeyword("his", new Word("m")) .addKeyword("his", new Word("m"))
.addKeyword("she", new Word("f")) .addKeyword("she", new Word("f"))
.addKeyword("he", new Word("m")) .addKeyword("he", new Word("m"))

View File

@ -1,72 +0,0 @@
plugins {
`java-library`
`maven-publish`
pmd
checkstyle
id("io.freefair.lombok") version "8.4"
}
repositories {
mavenLocal()
maven {
url = uri("https://nexus.knecon.com/repository/gindev/")
credentials {
username = providers.gradleProperty("mavenUser").getOrNull();
password = providers.gradleProperty("mavenPassword").getOrNull();
}
}
maven {
url = uri("https://repo.maven.apache.org/maven2/")
}
}
dependencies {
testImplementation("junit:junit:4.13.2")
}
group = "org.ahocorasick"
description = "Aho-CoraSick algorithm for efficient string matching"
java.sourceCompatibility = JavaVersion.VERSION_17
java.targetCompatibility = JavaVersion.VERSION_17
java {
withSourcesJar()
withJavadocJar()
}
publishing {
publications.create<MavenPublication>("maven") {
from(components["java"])
}
repositories {
maven {
url = uri("https://nexus.knecon.com/repository/red-platform-releases/")
credentials {
username = providers.gradleProperty("mavenUser").getOrNull();
password = providers.gradleProperty("mavenPassword").getOrNull();
}
}
}
}
tasks.withType<JavaCompile>() {
options.encoding = "UTF-8"
}
tasks.withType<Javadoc>() {
options.encoding = "UTF-8"
}
pmd {
isConsoleOutput = true
}
tasks.pmdMain {
pmd.ruleSetFiles = files("${rootDir}/config/pmd/pmd.xml")
}
tasks.pmdTest {
pmd.ruleSetFiles = files("${rootDir}/config/pmd/test_pmd.xml")
}

View File

@ -1,38 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE module PUBLIC "-//Puppy Crawl//DTD Check Configuration 1.3//EN"
"http://www.puppycrawl.com/dtds/configuration_1_3.dtd">
<module name="Checker">
<property
name="severity"
value="error"/>
<module name="TreeWalker">
<module name="SuppressWarningsHolder"/>
<module name="MissingDeprecated"/>
<module name="MissingOverride"/>
<module name="AnnotationLocation"/>
<module name="NonEmptyAtclauseDescription"/>
<module name="IllegalImport"/>
<module name="RedundantImport"/>
<module name="RedundantModifier"/>
<module name="EmptyBlock"/>
<module name="DefaultComesLast"/>
<module name="EmptyStatement"/>
<module name="EqualsHashCode"/>
<module name="ExplicitInitialization"/>
<module name="IllegalInstantiation"/>
<module name="ModifiedControlVariable"/>
<module name="MultipleVariableDeclarations"/>
<module name="PackageDeclaration"/>
<module name="ParameterAssignment"/>
<module name="SimplifyBooleanExpression"/>
<module name="SimplifyBooleanReturn"/>
<module name="StringLiteralEquality"/>
<module name="OneStatementPerLine"/>
<module name="FinalClass"/>
<module name="ArrayTypeStyle"/>
<module name="UpperEll"/>
<module name="OuterTypeFilename"/>
</module>
<module name="FileTabCharacter"/>
<module name="SuppressWarningsFilter"/>
</module>

View File

@ -1,21 +0,0 @@
<?xml version="1.0"?>
<ruleset name="Custom ruleset"
xmlns="http://pmd.sourceforge.net/ruleset/2.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://pmd.sourceforge.net/ruleset/2.0.0 http://pmd.sourceforge.net/ruleset_2_0_0.xsd">
<description>
Knecon ruleset checks the code for bad stuff
</description>
<rule ref="category/java/errorprone.xml">
<exclude name="MissingSerialVersionUID"/>
<exclude name="AvoidLiteralsInIfCondition"/>
<exclude name="DataflowAnomalyAnalysis"/>
<exclude name="AvoidDuplicateLiterals"/>
<exclude name="NullAssignment"/>
<exclude name="AssignmentInOperand"/>
<exclude name="BeanMembersShouldSerialize"/>
</rule>
</ruleset>

View File

@ -1,11 +0,0 @@
<?xml version="1.0"?>
<ruleset name="Custom ruleset"
xmlns="http://pmd.sourceforge.net/ruleset/2.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://pmd.sourceforge.net/ruleset/2.0.0 http://pmd.sourceforge.net/ruleset_2_0_0.xsd">
<description>
Knecon test ruleset checks the code for bad stuff
</description>
</ruleset>

View File

@ -1 +0,0 @@
version = 0.7-SNAPSHOT

179
pom.xml Normal file
View File

@ -0,0 +1,179 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.ahocorasick</groupId>
<artifactId>ahocorasick</artifactId>
<version>0.6.0</version>
<packaging>jar</packaging>
<name>Aho-CoraSick algorithm for efficient string matching</name>
<description>Java library for efficient string matching against a large set of keywords</description>
<inceptionYear>2014</inceptionYear>
<url>https://github.com/robert-bor/aho-corasick</url>
<distributionManagement>
<snapshotRepository>
<id>ossrh</id>
<url>https://oss.sonatype.org/content/repositories/snapshots</url>
</snapshotRepository>
<repository>
<id>ossrh</id>
<url>https://oss.sonatype.org/service/local/staging/deploy/maven2/</url>
</repository>
</distributionManagement>
<organization>
<name>42 BV</name>
<url>http://blog.42.nl/</url>
</organization>
<licenses>
<license>
<name>The Apache Software License, Version 2.0</name>
<url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
<distribution>repo</distribution>
</license>
</licenses>
<scm>
<url>scm:git://github.com/robert-bor/aho-corasick</url>
<connection>scm:git://github.com/robert-bor/aho-corasick</connection>
</scm>
<developers>
<developer>
<name>Robert Bor</name>
<organization>42</organization>
</developer>
<developer>
<name>Daniel Beck</name>
<organization>neoSearch UG (haftungsbeschränkt)</organization>
</developer>
<developer>
<name>Dave Jarvis</name>
<organization>White Magic Software, Ltd.</organization>
</developer>
</developers>
<properties>
<java.version>1.8</java.version>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<junit.version>4.10</junit.version>
<!-- Reporting -->
<maven.cobertura.version>2.5.2</maven.cobertura.version>
<maven.javadoc.version>2.8</maven.javadoc.version>
<maven.project.version>2.4</maven.project.version>
<maven.site.plugin.version>3.3</maven.site.plugin.version>
</properties>
<dependencies>
<!-- Used for unit testing -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit-dep</artifactId>
<version>${junit.version}</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<defaultGoal>install</defaultGoal>
<plugins>
<plugin>
<groupId>org.sonatype.plugins</groupId>
<artifactId>nexus-staging-maven-plugin</artifactId>
<version>1.6.7</version>
<extensions>true</extensions>
<configuration>
<serverId>ossrh</serverId>
<nexusUrl>https://oss.sonatype.org/</nexusUrl>
<autoReleaseAfterClose>false</autoReleaseAfterClose>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.6.0</version>
<configuration>
<source>${java.version}</source>
<target>${java.version}</target>
<encoding>${project.build.sourceEncoding}</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.9.1</version>
<executions>
<execution>
<id>attach-javadocs</id>
<goals>
<goal>jar</goal>
</goals>
<configuration>
<source>8</source>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<version>2.2.1</version>
<executions>
<execution>
<id>attach-sources</id>
<goals>
<goal>jar-no-fork</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-gpg-plugin</artifactId>
<version>1.5</version>
<executions>
<execution>
<id>sign-artifacts</id>
<phase>verify</phase>
<goals>
<goal>sign</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.jacoco</groupId>
<artifactId>jacoco-maven-plugin</artifactId>
<version>0.8.5</version>
<executions>
<execution>
<goals>
<goal>prepare-agent</goal>
</goals>
</execution>
<execution>
<id>report</id>
<phase>test</phase>
<goals>
<goal>report</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

View File

@ -1 +0,0 @@
rootProject.name = "ahocorasick"

View File

@ -1,18 +1,10 @@
package org.ahocorasick.interval; package org.ahocorasick.interval;
import org.ahocorasick.trie.Emit;
import org.ahocorasick.trie.PayloadEmit;
/**
* Responsible for tracking the start and end bounds, which are reused by
* both {@link Emit} and {@link PayloadEmit}.
*/
public class Interval implements Intervalable { public class Interval implements Intervalable {
private final int start; private final int start;
private final int end; private final int end;
/** /**
* Constructs an interval with a start and end position. * Constructs an interval with a start and end position.
* *
@ -20,12 +12,10 @@ public class Interval implements Intervalable {
* @param end The interval's ending text position. * @param end The interval's ending text position.
*/ */
public Interval(final int start, final int end) { public Interval(final int start, final int end) {
this.start = start; this.start = start;
this.end = end; this.end = end;
} }
/** /**
* Returns the starting offset into the text for this interval. * Returns the starting offset into the text for this interval.
* *
@ -33,11 +23,9 @@ public class Interval implements Intervalable {
*/ */
@Override @Override
public int getStart() { public int getStart() {
return this.start; return this.start;
} }
/** /**
* Returns the ending offset into the text for this interval. * Returns the ending offset into the text for this interval.
* *
@ -45,11 +33,9 @@ public class Interval implements Intervalable {
*/ */
@Override @Override
public int getEnd() { public int getEnd() {
return this.end; return this.end;
} }
/** /**
* Returns the length of the interval. * Returns the length of the interval.
* *
@ -57,11 +43,9 @@ public class Interval implements Intervalable {
*/ */
@Override @Override
public int size() { public int size() {
return end - start + 1; return end - start + 1;
} }
/** /**
* Answers whether the given interval overlaps this interval * Answers whether the given interval overlaps this interval
* instance. * instance.
@ -70,38 +54,31 @@ public class Interval implements Intervalable {
* @return true The intervals overlap. * @return true The intervals overlap.
*/ */
public boolean overlapsWith(final Interval other) { public boolean overlapsWith(final Interval other) {
return this.start <= other.getEnd() &&
return this.start <= other.getEnd() && this.end >= other.getStart(); this.end >= other.getStart();
} }
public boolean overlapsWith(int point) { public boolean overlapsWith(int point) {
return this.start <= point && point <= this.end; return this.start <= point && point <= this.end;
} }
@Override @Override
public boolean equals(Object o) { public boolean equals(Object o) {
if (!(o instanceof Intervalable)) { if (!(o instanceof Intervalable)) {
return false; return false;
} }
Intervalable other = (Intervalable) o; Intervalable other = (Intervalable) o;
return this.start == other.getStart() && this.end == other.getEnd(); return this.start == other.getStart() &&
this.end == other.getEnd();
} }
@Override @Override
public int hashCode() { public int hashCode() {
return this.start % 100 + this.end % 100; return this.start % 100 + this.end % 100;
} }
@Override @Override
public int compareTo(Object o) { public int compareTo(Object o) {
if (!(o instanceof Intervalable)) { if (!(o instanceof Intervalable)) {
return -1; return -1;
} }
@ -110,7 +87,6 @@ public class Interval implements Intervalable {
return comparison != 0 ? comparison : this.end - other.getEnd(); return comparison != 0 ? comparison : this.end - other.getEnd();
} }
/** /**
* Returns the starting offset and ending offset separated * Returns the starting offset and ending offset separated
* by a full colon (:). * by a full colon (:).
@ -119,8 +95,6 @@ public class Interval implements Intervalable {
*/ */
@Override @Override
public String toString() { public String toString() {
return this.start + ":" + this.end; return this.start + ":" + this.end;
} }
} }

View File

@ -6,19 +6,14 @@ import java.util.List;
public class IntervalNode { public class IntervalNode {
private enum Direction { private enum Direction {LEFT, RIGHT}
LEFT,
RIGHT
}
private IntervalNode left; private IntervalNode left;
private IntervalNode right; private IntervalNode right;
private int point; private int point;
private List<Intervalable> intervals = new ArrayList<>(); private List<Intervalable> intervals = new ArrayList<>();
public IntervalNode(final List<Intervalable> intervals) { public IntervalNode(final List<Intervalable> intervals) {
this.point = determineMedian(intervals); this.point = determineMedian(intervals);
final List<Intervalable> toLeft = new ArrayList<>(); final List<Intervalable> toLeft = new ArrayList<>();
@ -42,9 +37,7 @@ public class IntervalNode {
} }
} }
public int determineMedian(final List<Intervalable> intervals) {
private int determineMedian(final List<Intervalable> intervals) {
int start = -1; int start = -1;
int end = -1; int end = -1;
for (Intervalable interval : intervals) { for (Intervalable interval : intervals) {
@ -60,9 +53,7 @@ public class IntervalNode {
return (start + end) / 2; return (start + end) / 2;
} }
public List<Intervalable> findOverlaps(final Intervalable interval) { public List<Intervalable> findOverlaps(final Intervalable interval) {
final List<Intervalable> overlaps = new ArrayList<>(); final List<Intervalable> overlaps = new ArrayList<>();
if (this.point < interval.getStart()) { if (this.point < interval.getStart()) {
@ -83,9 +74,10 @@ public class IntervalNode {
return overlaps; return overlaps;
} }
protected void addToOverlaps(
protected void addToOverlaps(final Intervalable interval, final List<Intervalable> overlaps, final List<Intervalable> newOverlaps) { final Intervalable interval,
final List<Intervalable> overlaps,
final List<Intervalable> newOverlaps) {
for (final Intervalable currentInterval : newOverlaps) { for (final Intervalable currentInterval : newOverlaps) {
if (!currentInterval.equals(interval)) { if (!currentInterval.equals(interval)) {
overlaps.add(currentInterval); overlaps.add(currentInterval);
@ -93,21 +85,16 @@ public class IntervalNode {
} }
} }
protected List<Intervalable> checkForOverlapsToTheLeft(final Intervalable interval) { protected List<Intervalable> checkForOverlapsToTheLeft(final Intervalable interval) {
return checkForOverlaps(interval, Direction.LEFT); return checkForOverlaps(interval, Direction.LEFT);
} }
protected List<Intervalable> checkForOverlapsToTheRight(final Intervalable interval) { protected List<Intervalable> checkForOverlapsToTheRight(final Intervalable interval) {
return checkForOverlaps(interval, Direction.RIGHT); return checkForOverlaps(interval, Direction.RIGHT);
} }
protected List<Intervalable> checkForOverlaps(
protected List<Intervalable> checkForOverlaps(final Intervalable interval, final Direction direction) { final Intervalable interval, final Direction direction) {
final List<Intervalable> overlaps = new ArrayList<>(); final List<Intervalable> overlaps = new ArrayList<>();
for (final Intervalable currentInterval : this.intervals) { for (final Intervalable currentInterval : this.intervals) {
@ -128,10 +115,9 @@ public class IntervalNode {
return overlaps; return overlaps;
} }
protected List<Intervalable> findOverlappingRanges(IntervalNode node, Intervalable interval) { protected List<Intervalable> findOverlappingRanges(IntervalNode node, Intervalable interval) {
return node == null
return node == null ? Collections.<Intervalable>emptyList() : node.findOverlaps(interval); ? Collections.<Intervalable>emptyList()
: node.findOverlaps(interval);
} }
} }

View File

@ -10,13 +10,10 @@ public class IntervalTree {
private final IntervalNode rootNode; private final IntervalNode rootNode;
public IntervalTree(List<Intervalable> intervals) { public IntervalTree(List<Intervalable> intervals) {
this.rootNode = new IntervalNode(intervals); this.rootNode = new IntervalNode(intervals);
} }
public List<Intervalable> removeOverlaps(final List<Intervalable> intervals) { public List<Intervalable> removeOverlaps(final List<Intervalable> intervals) {
// Sort the intervals on size, then left-most position // Sort the intervals on size, then left-most position
@ -45,9 +42,7 @@ public class IntervalTree {
return intervals; return intervals;
} }
public List<Intervalable> findOverlaps(final Intervalable interval) { public List<Intervalable> findOverlaps(final Intervalable interval) {
return rootNode.findOverlaps(interval); return rootNode.findOverlaps(interval);
} }

View File

@ -2,12 +2,10 @@ package org.ahocorasick.interval;
public interface Intervalable extends Comparable { public interface Intervalable extends Comparable {
int getStart(); int getStart();
int getEnd();
int getEnd(); int size();
int size();
} }

View File

@ -6,7 +6,6 @@ public class IntervalableComparatorByPosition implements Comparator<Intervalable
@Override @Override
public int compare(final Intervalable intervalable, final Intervalable intervalable2) { public int compare(final Intervalable intervalable, final Intervalable intervalable2) {
return intervalable.getStart() - intervalable2.getStart(); return intervalable.getStart() - intervalable2.getStart();
} }

View File

@ -6,7 +6,6 @@ public class IntervalableComparatorBySize implements Comparator<Intervalable> {
@Override @Override
public int compare(final Intervalable intervalable, final Intervalable intervalable2) { public int compare(final Intervalable intervalable, final Intervalable intervalable2) {
int comparison = intervalable2.size() - intervalable.size(); int comparison = intervalable2.size() - intervalable.size();
if (comparison == 0) { if (comparison == 0) {

View File

@ -4,22 +4,16 @@ public class DefaultToken extends Token {
private PayloadToken<String> payloadToken; private PayloadToken<String> payloadToken;
public DefaultToken(PayloadToken<String> payloadToken) { public DefaultToken(PayloadToken<String> payloadToken) {
super(payloadToken.getFragment()); super(payloadToken.getFragment());
this.payloadToken = payloadToken; this.payloadToken = payloadToken;
} }
public boolean isMatch() { public boolean isMatch() {
return payloadToken.isMatch(); return payloadToken.isMatch();
} }
public Emit getEmit() { public Emit getEmit() {
PayloadEmit<String> emit = payloadToken.getEmit(); PayloadEmit<String> emit = payloadToken.getEmit();
return new Emit(emit.getStart(), emit.getEnd(), emit.getKeyword()); return new Emit(emit.getStart(), emit.getEnd(), emit.getKeyword());
} }

View File

@ -3,30 +3,20 @@ package org.ahocorasick.trie;
import org.ahocorasick.interval.Interval; import org.ahocorasick.interval.Interval;
import org.ahocorasick.interval.Intervalable; import org.ahocorasick.interval.Intervalable;
/**
* Responsible for tracking the bounds of matched terms.
*/
public class Emit extends Interval implements Intervalable { public class Emit extends Interval implements Intervalable {
private final String keyword; private final String keyword;
public Emit(final int start, final int end, String keyword) {
public Emit(final int start, final int end, final String keyword) {
super(start, end); super(start, end);
this.keyword = keyword; this.keyword = keyword;
} }
public String getKeyword() { public String getKeyword() {
return this.keyword; return this.keyword;
} }
@Override @Override
public String toString() { public String toString() {
return super.toString() + "=" + this.keyword; return super.toString() + "=" + this.keyword;
} }

View File

@ -3,21 +3,16 @@ package org.ahocorasick.trie;
public class FragmentToken extends Token { public class FragmentToken extends Token {
public FragmentToken(String fragment) { public FragmentToken(String fragment) {
super(fragment); super(fragment);
} }
@Override @Override
public boolean isMatch() { public boolean isMatch() {
return false; return false;
} }
@Override @Override
public Emit getEmit() { public Emit getEmit() {
return null; return null;
} }

View File

@ -4,26 +4,19 @@ public class MatchToken extends Token {
private final Emit emit; private final Emit emit;
public MatchToken(final String fragment, final Emit emit) { public MatchToken(final String fragment, final Emit emit) {
super(fragment); super(fragment);
this.emit = emit; this.emit = emit;
} }
@Override @Override
public boolean isMatch() { public boolean isMatch() {
return true; return true;
} }
@Override @Override
public Emit getEmit() { public Emit getEmit() {
return this.emit; return this.emit;
} }
} }

View File

@ -1,21 +1,32 @@
package org.ahocorasick.trie; package org.ahocorasick.trie;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
/** /**
* Contains the matched keyword and some payload data. * Contains the matched keyword and some payload data.
* *
* @param <T> The type of the wrapped payload data.
* @author Daniel Beck * @author Daniel Beck
* @param <T> The type of the wrapped payload data.
*/ */
@Getter public class Payload<T> implements Comparable<Payload<T>> {
@EqualsAndHashCode
@RequiredArgsConstructor
public class Payload<T> {
private final String keyword; private final String keyword;
private final T data; private final T data;
public Payload(final String keyword, final T data) {
super();
this.keyword = keyword;
this.data = data;
}
public String getKeyword() {
return keyword;
}
public T getData() {
return data;
}
@Override
public int compareTo(Payload<T> other) {
return keyword.compareTo(other.getKeyword());
}
} }

View File

@ -5,7 +5,7 @@ import org.ahocorasick.interval.Intervalable;
/** /**
* Contains a matched term and its associated payload data. * Contains a matched term and its associated payload data.
* *
* @param <T> Type of the wrapped payload-data. * @param <T> Type of the wrapped payload-data.
* @author Daniel Beck * @author Daniel Beck
*/ */
@ -15,44 +15,35 @@ public class PayloadEmit<T> extends Interval implements Intervalable {
private final T payload; private final T payload;
/** /**
* Created a PayloadEmit * Created a PayloadEmit
* *
* @param start Start of the matched search term. * @param start Start of the matched search term.
* @param end End of the matched search term. * @param end End of the matched search term.
* @param keyword Keyword that matched. * @param keyword Keyword that matched.
* @param payload Emitted payload data. * @param payload Emitted payload data.
*/ */
public PayloadEmit(final int start, final int end, String keyword, T payload) { public PayloadEmit(final int start, final int end, String keyword, T payload) {
super(start, end); super(start, end);
this.keyword = keyword; this.keyword = keyword;
this.payload = payload; this.payload = payload;
} }
public String getKeyword() { public String getKeyword() {
return this.keyword; return this.keyword;
} }
/** /**
* Returns the payload associated to this emit. * Returns the payload associated to this emit.
* *
* @return the associated payload * @return the associated payload
*/ */
public T getPayload() { public T getPayload() {
return this.payload; return this.payload;
} }
@Override @Override
public String toString() { public String toString() {
return super.toString() + "=" + this.keyword + (this.payload != null ? "->" + this.payload : ""); return super.toString() + "=" + this.keyword + (this.payload != null ? "->" + this.payload : "");
} }
} }

View File

@ -6,7 +6,7 @@ package org.ahocorasick.trie;
* This token indicates a matching search term was not found, so * This token indicates a matching search term was not found, so
* {@link #isMatch()} always returns {@code false}. * {@link #isMatch()} always returns {@code false}.
* </p> * </p>
* *
* @author Daniel Beck * @author Daniel Beck
* *
* @param <T> The Type of the emitted payloads. * @param <T> The Type of the emitted payloads.
@ -14,25 +14,19 @@ package org.ahocorasick.trie;
public class PayloadFragmentToken<T> extends PayloadToken<T> { public class PayloadFragmentToken<T> extends PayloadToken<T> {
public PayloadFragmentToken(String fragment) { public PayloadFragmentToken(String fragment) {
super(fragment); super(fragment);
} }
@Override @Override
public boolean isMatch() { public boolean isMatch() {
return false; return false;
} }
/** /**
* Returns null. * Returns null.
*/ */
@Override @Override
public PayloadEmit<T> getEmit() { public PayloadEmit<T> getEmit() {
return null; return null;
} }
} }

View File

@ -6,33 +6,27 @@ package org.ahocorasick.trie;
* This token indicates a matching search term was found, so {@link #isMatch()} * This token indicates a matching search term was found, so {@link #isMatch()}
* always returns {@code true}. * always returns {@code true}.
* </p> * </p>
*
* @author Daniel Beck
* *
* @param <T> The Type of the emitted payloads. * @param <T> The Type of the emitted payloads.
* @author Daniel Beck
*/ */
public class PayloadMatchToken<T> extends PayloadToken<T> { public class PayloadMatchToken<T> extends PayloadToken<T> {
private final PayloadEmit<T> emit; private final PayloadEmit<T> emit;
public PayloadMatchToken(final String fragment, final PayloadEmit<T> emit) { public PayloadMatchToken(final String fragment, final PayloadEmit<T> emit) {
super(fragment); super(fragment);
this.emit = emit; this.emit = emit;
} }
@Override @Override
public boolean isMatch() { public boolean isMatch() {
return true; return true;
} }
@Override @Override
public PayloadEmit<T> getEmit() { public PayloadEmit<T> getEmit() {
return this.emit; return this.emit;
} }
} }

View File

@ -1,10 +1,6 @@
package org.ahocorasick.trie; package org.ahocorasick.trie;
import java.util.*; import java.util.*;
import java.util.stream.Collectors;
import lombok.Getter;
import lombok.Setter;
/** /**
* <p> * <p>
@ -31,14 +27,13 @@ import lombok.Setter;
public class PayloadState<T> { public class PayloadState<T> {
/** /**
* effective the size of the keyword. * effective the size of the keyword
*/ */
@Getter
private final int depth; private final int depth;
/** /**
* only used for the root state to refer to itself in case no matches have been * only used for the root state to refer to itself in case no matches have been
* found. * found
*/ */
private final PayloadState<T> rootState; private final PayloadState<T> rootState;
@ -49,34 +44,26 @@ public class PayloadState<T> {
private final Map<Character, PayloadState<T>> success = new HashMap<>(); private final Map<Character, PayloadState<T>> success = new HashMap<>();
/** /**
* if no matching states are found, the failure state will be returned. * if no matching states are found, the failure state will be returned
*/ */
@Getter
@Setter
private PayloadState<T> failure; private PayloadState<T> failure;
/** /**
* whenever this state is reached, it will emit the matches keywords for future * whenever this state is reached, it will emit the matches keywords for future
* reference. * reference
*/ */
private Set<Payload<T>> emits; private Set<Payload<T>> emits;
public PayloadState() { public PayloadState() {
this(0); this(0);
} }
public PayloadState(final int depth) { public PayloadState(final int depth) {
this.depth = depth; this.depth = depth;
this.rootState = depth == 0 ? this : null; this.rootState = depth == 0 ? this : null;
} }
private PayloadState<T> nextState(final Character character, final boolean ignoreRootState) { private PayloadState<T> nextState(final Character character, final boolean ignoreRootState) {
PayloadState<T> nextState = this.success.get(character); PayloadState<T> nextState = this.success.get(character);
if (!ignoreRootState && nextState == null && this.rootState != null) { if (!ignoreRootState && nextState == null && this.rootState != null) {
@ -86,79 +73,82 @@ public class PayloadState<T> {
return nextState; return nextState;
} }
public PayloadState<T> nextState(final Character character) { public PayloadState<T> nextState(final Character character) {
return nextState(character, false); return nextState(character, false);
} }
public PayloadState<T> nextStateIgnoreRootState(Character character) { public PayloadState<T> nextStateIgnoreRootState(Character character) {
return nextState(character, true); return nextState(character, true);
} }
public PayloadState<T> addState(String keyword) {
PayloadState<T> state = this;
for (final Character character : keyword.toCharArray()) {
state = state.addState(character);
}
return state;
}
public PayloadState<T> addState(Character character) { public PayloadState<T> addState(Character character) {
PayloadState<T> nextState = nextStateIgnoreRootState(character); PayloadState<T> nextState = nextStateIgnoreRootState(character);
if (nextState == null) { if (nextState == null) {
nextState = new PayloadState<>(this.depth + 1); nextState = new PayloadState<T>(this.depth + 1);
this.success.put(character, nextState); this.success.put(character, nextState);
} }
return nextState; return nextState;
} }
public int getDepth() {
return this.depth;
}
/** /**
* Adds a payload to be emitted for this state. * Adds a payload to be emitted for this state.
* *
* @param payload to be emitted. * @param payload to be emitted.
*/ */
public void addEmit(Payload<T> payload) { public void addEmit(Payload<T> payload) {
if (this.emits == null) { if (this.emits == null) {
this.emits = new HashSet<>(); this.emits = new TreeSet<>();
} }
this.emits.add(payload); this.emits.add(payload);
} }
/** /**
* Adds a collection of payloads to be emitted for this state. * Adds a collection of payloads to be emitted for this state.
* *
* @param emits Collection of payloads to be emitted. * @param emits Collection of payloads to be emitted.
*/ */
public void addEmit(Collection<Payload<T>> emits) { public void addEmit(Collection<Payload<T>> emits) {
for (Payload<T> emit : emits) { for (Payload<T> emit : emits) {
addEmit(emit); addEmit(emit);
} }
} }
/** /**
* Returns a collection of emitted payloads for this state. * Returns a collection of emitted payloads for this state.
* *
* @return Collection of emitted payloads. * @return Collection of emitted payloads.
*/ */
public Collection<Payload<T>> emit() { public Collection<Payload<T>> emit() {
return this.emits == null ? Collections.<Payload<T>>emptyList() : this.emits;
return this.emits == null ? Collections.<Payload<T>>emptyList() : this.emits.stream()
.sorted(Comparator.comparing(Payload::getKeyword))
.collect(Collectors.toList());
} }
public PayloadState<T> failure() {
return this.failure;
}
public void setFailure(PayloadState<T> failState) {
this.failure = failState;
}
public Collection<PayloadState<T>> getStates() { public Collection<PayloadState<T>> getStates() {
return this.success.values(); return this.success.values();
} }
public Collection<Character> getTransitions() { public Collection<Character> getTransitions() {
return this.success.keySet(); return this.success.keySet();
} }
} }

View File

@ -9,33 +9,24 @@ package org.ahocorasick.trie;
* @param <T> The Type of the emitted payloads. * @param <T> The Type of the emitted payloads.
*/ */
public abstract class PayloadToken<T> { public abstract class PayloadToken<T> {
private String fragment; private String fragment;
public PayloadToken(String fragment) { public PayloadToken(String fragment) {
this.fragment = fragment; this.fragment = fragment;
} }
public String getFragment() { public String getFragment() {
return this.fragment; return this.fragment;
} }
/** /**
* Return {@code true} if a search term matched. * Return {@code true} if a search term matched.
*
* @return {@code true} if this is a match * @return {@code true} if this is a match
*/ */
public abstract boolean isMatch(); public abstract boolean isMatch();
/** /**
* @return the payload * @return the payload
*/ */
public abstract PayloadEmit<T> getEmit(); public abstract PayloadEmit<T> getEmit();
} }

View File

@ -1,10 +1,8 @@
package org.ahocorasick.trie; package org.ahocorasick.trie;
import static java.lang.Character.isWhitespace; import static java.lang.Character.isWhitespace;
import static java.lang.Character.toLowerCase;
import java.util.Deque; import java.util.ArrayList;
import java.util.LinkedList;
import java.util.Collection; import java.util.Collection;
import java.util.List; import java.util.List;
import java.util.Queue; import java.util.Queue;
@ -15,6 +13,8 @@ import org.ahocorasick.interval.Intervalable;
import org.ahocorasick.trie.handler.DefaultPayloadEmitHandler; import org.ahocorasick.trie.handler.DefaultPayloadEmitHandler;
import org.ahocorasick.trie.handler.PayloadEmitHandler; import org.ahocorasick.trie.handler.PayloadEmitHandler;
import org.ahocorasick.trie.handler.StatefulPayloadEmitHandler; import org.ahocorasick.trie.handler.StatefulPayloadEmitHandler;
import org.ahocorasick.util.ListElementRemoval;
import org.ahocorasick.util.ListElementRemoval.RemoveElementPredicate;
/** /**
* A trie implementation that carries a payload. See {@link Trie} for * A trie implementation that carries a payload. See {@link Trie} for
@ -24,9 +24,9 @@ import org.ahocorasick.trie.handler.StatefulPayloadEmitHandler;
* The payload trie adds the possibility to specify emitted payloads for each * The payload trie adds the possibility to specify emitted payloads for each
* added keyword. * added keyword.
* </p> * </p>
* *
* @param <T> The type of the supplied of the payload.
* @author Daniel Beck * @author Daniel Beck
* @param <T> The type of the supplied of the payload.
*/ */
public class PayloadTrie<T> { public class PayloadTrie<T> {
@ -34,30 +34,29 @@ public class PayloadTrie<T> {
private final PayloadState<T> rootState; private final PayloadState<T> rootState;
protected PayloadTrie(final TrieConfig trieConfig) { protected PayloadTrie(final TrieConfig trieConfig) {
this.trieConfig = trieConfig; this.trieConfig = trieConfig;
this.rootState = new PayloadState<>(); this.rootState = new PayloadState<>();
} }
/** /**
* Used by the builder to add a text search keyword with an emit payload. * Used by the builder to add a text search keyword with a emit payload.
* *
* @param keyword The search term to add to the list of search terms. * @param keyword The search term to add to the list of search terms.
* @param emit the payload to emit for this search term. * @param emit the payload to emit for this search term.
* @throws NullPointerException if the keyword is null. * @throws NullPointerException if the keyword is null.
*/ */
private void addKeyword(String keyword, T emit) { private void addKeyword(String keyword, T emit) {
if (keyword.isEmpty()) { if (keyword.isEmpty()) {
return; return;
} }
addState(keyword).addEmit(new Payload<>(keyword, emit)); if (isCaseInsensitive()) {
} keyword = keyword.toLowerCase();
}
addState(keyword).addEmit(new Payload<T>(keyword, emit));
}
/** /**
* Used by the builder to add a text search keyword. * Used by the builder to add a text search keyword.
@ -66,44 +65,35 @@ public class PayloadTrie<T> {
* @throws NullPointerException if the keyword is null. * @throws NullPointerException if the keyword is null.
*/ */
private void addKeyword(String keyword) { private void addKeyword(String keyword) {
if (keyword.isEmpty()) { if (keyword.isEmpty()) {
return; return;
} }
addState(keyword).addEmit(new Payload<>(keyword, null)); if (isCaseInsensitive()) {
} keyword = keyword.toLowerCase();
}
addState(keyword).addEmit(new Payload<T>(keyword, null));
}
private PayloadState<T> addState(final String keyword) { private PayloadState<T> addState(final String keyword) {
return getRootState().addState(keyword);
PayloadState<T> state = getRootState();
for (final Character character : keyword.toCharArray()) {
if (isIgnoreWhiteSpace() && isWhitespace(character)) {
continue;
}
Character adjustedChar = isCaseInsensitive() ? Character.toLowerCase(character) : character;
state = state.addState(adjustedChar);
}
return state;
} }
/** /**
* Tokenizes the specified text and returns the emitted outputs. * Tokenizes the specified text and returns the emitted outputs.
* *
* @param text The text to tokenize. * @param text The text to tokenize.
* @return the emitted outputs * @return the emitted outputs
*/ */
public Collection<PayloadToken<T>> tokenize(final String text) { public Collection<PayloadToken<T>> tokenize(final String text) {
final Collection<PayloadToken<T>> tokens = new ArrayList<>();
final Collection<PayloadToken<T>> tokens = new LinkedList<>();
final Collection<PayloadEmit<T>> collectedEmits = parseText(text); final Collection<PayloadEmit<T>> collectedEmits = parseText(text);
int lastCollectedPosition = -1; int lastCollectedPosition = -1;
for (final PayloadEmit<T> emit : collectedEmits) { for (final PayloadEmit<T> emit : collectedEmits) {
if (emit.getStart() - lastCollectedPosition > 1) { if (emit.getStart() - lastCollectedPosition > 1) {
tokens.add(createFragment(emit, text, lastCollectedPosition)); tokens.add((PayloadToken<T>) createFragment(emit, text, lastCollectedPosition));
} }
tokens.add(createMatch(emit, text)); tokens.add(createMatch(emit, text));
@ -111,52 +101,53 @@ public class PayloadTrie<T> {
} }
if (text.length() - lastCollectedPosition > 1) { if (text.length() - lastCollectedPosition > 1) {
tokens.add(createFragment(null, text, lastCollectedPosition)); tokens.add((PayloadToken<T>) createFragment(null, text, lastCollectedPosition));
} }
return tokens; return tokens;
} }
private PayloadToken<T> createFragment(final PayloadEmit<T> emit, final String text, final int lastCollectedPosition) { private PayloadToken<T> createFragment(final PayloadEmit<T> emit, final String text, final int lastCollectedPosition) {
return new PayloadFragmentToken<T>(
return new PayloadFragmentToken<>(text.substring(lastCollectedPosition + 1, emit == null ? text.length() : emit.getStart())); text.substring(lastCollectedPosition + 1, emit == null ? text.length() : emit.getStart()));
} }
private PayloadToken<T> createMatch(PayloadEmit<T> emit, String text) { private PayloadToken<T> createMatch(PayloadEmit<T> emit, String text) {
return new PayloadMatchToken<T>(text.substring(emit.getStart(), emit.getEnd() + 1), emit);
return new PayloadMatchToken<>(text.substring(emit.getStart(), emit.getEnd() + 1), emit);
} }
/** /**
* Tokenizes a specified text and returns the emitted outputs. * Tokenizes a specified text and returns the emitted outputs.
* *
* @param text The character sequence to tokenize. * @param text The character sequence to tokenize.
* @return A collection of emits. * @return A collection of emits.
*/ */
public Collection<PayloadEmit<T>> parseText(final CharSequence text) { public Collection<PayloadEmit<T>> parseText(final CharSequence text) {
return parseText(text, new DefaultPayloadEmitHandler<T>());
return parseText(text, new DefaultPayloadEmitHandler<>());
} }
/** /**
* Tokenizes the specified text by using a custom EmitHandler and returns the * Tokenizes the specified text by using a custom EmitHandler and returns the
* emitted outputs. * emitted outputs.
* *
* @param text The character sequence to tokenize. * @param text The character sequence to tokenize.
* @param emitHandler The handler that will be used to parse the text. * @param emitHandler The emit handler that will be used to parse the text.
* @return A collection of emits. * @return A collection of emits.
*/ */
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public Collection<PayloadEmit<T>> parseText(final CharSequence text, final StatefulPayloadEmitHandler<T> emitHandler) { public Collection<PayloadEmit<T>> parseText(final CharSequence text, final StatefulPayloadEmitHandler<T> emitHandler) {
parseText(text, (PayloadEmitHandler<T>) emitHandler); parseText(text, (PayloadEmitHandler<T>) emitHandler);
final List<PayloadEmit<T>> collectedEmits = emitHandler.getEmits(); final List<PayloadEmit<T>> collectedEmits = emitHandler.getEmits();
if (trieConfig.isOnlyWholeWords()) {
removePartialMatches(text, collectedEmits);
}
if (trieConfig.isOnlyWholeWordsWhiteSpaceSeparated()) {
removePartialMatchesWhiteSpaceSeparated(text, collectedEmits);
}
if (!trieConfig.isAllowOverlaps()) { if (!trieConfig.isAllowOverlaps()) {
IntervalTree intervalTree = new IntervalTree((List<Intervalable>) (List<?>) collectedEmits); IntervalTree intervalTree = new IntervalTree((List<Intervalable>) (List<?>) collectedEmits);
intervalTree.removeOverlaps((List<Intervalable>) (List<?>) collectedEmits); intervalTree.removeOverlaps((List<Intervalable>) (List<?>) collectedEmits);
@ -165,61 +156,50 @@ public class PayloadTrie<T> {
return collectedEmits; return collectedEmits;
} }
/** /**
* Returns true if the text contains one of the search terms; otherwise, * Returns true if the text contains contains one of the search terms. Else,
* returns false. * returns false.
* *
* @param text Specified text. * @param text Specified text.
* @return true if the text contains one of the search terms. Else, returns * @return true if the text contains one of the search terms. Else, returns
* false. * false.
*/ */
public boolean containsMatch(final CharSequence text) { public boolean containsMatch(final CharSequence text) {
return firstMatch(text) != null; return firstMatch(text) != null;
} }
/** /**
* Tokenizes the specified text by using a custom EmitHandler and returns the * Tokenizes the specified text by using a custom EmitHandler and returns the
* emitted outputs. * emitted outputs.
* *
* @param text The character sequence to tokenize. * @param text The character sequence to tokenize.
* @param emitHandler The handler that will be used to parse the text. * @param emitHandler The emit handler that will be used to parse the text.
*/ */
public void parseText(final CharSequence text, final PayloadEmitHandler<T> emitHandler) { public void parseText(final CharSequence text, final PayloadEmitHandler<T> emitHandler) {
PayloadState<T> currentState = getRootState(); PayloadState<T> currentState = getRootState();
for (int position = 0; position < text.length(); position++) { for (int position = 0; position < text.length(); position++) {
char character = text.charAt(position); Character character = text.charAt(position);
if (trieConfig.isIgnoreWhiteSpace() && isWhitespace(character)) {
continue;
}
// TODO: Maybe lowercase the entire string at once?
if (trieConfig.isCaseInsensitive()) { if (trieConfig.isCaseInsensitive()) {
character = Character.toLowerCase(character); character = Character.toLowerCase(character);
} }
currentState = getState(currentState, character); currentState = getState(currentState, character);
final Collection<Payload<T>> payloads = currentState.emit(); if (storeEmits(position, currentState, emitHandler) && trieConfig.isStopOnHit()) {
if (processEmits(text, position, payloads, emitHandler) && trieConfig.isStopOnHit()) {
return; return;
} }
} }
} }
/** /**
* The first matching text sequence. * The first matching text sequence.
* *
* @param text The text to search for keywords, must not be {@code null}. * @param text The text to search for keywords.
* @return {@code null} if no matches found. * @return null if no matches found.
*/ */
public PayloadEmit<T> firstMatch(final CharSequence text) { public PayloadEmit<T> firstMatch(final CharSequence text) {
assert text != null;
if (!trieConfig.isAllowOverlaps()) { if (!trieConfig.isAllowOverlaps()) {
// Slow path. Needs to find all the matches to detect overlaps. // Slow path. Needs to find all the matches to detect overlaps.
final Collection<PayloadEmit<T>> parseText = parseText(text); final Collection<PayloadEmit<T>> parseText = parseText(text);
@ -232,11 +212,9 @@ public class PayloadTrie<T> {
PayloadState<T> currentState = getRootState(); PayloadState<T> currentState = getRootState();
for (int position = 0; position < text.length(); position++) { for (int position = 0; position < text.length(); position++) {
char character = text.charAt(position); Character character = text.charAt(position);
if (trieConfig.isIgnoreWhiteSpace() && isWhitespace(character)) { // TODO: Lowercase the entire string at once?
continue;
}
if (trieConfig.isCaseInsensitive()) { if (trieConfig.isCaseInsensitive()) {
character = Character.toLowerCase(character); character = Character.toLowerCase(character);
} }
@ -246,13 +224,8 @@ public class PayloadTrie<T> {
if (payloads != null && !payloads.isEmpty()) { if (payloads != null && !payloads.isEmpty()) {
for (final Payload<T> payload : payloads) { for (final Payload<T> payload : payloads) {
int start; final PayloadEmit<T> emit = new PayloadEmit<>(position - payload.getKeyword().length() + 1, position,
if (isIgnoreWhiteSpace()) { payload.getKeyword(), payload.getData());
start = findStart(text, position, payload);
} else {
start = position - payload.getKeyword().length() + 1;
}
final PayloadEmit<T> emit = new PayloadEmit<>(start, position, payload.getKeyword(), payload.getData());
if (trieConfig.isOnlyWholeWords()) { if (trieConfig.isOnlyWholeWords()) {
if (!isPartialMatch(text, emit)) { if (!isPartialMatch(text, emit)) {
return emit; return emit;
@ -268,38 +241,55 @@ public class PayloadTrie<T> {
return null; return null;
} }
private boolean isPartialMatch(final CharSequence searchText, final PayloadEmit<T> emit) { private boolean isPartialMatch(final CharSequence searchText, final PayloadEmit<T> emit) {
return (emit.getStart() != 0 && Character.isAlphabetic(searchText.charAt(emit.getStart() - 1)))
return (emit.getStart() != 0 && Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) || (emit.getEnd() + 1 != searchText.length() && Character.isAlphabetic( || (emit.getEnd() + 1 != searchText.length() && Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1)));
searchText.charAt(emit.getEnd() + 1)));
} }
private void removePartialMatches(final CharSequence searchText, final List<PayloadEmit<T>> collectedEmits) {
private boolean isPartialMatchWhiteSpaceSeparated(final CharSequence searchText, final PayloadEmit<T> emit) { final RemoveElementPredicate<PayloadEmit<T>> predicate = new RemoveElementPredicate<PayloadEmit<T>>() {
@Override
public boolean remove(PayloadEmit<T> emit) {
return isPartialMatch(searchText, emit);
}
};
ListElementRemoval.removeIf(collectedEmits, predicate);
}
private void removePartialMatchesWhiteSpaceSeparated(final CharSequence searchText,
final List<PayloadEmit<T>> collectedEmits) {
final long size = searchText.length(); final long size = searchText.length();
return (emit.getStart() != 0 && !isWhitespace(searchText.charAt(emit.getStart() - 1))) || (emit.getEnd() + 1 != size && !isWhitespace(searchText.charAt(emit.getEnd() final List<PayloadEmit<T>> removeEmits = new ArrayList<>();
+ 1)));
}
for (final PayloadEmit<T> emit : collectedEmits) {
if ((emit.getStart() == 0 || isWhitespace(searchText.charAt(emit.getStart() - 1)))
&& (emit.getEnd() + 1 == size || isWhitespace(searchText.charAt(emit.getEnd() + 1)))) {
continue;
}
removeEmits.add(emit);
}
for (final PayloadEmit<T> removeEmit : removeEmits) {
collectedEmits.remove(removeEmit);
}
}
private PayloadState<T> getState(PayloadState<T> currentState, final Character character) { private PayloadState<T> getState(PayloadState<T> currentState, final Character character) {
PayloadState<T> newCurrentState = currentState.nextState(character); PayloadState<T> newCurrentState = currentState.nextState(character);
var tempState = currentState;
while (newCurrentState == null) { while (newCurrentState == null) {
tempState = tempState.getFailure(); currentState = currentState.failure();
newCurrentState = tempState.nextState(character); newCurrentState = currentState.nextState(character);
} }
return newCurrentState; return newCurrentState;
} }
private void constructFailureStates() { private void constructFailureStates() {
final Queue<PayloadState<T>> queue = new LinkedBlockingDeque<>(); final Queue<PayloadState<T>> queue = new LinkedBlockingDeque<>();
final PayloadState<T> startState = getRootState(); final PayloadState<T> startState = getRootState();
@ -317,9 +307,9 @@ public class PayloadTrie<T> {
PayloadState<T> targetState = currentState.nextState(transition); PayloadState<T> targetState = currentState.nextState(transition);
queue.add(targetState); queue.add(targetState);
PayloadState<T> traceFailureState = currentState.getFailure(); PayloadState<T> traceFailureState = currentState.failure();
while (traceFailureState.nextState(transition) == null) { while (traceFailureState.nextState(transition) == null) {
traceFailureState = traceFailureState.getFailure(); traceFailureState = traceFailureState.failure();
} }
final PayloadState<T> newFailureState = traceFailureState.nextState(transition); final PayloadState<T> newFailureState = traceFailureState.nextState(transition);
@ -329,22 +319,16 @@ public class PayloadTrie<T> {
} }
} }
private boolean storeEmits(final int position, final PayloadState<T> currentState, final PayloadEmitHandler<T> emitHandler) {
private boolean processEmits(final CharSequence text, final int position, final Collection<Payload<T>> payloads, final PayloadEmitHandler<T> emitHandler) {
boolean emitted = false; boolean emitted = false;
for (final Payload<T> payload : payloads) { final Collection<Payload<T>> payloads = currentState.emit();
int start;
if (isIgnoreWhiteSpace()) { // TODO: The check for empty might be superfluous.
start = findStart(text, position, payload); if (payloads != null && !payloads.isEmpty()) {
} else { for (final Payload<T> payload : payloads) {
start = position - payload.getKeyword().length() + 1; emitted = emitHandler.emit(new PayloadEmit<T>(position - payload.getKeyword().length() + 1, position,
} payload.getKeyword(), payload.getData())) || emitted;
final PayloadEmit<T> payloadEmit = new PayloadEmit<>(start, position, payload.getKeyword(), payload.getData());
if (!(trieConfig.isOnlyWholeWords() && isPartialMatch(text, payloadEmit)) && !(trieConfig.isOnlyWholeWordsWhiteSpaceSeparated() && isPartialMatchWhiteSpaceSeparated(
text,
payloadEmit))) {
emitted = emitHandler.emit(payloadEmit) || emitted;
if (emitted && trieConfig.isStopOnHit()) { if (emitted && trieConfig.isStopOnHit()) {
break; break;
} }
@ -354,77 +338,41 @@ public class PayloadTrie<T> {
return emitted; return emitted;
} }
private int findStart(CharSequence text, int position, Payload<T> payload) {
Deque<Character> stack = new LinkedList<>();
int i;
for (i = 0; i < payload.getKeyword().length(); i++) {
if (isWhitespace(payload.getKeyword().charAt(i))) {
continue;
}
stack.push(isCaseInsensitive() ? toLowerCase(payload.getKeyword().charAt(i)) : payload.getKeyword().charAt(i));
}
for (i = position; !stack.isEmpty() && i >= 0; --i) {
char c = isCaseInsensitive() ? toLowerCase(text.charAt(i)) : text.charAt(i);
if (c == stack.peek()) {
stack.pop();
}
}
return i + 1;
}
private boolean isCaseInsensitive() { private boolean isCaseInsensitive() {
return trieConfig.isCaseInsensitive(); return trieConfig.isCaseInsensitive();
} }
private boolean isIgnoreWhiteSpace() {
return trieConfig.isIgnoreWhiteSpace();
}
private PayloadState<T> getRootState() { private PayloadState<T> getRootState() {
return this.rootState; return this.rootState;
} }
/** /**
* Provides a fluent interface for constructing Trie instances with payloads. * Provides a fluent interface for constructing Trie instances with payloads.
*
* @param <T> The type of the emitted payload. * @param <T> The type of the emitted payload.
*
* @return The builder used to configure its Trie. * @return The builder used to configure its Trie.
*/ */
public static <T> PayloadTrieBuilder<T> builder() { public static <T> PayloadTrieBuilder<T> builder() {
return new PayloadTrieBuilder<T>();
return new PayloadTrieBuilder<>();
} }
/** /**
* Builder class to create a PayloadTrie instance. * Builder class to create a PayloadTrie instance.
* *
* @param <T> The type of the emitted payload. * @param <T> The type of the emitted payload.
*/ */
public static final class PayloadTrieBuilder<T> { public static class PayloadTrieBuilder<T> {
private final TrieConfig trieConfig = new TrieConfig(); private final TrieConfig trieConfig = new TrieConfig();
private final PayloadTrie<T> trie = new PayloadTrie<>(trieConfig); private final PayloadTrie<T> trie = new PayloadTrie<>(trieConfig);
/** /**
* Default (empty) constructor. * Default (empty) constructor.
*/ */
private PayloadTrieBuilder() { private PayloadTrieBuilder() {
} }
/** /**
* Configure the Trie to ignore case when searching for keywords in the text. * Configure the Trie to ignore case when searching for keywords in the text.
* This must be called before calling addKeyword because the algorithm converts * This must be called before calling addKeyword because the algorithm converts
@ -434,42 +382,35 @@ public class PayloadTrie<T> {
* @return This builder. * @return This builder.
*/ */
public PayloadTrieBuilder<T> ignoreCase() { public PayloadTrieBuilder<T> ignoreCase() {
this.trieConfig.setCaseInsensitive(true); this.trieConfig.setCaseInsensitive(true);
return this; return this;
} }
/** /**
* Configure the Trie to ignore overlapping keywords. * Configure the Trie to ignore overlapping keywords.
* *
* @return This builder. * @return This builder.
*/ */
public PayloadTrieBuilder<T> ignoreOverlaps() { public PayloadTrieBuilder<T> ignoreOverlaps() {
this.trieConfig.setAllowOverlaps(false); this.trieConfig.setAllowOverlaps(false);
return this; return this;
} }
/** /**
* Adds a keyword to the {@link Trie}'s list of text search keywords. * Adds a keyword to the Trie's list of text search keywords. No Payload is
* No {@link Payload} is supplied. * supplied.
* *
* @param keyword The keyword to add to the list. * @param keyword The keyword to add to the list.
* @return This builder. * @return This builder.
* @throws NullPointerException if the keyword is null. * @throws NullPointerException if the keyword is null.
*/ */
public PayloadTrieBuilder<T> addKeyword(final String keyword) { public PayloadTrieBuilder<T> addKeyword(final String keyword) {
this.trie.addKeyword(keyword); this.trie.addKeyword(keyword);
return this; return this;
} }
/** /**
* Adds a keyword and a payload to the {@link Trie}'s list of text * Adds a keyword and a payload to the Trie's list of text search keywords.
* search keywords.
* *
* @param keyword The keyword to add to the list. * @param keyword The keyword to add to the list.
* @param payload the payload to add * @param payload the payload to add
@ -477,40 +418,34 @@ public class PayloadTrie<T> {
* @throws NullPointerException if the keyword is null. * @throws NullPointerException if the keyword is null.
*/ */
public PayloadTrieBuilder<T> addKeyword(final String keyword, final T payload) { public PayloadTrieBuilder<T> addKeyword(final String keyword, final T payload) {
this.trie.addKeyword(keyword, payload); this.trie.addKeyword(keyword, payload);
return this; return this;
} }
/** /**
* Adds a list of keywords and payloads to the {@link Trie}'s list of * Adds a list of keywords and payloads to the Trie's list of text search
* text search keywords. * keywords.
* *
* @param keywords The keywords to add to the list. * @param keywords The keywords to add to the list.
* @return This builder. * @return This builder.
*/ */
public PayloadTrieBuilder<T> addKeywords(final Collection<Payload<T>> keywords) { public PayloadTrieBuilder<T> addKeywords(final Collection<Payload<T>> keywords) {
for (Payload<T> payload : keywords) { for (Payload<T> payload : keywords) {
this.trie.addKeyword(payload.getKeyword(), payload.getData()); this.trie.addKeyword(payload.getKeyword(), payload.getData());
} }
return this; return this;
} }
/** /**
* Configure the Trie to match whole keywords in the text. * Configure the Trie to match whole keywords in the text.
* *
* @return This builder. * @return This builder.
*/ */
public PayloadTrieBuilder<T> onlyWholeWords() { public PayloadTrieBuilder<T> onlyWholeWords() {
this.trieConfig.setOnlyWholeWords(true); this.trieConfig.setOnlyWholeWords(true);
return this; return this;
} }
/** /**
* Configure the Trie to match whole keywords that are separated by whitespace * Configure the Trie to match whole keywords that are separated by whitespace
* in the text. For example, "this keyword thatkeyword" would only match the * in the text. For example, "this keyword thatkeyword" would only match the
@ -519,69 +454,44 @@ public class PayloadTrie<T> {
* @return This builder. * @return This builder.
*/ */
public PayloadTrieBuilder<T> onlyWholeWordsWhiteSpaceSeparated() { public PayloadTrieBuilder<T> onlyWholeWordsWhiteSpaceSeparated() {
this.trieConfig.setOnlyWholeWordsWhiteSpaceSeparated(true); this.trieConfig.setOnlyWholeWordsWhiteSpaceSeparated(true);
return this; return this;
} }
/** /**
* Configure the Trie to stop after the first keyword is found in the text. * Configure the Trie to stop after the first keyword is found in the text.
* *
* @return This builder. * @return This builder.
*/ */
public PayloadTrieBuilder<T> stopOnHit() { public PayloadTrieBuilder<T> stopOnHit() {
trie.trieConfig.setStopOnHit(true); trie.trieConfig.setStopOnHit(true);
return this; return this;
} }
/** /**
* Configure the PayloadTrie based on the builder settings. * Configure the PayloadTrie based on the builder settings.
* *
* @return The configured PayloadTrie. * @return The configured PayloadTrie.
*/ */
public PayloadTrie<T> build() { public PayloadTrie<T> build() {
this.trie.constructFailureStates(); this.trie.constructFailureStates();
return this.trie; return this.trie;
} }
/** /**
* @return This builder. * @return This builder.
* @deprecated Use ignoreCase() * @deprecated Use ignoreCase()
*/ */
@Deprecated
public PayloadTrieBuilder<T> caseInsensitive() { public PayloadTrieBuilder<T> caseInsensitive() {
return ignoreCase(); return ignoreCase();
} }
/** /**
* @return This builder. * @return This builder.
* @deprecated Use ignoreOverlaps() * @deprecated Use ignoreOverlaps()
*/ */
@Deprecated
public PayloadTrieBuilder<T> removeOverlaps() { public PayloadTrieBuilder<T> removeOverlaps() {
return ignoreOverlaps(); return ignoreOverlaps();
} }
/**
* Configure the Trie to ignore whitespaces.
*
* @return This builder.
*/
public PayloadTrieBuilder<T> ignoreWhiteSpace() {
trieConfig.setIgnoreWhiteSpace(true);
return this;
}
} }
} }

View File

@ -2,9 +2,6 @@ package org.ahocorasick.trie;
import java.util.*; import java.util.*;
import lombok.Getter;
import lombok.Setter;
/** /**
* <p> * <p>
* A state has various important tasks it must attend to: * A state has various important tasks it must attend to:
@ -29,7 +26,6 @@ public class State {
/** /**
* effective the size of the keyword * effective the size of the keyword
*/ */
@Getter
private final int depth; private final int depth;
/** /**
@ -46,8 +42,6 @@ public class State {
/** /**
* if no matching states are found, the failure state will be returned * if no matching states are found, the failure state will be returned
*/ */
@Setter
@Getter
private State failure; private State failure;
/** /**
@ -55,22 +49,16 @@ public class State {
*/ */
private Set<String> emits; private Set<String> emits;
public State() { public State() {
this(0); this(0);
} }
public State(final int depth) { public State(final int depth) {
this.depth = depth; this.depth = depth;
this.rootState = depth == 0 ? this : null; this.rootState = depth == 0 ? this : null;
} }
private State nextState(final Character character, final boolean ignoreRootState) { private State nextState(final Character character, final boolean ignoreRootState) {
State nextState = this.success.get(character); State nextState = this.success.get(character);
if (!ignoreRootState && nextState == null && this.rootState != null) { if (!ignoreRootState && nextState == null && this.rootState != null) {
@ -80,21 +68,15 @@ public class State {
return nextState; return nextState;
} }
public State nextState(final Character character) { public State nextState(final Character character) {
return nextState(character, false); return nextState(character, false);
} }
public State nextStateIgnoreRootState(Character character) { public State nextStateIgnoreRootState(Character character) {
return nextState(character, true); return nextState(character, true);
} }
public State addState(String keyword) { public State addState(String keyword) {
State state = this; State state = this;
for (final Character character : keyword.toCharArray()) { for (final Character character : keyword.toCharArray()) {
@ -104,9 +86,7 @@ public class State {
return state; return state;
} }
public State addState(Character character) { public State addState(Character character) {
State nextState = nextStateIgnoreRootState(character); State nextState = nextStateIgnoreRootState(character);
if (nextState == null) { if (nextState == null) {
nextState = new State(this.depth + 1); nextState = new State(this.depth + 1);
@ -115,39 +95,40 @@ public class State {
return nextState; return nextState;
} }
public int getDepth() {
return this.depth;
}
public void addEmit(String keyword) { public void addEmit(String keyword) {
if (this.emits == null) { if (this.emits == null) {
this.emits = new TreeSet<>(); this.emits = new TreeSet<>();
} }
this.emits.add(keyword); this.emits.add(keyword);
} }
public void addEmit(Collection<String> emits) { public void addEmit(Collection<String> emits) {
for (String emit : emits) { for (String emit : emits) {
addEmit(emit); addEmit(emit);
} }
} }
public Collection<String> emit() { public Collection<String> emit() {
return this.emits == null ? Collections.<String>emptyList() : this.emits; return this.emits == null ? Collections.<String>emptyList() : this.emits;
} }
public State failure() {
return this.failure;
}
public void setFailure(State failState) {
this.failure = failState;
}
public Collection<State> getStates() { public Collection<State> getStates() {
return this.success.values(); return this.success.values();
} }
public Collection<Character> getTransitions() { public Collection<Character> getTransitions() {
return this.success.keySet(); return this.success.keySet();
} }
} }

View File

@ -1,25 +1,17 @@
package org.ahocorasick.trie; package org.ahocorasick.trie;
public abstract class Token { public abstract class Token {
private String fragment; private String fragment;
public Token(String fragment) { public Token(String fragment) {
this.fragment = fragment; this.fragment = fragment;
} }
public String getFragment() { public String getFragment() {
return this.fragment; return this.fragment;
} }
public abstract boolean isMatch(); public abstract boolean isMatch();
public abstract Emit getEmit(); public abstract Emit getEmit();
} }

View File

@ -15,26 +15,20 @@ import org.ahocorasick.trie.handler.StatefulEmitHandler;
* *
* @author Robert Bor * @author Robert Bor
*/ */
public final class Trie { public class Trie {
private final PayloadTrie<String> payloadTrie; private final PayloadTrie<String> payloadTrie;
private Trie(final PayloadTrie<String> payloadTrie) { private Trie(final PayloadTrie<String> payloadTrie) {
this.payloadTrie = payloadTrie; this.payloadTrie = payloadTrie;
} }
public Collection<Token> tokenize(final String text) { public Collection<Token> tokenize(final String text) {
Collection<PayloadToken<String>> tokens = this.payloadTrie.tokenize(text); Collection<PayloadToken<String>> tokens = this.payloadTrie.tokenize(text);
return asTokens(tokens); return asTokens(tokens);
} }
private static Collection<Token> asTokens(Collection<PayloadToken<String>> tokens) { private static Collection<Token> asTokens(Collection<PayloadToken<String>> tokens) {
Collection<Token> result = new ArrayList<>(); Collection<Token> result = new ArrayList<>();
for (PayloadToken<String> payloadToken : tokens) { for (PayloadToken<String> payloadToken : tokens) {
result.add(new DefaultToken(payloadToken)); result.add(new DefaultToken(payloadToken));
@ -42,9 +36,7 @@ public final class Trie {
return result; return result;
} }
private static Collection<Emit> asEmits(Collection<PayloadEmit<String>> emits) { private static Collection<Emit> asEmits(Collection<PayloadEmit<String>> emits) {
Collection<Emit> result = new ArrayList<>(); Collection<Emit> result = new ArrayList<>();
for (PayloadEmit<String> emit : emits) { for (PayloadEmit<String> emit : emits) {
result.add(asEmit(emit)); result.add(asEmit(emit));
@ -52,79 +44,60 @@ public final class Trie {
return result; return result;
} }
private static Emit asEmit(PayloadEmit<String> payloadEmit) { private static Emit asEmit(PayloadEmit<String> payloadEmit) {
return new Emit(payloadEmit.getStart(), payloadEmit.getEnd(), payloadEmit.getKeyword()); return new Emit(payloadEmit.getStart(), payloadEmit.getEnd(), payloadEmit.getKeyword());
} }
public Collection<Emit> parseText(final CharSequence text) { public Collection<Emit> parseText(final CharSequence text) {
Collection<PayloadEmit<String>> parsedText = this.payloadTrie.parseText(text); Collection<PayloadEmit<String>> parsedText = this.payloadTrie.parseText(text);
return asEmits(parsedText); return asEmits(parsedText);
} }
@SuppressWarnings("UnusedReturnValue") @SuppressWarnings("UnusedReturnValue")
public Collection<Emit> parseText(final CharSequence text, final StatefulEmitHandler emitHandler) { public Collection<Emit> parseText( final CharSequence text, final StatefulEmitHandler emitHandler) {
Collection<PayloadEmit<String>> parsedText = this.payloadTrie.parseText(text,
Collection<PayloadEmit<String>> parsedText = this.payloadTrie.parseText(text, new StatefulPayloadEmitDelegateHandler(emitHandler)); new StatefulPayloadEmitDelegateHandler(emitHandler));
return asEmits(parsedText); return asEmits(parsedText);
} }
public boolean containsMatch(final CharSequence text) { public boolean containsMatch(final CharSequence text) {
return firstMatch(text) != null; return firstMatch(text) != null;
} }
public void parseText(final CharSequence text, final EmitHandler emitHandler) { public void parseText(final CharSequence text, final EmitHandler emitHandler) {
this.payloadTrie.parseText(text, new PayloadEmitDelegateHandler(emitHandler)); this.payloadTrie.parseText(text, new PayloadEmitDelegateHandler(emitHandler));
} }
/** /**
* The first matching text sequence. * The first matching text sequence.
* *
* @param text The text to search for keywords, must not be {@code null}. * @param text The text to search for keywords.
* @return {@code null} if no matches found. * @return null if no matches found.
*/ */
public Emit firstMatch(final CharSequence text) { public Emit firstMatch(final CharSequence text) {
assert text != null;
final PayloadEmit<String> payload = this.payloadTrie.firstMatch(text); final PayloadEmit<String> payload = this.payloadTrie.firstMatch(text);
return payload == null ? null : new Emit(payload.getStart(), payload.getEnd(), payload.getKeyword()); return payload == null ? null : new Emit(payload.getStart(), payload.getEnd(), payload.getKeyword());
} }
/** /**
* Provides a fluent interface for constructing Trie instances. * Provides a fluent interface for constructing Trie instances.
* *
* @return The builder used to configure its Trie. * @return The builder used to configure its Trie.
*/ */
public static TrieBuilder builder() { public static TrieBuilder builder() {
return new TrieBuilder(); return new TrieBuilder();
} }
public static class TrieBuilder {
public static final class TrieBuilder {
private final PayloadTrieBuilder<String> delegate = PayloadTrie.builder(); private final PayloadTrieBuilder<String> delegate = PayloadTrie.builder();
/** /**
* Default (empty) constructor. * Default (empty) constructor.
*/ */
private TrieBuilder() { private TrieBuilder() {
} }
/** /**
* Configure the Trie to ignore case when searching for keywords in the text. * Configure the Trie to ignore case when searching for keywords in the text.
* This must be called before calling addKeyword because the algorithm converts * This must be called before calling addKeyword because the algorithm converts
@ -134,37 +107,21 @@ public final class Trie {
* @return This builder. * @return This builder.
*/ */
public TrieBuilder ignoreCase() { public TrieBuilder ignoreCase() {
delegate.ignoreCase(); delegate.ignoreCase();
// this.trieConfig.setCaseInsensitive(true); // this.trieConfig.setCaseInsensitive(true);
return this; return this;
} }
/** /**
* Configure the Trie to ignore overlapping keywords. * Configure the Trie to ignore overlapping keywords.
* *
* @return This builder. * @return This builder.
*/ */
public TrieBuilder ignoreOverlaps() { public TrieBuilder ignoreOverlaps() {
delegate.ignoreOverlaps(); delegate.ignoreOverlaps();
return this; return this;
} }
/**
* Configure the Trie to ignore whitespaces.
*
* @return This builder.
*/
public TrieBuilder ignoreWhiteSpace() {
delegate.ignoreWhiteSpace();
return this;
}
/** /**
* Adds a keyword to the Trie's list of text search keywords. * Adds a keyword to the Trie's list of text search keywords.
* *
@ -173,12 +130,10 @@ public final class Trie {
* @throws NullPointerException if the keyword is null. * @throws NullPointerException if the keyword is null.
*/ */
public TrieBuilder addKeyword(final String keyword) { public TrieBuilder addKeyword(final String keyword) {
delegate.addKeyword(keyword, null); delegate.addKeyword(keyword, null);
return this; return this;
} }
/** /**
* Adds a list of keywords to the Trie's list of text search keywords. * Adds a list of keywords to the Trie's list of text search keywords.
* *
@ -186,14 +141,12 @@ public final class Trie {
* @return This builder. * @return This builder.
*/ */
public TrieBuilder addKeywords(final String... keywords) { public TrieBuilder addKeywords(final String... keywords) {
for (String keyword : keywords) { for (String keyword : keywords) {
delegate.addKeyword(keyword, null); delegate.addKeyword(keyword, null);
} }
return this; return this;
} }
/** /**
* Adds a list of keywords to the Trie's list of text search keywords. * Adds a list of keywords to the Trie's list of text search keywords.
* *
@ -201,27 +154,23 @@ public final class Trie {
* @return This builder. * @return This builder.
*/ */
@SuppressWarnings("unused") @SuppressWarnings("unused")
public TrieBuilder addKeywords(final Collection<String> keywords) { public TrieBuilder addKeywords( final Collection<String> keywords ) {
for (String keyword : keywords) { for (String keyword : keywords) {
this.delegate.addKeyword(keyword, null); this.delegate.addKeyword(keyword, null);
} }
return this; return this;
} }
/** /**
* Configure the Trie to match whole keywords in the text. * Configure the Trie to match whole keywords in the text.
* *
* @return This builder. * @return This builder.
*/ */
public TrieBuilder onlyWholeWords() { public TrieBuilder onlyWholeWords() {
this.delegate.onlyWholeWords(); this.delegate.onlyWholeWords();
return this; return this;
} }
/** /**
* Configure the Trie to match whole keywords that are separated by whitespace * Configure the Trie to match whole keywords that are separated by whitespace
* in the text. For example, "this keyword thatkeyword" would only match the * in the text. For example, "this keyword thatkeyword" would only match the
@ -230,35 +179,44 @@ public final class Trie {
* @return This builder. * @return This builder.
*/ */
public TrieBuilder onlyWholeWordsWhiteSpaceSeparated() { public TrieBuilder onlyWholeWordsWhiteSpaceSeparated() {
this.delegate.onlyWholeWordsWhiteSpaceSeparated(); this.delegate.onlyWholeWordsWhiteSpaceSeparated();
return this; return this;
} }
/** /**
* Configure the Trie to stop after the first keyword is found in the text. * Configure the Trie to stop after the first keyword is found in the text.
* *
* @return This builder. * @return This builder.
*/ */
public TrieBuilder stopOnHit() { public TrieBuilder stopOnHit() {
this.delegate.stopOnHit(); this.delegate.stopOnHit();
return this; return this;
} }
/** /**
* Configure the Trie based on the builder settings. * Configure the Trie based on the builder settings.
* *
* @return The configured Trie. * @return The configured Trie.
*/ */
public Trie build() { public Trie build() {
PayloadTrie<String> payloadTrie = this.delegate.build(); PayloadTrie<String> payloadTrie = this.delegate.build();
return new Trie(payloadTrie); return new Trie(payloadTrie);
} }
} /**
* @return This builder.
* @deprecated Use ignoreCase()
*/
public TrieBuilder caseInsensitive() {
return ignoreCase();
}
/**
* @return This builder.
* @deprecated Use ignoreOverlaps()
*/
public TrieBuilder removeOverlaps() {
return ignoreOverlaps();
}
}
} }

View File

@ -4,86 +4,51 @@ public class TrieConfig {
private boolean allowOverlaps = true; private boolean allowOverlaps = true;
private boolean onlyWholeWords; private boolean onlyWholeWords = false;
private boolean onlyWholeWordsWhiteSpaceSeparated; private boolean onlyWholeWordsWhiteSpaceSeparated = false;
private boolean caseInsensitive; private boolean caseInsensitive = false;
private boolean ignoreWhiteSpace;
private boolean stopOnHit;
private boolean stopOnHit = false;
public boolean isStopOnHit() { public boolean isStopOnHit() {
return stopOnHit; return stopOnHit;
} }
public void setStopOnHit(boolean stopOnHit) { public void setStopOnHit(boolean stopOnHit) {
this.stopOnHit = stopOnHit; this.stopOnHit = stopOnHit;
} }
public boolean isAllowOverlaps() { public boolean isAllowOverlaps() {
return allowOverlaps; return allowOverlaps;
} }
public void setAllowOverlaps(boolean allowOverlaps) { public void setAllowOverlaps(boolean allowOverlaps) {
this.allowOverlaps = allowOverlaps; this.allowOverlaps = allowOverlaps;
} }
public boolean isOnlyWholeWords() { public boolean isOnlyWholeWords() {
return onlyWholeWords; return onlyWholeWords;
} }
public void setOnlyWholeWords(boolean onlyWholeWords) { public void setOnlyWholeWords(boolean onlyWholeWords) {
this.onlyWholeWords = onlyWholeWords; this.onlyWholeWords = onlyWholeWords;
} }
public boolean isOnlyWholeWordsWhiteSpaceSeparated() { public boolean isOnlyWholeWordsWhiteSpaceSeparated() {
return onlyWholeWordsWhiteSpaceSeparated; return onlyWholeWordsWhiteSpaceSeparated;
} }
public void setOnlyWholeWordsWhiteSpaceSeparated(boolean onlyWholeWordsWhiteSpaceSeparated) { public void setOnlyWholeWordsWhiteSpaceSeparated(boolean onlyWholeWordsWhiteSpaceSeparated) {
this.onlyWholeWordsWhiteSpaceSeparated = onlyWholeWordsWhiteSpaceSeparated; this.onlyWholeWordsWhiteSpaceSeparated = onlyWholeWordsWhiteSpaceSeparated;
} }
public boolean isCaseInsensitive() { public boolean isCaseInsensitive() {
return caseInsensitive; return caseInsensitive;
} }
public boolean isIgnoreWhiteSpace() {
return ignoreWhiteSpace;
}
public void setCaseInsensitive(boolean caseInsensitive) { public void setCaseInsensitive(boolean caseInsensitive) {
this.caseInsensitive = caseInsensitive; this.caseInsensitive = caseInsensitive;
} }
public void setIgnoreWhiteSpace(boolean ignoreWhiteSpace) {
this.ignoreWhiteSpace = ignoreWhiteSpace;
}
} }

View File

@ -9,16 +9,12 @@ public abstract class AbstractStatefulEmitHandler implements StatefulEmitHandler
private final List<Emit> emits = new ArrayList<>(); private final List<Emit> emits = new ArrayList<>();
public void addEmit(final Emit emit) { public void addEmit(final Emit emit) {
this.emits.add(emit); this.emits.add(emit);
} }
@Override @Override
public List<Emit> getEmits() { public List<Emit> getEmits() {
return this.emits; return this.emits;
} }

View File

@ -9,16 +9,12 @@ public abstract class AbstractStatefulPayloadEmitHandler<T> implements StatefulP
private final List<PayloadEmit<T>> emits = new ArrayList<>(); private final List<PayloadEmit<T>> emits = new ArrayList<>();
public void addEmit(final PayloadEmit<T> emit) { public void addEmit(final PayloadEmit<T> emit) {
this.emits.add(emit); this.emits.add(emit);
} }
@Override @Override
public List<PayloadEmit<T>> getEmits() { public List<PayloadEmit<T>> getEmits() {
return this.emits; return this.emits;
} }

View File

@ -9,19 +9,14 @@ public class DefaultEmitHandler implements StatefulEmitHandler {
private final List<Emit> emits = new ArrayList<>(); private final List<Emit> emits = new ArrayList<>();
@Override @Override
public boolean emit(final Emit emit) { public boolean emit(final Emit emit) {
this.emits.add(emit); this.emits.add(emit);
return true; return true;
} }
@Override @Override
public List<Emit> getEmits() { public List<Emit> getEmits() {
return this.emits; return this.emits;
} }
} }

View File

@ -9,19 +9,14 @@ public class DefaultPayloadEmitHandler<T> implements StatefulPayloadEmitHandler<
private final List<PayloadEmit<T>> emits = new ArrayList<>(); private final List<PayloadEmit<T>> emits = new ArrayList<>();
@Override @Override
public boolean emit(final PayloadEmit<T> emit) { public boolean emit(final PayloadEmit<T> emit) {
this.emits.add(emit); this.emits.add(emit);
return true; return true;
} }
@Override @Override
public List<PayloadEmit<T>> getEmits() { public List<PayloadEmit<T>> getEmits() {
return this.emits; return this.emits;
} }
} }

View File

@ -3,7 +3,5 @@ package org.ahocorasick.trie.handler;
import org.ahocorasick.trie.Emit; import org.ahocorasick.trie.Emit;
public interface EmitHandler { public interface EmitHandler {
boolean emit(Emit emit); boolean emit(Emit emit);
} }

View File

@ -11,17 +11,13 @@ public class PayloadEmitDelegateHandler implements PayloadEmitHandler<String> {
private EmitHandler handler; private EmitHandler handler;
public PayloadEmitDelegateHandler(EmitHandler handler) { public PayloadEmitDelegateHandler(EmitHandler handler) {
this.handler = handler; this.handler = handler;
} }
@Override @Override
public boolean emit(PayloadEmit<String> emit) { public boolean emit(PayloadEmit<String> emit) {
Emit newEmit = new Emit(emit.getStart(), emit.getEnd(), emit.getKeyword()); Emit newEmit = new Emit(emit.getStart(), emit.getEnd(), emit.getKeyword());
return handler.emit(newEmit); return handler.emit(newEmit);
} }

View File

@ -3,7 +3,5 @@ package org.ahocorasick.trie.handler;
import org.ahocorasick.trie.PayloadEmit; import org.ahocorasick.trie.PayloadEmit;
public interface PayloadEmitHandler<T> { public interface PayloadEmitHandler<T> {
boolean emit(PayloadEmit<T> emit); boolean emit(PayloadEmit<T> emit);
} }

View File

@ -5,7 +5,5 @@ import java.util.List;
import org.ahocorasick.trie.Emit; import org.ahocorasick.trie.Emit;
public interface StatefulEmitHandler extends EmitHandler { public interface StatefulEmitHandler extends EmitHandler {
List<Emit> getEmits(); List<Emit> getEmits();
} }

View File

@ -15,16 +15,12 @@ public class StatefulPayloadEmitDelegateHandler implements StatefulPayloadEmitHa
private StatefulEmitHandler handler; private StatefulEmitHandler handler;
public StatefulPayloadEmitDelegateHandler(StatefulEmitHandler handler) { public StatefulPayloadEmitDelegateHandler(StatefulEmitHandler handler) {
this.handler = handler; this.handler = handler;
} }
private static List<PayloadEmit<String>> asEmits(Collection<Emit> emits) { private static List<PayloadEmit<String>> asEmits(Collection<Emit> emits) {
List<PayloadEmit<String>> result = new ArrayList<>(); List<PayloadEmit<String>> result = new ArrayList<>();
for (Emit emit : emits) { for (Emit emit : emits) {
result.add(new PayloadEmit<String>(emit.getStart(), emit.getEnd(), emit.getKeyword(), null)); result.add(new PayloadEmit<String>(emit.getStart(), emit.getEnd(), emit.getKeyword(), null));
@ -32,20 +28,15 @@ public class StatefulPayloadEmitDelegateHandler implements StatefulPayloadEmitHa
return result; return result;
} }
@Override @Override
public boolean emit(PayloadEmit<String> emit) { public boolean emit(PayloadEmit<String> emit) {
Emit newEmit = new Emit(emit.getStart(), emit.getEnd(), emit.getKeyword()); Emit newEmit = new Emit(emit.getStart(), emit.getEnd(), emit.getKeyword());
return handler.emit(newEmit); return handler.emit(newEmit);
} }
@Override @Override
public List<PayloadEmit<String>> getEmits() { public List<PayloadEmit<String>> getEmits() {
List<Emit> emits = this.handler.getEmits(); List<Emit> emits = this.handler.getEmits();
return asEmits(emits); return asEmits(emits);
} }
} }

View File

@ -4,8 +4,6 @@ import java.util.List;
import org.ahocorasick.trie.PayloadEmit; import org.ahocorasick.trie.PayloadEmit;
public interface StatefulPayloadEmitHandler<T> extends PayloadEmitHandler<T> { public interface StatefulPayloadEmitHandler<T> extends PayloadEmitHandler<T>{
List<PayloadEmit<T>> getEmits(); List<PayloadEmit<T>> getEmits();
} }

View File

@ -0,0 +1,51 @@
package org.ahocorasick.util;
import java.util.ArrayList;
import java.util.List;
/**
* Helps removes elements from a list in a efficient way
*
* <p>Removing elements from an ArrayList in a naive way can lead to O(n^3)
* running time. If the algorithm first creates a list of all the elements
* to remove, then we for each element in this list (assume n elements) we look
* for the element in the original list (against n elements) and when found we need
* to remove the element and move the elements to the right (of the removed element)
* to the left by one, the size of this operation is at worst n hence O(n^3).</p>
*
* <p>This instead makes a new list and copies over only elements we want to keep,
* we then clear the original list and then add all of the elements to the original
* list. This gives us (for ArrayList) a running time of O(n).</p>
*
* <p>The performance of this has not been thoroughly tested for linked list.</p>
*
* <p>This can be completely removed in java 8 as the List#removeIf() method can be used instead
* as this already is optimised for each list implementation.
*
*/
public class ListElementRemoval {
public static interface RemoveElementPredicate<T> {
public boolean remove(T t);
}
/**
* Removes all elements from the list matching the given predicate.
*
* @param list the list from which to remove
* @param predicate to test for removal
* @param <T> type of list
*/
public static <T> void removeIf(final List<T> list, final RemoveElementPredicate<T> predicate) {
final List<T> newList = new ArrayList<>(list.size());
for(final T element : list) {
if (!predicate.remove(element)) {
newList.add(element);
}
}
list.clear();
list.addAll(newList);
}
}

View File

@ -6,79 +6,52 @@ import java.util.Iterator;
import java.util.Set; import java.util.Set;
import java.util.TreeSet; import java.util.TreeSet;
import static org.junit.Assert.*; import static junit.framework.Assert.*;
public class IntervalTest { public class IntervalTest {
@Test @Test
public void test_construct() { public void construct() {
Interval i = new Interval(1, 3);
final Interval i = new Interval(1, 3);
assertEquals(1, i.getStart()); assertEquals(1, i.getStart());
assertEquals(3, i.getEnd()); assertEquals(3, i.getEnd());
} }
@Test @Test
public void test_size() { public void size() {
assertEquals(3, new Interval(0, 2).size()); assertEquals(3, new Interval(0, 2).size());
} }
@Test @Test
public void test_intervaloverlaps() { public void intervaloverlaps() {
assertTrue(new Interval(1, 3).overlapsWith(new Interval(2, 4))); assertTrue(new Interval(1, 3).overlapsWith(new Interval(2, 4)));
} }
@Test @Test
public void test_intervalDoesNotOverlap() { public void intervalDoesNotOverlap() {
assertFalse(new Interval(1, 13).overlapsWith(new Interval(27, 42))); assertFalse(new Interval(1, 13).overlapsWith(new Interval(27, 42)));
} }
@Test @Test
public void test_pointOverlaps() { public void pointOverlaps() {
assertTrue(new Interval(1, 3).overlapsWith(2)); assertTrue(new Interval(1, 3).overlapsWith(2));
} }
@Test @Test
public void test_pointDoesNotOverlap() { public void pointDoesNotOverlap() {
assertFalse(new Interval(1, 13).overlapsWith(42)); assertFalse(new Interval(1, 13).overlapsWith(42));
} }
@Test @Test
public void test_comparable() { public void comparable() {
Set<Interval> intervals = new TreeSet<>();
final Set<Interval> intervals = new TreeSet<>();
intervals.add(new Interval(4, 6)); intervals.add(new Interval(4, 6));
intervals.add(new Interval(2, 7)); intervals.add(new Interval(2, 7));
intervals.add(new Interval(3, 4)); intervals.add(new Interval(3, 4));
final Iterator<Interval> it = intervals.iterator(); Iterator<Interval> it = intervals.iterator();
assertEquals(2, it.next().getStart()); assertEquals(2, it.next().getStart());
assertEquals(3, it.next().getStart()); assertEquals(3, it.next().getStart());
assertEquals(4, it.next().getStart()); assertEquals(4, it.next().getStart());
} }
@Test
public void test_checkToString() {
assertEquals("4:6", new Interval(4, 6).toString());
}
@Test
public void test_compareToNegativeTest() {
assertEquals(-1, new Interval(4, 6).compareTo(new Object()));
}
} }

View File

@ -12,7 +12,6 @@ public class IntervalTreeTest {
@Test @Test
public void findOverlaps() { public void findOverlaps() {
List<Intervalable> intervals = new ArrayList<>(); List<Intervalable> intervals = new ArrayList<>();
intervals.add(new Interval(0, 2)); intervals.add(new Interval(0, 2));
intervals.add(new Interval(1, 3)); intervals.add(new Interval(1, 3));
@ -29,10 +28,8 @@ public class IntervalTreeTest {
assertOverlap(overlapsIt.next(), 0, 2); assertOverlap(overlapsIt.next(), 0, 2);
} }
@Test @Test
public void removeOverlaps() { public void removeOverlaps() {
List<Intervalable> intervals = new ArrayList<>(); List<Intervalable> intervals = new ArrayList<>();
intervals.add(new Interval(0, 2)); intervals.add(new Interval(0, 2));
intervals.add(new Interval(4, 5)); intervals.add(new Interval(4, 5));
@ -46,9 +43,7 @@ public class IntervalTreeTest {
} }
protected void assertOverlap(Intervalable interval, int expectedStart, int expectedEnd) { protected void assertOverlap(Intervalable interval, int expectedStart, int expectedEnd) {
assertEquals(expectedStart, interval.getStart()); assertEquals(expectedStart, interval.getStart());
assertEquals(expectedEnd, interval.getEnd()); assertEquals(expectedEnd, interval.getEnd());
} }

View File

@ -12,7 +12,6 @@ public class IntervalableComparatorByPositionTest {
@Test @Test
public void sortOnPosition() { public void sortOnPosition() {
List<Intervalable> intervals = new ArrayList<Intervalable>(); List<Intervalable> intervals = new ArrayList<Intervalable>();
intervals.add(new Interval(4, 5)); intervals.add(new Interval(4, 5));
intervals.add(new Interval(1, 4)); intervals.add(new Interval(1, 4));

View File

@ -12,7 +12,6 @@ public class IntervalableComparatorBySizeTest {
@Test @Test
public void sortOnSize() { public void sortOnSize() {
List<Intervalable> intervals = new ArrayList<Intervalable>(); List<Intervalable> intervals = new ArrayList<Intervalable>();
intervals.add(new Interval(4, 5)); intervals.add(new Interval(4, 5));
intervals.add(new Interval(1, 4)); intervals.add(new Interval(1, 4));
@ -23,10 +22,8 @@ public class IntervalableComparatorBySizeTest {
assertEquals(2, intervals.get(2).size()); assertEquals(2, intervals.get(2).size());
} }
@Test @Test
public void sortOnSizeThenPosition() { public void sortOnSizeThenPosition() {
List<Intervalable> intervals = new ArrayList<Intervalable>(); List<Intervalable> intervals = new ArrayList<Intervalable>();
intervals.add(new Interval(4, 7)); intervals.add(new Interval(4, 7));
intervals.add(new Interval(2, 5)); intervals.add(new Interval(2, 5));

View File

@ -2,35 +2,23 @@ package org.ahocorasick.trie;
import org.junit.Test; import org.junit.Test;
import static org.junit.Assert.assertEquals; import static junit.framework.Assert.assertEquals;
import static org.junit.Assert.assertNotEquals; import static junit.framework.Assert.assertNotSame;
/**
* Test the {@link Emit} class functionality.
*/
public class EmitTest { public class EmitTest {
/**
* Test that two {@link Emit} instances having the same values are equal.
*/
@Test @Test
public void test_Equality_SameValues_ObjectsAreEqual() { public void equals() {
Emit one = new Emit(13, 42, null);
final Emit one = new Emit(13, 42, null); Emit two = new Emit(13, 42, null);
final Emit two = new Emit(13, 42, null);
assertEquals(one, two); assertEquals(one, two);
} }
/**
* Test that two {@link Emit} instances having different values are equal.
*/
@Test @Test
public void test_Equality_DifferingValues_ObjectsAreNotEqual() { public void notEquals() {
Emit one = new Emit(13, 42, null);
final Emit one = new Emit(13, 42, null); Emit two = new Emit(13, 43, null);
final Emit two = new Emit(13, 43, null); assertNotSame(one, two);
assertNotEquals(one, two);
} }
} }

View File

@ -1,133 +1,124 @@
package org.ahocorasick.trie; package org.ahocorasick.trie;
import static junit.framework.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.ThreadLocalRandom;
import org.ahocorasick.trie.handler.AbstractStatefulPayloadEmitHandler; import org.ahocorasick.trie.handler.AbstractStatefulPayloadEmitHandler;
import org.ahocorasick.trie.handler.PayloadEmitHandler; import org.ahocorasick.trie.handler.PayloadEmitHandler;
import org.ahocorasick.trie.handler.StatefulPayloadEmitHandler; import org.ahocorasick.trie.handler.StatefulPayloadEmitHandler;
import org.junit.Test; import org.junit.Test;
import java.util.Collection;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import static java.util.Arrays.asList;
import static org.ahocorasick.trie.TestHelper.injectKeyword;
import static org.ahocorasick.trie.TestHelper.randomNumbers;
import static org.junit.Assert.*;
public class PayloadTrieTest { public class PayloadTrieTest {
private final static String[] ALPHABET = new String[]{"abc", "bcd", "cde"}; private final static String[] ALPHABET = new String[] { "abc", "bcd", "cde" };
private final static String[] ALPHABET_PAYLOAD = new String[]{"alpha:abc", "alpha:bcd", "alpha:cde"}; private final static String[] ALPHABET_PAYLOAD = new String[] { "alpha:abc", "alpha:bcd", "alpha:cde" };
private final static List<Payload<String>> ALPHABET_WITH_PAYLOADS = asList(new Payload<>(ALPHABET[0], ALPHABET_PAYLOAD[0]), private final static List<Payload<String>> ALPHABET_WITH_PAYLOADS = Arrays.asList(//
new Payload<>(ALPHABET[1], ALPHABET_PAYLOAD[1]), new Payload<String>(ALPHABET[0], ALPHABET_PAYLOAD[0]), //
new Payload<>(ALPHABET[2], ALPHABET_PAYLOAD[2])); new Payload<String>(ALPHABET[1], ALPHABET_PAYLOAD[1]), //
new Payload<String>(ALPHABET[2], ALPHABET_PAYLOAD[2]));
private final static String[] PRONOUNS = new String[]{"hers", "his", "she", "he"}; private final static String[] PRONOUNS = new String[] { "hers", "his", "she", "he" };
private final static int[] PRONOUNS_PAYLOAD_ID = new int[]{9, 12, 4, 20}; private final static int[] PRONOUNS_PAYLOAD_ID = new int[] { 9, 12, 4, 20 };
private final static List<Payload<Integer>> PRONOUNS_WITH_PAYLOADS = asList(new Payload<>(PRONOUNS[0], PRONOUNS_PAYLOAD_ID[0]), private final static List<Payload<Integer>> PRONOUNS_WITH_PAYLOADS = Arrays.asList(//
new Payload<>(PRONOUNS[1], PRONOUNS_PAYLOAD_ID[1]), new Payload<Integer>(PRONOUNS[0], PRONOUNS_PAYLOAD_ID[0]), //
new Payload<>(PRONOUNS[2], PRONOUNS_PAYLOAD_ID[2]), new Payload<Integer>(PRONOUNS[1], PRONOUNS_PAYLOAD_ID[1]), //
new Payload<>(PRONOUNS[3], PRONOUNS_PAYLOAD_ID[3])); new Payload<Integer>(PRONOUNS[2], PRONOUNS_PAYLOAD_ID[2]), //
new Payload<Integer>(PRONOUNS[3], PRONOUNS_PAYLOAD_ID[3]) //
);
private final static String[] FOOD = new String[]{"veal", "cauliflower", "broccoli", "tomatoes"}; private final static String[] FOOD = new String[] { "veal", "cauliflower", "broccoli", "tomatoes" };
private final static Food[] FOOD_PAYLOAD = new Food[]{new Food("veal"), new Food("cauliflower"), new Food("broccoli"), new Food("tomatoes")}; private final static Food[] FOOD_PAYLOAD = new Food[] { new Food("veal"), new Food("cauliflower"), new Food("broccoli"),
new Food("tomatoes") };
private final static List<Payload<Food>> FOOD_WITH_PAYLOADS = asList(new Payload<>(FOOD[0], FOOD_PAYLOAD[0]), private final static List<Payload<Food>> FOOD_WITH_PAYLOADS = Arrays.asList(//
new Payload<>(FOOD[1], FOOD_PAYLOAD[1]), new Payload<Food>(FOOD[0], FOOD_PAYLOAD[0]), //
new Payload<>(FOOD[2], FOOD_PAYLOAD[2]), new Payload<Food>(FOOD[1], FOOD_PAYLOAD[1]), //
new Payload<>(FOOD[3], FOOD_PAYLOAD[3])); new Payload<Food>(FOOD[2], FOOD_PAYLOAD[2]), //
new Payload<Food>(FOOD[3], FOOD_PAYLOAD[3]) //
);
private final static String[] GREEK_LETTERS = new String[]{"Alpha", "Beta", "Gamma"}; private final static String[] GREEK_LETTERS = new String[] { "Alpha", "Beta", "Gamma" };
private final static String[] GREEK_LETTERS_PAYLOAD = new String[]{"greek:Alpha", "greek:Beta", "greek:Gamma"}; private final static String[] GREEK_LETTERS_PAYLOAD = new String[] { "greek:Alpha", "greek:Beta", "greek:Gamma" };
private final static List<Payload<String>> GREEK_LETTERS_WITH_PAYLOADS = asList(new Payload<>(GREEK_LETTERS[0], GREEK_LETTERS_PAYLOAD[0]), private final static List<Payload<String>> GREEK_LETTERS_WITH_PAYLOADS = Arrays.asList(//
new Payload<>(GREEK_LETTERS[1], GREEK_LETTERS_PAYLOAD[1]), new Payload<String>(GREEK_LETTERS[0], GREEK_LETTERS_PAYLOAD[0]), //
new Payload<>(GREEK_LETTERS[2], GREEK_LETTERS_PAYLOAD[2])); new Payload<String>(GREEK_LETTERS[1], GREEK_LETTERS_PAYLOAD[1]), //
new Payload<String>(GREEK_LETTERS[2], GREEK_LETTERS_PAYLOAD[2]));
private final static String[] UNICODE = new String[]{"turning", "once", "again", "börkü"}; private final static String[] UNICODE = new String[] { "turning", "once", "again", "börkü" };
private final static String[] UNICODE_PAYLOAD = new String[]{"uni:turning", "uni:once", "uni:again", "uni:börkü"}; private final static String[] UNICODE_PAYLOAD = new String[] { "uni:turning", "uni:once", "uni:again", "uni:börkü" };
private final static List<Payload<String>> UNICODE_WITH_PAYLOADS = asList(new Payload<>(UNICODE[0], UNICODE_PAYLOAD[0]), private final static List<Payload<String>> UNICODE_WITH_PAYLOADS = Arrays.asList(//
new Payload<>(UNICODE[1], UNICODE_PAYLOAD[1]), new Payload<String>(UNICODE[0], UNICODE_PAYLOAD[0]), //
new Payload<>(UNICODE[2], UNICODE_PAYLOAD[2]), new Payload<String>(UNICODE[1], UNICODE_PAYLOAD[1]), //
new Payload<>(UNICODE[3], UNICODE_PAYLOAD[3])); new Payload<String>(UNICODE[2], UNICODE_PAYLOAD[2]), //
new Payload<String>(UNICODE[3], UNICODE_PAYLOAD[3]));
public static class Food { public static class Food {
private final String name; private final String name;
public Food(String name) { public Food(String name) {
this.name = name; this.name = name;
} }
@Override @Override
public int hashCode() { public int hashCode() {
final int prime = 31; final int prime = 31;
int result = 1; int result = 1;
result = prime * result + ((name == null) ? 0 : name.hashCode()); result = prime * result + ((name == null) ? 0 : name.hashCode());
return result; return result;
} }
@Override @Override
public boolean equals(Object obj) { public boolean equals(Object obj) {
if (this == obj)
if (this == obj) {
return true; return true;
} if (obj == null)
if (obj == null) {
return false; return false;
} if (getClass() != obj.getClass())
if (getClass() != obj.getClass()) {
return false; return false;
}
Food other = (Food) obj; Food other = (Food) obj;
if (name == null) { if (name == null) {
return other.name == null; if (other.name != null)
} else { return false;
return name.equals(other.name); } else if (!name.equals(other.name))
} return false;
return true;
} }
} }
@Test @Test
public void keywordAndTextAreTheSame() { public void keywordAndTextAreTheSame() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword(ALPHABET[0], ALPHABET_PAYLOAD[0]).build(); PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword(ALPHABET[0], ALPHABET_PAYLOAD[0]).build();
Collection<PayloadEmit<String>> emits = trie.parseText(ALPHABET[0]); Collection<PayloadEmit<String>> emits = trie.parseText(ALPHABET[0]);
Iterator<PayloadEmit<String>> iterator = emits.iterator(); Iterator<PayloadEmit<String>> iterator = emits.iterator();
checkEmit(iterator.next(), 0, 2, ALPHABET[0], ALPHABET_PAYLOAD[0]); checkEmit(iterator.next(), 0, 2, ALPHABET[0], ALPHABET_PAYLOAD[0]);
} }
@Test @Test
public void keywordAndTextAreTheSameFirstMatch() { public void keywordAndTextAreTheSameFirstMatch() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword(ALPHABET[0], ALPHABET_PAYLOAD[0]).build(); PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword(ALPHABET[0], ALPHABET_PAYLOAD[0]).build();
PayloadEmit<String> firstMatch = trie.firstMatch(ALPHABET[0]); PayloadEmit<String> firstMatch = trie.firstMatch(ALPHABET[0]);
checkEmit(firstMatch, 0, 2, ALPHABET[0], ALPHABET_PAYLOAD[0]); checkEmit(firstMatch, 0, 2, ALPHABET[0], ALPHABET_PAYLOAD[0]);
} }
@Test @Test
public void textIsLongerThanKeyword() { public void textIsLongerThanKeyword() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword(ALPHABET[0], ALPHABET_PAYLOAD[0]).build(); PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword(ALPHABET[0], ALPHABET_PAYLOAD[0]).build();
Collection<PayloadEmit<String>> emits = trie.parseText(" " + ALPHABET[0]); Collection<PayloadEmit<String>> emits = trie.parseText(" " + ALPHABET[0]);
Iterator<PayloadEmit<String>> iterator = emits.iterator(); Iterator<PayloadEmit<String>> iterator = emits.iterator();
checkEmit(iterator.next(), 1, 3, ALPHABET[0], ALPHABET_PAYLOAD[0]); checkEmit(iterator.next(), 1, 3, ALPHABET[0], ALPHABET_PAYLOAD[0]);
} }
@Test @Test
public void textIsLongerThanKeywordFirstMatch() { public void textIsLongerThanKeywordFirstMatch() {
@ -136,29 +127,23 @@ public class PayloadTrieTest {
checkEmit(firstMatch, 1, 3, ALPHABET[0], ALPHABET_PAYLOAD[0]); checkEmit(firstMatch, 1, 3, ALPHABET[0], ALPHABET_PAYLOAD[0]);
} }
@Test @Test
public void variousKeywordsOneMatch() { public void variousKeywordsOneMatch() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeywords(ALPHABET_WITH_PAYLOADS).build(); PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeywords(ALPHABET_WITH_PAYLOADS).build();
Collection<PayloadEmit<String>> emits = trie.parseText("bcd"); Collection<PayloadEmit<String>> emits = trie.parseText("bcd");
Iterator<PayloadEmit<String>> iterator = emits.iterator(); Iterator<PayloadEmit<String>> iterator = emits.iterator();
checkEmit(iterator.next(), 0, 2, "bcd", "alpha:bcd"); checkEmit(iterator.next(), 0, 2, "bcd", "alpha:bcd");
} }
@Test @Test
public void variousKeywordsFirstMatch() { public void variousKeywordsFirstMatch() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeywords(ALPHABET_WITH_PAYLOADS).build(); PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeywords(ALPHABET_WITH_PAYLOADS).build();
PayloadEmit<String> firstMatch = trie.firstMatch("bcd"); PayloadEmit<String> firstMatch = trie.firstMatch("bcd");
checkEmit(firstMatch, 0, 2, "bcd", "alpha:bcd"); checkEmit(firstMatch, 0, 2, "bcd", "alpha:bcd");
} }
@Test @Test
public void ushersTestAndStopOnHit() { public void ushersTestAndStopOnHit() {
PayloadTrie<Integer> trie = PayloadTrie.<Integer>builder().addKeywords(PRONOUNS_WITH_PAYLOADS).stopOnHit().build(); PayloadTrie<Integer> trie = PayloadTrie.<Integer>builder().addKeywords(PRONOUNS_WITH_PAYLOADS).stopOnHit().build();
Collection<PayloadEmit<Integer>> emits = trie.parseText("ushers"); Collection<PayloadEmit<Integer>> emits = trie.parseText("ushers");
assertEquals(1, emits.size()); // she @ 3, he @ 3, hers @ 5 assertEquals(1, emits.size()); // she @ 3, he @ 3, hers @ 5
@ -166,19 +151,15 @@ public class PayloadTrieTest {
checkEmit(iterator.next(), 2, 3, "he", 20); checkEmit(iterator.next(), 2, 3, "he", 20);
} }
@Test @Test
public void ushersTestStopOnHitSkipOne() { public void ushersTestStopOnHitSkipOne() {
PayloadTrie<Integer> trie = PayloadTrie.<Integer>builder().addKeywords(PRONOUNS_WITH_PAYLOADS).stopOnHit().build(); PayloadTrie<Integer> trie = PayloadTrie.<Integer>builder().addKeywords(PRONOUNS_WITH_PAYLOADS).stopOnHit().build();
StatefulPayloadEmitHandler<Integer> testEmitHandler = new AbstractStatefulPayloadEmitHandler<Integer>() { StatefulPayloadEmitHandler<Integer> testEmitHandler = new AbstractStatefulPayloadEmitHandler<Integer>() {
boolean first = true; boolean first = true;
@Override @Override
public boolean emit(final PayloadEmit<Integer> emit) { public boolean emit(final PayloadEmit<Integer> emit) {
if (first) { if (first) {
// return false for the first element // return false for the first element
first = false; first = false;
@ -197,10 +178,8 @@ public class PayloadTrieTest {
checkEmit(iterator.next(), 1, 3, "she", 4); checkEmit(iterator.next(), 1, 3, "she", 4);
} }
@Test @Test
public void ushersTest() { public void ushersTest() {
PayloadTrie<Integer> trie = PayloadTrie.<Integer>builder().addKeywords(PRONOUNS_WITH_PAYLOADS).build(); PayloadTrie<Integer> trie = PayloadTrie.<Integer>builder().addKeywords(PRONOUNS_WITH_PAYLOADS).build();
Collection<PayloadEmit<Integer>> emits = trie.parseText("ushers"); Collection<PayloadEmit<Integer>> emits = trie.parseText("ushers");
assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5 assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5
@ -211,44 +190,37 @@ public class PayloadTrieTest {
checkEmit(iterator.next(), 2, 5, "hers", 9); checkEmit(iterator.next(), 2, 5, "hers", 9);
} }
@Test @Test
public void ushersTestWithCapitalKeywords() { public void ushersTestWithCapitalKeywords() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().addKeyword("HERS", "hers").addKeyword("HIS", "his")
PayloadTrie<String> trie = PayloadTrie.<String>builder() .addKeyword("SHE", "she").addKeyword("HE", "he").build();
.ignoreCase()
.addKeyword("HERS", "hers")
.addKeyword("HIS", "his")
.addKeyword("SHE", "she")
.addKeyword("HE", "he")
.build();
Collection<PayloadEmit<String>> emits = trie.parseText("ushers"); Collection<PayloadEmit<String>> emits = trie.parseText("ushers");
assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5 assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5
Iterator<PayloadEmit<String>> iterator = emits.iterator(); Iterator<PayloadEmit<String>> iterator = emits.iterator();
checkEmit(iterator.next(), 2, 3, "HE", "he"); checkEmit(iterator.next(), 2, 3, "he", "he");
checkEmit(iterator.next(), 1, 3, "SHE", "she"); checkEmit(iterator.next(), 1, 3, "she", "she");
checkEmit(iterator.next(), 2, 5, "HERS", "hers"); checkEmit(iterator.next(), 2, 5, "hers", "hers");
} }
@Test @Test
public void ushersTestFirstMatch() { public void ushersTestFirstMatch() {
PayloadTrie<Integer> trie = PayloadTrie.<Integer>builder().addKeywords(PRONOUNS_WITH_PAYLOADS).build(); PayloadTrie<Integer> trie = PayloadTrie.<Integer>builder().addKeywords(PRONOUNS_WITH_PAYLOADS).build();
PayloadEmit<Integer> firstMatch = trie.firstMatch("ushers"); PayloadEmit<Integer> firstMatch = trie.firstMatch("ushers");
checkEmit(firstMatch, 2, 3, "he", 20); checkEmit(firstMatch, 2, 3, "he", 20);
} }
@Test @Test
public void ushersTestByCallback() { public void ushersTestByCallback() {
PayloadTrie<Integer> trie = PayloadTrie.<Integer>builder().addKeywords(PRONOUNS_WITH_PAYLOADS).build(); PayloadTrie<Integer> trie = PayloadTrie.<Integer>builder().addKeywords(PRONOUNS_WITH_PAYLOADS).build();
final List<PayloadEmit<Integer>> emits = new LinkedList<>(); final List<PayloadEmit<Integer>> emits = new ArrayList<>();
PayloadEmitHandler<Integer> emitHandler = emit -> { PayloadEmitHandler<Integer> emitHandler = new PayloadEmitHandler<Integer>() {
emits.add(emit);
return true; @Override
public boolean emit(PayloadEmit<Integer> emit) {
emits.add(emit);
return true;
}
}; };
trie.parseText("ushers", emitHandler); trie.parseText("ushers", emitHandler);
assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5 assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5
@ -259,29 +231,23 @@ public class PayloadTrieTest {
checkEmit(iterator.next(), 2, 5, "hers", 9); checkEmit(iterator.next(), 2, 5, "hers", 9);
} }
@Test @Test
public void misleadingTest() { public void misleadingTest() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword("hers", "pronon:hers").build(); PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword("hers", "pronon:hers").build();
Collection<PayloadEmit<String>> emits = trie.parseText("h he her hers"); Collection<PayloadEmit<String>> emits = trie.parseText("h he her hers");
Iterator<PayloadEmit<String>> iterator = emits.iterator(); Iterator<PayloadEmit<String>> iterator = emits.iterator();
checkEmit(iterator.next(), 9, 12, "hers", "pronon:hers"); checkEmit(iterator.next(), 9, 12, "hers", "pronon:hers");
} }
@Test @Test
public void misleadingTestFirstMatch() { public void misleadingTestFirstMatch() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword("hers", "pronon:hers").build(); PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword("hers", "pronon:hers").build();
PayloadEmit<String> firstMatch = trie.firstMatch("h he her hers"); PayloadEmit<String> firstMatch = trie.firstMatch("h he her hers");
checkEmit(firstMatch, 9, 12, "hers", "pronon:hers"); checkEmit(firstMatch, 9, 12, "hers", "pronon:hers");
} }
@Test @Test
public void recipes() { public void recipes() {
PayloadTrie<Food> trie = PayloadTrie.<Food>builder().addKeywords(FOOD_WITH_PAYLOADS).build(); PayloadTrie<Food> trie = PayloadTrie.<Food>builder().addKeywords(FOOD_WITH_PAYLOADS).build();
Collection<PayloadEmit<Food>> emits = trie.parseText("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli"); Collection<PayloadEmit<Food>> emits = trie.parseText("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli");
Iterator<PayloadEmit<Food>> iterator = emits.iterator(); Iterator<PayloadEmit<Food>> iterator = emits.iterator();
@ -291,20 +257,17 @@ public class PayloadTrieTest {
checkEmit(iterator.next(), 51, 58, "broccoli", new Food("broccoli")); checkEmit(iterator.next(), 51, 58, "broccoli", new Food("broccoli"));
} }
@Test @Test
public void recipesFirstMatch() { public void recipesFirstMatch() {
PayloadTrie<Food> trie = PayloadTrie.<Food>builder().addKeywords(FOOD_WITH_PAYLOADS).build(); PayloadTrie<Food> trie = PayloadTrie.<Food>builder().addKeywords(FOOD_WITH_PAYLOADS).build();
PayloadEmit<Food> firstMatch = trie.firstMatch("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli"); PayloadEmit<Food> firstMatch = trie.firstMatch("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli");
checkEmit(firstMatch, 2, 12, "cauliflower", new Food("cauliflower")); checkEmit(firstMatch, 2, 12, "cauliflower", new Food("cauliflower"));
} }
@Test @Test
public void longAndShortOverlappingMatch() { public void longAndShortOverlappingMatch() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword("he", "pronon:he").addKeyword("hehehehe", "garbage")
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword("he", "pronon:he").addKeyword("hehehehe", "garbage").build(); .build();
Collection<PayloadEmit<String>> emits = trie.parseText("hehehehehe"); Collection<PayloadEmit<String>> emits = trie.parseText("hehehehehe");
Iterator<PayloadEmit<String>> iterator = emits.iterator(); Iterator<PayloadEmit<String>> iterator = emits.iterator();
checkEmit(iterator.next(), 0, 1, "he", "pronon:he"); checkEmit(iterator.next(), 0, 1, "he", "pronon:he");
@ -316,16 +279,10 @@ public class PayloadTrieTest {
checkEmit(iterator.next(), 2, 9, "hehehehe", "garbage"); checkEmit(iterator.next(), 2, 9, "hehehehe", "garbage");
} }
@Test @Test
public void nonOverlapping() { public void nonOverlapping() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().removeOverlaps().addKeyword("ab", "alpha:ab")
PayloadTrie<String> trie = PayloadTrie.<String>builder() .addKeyword("cba", "alpha:cba").addKeyword("ababc", "alpha:ababc").build();
.ignoreOverlaps()
.addKeyword("ab", "alpha:ab")
.addKeyword("cba", "alpha:cba")
.addKeyword("ababc", "alpha:ababc")
.build();
Collection<PayloadEmit<String>> emits = trie.parseText("ababcbab"); Collection<PayloadEmit<String>> emits = trie.parseText("ababcbab");
assertEquals(2, emits.size()); assertEquals(2, emits.size());
Iterator<PayloadEmit<String>> iterator = emits.iterator(); Iterator<PayloadEmit<String>> iterator = emits.iterator();
@ -334,79 +291,49 @@ public class PayloadTrieTest {
checkEmit(iterator.next(), 6, 7, "ab", "alpha:ab"); checkEmit(iterator.next(), 6, 7, "ab", "alpha:ab");
} }
@Test @Test
public void nonOverlappingFirstMatch() { public void nonOverlappingFirstMatch() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().removeOverlaps().addKeyword("ab", "alpha:ab")
PayloadTrie<String> trie = PayloadTrie.<String>builder() .addKeyword("cba", "alpha:cba").addKeyword("ababc", "alpha:ababc").build();
.ignoreOverlaps()
.addKeyword("ab", "alpha:ab")
.addKeyword("cba", "alpha:cba")
.addKeyword("ababc", "alpha:ababc")
.build();
PayloadEmit<String> firstMatch = trie.firstMatch("ababcbab"); PayloadEmit<String> firstMatch = trie.firstMatch("ababcbab");
checkEmit(firstMatch, 0, 4, "ababc", "alpha:ababc"); checkEmit(firstMatch, 0, 4, "ababc", "alpha:ababc");
} }
@Test @Test
public void containsMatch() { public void containsMatch() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().removeOverlaps().addKeyword("ab", "alpha:ab")
PayloadTrie<String> trie = PayloadTrie.<String>builder() .addKeyword("cba", "alpha:cba").addKeyword("ababc", "alpha:ababc").build();
.ignoreOverlaps()
.addKeyword("ab", "alpha:ab")
.addKeyword("cba", "alpha:cba")
.addKeyword("ababc", "alpha:ababc")
.build();
assertTrue(trie.containsMatch("ababcbab")); assertTrue(trie.containsMatch("ababcbab"));
} }
@Test @Test
public void startOfChurchillSpeech() { public void startOfChurchillSpeech() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().removeOverlaps().addKeyword("T").addKeyword("u").addKeyword("ur")
PayloadTrie<String> trie = PayloadTrie.<String>builder() .addKeyword("r").addKeyword("urn").addKeyword("ni").addKeyword("i").addKeyword("in").addKeyword("n")
.ignoreOverlaps() .addKeyword("urning").build();
.addKeyword("T")
.addKeyword("u")
.addKeyword("ur")
.addKeyword("r")
.addKeyword("urn")
.addKeyword("ni")
.addKeyword("i")
.addKeyword("in")
.addKeyword("n")
.addKeyword("urning")
.build();
Collection<PayloadEmit<String>> emits = trie.parseText("Turning"); Collection<PayloadEmit<String>> emits = trie.parseText("Turning");
assertEquals(2, emits.size()); assertEquals(2, emits.size());
} }
@Test @Test
public void partialMatch() { public void partialMatch() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().onlyWholeWords().addKeyword("sugar", "food:sugar").build(); PayloadTrie<String> trie = PayloadTrie.<String>builder().onlyWholeWords().addKeyword("sugar", "food:sugar").build();
Collection<PayloadEmit<String>> emits = trie.parseText("sugarcane sugarcane sugar canesugar"); // left, middle, right test Collection<PayloadEmit<String>> emits = trie.parseText("sugarcane sugarcane sugar canesugar"); // left, middle, right test
assertEquals(1, emits.size()); // Match must not be made assertEquals(1, emits.size()); // Match must not be made
checkEmit(emits.iterator().next(), 20, 24, "sugar", "food:sugar"); checkEmit(emits.iterator().next(), 20, 24, "sugar", "food:sugar");
} }
@Test @Test
public void partialMatchFirstMatch() { public void partialMatchFirstMatch() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().onlyWholeWords().addKeyword("sugar", "food:sugar").build(); PayloadTrie<String> trie = PayloadTrie.<String>builder().onlyWholeWords().addKeyword("sugar", "food:sugar").build();
PayloadEmit<String> firstMatch = trie.firstMatch("sugarcane sugarcane sugar canesugar"); // left, middle, right test PayloadEmit<String> firstMatch = trie.firstMatch("sugarcane sugarcane sugar canesugar"); // left, middle, right test
checkEmit(firstMatch, 20, 24, "sugar", "food:sugar"); checkEmit(firstMatch, 20, 24, "sugar", "food:sugar");
} }
@Test @Test
public void tokenizeFullSentence() { public void tokenizeFullSentence() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeywords(GREEK_LETTERS_WITH_PAYLOADS).build(); PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeywords(GREEK_LETTERS_WITH_PAYLOADS).build();
Collection<PayloadToken<String>> tokens = trie.tokenize("Hear: Alpha team first, Beta from the rear, Gamma in reserve"); Collection<PayloadToken<String>> tokens = trie.tokenize("Hear: Alpha team first, Beta from the rear, Gamma in reserve");
assertEquals(7, tokens.size()); assertEquals(7, tokens.size());
@ -420,12 +347,11 @@ public class PayloadTrieTest {
assertEquals(" in reserve", tokensIt.next().getFragment()); assertEquals(" in reserve", tokensIt.next().getFragment());
} }
// @see https://github.com/robert-bor/aho-corasick/issues/5 // @see https://github.com/robert-bor/aho-corasick/issues/5
@Test @Test
public void testStringIndexOutOfBoundsException() { public void testStringIndexOutOfBoundsException() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().onlyWholeWords().addKeywords(UNICODE_WITH_PAYLOADS)
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().onlyWholeWords().addKeywords(UNICODE_WITH_PAYLOADS).build(); .build();
Collection<PayloadEmit<String>> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ"); Collection<PayloadEmit<String>> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ");
assertEquals(4, emits.size()); // Match must not be made assertEquals(4, emits.size()); // Match must not be made
Iterator<PayloadEmit<String>> it = emits.iterator(); Iterator<PayloadEmit<String>> it = emits.iterator();
@ -436,10 +362,8 @@ public class PayloadTrieTest {
checkEmit(it.next(), 19, 23, "börkü", "uni:börkü"); checkEmit(it.next(), 19, 23, "börkü", "uni:börkü");
} }
@Test @Test
public void testIgnoreCase() { public void testIgnoreCase() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().addKeywords(UNICODE_WITH_PAYLOADS).build(); PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().addKeywords(UNICODE_WITH_PAYLOADS).build();
Collection<PayloadEmit<String>> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ"); Collection<PayloadEmit<String>> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ");
assertEquals(4, emits.size()); // Match must not be made assertEquals(4, emits.size()); // Match must not be made
@ -451,75 +375,65 @@ public class PayloadTrieTest {
checkEmit(it.next(), 19, 23, "börkü", "uni:börkü"); checkEmit(it.next(), 19, 23, "börkü", "uni:börkü");
} }
@Test @Test
public void testIgnoreCaseFirstMatch() { public void testIgnoreCaseFirstMatch() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().addKeywords(UNICODE_WITH_PAYLOADS).build(); PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().addKeywords(UNICODE_WITH_PAYLOADS).build();
PayloadEmit<String> firstMatch = trie.firstMatch("TurninG OnCe AgAiN BÖRKÜ"); PayloadEmit<String> firstMatch = trie.firstMatch("TurninG OnCe AgAiN BÖRKÜ");
checkEmit(firstMatch, 0, 6, "turning", "uni:turning"); checkEmit(firstMatch, 0, 6, "turning", "uni:turning");
} }
@Test @Test
public void tokenizeTokensInSequence() { public void tokenizeTokensInSequence() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeywords(GREEK_LETTERS_WITH_PAYLOADS).build(); PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeywords(GREEK_LETTERS_WITH_PAYLOADS).build();
Collection<PayloadToken<String>> tokens = trie.tokenize("Alpha Beta Gamma"); Collection<PayloadToken<String>> tokens = trie.tokenize("Alpha Beta Gamma");
assertEquals(5, tokens.size()); assertEquals(5, tokens.size());
} }
// @see https://github.com/robert-bor/aho-corasick/issues/7 // @see https://github.com/robert-bor/aho-corasick/issues/7
@Test @Test
public void testZeroLength() { public void testZeroLength() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreOverlaps().onlyWholeWords().ignoreCase().addKeyword("")
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreOverlaps().onlyWholeWords().ignoreCase().addKeyword("").build(); .build();
trie.tokenize( trie.tokenize(
"Try a natural lip and subtle bronzer to keep all the focus on those big bright eyes with NARS Eyeshadow Duo in Rated R And the winner is... Boots No7 Advanced Renewal Anti-ageing Glycolic Peel Kit ($25 amazon.com) won most-appealing peel."); "Try a natural lip and subtle bronzer to keep all the focus on those big bright eyes with NARS Eyeshadow Duo in Rated R And the winner is... Boots No7 Advanced Renewal Anti-ageing Glycolic Peel Kit ($25 amazon.com) won most-appealing peel.");
} }
// @see https://github.com/robert-bor/aho-corasick/issues/8 // @see https://github.com/robert-bor/aho-corasick/issues/8
@Test @Test
public void testUnicode1() { public void testUnicode1() {
String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char
assertEquals("THIS", target.substring(5, 9)); // Java does it the right way assertEquals("THIS", target.substring(5, 9)); // Java does it the right way
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().onlyWholeWords().addKeyword("this", "pronon:this").build(); PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().onlyWholeWords().addKeyword("this", "pronon:this")
.build();
Collection<PayloadEmit<String>> emits = trie.parseText(target); Collection<PayloadEmit<String>> emits = trie.parseText(target);
assertEquals(1, emits.size()); assertEquals(1, emits.size());
Iterator<PayloadEmit<String>> it = emits.iterator(); Iterator<PayloadEmit<String>> it = emits.iterator();
checkEmit(it.next(), 5, 8, "this", "pronon:this"); checkEmit(it.next(), 5, 8, "this", "pronon:this");
} }
// @see https://github.com/robert-bor/aho-corasick/issues/8 // @see https://github.com/robert-bor/aho-corasick/issues/8
@Test @Test
public void testUnicode2() { public void testUnicode2() {
String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().onlyWholeWords().addKeyword("this", "pronon:this").build(); PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().onlyWholeWords().addKeyword("this", "pronon:this")
.build();
assertEquals("THIS", target.substring(5, 9)); // Java does it the right way assertEquals("THIS", target.substring(5, 9)); // Java does it the right way
PayloadEmit<String> firstMatch = trie.firstMatch(target); PayloadEmit<String> firstMatch = trie.firstMatch(target);
checkEmit(firstMatch, 5, 8, "this", "pronon:this"); checkEmit(firstMatch, 5, 8, "this", "pronon:this");
} }
@Test @Test
public void testPartialMatchWhiteSpaces() { public void testPartialMatchWhiteSpaces() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().onlyWholeWordsWhiteSpaceSeparated()
PayloadTrie<String> trie = PayloadTrie.<String>builder().onlyWholeWordsWhiteSpaceSeparated().addKeyword("#sugar-123", "sugar").build(); .addKeyword("#sugar-123", "sugar").build();
Collection<PayloadEmit<String>> emits = trie.parseText("#sugar-123 #sugar-1234"); // left, middle, right test Collection<PayloadEmit<String>> emits = trie.parseText("#sugar-123 #sugar-1234"); // left, middle, right test
assertEquals(1, emits.size()); // Match must not be made assertEquals(1, emits.size()); // Match must not be made
checkEmit(emits.iterator().next(), 0, 9, "#sugar-123", "sugar"); checkEmit(emits.iterator().next(), 0, 9, "#sugar-123", "sugar");
} }
@Test @Test
public void testLargeString() { public void testLargeString() {
final int interval = 100; final int interval = 100;
final int textSize = 1000000; final int textSize = 1000000;
final String keyword = FOOD[1]; final String keyword = FOOD[1];
@ -535,73 +449,62 @@ public class PayloadTrieTest {
assertEquals(textSize / interval, emits.size()); assertEquals(textSize / interval, emits.size());
} }
/**
* Generates a random sequence of ASCII numbers.
*
* @param count The number of numbers to generate.
* @return A character sequence filled with random digits.
*/
private StringBuilder randomNumbers(int count) {
final StringBuilder sb = new StringBuilder(count);
@Test while (--count > 0) {
public void test_containsMatchWithCaseInsensitive() { sb.append(randomInt(0, 10));
}
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().addKeyword("foo", "bar").build(); return sb;
assertTrue(trie.containsMatch("FOOBAR"));
assertFalse(trie.containsMatch("FO!?AR"));
} }
/**
// @see https://github.com/robert-bor/aho-corasick/issues/85 * Injects keywords into a string builder.
@Test *
public void test_wholeWords() { * @param source Should contain a bunch of random data that cannot match any
* keyword.
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword("foo", "bar").onlyWholeWords().build(); * @param keyword A keyword to inject repeatedly in the text.
// access via PayloadTrie.parseText(CharSequence) * @param interval How often to inject the keyword.
Collection<PayloadEmit<String>> result1 = trie.parseText("foobar"); */
// access via PayloadTrie.parseText(CharSequence, PayloadEmitHandler<String>) private void injectKeyword(final StringBuilder source, final String keyword, final int interval) {
Collection<PayloadEmit<String>> result2 = new LinkedList<>(); final int length = source.length();
trie.parseText("foobar", result2::add); for (int i = 0; i < length; i += interval) {
source.replace(i, i + keyword.length(), keyword);
assertTrue(result1.isEmpty()); }
assertEquals(result1, result2);
} }
private int randomInt(final int min, final int max) {
// @see https://github.com/robert-bor/aho-corasick/issues/85 return ThreadLocalRandom.current().nextInt(min, max);
@Test
public void test_wholeWordsWhiteSpaceSeparated() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword("foo", "bar").onlyWholeWordsWhiteSpaceSeparated().build();
// access via PayloadTrie.parseText(CharSequence)
Collection<PayloadEmit<String>> result1 = trie.parseText("foo#bar");
// access via PayloadTrie.parseText(CharSequence, PayloadEmitHandler<String>)
Collection<PayloadEmit<String>> result2 = new LinkedList<>();
trie.parseText("foo#bar", result2::add);
assertTrue(result1.isEmpty());
assertEquals(result1, result2);
} }
private void checkEmit(PayloadEmit<Food> next, int expectedStart, int expectedEnd, String expectedKeyword,
private void checkEmit(final PayloadEmit<Food> next, final int expectedStart, final int expectedEnd, final String expectedKeyword, final Food expectedPayload) { Food expectedPayload) {
assertEquals("Start of emit should have been " + expectedStart, expectedStart, next.getStart()); assertEquals("Start of emit should have been " + expectedStart, expectedStart, next.getStart());
assertEquals("End of emit should have been " + expectedEnd, expectedEnd, next.getEnd()); assertEquals("End of emit should have been " + expectedEnd, expectedEnd, next.getEnd());
assertEquals("Keyword of emit shoud be " + expectedKeyword, expectedKeyword, next.getKeyword()); assertEquals("Keyword of emit shoud be " + expectedKeyword, expectedKeyword, next.getKeyword());
assertEquals("Payload of emit shoud be " + expectedPayload, expectedPayload, next.getPayload()); assertEquals("Payload of emit shoud be " + expectedPayload, expectedPayload, next.getPayload());
} }
private void checkEmit(PayloadEmit<Integer> next, int expectedStart, int expectedEnd, String expectedKeyword,
private void checkEmit(final PayloadEmit<Integer> next, final int expectedStart, final int expectedEnd, final String expectedKeyword, final Integer expectedPayload) { Integer expectedPayload) {
assertEquals("Start of emit should have been " + expectedStart, expectedStart, next.getStart()); assertEquals("Start of emit should have been " + expectedStart, expectedStart, next.getStart());
assertEquals("End of emit should have been " + expectedEnd, expectedEnd, next.getEnd()); assertEquals("End of emit should have been " + expectedEnd, expectedEnd, next.getEnd());
assertEquals("Keyword of emit shoud be " + expectedKeyword, expectedKeyword, next.getKeyword()); assertEquals("Keyword of emit shoud be " + expectedKeyword, expectedKeyword, next.getKeyword());
assertEquals("Payload of emit shoud be " + expectedPayload, expectedPayload, next.getPayload()); assertEquals("Payload of emit shoud be " + expectedPayload, expectedPayload, next.getPayload());
} }
private void checkEmit(PayloadEmit<String> next, int expectedStart, int expectedEnd, String expectedKeyword,
private void checkEmit(final PayloadEmit<String> next, final int expectedStart, final int expectedEnd, final String expectedKeyword, final String expectedPayload) { String expectedPayload) {
assertEquals("Start of emit should have been " + expectedStart, expectedStart, next.getStart()); assertEquals("Start of emit should have been " + expectedStart, expectedStart, next.getStart());
assertEquals("End of emit should have been " + expectedEnd, expectedEnd, next.getEnd()); assertEquals("End of emit should have been " + expectedEnd, expectedEnd, next.getEnd());
assertEquals("Keyword of emit shoud be " + expectedKeyword, expectedKeyword, next.getKeyword()); assertEquals("Keyword of emit shoud be " + expectedKeyword, expectedKeyword, next.getKeyword());
assertEquals("Payload of emit shoud be " + expectedPayload, expectedPayload, next.getPayload()); assertEquals("Payload of emit shoud be " + expectedPayload, expectedPayload, next.getPayload());
} }
} }

View File

@ -2,75 +2,23 @@ package org.ahocorasick.trie;
import org.junit.Test; import org.junit.Test;
import java.util.Collection; import static junit.framework.Assert.assertEquals;
import java.util.Collections;
import static org.junit.Assert.*;
public class StateTest { public class StateTest {
@Test @Test
public void test_constructSequenceOfCharacters() { public void constructSequenceOfCharacters() {
State rootState = new State();
final State rootState = new State(); rootState
rootState.addState('a').addState('b').addState('c'); .addState('a')
.addState('b')
.addState('c');
State currentState = rootState.nextState('a'); State currentState = rootState.nextState('a');
assertEquals(1, currentState.getDepth()); assertEquals(1, currentState.getDepth());
currentState = currentState.nextState('b'); currentState = currentState.nextState('b');
assertEquals(2, currentState.getDepth()); assertEquals(2, currentState.getDepth());
currentState = currentState.nextState('c'); currentState = currentState.nextState('c');
assertEquals(3, currentState.getDepth()); assertEquals(3, currentState.getDepth());
currentState = currentState.nextState('F');
assertNull(currentState);
}
@Test
public void test_getStates() {
final State rootState = new State();
rootState.addState("foo");
final State currentState = rootState.nextState('f');
final Collection<State> states = rootState.getStates();
assertEquals(1, states.size());
assertEquals(currentState, states.iterator().next());
}
@Test
public void test_getTransitions() {
final State rootState = new State();
rootState.addState("foo");
final State currentState = rootState.nextState('f');
final Collection<Character> transitions = rootState.getTransitions();
assertEquals(1, transitions.size());
assertEquals(Character.valueOf('f'), transitions.iterator().next());
}
@Test
public void test_getFailure() {
final State failureState = new State();
final State rootState = new State();
rootState.setFailure(failureState);
assertEquals(failureState, rootState.getFailure());
}
@Test
public void test_checkEmits() {
final State rootState = new State();
rootState.addState('a').addEmit(Collections.singleton("tag"));
final Collection<String> actual = rootState.nextState('a').emit();
assertEquals(1, actual.size());
assertEquals("tag", actual.iterator().next());
} }
} }

View File

@ -1,47 +0,0 @@
package org.ahocorasick.trie;
import static java.util.concurrent.ThreadLocalRandom.current;
/**
* Contains functionality common to tests.
*/
public class TestHelper {
/**
* Injects keywords into a string builder.
*
* @param source Should contain a bunch of random data that cannot match
* any keyword.
* @param keyword A keyword to inject repeatedly in the text.
* @param interval How often to inject the keyword.
*/
@SuppressWarnings("SameParameterValue")
static void injectKeyword(final StringBuilder source, final String keyword, final int interval) {
final int length = source.length();
for (int i = 0; i < length; i += interval) {
source.replace(i, i + keyword.length(), keyword);
}
}
/**
* Generates a random sequence of ASCII numbers.
*
* @param count The number of numbers to generate.
* @return A character sequence filled with random digits.
*/
@SuppressWarnings("SameParameterValue")
public static StringBuilder randomNumbers(int count) {
int localCount = count;
final StringBuilder sb = new StringBuilder(localCount);
while (--localCount > 0) {
sb.append(current().nextInt(0, 10));
}
return sb;
}
}

View File

@ -1,164 +1,123 @@
package org.ahocorasick.trie; package org.ahocorasick.trie;
import static junit.framework.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.ThreadLocalRandom;
import java.util.concurrent.atomic.AtomicInteger;
import org.ahocorasick.trie.handler.AbstractStatefulEmitHandler; import org.ahocorasick.trie.handler.AbstractStatefulEmitHandler;
import org.ahocorasick.trie.handler.EmitHandler; import org.ahocorasick.trie.handler.EmitHandler;
import org.ahocorasick.trie.handler.StatefulEmitHandler; import org.ahocorasick.trie.handler.StatefulEmitHandler;
import org.junit.Test; import org.junit.Test;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import static java.lang.String.format;
import static org.ahocorasick.trie.TestHelper.injectKeyword;
import static org.ahocorasick.trie.TestHelper.randomNumbers;
import static org.ahocorasick.trie.Trie.builder;
import static org.junit.Assert.*;
/**
* Test the {@link Trie} class functionality.
*/
public class TrieTest { public class TrieTest {
private final static String[] ALPHABET = new String[]{
"abc", "bcd", "cde"
};
private final static String[] ALPHABET = new String[]{"abc", "bcd", "cde"}; private final static String[] PRONOUNS = new String[]{
"hers", "his", "she", "he"
};
private final static String[] PRONOUNS = new String[]{"hers", "his", "she", "he"}; private final static String[] FOOD = new String[]{
"veal", "cauliflower", "broccoli", "tomatoes"
};
private final static String[] FOOD = new String[]{"veal", "cauliflower", "broccoli", "tomatoes"}; private final static String[] GREEK_LETTERS = new String[]{
"Alpha", "Beta", "Gamma"
private final static String[] GREEK_LETTERS = new String[]{"Alpha", "Beta", "Gamma"}; };
private final static String[] UNICODE = new String[]{"turning", "once", "again", "börkü"};
private static Trie trie(final String keyword) {
return Trie.builder().addKeyword(keyword).build();
}
private static Trie trieIgnoreWhiteSpace(final String keyword) {
return Trie.builder().addKeyword(keyword).ignoreWhiteSpace().build();
}
private static Trie trie(final String[] keywords) {
return Trie.builder().addKeywords(keywords).ignoreWhiteSpace().build();
}
private static Trie trieIgnoreWhiteSpace(final String[] keywords) {
return Trie.builder().addKeywords(keywords).ignoreWhiteSpace().build();
}
private final static String[] UNICODE = new String[]{
"turning", "once", "again", "börkü"
};
@Test @Test
public void test_KeywordAndTextAreTheSame() { public void keywordAndTextAreTheSame() {
Trie trie = Trie.builder()
final Trie trie = trie(ALPHABET[0]); .addKeyword(ALPHABET[0])
final Collection<Emit> emits = trie.parseText(ALPHABET[0]); .build();
final Iterator<Emit> iterator = emits.iterator(); Collection<Emit> emits = trie.parseText(ALPHABET[0]);
Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 0, 2, ALPHABET[0]); checkEmit(iterator.next(), 0, 2, ALPHABET[0]);
} }
@Test @Test
public void test_ignoringWhitespace_KeywordAndTextAreTheSame() { public void keywordAndTextAreTheSameFirstMatch() {
Trie trie = Trie.builder()
final Trie trie = trieIgnoreWhiteSpace(ALPHABET); .addKeyword(ALPHABET[0])
final Collection<Emit> emits = trie.parseText("a b c d e"); .build();
final Iterator<Emit> iterator = emits.iterator(); Emit firstMatch = trie.firstMatch(ALPHABET[0]);
checkEmit(iterator.next(), 0, 4, ALPHABET[0]);
checkEmit(iterator.next(), 2, 6, ALPHABET[1]);
checkEmit(iterator.next(), 4, 8, ALPHABET[2]);
}
@Test
public void test_KeywordAndTextAreTheSameFirstMatch() {
final Trie trie = trie(ALPHABET[0]);
final Emit firstMatch = trie.firstMatch(ALPHABET[0]);
checkEmit(firstMatch, 0, 2, ALPHABET[0]); checkEmit(firstMatch, 0, 2, ALPHABET[0]);
} }
@Test @Test
public void test_TextIsLongerThanKeyword() { public void textIsLongerThanKeyword() {
Trie trie = Trie.builder()
final Trie trie = trie(ALPHABET[0]); .addKeyword(ALPHABET[0])
final Collection<Emit> emits = trie.parseText(" " + ALPHABET[0]); .build();
final Iterator<Emit> iterator = emits.iterator(); Collection<Emit> emits = trie.parseText(" " + ALPHABET[0]);
Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 1, 3, ALPHABET[0]); checkEmit(iterator.next(), 1, 3, ALPHABET[0]);
} }
@Test @Test
public void test_TextIsLongerThanKeywordFirstMatch() { public void textIsLongerThanKeywordFirstMatch() {
Trie trie = Trie.builder()
final Trie trie = trie(ALPHABET[0]); .addKeyword(ALPHABET[0])
final Emit firstMatch = trie.firstMatch(" " + ALPHABET[0]); .build();
Emit firstMatch = trie.firstMatch(" " + ALPHABET[0]);
checkEmit(firstMatch, 1, 3, ALPHABET[0]); checkEmit(firstMatch, 1, 3, ALPHABET[0]);
} }
@Test @Test
public void test_VariousKeywordsOneMatch() { public void variousKeywordsOneMatch() {
Trie trie = Trie.builder()
final Trie trie = trie(ALPHABET); .addKeywords(ALPHABET)
final Collection<Emit> emits = trie.parseText("bcd"); .build();
final Iterator<Emit> iterator = emits.iterator(); Collection<Emit> emits = trie.parseText("bcd");
Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 0, 2, "bcd"); checkEmit(iterator.next(), 0, 2, "bcd");
} }
@Test @Test
public void test_VariousKeywordsFirstMatch() { public void variousKeywordsFirstMatch() {
Trie trie = Trie.builder()
final Trie trie = trie(ALPHABET); .addKeywords(ALPHABET)
final Emit firstMatch = trie.firstMatch("bc d"); .build();
checkEmit(firstMatch, 0, 3, "bcd"); Emit firstMatch = trie.firstMatch("bcd");
checkEmit(firstMatch, 0, 2, "bcd");
} }
@Test(expected = AssertionError.class)
public void test_NullInputTextFirstMatch() {
final Trie trie = trie(ALPHABET);
final Emit firstMatch = trie.firstMatch(null);
assertNull(firstMatch);
}
@Test @Test
public void test_UshersTestAndStopOnHit() { public void ushersTestAndStopOnHit() {
Trie trie = Trie.builder()
final Trie trie = Trie.builder().addKeywords(PRONOUNS).stopOnHit().build(); .addKeywords(PRONOUNS)
final Collection<Emit> emits = trie.parseText("ushers"); .stopOnHit()
.build();
Collection<Emit> emits = trie.parseText("ushers");
assertEquals(1, emits.size()); // she @ 3, he @ 3, hers @ 5 assertEquals(1, emits.size()); // she @ 3, he @ 3, hers @ 5
final Iterator<Emit> iterator = emits.iterator(); Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 2, 3, "he"); checkEmit(iterator.next(), 2, 3, "he");
} }
@Test @Test
public void test_UshersTestStopOnHitSkipOne() { public void ushersTestStopOnHitSkipOne() {
Trie trie = Trie.builder()
final Trie trie = Trie.builder().addKeywords(PRONOUNS).stopOnHit().build(); .addKeywords(PRONOUNS)
.stopOnHit()
final StatefulEmitHandler testEmitHandler = new AbstractStatefulEmitHandler() { .build();
StatefulEmitHandler testEmitHandler = new AbstractStatefulEmitHandler() {
boolean first = true; boolean first = true;
@Override @Override
public boolean emit(final Emit emit) { public boolean emit(final Emit emit) {
if(first) {
if (first) {
// return false for the first element // return false for the first element
first = false; first = false;
return false; return false;
@ -166,117 +125,128 @@ public class TrieTest {
addEmit(emit); addEmit(emit);
return true; return true;
} }
};
};
trie.parseText("ushers", testEmitHandler); trie.parseText("ushers", testEmitHandler);
final Collection<Emit> emits = testEmitHandler.getEmits(); Collection<Emit> emits = testEmitHandler.getEmits();
assertEquals(1, emits.size()); // she @ 3, he @ 3, hers @ 5 assertEquals(1, emits.size()); // she @ 3, he @ 3, hers @ 5
final Iterator<Emit> iterator = emits.iterator(); Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 1, 3, "she"); checkEmit(iterator.next(), 1, 3, "she");
} }
@Test @Test
public void test_UshersTest() { public void ushersTest() {
Trie trie = Trie.builder()
final Trie trie = trie(PRONOUNS); .addKeywords(PRONOUNS)
final Collection<Emit> emits = trie.parseText("ushers"); .build();
Collection<Emit> emits = trie.parseText("ushers");
assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5 assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5
final Iterator<Emit> iterator = emits.iterator(); Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 2, 3, "he"); checkEmit(iterator.next(), 2, 3, "he");
checkEmit(iterator.next(), 1, 3, "she"); checkEmit(iterator.next(), 1, 3, "she");
checkEmit(iterator.next(), 2, 5, "hers"); checkEmit(iterator.next(), 2, 5, "hers");
} }
@Test @Test
public void test_UshersTestWithCapitalKeywords() { public void ushersTestWithCapitalKeywords() {
Trie trie = Trie.builder()
final Trie trie = Trie.builder().ignoreCase().addKeyword("HERS").addKeyword("HIS").addKeyword("SHE").addKeyword("HE").build(); .ignoreCase()
final Collection<Emit> emits = trie.parseText("ushers"); .addKeyword("HERS")
.addKeyword("HIS")
.addKeyword("SHE")
.addKeyword("HE")
.build();
Collection<Emit> emits = trie.parseText("ushers");
assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5 assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5
final Iterator<Emit> iterator = emits.iterator(); Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 2, 3, "HE"); checkEmit(iterator.next(), 2, 3, "he");
checkEmit(iterator.next(), 1, 3, "SHE"); checkEmit(iterator.next(), 1, 3, "she");
checkEmit(iterator.next(), 2, 5, "HERS"); checkEmit(iterator.next(), 2, 5, "hers");
} }
@Test @Test
public void test_UshersTestFirstMatch() { public void ushersTestFirstMatch() {
Trie trie = Trie.builder()
final Trie trie = trie(PRONOUNS); .addKeywords(PRONOUNS)
final Emit firstMatch = trie.firstMatch("ushers"); .build();
Emit firstMatch = trie.firstMatch("ushers");
checkEmit(firstMatch, 2, 3, "he"); checkEmit(firstMatch, 2, 3, "he");
} }
@Test @Test
public void test_UshersTestByCallback() { public void ushersTestByCallback() {
Trie trie = Trie.builder()
.addKeywords(PRONOUNS)
.build();
final Trie trie = trie(PRONOUNS);
final List<Emit> emits = new ArrayList<>(); final List<Emit> emits = new ArrayList<>();
final EmitHandler emitHandler = emit -> { EmitHandler emitHandler = new EmitHandler() {
emits.add(emit);
return true; @Override
public boolean emit(Emit emit) {
emits.add(emit);
return true;
}
}; };
trie.parseText("ushers", emitHandler); trie.parseText("ushers", emitHandler);
assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5 assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5
final Iterator<Emit> iterator = emits.iterator(); Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 2, 3, "he"); checkEmit(iterator.next(), 2, 3, "he");
checkEmit(iterator.next(), 1, 3, "she"); checkEmit(iterator.next(), 1, 3, "she");
checkEmit(iterator.next(), 2, 5, "hers"); checkEmit(iterator.next(), 2, 5, "hers");
} }
@Test @Test
public void test_MisleadingTest() { public void misleadingTest() {
Trie trie = Trie.builder()
final Trie trie = trie("hers"); .addKeyword("hers")
final Collection<Emit> emits = trie.parseText("h he her hers"); .build();
final Iterator<Emit> iterator = emits.iterator(); Collection<Emit> emits = trie.parseText("h he her hers");
Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 9, 12, "hers"); checkEmit(iterator.next(), 9, 12, "hers");
} }
@Test @Test
public void test_MisleadingTestFirstMatch() { public void misleadingTestFirstMatch() {
Trie trie = Trie.builder()
final Trie trie = trie("hers"); .addKeyword("hers")
final Emit firstMatch = trie.firstMatch("h he her hers"); .build();
Emit firstMatch = trie.firstMatch("h he her hers");
checkEmit(firstMatch, 9, 12, "hers"); checkEmit(firstMatch, 9, 12, "hers");
} }
@Test @Test
public void test_Recipes() { public void recipes() {
Trie trie = Trie.builder()
final Trie trie = trie(FOOD); .addKeywords(FOOD)
final Collection<Emit> emits = trie.parseText("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli"); .build();
final Iterator<Emit> iterator = emits.iterator(); Collection<Emit> emits = trie.parseText("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli");
Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 2, 12, "cauliflower"); checkEmit(iterator.next(), 2, 12, "cauliflower");
checkEmit(iterator.next(), 18, 25, "tomatoes"); checkEmit(iterator.next(), 18, 25, "tomatoes");
checkEmit(iterator.next(), 40, 43, "veal"); checkEmit(iterator.next(), 40, 43, "veal");
checkEmit(iterator.next(), 51, 58, "broccoli"); checkEmit(iterator.next(), 51, 58, "broccoli");
} }
@Test @Test
public void test_RecipesFirstMatch() { public void recipesFirstMatch() {
Trie trie = Trie.builder()
final Trie trie = trie(FOOD); .addKeywords(FOOD)
final Emit firstMatch = trie.firstMatch("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli"); .build();
Emit firstMatch = trie.firstMatch("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli");
checkEmit(firstMatch, 2, 12, "cauliflower"); checkEmit(firstMatch, 2, 12, "cauliflower");
} }
@Test @Test
public void test_LongAndShortOverlappingMatch() { public void longAndShortOverlappingMatch() {
Trie trie = Trie.builder()
final Trie trie = Trie.builder().addKeyword("he").addKeyword("hehehehe").build(); .addKeyword("he")
final Collection<Emit> emits = trie.parseText("hehehehehe"); .addKeyword("hehehehe")
final Iterator<Emit> iterator = emits.iterator(); .build();
Collection<Emit> emits = trie.parseText("hehehehehe");
Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 0, 1, "he"); checkEmit(iterator.next(), 0, 1, "he");
checkEmit(iterator.next(), 2, 3, "he"); checkEmit(iterator.next(), 2, 3, "he");
checkEmit(iterator.next(), 4, 5, "he"); checkEmit(iterator.next(), 4, 5, "he");
@ -286,43 +256,46 @@ public class TrieTest {
checkEmit(iterator.next(), 2, 9, "hehehehe"); checkEmit(iterator.next(), 2, 9, "hehehehe");
} }
@Test @Test
public void test_NonOverlapping() { public void nonOverlapping() {
Trie trie = Trie.builder().removeOverlaps()
final Trie trie = Trie.builder().ignoreOverlaps().addKeyword("ab").addKeyword("cba").addKeyword("ababc").build(); .addKeyword("ab")
final Collection<Emit> emits = trie.parseText("ababcbab"); .addKeyword("cba")
.addKeyword("ababc")
.build();
Collection<Emit> emits = trie.parseText("ababcbab");
assertEquals(2, emits.size()); assertEquals(2, emits.size());
final Iterator<Emit> iterator = emits.iterator(); Iterator<Emit> iterator = emits.iterator();
// With overlaps: ab@1, ab@3, ababc@4, cba@6, ab@7 // With overlaps: ab@1, ab@3, ababc@4, cba@6, ab@7
checkEmit(iterator.next(), 0, 4, "ababc"); checkEmit(iterator.next(), 0, 4, "ababc");
checkEmit(iterator.next(), 6, 7, "ab"); checkEmit(iterator.next(), 6, 7, "ab");
} }
@Test @Test
public void test_NonOverlappingFirstMatch() { public void nonOverlappingFirstMatch() {
Trie trie = Trie.builder().removeOverlaps()
final Trie trie = Trie.builder().ignoreOverlaps().addKeyword("ab").addKeyword("cba").addKeyword("ababc").build(); .addKeyword("ab")
final Emit firstMatch = trie.firstMatch("ababcbab"); .addKeyword("cba")
.addKeyword("ababc")
.build();
Emit firstMatch = trie.firstMatch("ababcbab");
checkEmit(firstMatch, 0, 4, "ababc"); checkEmit(firstMatch, 0, 4, "ababc");
} }
@Test @Test
public void test_ContainsMatch() { public void containsMatch() {
Trie trie = Trie.builder().removeOverlaps()
final Trie trie = Trie.builder().ignoreOverlaps().addKeyword("ab").addKeyword("cba").addKeyword("ababc").build(); .addKeyword("ab")
.addKeyword("cba")
.addKeyword("ababc")
.build();
assertTrue(trie.containsMatch("ababcbab")); assertTrue(trie.containsMatch("ababcbab"));
} }
@Test @Test
public void test_StartOfChurchillSpeech() { public void startOfChurchillSpeech() {
Trie trie = Trie.builder().removeOverlaps()
final Trie trie = Trie.builder()
.ignoreOverlaps()
.addKeyword("T") .addKeyword("T")
.addKeyword("u") .addKeyword("u")
.addKeyword("ur") .addKeyword("ur")
@ -334,40 +307,40 @@ public class TrieTest {
.addKeyword("n") .addKeyword("n")
.addKeyword("urning") .addKeyword("urning")
.build(); .build();
final Collection<Emit> emits = trie.parseText("Turning"); Collection<Emit> emits = trie.parseText("Turning");
assertEquals(2, emits.size()); assertEquals(2, emits.size());
} }
@Test @Test
public void test_PartialMatch() { public void partialMatch() {
Trie trie = Trie.builder()
final Trie trie = Trie.builder().onlyWholeWords().addKeyword("sugar").build(); .onlyWholeWords()
final Collection<Emit> emits = trie.parseText("sugarcane sugarcane sugar canesugar"); // left, middle, right test .addKeyword("sugar")
.build();
Collection<Emit> emits = trie.parseText("sugarcane sugarcane sugar canesugar"); // left, middle, right test
assertEquals(1, emits.size()); // Match must not be made assertEquals(1, emits.size()); // Match must not be made
checkEmit(emits.iterator().next(), 20, 24, "sugar"); checkEmit(emits.iterator().next(), 20, 24, "sugar");
} }
@Test @Test
public void test_PartialMatchFirstMatch() { public void partialMatchFirstMatch() {
Trie trie = Trie.builder()
final Trie trie = Trie.builder().onlyWholeWords().addKeyword("sugar").build(); .onlyWholeWords()
.addKeyword("sugar")
// left, middle, right test .build();
final Emit firstMatch = trie.firstMatch("sugarcane sugarcane sugar canesugar"); Emit firstMatch = trie.firstMatch("sugarcane sugarcane sugar canesugar"); // left, middle, right test
checkEmit(firstMatch, 20, 24, "sugar"); checkEmit(firstMatch, 20, 24, "sugar");
} }
@Test @Test
public void test_TokenizeFullSentence() { public void tokenizeFullSentence() {
Trie trie = Trie.builder()
final Trie trie = trie(GREEK_LETTERS); .addKeywords(GREEK_LETTERS)
final Collection<Token> tokens = trie.tokenize("Hear: Alpha team first, Beta from the rear, Gamma in reserve"); .build();
Collection<Token> tokens = trie.tokenize("Hear: Alpha team first, Beta from the rear, Gamma in reserve");
assertEquals(7, tokens.size()); assertEquals(7, tokens.size());
final Iterator<Token> tokensIt = tokens.iterator(); Iterator<Token> tokensIt = tokens.iterator();
assertEquals("Hear: ", tokensIt.next().getFragment()); assertEquals("Hear: ", tokensIt.next().getFragment());
assertEquals("Alpha", tokensIt.next().getFragment()); assertEquals("Alpha", tokensIt.next().getFragment());
assertEquals(" team first, ", tokensIt.next().getFragment()); assertEquals(" team first, ", tokensIt.next().getFragment());
@ -377,147 +350,104 @@ public class TrieTest {
assertEquals(" in reserve", tokensIt.next().getFragment()); assertEquals(" in reserve", tokensIt.next().getFragment());
} }
// @see https://github.com/robert-bor/aho-corasick/issues/5
/**
* Test boundary check with case-insensitive matches with whole words.
*/
@Test @Test
public void test_StringIndexOutOfBoundsException() { public void testStringIndexOutOfBoundsException() {
Trie trie = Trie.builder().ignoreCase().onlyWholeWords()
final Trie trie = Trie.builder().ignoreCase().onlyWholeWords().addKeywords(UNICODE).build(); .addKeywords(UNICODE)
final Collection<Emit> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ"); .build();
Collection<Emit> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ");
assertEquals(4, emits.size()); // Match must not be made assertEquals(4, emits.size()); // Match must not be made
final Iterator<Emit> it = emits.iterator(); Iterator<Emit> it = emits.iterator();
checkEmit(it.next(), 0, 6, "turning"); checkEmit(it.next(), 0, 6, "turning");
checkEmit(it.next(), 8, 11, "once"); checkEmit(it.next(), 8, 11, "once");
checkEmit(it.next(), 13, 17, "again"); checkEmit(it.next(), 13, 17, "again");
checkEmit(it.next(), 19, 23, "börkü"); checkEmit(it.next(), 19, 23, "börkü");
} }
@Test @Test
public void test_IgnoreCase() { public void testIgnoreCase() {
Trie trie = Trie.builder().ignoreCase()
final Trie trie = Trie.builder().ignoreCase().addKeywords(UNICODE).build(); .addKeywords(UNICODE)
final Collection<Emit> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ"); .build();
Collection<Emit> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ");
assertEquals(4, emits.size()); // Match must not be made assertEquals(4, emits.size()); // Match must not be made
final Iterator<Emit> it = emits.iterator(); Iterator<Emit> it = emits.iterator();
checkEmit(it.next(), 0, 6, "turning"); checkEmit(it.next(), 0, 6, "turning");
checkEmit(it.next(), 8, 11, "once"); checkEmit(it.next(), 8, 11, "once");
checkEmit(it.next(), 13, 17, "again"); checkEmit(it.next(), 13, 17, "again");
checkEmit(it.next(), 19, 23, "börkü"); checkEmit(it.next(), 19, 23, "börkü");
} }
@Test @Test
public void test_IgnoreCaseFirstMatch() { public void testIgnoreCaseFirstMatch() {
Trie trie = Trie.builder().ignoreCase()
final Trie trie = Trie.builder().ignoreCase().addKeywords(UNICODE).build(); .addKeywords(UNICODE)
final Emit firstMatch = trie.firstMatch("TurninG OnCe AgAiN BÖRKÜ"); .build();
Emit firstMatch = trie.firstMatch("TurninG OnCe AgAiN BÖRKÜ");
checkEmit(firstMatch, 0, 6, "turning"); checkEmit(firstMatch, 0, 6, "turning");
} }
@Test @Test
public void test_TokenizeTokensInSequence() { public void tokenizeTokensInSequence() {
Trie trie = Trie.builder()
final Trie trie = trie(GREEK_LETTERS); .addKeywords(GREEK_LETTERS)
final Collection<Token> tokens = trie.tokenize("Alpha Beta Gamma"); .build();
Collection<Token> tokens = trie.tokenize("Alpha Beta Gamma");
assertEquals(5, tokens.size()); assertEquals(5, tokens.size());
} }
// @see https://github.com/robert-bor/aho-corasick/issues/7
/**
* Fix adding a word of size 0 ("") as a dictionary. A bug in the dictionary
* parsing code (at end of line) caused it to generate words of 0 length,
* which were being added to the trie. Removing the additional commas
* resolved the issue.
*/
@Test @Test
public void test_ZeroLength() { public void testZeroLength() {
Trie trie = Trie.builder().ignoreOverlaps().onlyWholeWords().ignoreCase()
final Trie trie = Trie.builder().ignoreOverlaps().onlyWholeWords().ignoreCase().addKeyword("").build(); .addKeyword("")
trie.tokenize("Try a natural lip and subtle bronzer to keep all the focus on those " .build();
+ "big bright eyes with NARS Eyeshadow Duo in Rated R And the " trie.tokenize("Try a natural lip and subtle bronzer to keep all the focus on those big bright eyes with NARS Eyeshadow Duo in Rated R And the winner is... Boots No7 Advanced Renewal Anti-ageing Glycolic Peel Kit ($25 amazon.com) won most-appealing peel.");
+ "winner is... Boots No7 Advanced Renewal Anti-ageing Glycolic "
+ "Peel Kit ($25 amazon.com) won most-appealing peel.");
} }
// @see https://github.com/robert-bor/aho-corasick/issues/8
@Test @Test
public void test_Emit_PunctuatedKeyword_AllOffsetsFound() { public void testUnicode1() {
String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char
final String keyword = "{{var}}"; assertEquals("THIS", target.substring(5, 9)); // Java does it the right way
final int len = keyword.length() - 1; Trie trie = Trie.builder().ignoreCase().onlyWholeWords()
final Trie trie = builder().ignoreOverlaps().addKeyword(keyword).build(); .addKeyword("this")
.build();
final Collection<Emit> emits = trie.parseText(format("__%s__ **%s** {{%s}} %s%s", keyword, keyword, keyword, keyword, keyword)); Collection<Emit> emits = trie.parseText(target);
assertEquals(5, emits.size());
final Iterator<Emit> it = emits.iterator();
checkEmit(it.next(), 2, 2 + len, keyword);
checkEmit(it.next(), 14, 14 + len, keyword);
checkEmit(it.next(), 26, 26 + len, keyword);
checkEmit(it.next(), 36, 36 + len, keyword);
checkEmit(it.next(), 43, 43 + len, keyword);
}
/**
* Notice the capital I with a dot. The code used to compute the offsets
* at (6, 9), which caused {@link Trie#tokenize(String)} to crash because
* 9 is past the end of the string. That character is two bytes wide, so it
* pushes the offset calculation off.
*/
@Test
public void test_Unicode1() {
// The second character ('İ') is
// Unicode, which was read by AC as a 2-byte char
final String target = "LİKE THIS";
// Java does it the right way
assertEquals("THIS", target.substring(5, 9));
final Trie trie = Trie.builder().ignoreCase().onlyWholeWords().addKeyword("this").build();
final Collection<Emit> emits = trie.parseText(target);
assertEquals(1, emits.size()); assertEquals(1, emits.size());
final Iterator<Emit> it = emits.iterator(); Iterator<Emit> it = emits.iterator();
checkEmit(it.next(), 5, 8, "this"); checkEmit(it.next(), 5, 8, "this");
} }
// @see https://github.com/robert-bor/aho-corasick/issues/8
/**
* Notice the capital I with a dot. The code used to compute the offsets
* at (6, 9), which caused {@link Trie#tokenize(String)} to crash because
* 9 is past the end of the string. That character is two bytes wide, so it
* pushes the offset calculation off.
*/
@Test @Test
public void test_Unicode2() { public void testUnicode2() {
// The second character ('İ') is String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char
// Unicode, which was read by AC as a 2-byte char Trie trie = Trie.builder()
final String target = "LİKE THIS"; .ignoreCase()
final Trie trie = Trie.builder().ignoreCase().onlyWholeWords().addKeyword("this").build(); .onlyWholeWords()
// Java does it the right way .addKeyword("this")
assertEquals("THIS", target.substring(5, 9)); .build();
final Emit firstMatch = trie.firstMatch(target); assertEquals("THIS", target.substring(5, 9)); // Java does it the right way
Emit firstMatch = trie.firstMatch(target);
checkEmit(firstMatch, 5, 8, "this"); checkEmit(firstMatch, 5, 8, "this");
} }
@Test @Test
public void test_PartialMatchWhiteSpaces() { public void testPartialMatchWhiteSpaces() {
Trie trie = Trie.builder()
final Trie trie = Trie.builder().onlyWholeWordsWhiteSpaceSeparated().addKeyword("#sugar-123").build(); .onlyWholeWordsWhiteSpaceSeparated()
final Collection<Emit> emits = trie.parseText("#sugar-123 #sugar-1234"); // left, middle, right test .addKeyword("#sugar-123")
.build();
Collection<Emit> emits = trie.parseText("#sugar-123 #sugar-1234"); // left, middle, right test
assertEquals(1, emits.size()); // Match must not be made assertEquals(1, emits.size()); // Match must not be made
checkEmit(emits.iterator().next(), 0, 9, "#sugar-123"); checkEmit(emits.iterator().next(), 0, 9, "#sugar-123");
} }
@Test @Test
public void test_LargeString() { public void testLargeString() {
final int interval = 100; final int interval = 100;
final int textSize = 1000000; final int textSize = 1000000;
final String keyword = FOOD[1]; final String keyword = FOOD[1];
@ -525,51 +455,48 @@ public class TrieTest {
injectKeyword(text, keyword, interval); injectKeyword(text, keyword, interval);
final Trie trie = Trie.builder().onlyWholeWords().addKeyword(keyword).build(); Trie trie = Trie.builder()
.onlyWholeWords()
.addKeyword(keyword)
.build();
final Collection<Emit> emits = trie.parseText(text); final Collection<Emit> emits = trie.parseText(text);
assertEquals(textSize / interval, emits.size()); assertEquals(textSize / interval, emits.size());
} }
@Test(timeout=30_000)
@Test public void testParallelSearch() throws InterruptedException {
public void test_UnicodeIssueBug39ReportedByHumanzz() {
// Problem: "İ".length => 1, "İ".toLowerCase().length => 2. This causes
// all sorts of unexpected behaviors
// and bugs where the Emit will have a size different from the original
// string.
// Soln: As in issue #8, convert at character level Character.toLowerCase
// ('İ') => 'i' + make sure
// that emit gets the properly cased keyword.
final String upperLengthOne = "İnt";
final Trie trie = Trie.builder().ignoreCase().onlyWholeWords().addKeyword(upperLengthOne).build();
final Collection<Emit> emits = trie.parseText("İnt is good");
assertEquals(1, emits.size());
checkEmit(emits.iterator().next(), 0, 2, upperLengthOne);
}
@Test(timeout = 30_000)
public void test_ParallelSearch() throws InterruptedException {
final int interval = 100; final int interval = 100;
final int textSize = 1000000; final int textSize = 1000000;
final String keyword = FOOD[1]; final String keyword = FOOD[1];
final StringBuilder matchingText = randomNumbers(textSize); final StringBuilder matchingText = randomNumbers(textSize);
injectKeyword(matchingText, keyword, interval); injectKeyword(matchingText, keyword, interval);
final StringBuilder nonMatchingText = randomNumbers(textSize); final StringBuilder nonMatchingText = randomNumbers(textSize);
injectKeyword(nonMatchingText, keyword.substring(0, keyword.length() - 1), interval); injectKeyword(nonMatchingText, keyword.substring(0, keyword.length()-1), interval);
final Trie trie = Trie.builder().onlyWholeWords().addKeyword(keyword).build(); final Trie trie = Trie.builder()
.onlyWholeWords()
.addKeyword(keyword)
.build();
final AtomicInteger matchCount = new AtomicInteger(0); final AtomicInteger matchCount = new AtomicInteger(0);
final Runnable matchingTask = () -> matchCount.set(trie.parseText(matchingText).size()); Runnable matchingTask = new Runnable() {
@Override
public void run() {
matchCount.set(trie.parseText(matchingText).size());
}
};
final AtomicInteger nonMatchCount = new AtomicInteger(0); final AtomicInteger nonMatchCount = new AtomicInteger(0);
final Runnable nonMatchingTask = () -> nonMatchCount.set(trie.parseText(nonMatchingText).size()); Runnable nonMatchingTask = new Runnable() {
final Thread matchingThread = new Thread(matchingTask); @Override
final Thread nonMatchingThread = new Thread(nonMatchingTask); public void run() {
nonMatchCount.set(trie.parseText(nonMatchingText).size());
}
};
Thread matchingThread = new Thread(matchingTask);
Thread nonMatchingThread = new Thread(nonMatchingTask);
matchingThread.start(); matchingThread.start();
nonMatchingThread.start(); nonMatchingThread.start();
matchingThread.join(); matchingThread.join();
@ -579,12 +506,47 @@ public class TrieTest {
assertEquals(0, nonMatchCount.get()); assertEquals(0, nonMatchCount.get());
} }
/**
* Generates a random sequence of ASCII numbers.
*
* @param count The number of numbers to generate.
* @return A character sequence filled with random digits.
*/
private StringBuilder randomNumbers(int count) {
final StringBuilder sb = new StringBuilder(count);
while (--count > 0) {
sb.append(randomInt(0, 10));
}
return sb;
}
/**
* Injects keywords into a string builder.
*
* @param source Should contain a bunch of random data that cannot match
* any keyword.
* @param keyword A keyword to inject repeatedly in the text.
* @param interval How often to inject the keyword.
*/
private void injectKeyword(
final StringBuilder source,
final String keyword,
final int interval) {
final int length = source.length();
for (int i = 0; i < length; i += interval) {
source.replace(i, i + keyword.length(), keyword);
}
}
private int randomInt(final int min, final int max) {
return ThreadLocalRandom.current().nextInt(min, max);
}
private void checkEmit(Emit next, int expectedStart, int expectedEnd, String expectedKeyword) { private void checkEmit(Emit next, int expectedStart, int expectedEnd, String expectedKeyword) {
assertEquals("Start of emit should have been " + expectedStart, expectedStart, next.getStart()); assertEquals("Start of emit should have been " + expectedStart, expectedStart, next.getStart());
assertEquals("End of emit should have been " + expectedEnd, expectedEnd, next.getEnd()); assertEquals("End of emit should have been " + expectedEnd, expectedEnd, next.getEnd());
assertEquals(expectedKeyword, next.getKeyword()); assertEquals(expectedKeyword, next.getKeyword());
} }
} }

View File

@ -0,0 +1,65 @@
package org.ahocorasick.util;
import java.util.ArrayList;
import java.util.List;
import org.ahocorasick.util.ListElementRemoval.RemoveElementPredicate;
import org.junit.Test;
import junit.framework.Assert;
import static java.util.Arrays.asList;
public class ListElementRemovalTest {
@Test
public void removeNone() {
List<String> list = new ArrayList<>(asList("a", "b", "c"));
RemoveElementPredicate<String> matchNothing = new RemoveElementPredicate<String>() {
@Override
public boolean remove(String t) {
return false;
}
};
ListElementRemoval.removeIf(list, matchNothing);
Assert.assertEquals(3, list.size());
}
@Test
public void removeAll() {
List<String> list = new ArrayList<>(asList("a", "b", "c"));
RemoveElementPredicate<String> matchNothing = new RemoveElementPredicate<String>() {
@Override
public boolean remove(String t) {
return true;
}
};
ListElementRemoval.removeIf(list, matchNothing);
Assert.assertEquals(0, list.size());
}
@Test
public void removeSome() {
List<String> list = new ArrayList<>(asList("a", "b", "c"));
RemoveElementPredicate<String> matchNothing = new RemoveElementPredicate<String>() {
@Override
public boolean remove(String t) {
return "a".equals(t) || "c".equals(t);
}
};
ListElementRemoval.removeIf(list, matchNothing);
Assert.assertEquals(1, list.size());
Assert.assertEquals("b", list.get(0));
}
}