Compare commits

..

1 Commits

Author SHA1 Message Date
Dave Jarvis
ea661d5e64 Proof of maintenance 2020-09-25 10:45:58 -07:00
54 changed files with 1152 additions and 1602 deletions

View File

@ -1,21 +0,0 @@
include:
- project: 'gitlab/gitlab'
ref: 'main'
file: 'ci-templates/gradle_java.yml'
deploy:
stage: deploy
tags:
- dind
script:
- echo "Building with gradle version ${BUILDVERSION}"
- gradle -Pversion=${BUILDVERSION} publish
- echo "BUILDVERSION=$BUILDVERSION" >> version.env
artifacts:
reports:
dotenv: version.env
rules:
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
- if: $CI_COMMIT_BRANCH =~ /^release/
- if: $CI_COMMIT_TAG

6
.travis.yml Normal file
View File

@ -0,0 +1,6 @@
language: java
install: mvn install -DskipTests=true -Dgpg.skip=true
jdk:
- openjdk8
after_success:
- bash <(curl -s https://codecov.io/bash)

View File

@ -1,202 +1,237 @@
Aho-Corasick
============
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
[![Build Status](https://travis-ci.org/robert-bor/aho-corasick.svg?branch=master)](https://travis-ci.org/robert-bor/aho-corasick)
[![Codacy Badge](https://api.codacy.com/project/badge/Grade/0f65bfb641f745a4b301b85d028a4a8d)](https://www.codacy.com/app/bor-robert/aho-corasick)
[![Codecov](https://codecov.io/gh/robert-bor/aho-corasick/branch/master/graph/badge.svg)](https://codecov.io/gh/robert-bor/aho-corasick)
[![Maven Central](https://maven-badges.herokuapp.com/maven-central/org.ahocorasick/ahocorasick/badge.svg)](https://maven-badges.herokuapp.com/maven-central/org.ahocorasick/ahocorasick)
[![Javadoc](https://javadoc-emblem.rhcloud.com/doc/org.ahocorasick/ahocorasick/badge.svg)](http://www.javadoc.io/doc/org.ahocorasick/ahocorasick)
[![Apache 2](http://img.shields.io/badge/license-Apache%202-blue.svg)](http://www.apache.org/licenses/LICENSE-2.0)
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
Dependency
----------
1. Definitions.
Include this dependency in your POM. Be sure to check for the latest version in Maven Central.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
```xml
<dependency>
<groupId>org.ahocorasick</groupId>
<artifactId>ahocorasick</artifactId>
<version>0.5.0</version>
</dependency>
```
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
Introduction
------------
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
Most free-text searching is based on Lucene-like approaches, where the
search text is parsed into its various components. For every keyword a
lookup is done to see where it occurs. When looking for a couple of keywords
this approach is great, but when searching for 100,000 words, the approach
is quite slow (for example, checking against a dictionary).
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
The Aho-Corasick algorithm shines when looking for multiple words.
Rather than chop up the search text, it uses all the keywords to build
a [Trie](http://en.wikipedia.org/wiki/Trie) construct. The crucial
Aho-Corasick components include:
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
* goto
* fail
* output
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
Every character encountered is presented to a state object within the
*goto* structure. If there is a matching state, that will be elevated to
the new current state.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
However, if there is no matching state, the algorithm will signal a
*fail* and fall back to states with less depth (i.e., a match less long)
and proceed from there, until it found a matching state, or it has reached
the root state.
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
Whenever a state is reached that matches an entire keyword, it is
emitted to an *output* set which can be read after the entire scan
has completed.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
The algorithm is O(n). No matter how many keywords are given, or how large
the search text is, the performance will decline linearly.
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
The Aho-Corasick algorithm can help:
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
* find words in texts to link or emphasize them;
* add semantics to plain text; or
* check against a dictionary to see if syntactic errors were made.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
See the [white paper](http://cr.yp.to/bib/1975/aho.pdf) by Aho and
Corasick for algorithmic details.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
Usage
-----
Set up the Trie using a builder as follows:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
```java
Trie trie = Trie.builder()
.addKeyword("hers")
.addKeyword("his")
.addKeyword("she")
.addKeyword("he")
.build();
Collection<Emit> emits = trie.parseText("ushers");
```
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
The collection will contain `Emit` objects that match:
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
* "she" starting at position 1, ending at position 3
* "he" starting at position 2, ending at position 3
* "hers" starting at position 2, ending at position 5
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
In situations where overlapping instances are not desired, retain
the longest and left-most matches by calling `ignoreOverlaps()`:
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
```java
Trie trie = Trie.builder()
.ignoreOverlaps()
.addKeyword("hot")
.addKeyword("hot chocolate")
.build();
Collection<Emit> emits = trie.parseText("hot chocolate");
```
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
The `ignoreOverlaps()` method tells the Trie to remove all overlapping
matches. For this it relies on the following conflict resolution rules:
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
1. longer matches prevail over shorter matches; and
1. left-most prevails over right-most.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
Only one result is returned:
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
* "hot chocolate" starting at position 0, ending at position 12
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
To check for whole words exclusively, call `onlyWholeWords()` as follows:
END OF TERMS AND CONDITIONS
```java
Trie trie = Trie.builder()
.onlyWholeWords()
.addKeyword("sugar")
.build();
Collection<Emit> emits = trie.parseText("sugarcane sugar canesugar");
```
APPENDIX: How to apply the Apache License to your work.
Only one match is found; whereas, without calling `onlyWholeWords()` four
matches are found. The sugarcane/canesugar words are discarded because
they are partial matches.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Some text is `WrItTeN` in mixed case, which makes it hard to identify.
Instruct the Trie to convert the searchtext to lowercase to ease the
matching process. The lower-casing applies to keywords as well.
Copyright 2018 Robert Bor
```java
Trie trie = Trie.builder()
.ignoreCase()
.addKeyword("casing")
.build();
Collection<Emit> emits = trie.parseText("CaSiNg");
```
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Normally, this match would not be found. By calling `ignoreCase()`,
the entire search text is made lowercase before matching begins.
Therefore it will find exactly one match.
http://www.apache.org/licenses/LICENSE-2.0
It is also possible to just ask whether the text matches any of
the keywords, or just to return the first match it finds.
```java
Trie trie = Trie.builder().ignoreOverlaps()
.addKeyword("ab")
.addKeyword("cba")
.addKeyword("ababc")
.build();
Emit firstMatch = trie.firstMatch("ababcbab");
```
The value for `firstMatch` will be "ababc" from position 0. The
`containsMatch()` method checks whether `firstMatch` found a match and
returns `true` if that is the case.
For a barebones Aho-Corasick algorithm with a custom emit handler use:
```java
Trie trie = Trie.builder()
.addKeyword("hers")
.addKeyword("his")
.addKeyword("she")
.addKeyword("he")
.build();
final List<Emit> emits = new ArrayList<>();
EmitHandler emitHandler = new EmitHandler() {
@Override
public void emit(Emit emit) {
emits.add(emit);
}
};
```
In many cases you may want to do perform tasks with both the non-matching
and the matching text. Such implementations may be better served by using
`Trie.tokenize()`. The `tokenize()` method allows looping over the
corpus to deal with matches as soon as they are encountered. Here's an
example that outputs key words as italicized HTML elements:
```java
String speech = "The Answer to the Great Question... Of Life, " +
"the Universe and Everything... Is... Forty-two,' said " +
"Deep Thought, with infinite majesty and calm.";
Trie trie = Trie.builder().ignoreOverlaps().onlyWholeWords().ignoreCase()
.addKeyword("great question")
.addKeyword("forty-two")
.addKeyword("deep thought")
.build();
Collection<Token> tokens = trie.tokenize(speech);
StringBuilder html = new StringBuilder();
html.append("<html><body><p>");
for (Token token : tokens) {
if (token.isMatch()) {
html.append("<i>");
}
html.append(token.getFragment());
if (token.isMatch()) {
html.append("</i>");
}
}
html.append("</p></body></html>");
System.out.println(html);
```
You can also emit custom outputs. This might for example be useful to
implement a trivial named entity recognizer. In this case use a
`PayloadTrie` instead of a `Trie` as follows:
```java
class Word {
private final String gender;
public Word(String gender) {
this.gender = gender;
}
}
PayloadTrie<Word> trie = PayloadTrie.<Word>builder()
.addKeyword("hers", new Word("f")
.addKeyword("his", new Word("m"))
.addKeyword("she", new Word("f"))
.addKeyword("he", new Word("m"))
.addKeyword("nonbinary", new Word("nb"))
.addKeyword("transgender", new Word("tg"))
.build();
Collection<PayloadEmit<Word>> emits = trie.parseText("ushers");
```
Releases
--------
See [releases](https://github.com/robert-bor/aho-corasick/releases) for details.
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

2
OSSRH-60885.txt Normal file
View File

@ -0,0 +1,2 @@
Dave Jarvis is maintaining the code; please grant him the same permissions as
Robert Bor.

View File

@ -2,9 +2,10 @@ Aho-Corasick
============
[![Build Status](https://travis-ci.org/robert-bor/aho-corasick.svg?branch=master)](https://travis-ci.org/robert-bor/aho-corasick)
[![Codacy Badge](https://api.codacy.com/project/badge/Grade/0f65bfb641f745a4b301b85d028a4a8d)](https://www.codacy.com/app/bor-robert/aho-corasick)
[![Codecov](https://codecov.io/gh/robert-bor/aho-corasick/branch/master/graph/badge.svg)](https://codecov.io/gh/robert-bor/aho-corasick)
[![Maven Central](https://maven-badges.herokuapp.com/maven-central/org.ahocorasick/ahocorasick/badge.svg)](https://maven-badges.herokuapp.com/maven-central/org.ahocorasick/ahocorasick)
[![Javadoc](https://javadoc.io/badge2/org.ahocorasick/ahocorasick/javadoc.svg)](https://javadoc.io/doc/org.ahocorasick/ahocorasick)
[![Javadoc](https://javadoc-emblem.rhcloud.com/doc/org.ahocorasick/ahocorasick/badge.svg)](http://www.javadoc.io/doc/org.ahocorasick/ahocorasick)
[![Apache 2](http://img.shields.io/badge/license-Apache%202-blue.svg)](http://www.apache.org/licenses/LICENSE-2.0)
Dependency
@ -16,7 +17,7 @@ Include this dependency in your POM. Be sure to check for the latest version in
<dependency>
<groupId>org.ahocorasick</groupId>
<artifactId>ahocorasick</artifactId>
<version>0.6.3</version>
<version>0.4.0</version>
</dependency>
```
@ -115,7 +116,7 @@ Trie trie = Trie.builder()
Collection<Emit> emits = trie.parseText("sugarcane sugar canesugar");
```
Only one match is found; whereas, without calling `onlyWholeWords()` three
Only one match is found; whereas, without calling `onlyWholeWords()` four
matches are found. The sugarcane/canesugar words are discarded because
they are partial matches.
@ -219,7 +220,7 @@ class Word {
}
PayloadTrie<Word> trie = PayloadTrie.<Word>builder()
.addKeyword("hers", new Word("f"))
.addKeyword("hers", new Word("f")
.addKeyword("his", new Word("m"))
.addKeyword("she", new Word("f"))
.addKeyword("he", new Word("m"))

View File

@ -1,72 +0,0 @@
plugins {
`java-library`
`maven-publish`
pmd
checkstyle
id("io.freefair.lombok") version "8.4"
}
repositories {
mavenLocal()
maven {
url = uri("https://nexus.knecon.com/repository/gindev/")
credentials {
username = providers.gradleProperty("mavenUser").getOrNull();
password = providers.gradleProperty("mavenPassword").getOrNull();
}
}
maven {
url = uri("https://repo.maven.apache.org/maven2/")
}
}
dependencies {
testImplementation("junit:junit:4.13.2")
}
group = "org.ahocorasick"
description = "Aho-CoraSick algorithm for efficient string matching"
java.sourceCompatibility = JavaVersion.VERSION_17
java.targetCompatibility = JavaVersion.VERSION_17
java {
withSourcesJar()
withJavadocJar()
}
publishing {
publications.create<MavenPublication>("maven") {
from(components["java"])
}
repositories {
maven {
url = uri("https://nexus.knecon.com/repository/red-platform-releases/")
credentials {
username = providers.gradleProperty("mavenUser").getOrNull();
password = providers.gradleProperty("mavenPassword").getOrNull();
}
}
}
}
tasks.withType<JavaCompile>() {
options.encoding = "UTF-8"
}
tasks.withType<Javadoc>() {
options.encoding = "UTF-8"
}
pmd {
isConsoleOutput = true
}
tasks.pmdMain {
pmd.ruleSetFiles = files("${rootDir}/config/pmd/pmd.xml")
}
tasks.pmdTest {
pmd.ruleSetFiles = files("${rootDir}/config/pmd/test_pmd.xml")
}

View File

@ -1,38 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE module PUBLIC "-//Puppy Crawl//DTD Check Configuration 1.3//EN"
"http://www.puppycrawl.com/dtds/configuration_1_3.dtd">
<module name="Checker">
<property
name="severity"
value="error"/>
<module name="TreeWalker">
<module name="SuppressWarningsHolder"/>
<module name="MissingDeprecated"/>
<module name="MissingOverride"/>
<module name="AnnotationLocation"/>
<module name="NonEmptyAtclauseDescription"/>
<module name="IllegalImport"/>
<module name="RedundantImport"/>
<module name="RedundantModifier"/>
<module name="EmptyBlock"/>
<module name="DefaultComesLast"/>
<module name="EmptyStatement"/>
<module name="EqualsHashCode"/>
<module name="ExplicitInitialization"/>
<module name="IllegalInstantiation"/>
<module name="ModifiedControlVariable"/>
<module name="MultipleVariableDeclarations"/>
<module name="PackageDeclaration"/>
<module name="ParameterAssignment"/>
<module name="SimplifyBooleanExpression"/>
<module name="SimplifyBooleanReturn"/>
<module name="StringLiteralEquality"/>
<module name="OneStatementPerLine"/>
<module name="FinalClass"/>
<module name="ArrayTypeStyle"/>
<module name="UpperEll"/>
<module name="OuterTypeFilename"/>
</module>
<module name="FileTabCharacter"/>
<module name="SuppressWarningsFilter"/>
</module>

View File

@ -1,21 +0,0 @@
<?xml version="1.0"?>
<ruleset name="Custom ruleset"
xmlns="http://pmd.sourceforge.net/ruleset/2.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://pmd.sourceforge.net/ruleset/2.0.0 http://pmd.sourceforge.net/ruleset_2_0_0.xsd">
<description>
Knecon ruleset checks the code for bad stuff
</description>
<rule ref="category/java/errorprone.xml">
<exclude name="MissingSerialVersionUID"/>
<exclude name="AvoidLiteralsInIfCondition"/>
<exclude name="DataflowAnomalyAnalysis"/>
<exclude name="AvoidDuplicateLiterals"/>
<exclude name="NullAssignment"/>
<exclude name="AssignmentInOperand"/>
<exclude name="BeanMembersShouldSerialize"/>
</rule>
</ruleset>

View File

@ -1,11 +0,0 @@
<?xml version="1.0"?>
<ruleset name="Custom ruleset"
xmlns="http://pmd.sourceforge.net/ruleset/2.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://pmd.sourceforge.net/ruleset/2.0.0 http://pmd.sourceforge.net/ruleset_2_0_0.xsd">
<description>
Knecon test ruleset checks the code for bad stuff
</description>
</ruleset>

View File

@ -1 +0,0 @@
version = 0.7-SNAPSHOT

179
pom.xml Normal file
View File

@ -0,0 +1,179 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.ahocorasick</groupId>
<artifactId>ahocorasick</artifactId>
<version>0.6.1</version>
<packaging>jar</packaging>
<name>Aho-CoraSick algorithm for efficient string matching</name>
<description>Java library for efficient string matching against a large set of keywords</description>
<inceptionYear>2014</inceptionYear>
<url>https://github.com/robert-bor/aho-corasick</url>
<distributionManagement>
<snapshotRepository>
<id>ossrh</id>
<url>https://oss.sonatype.org/content/repositories/snapshots</url>
</snapshotRepository>
<repository>
<id>ossrh</id>
<url>https://oss.sonatype.org/service/local/staging/deploy/maven2/</url>
</repository>
</distributionManagement>
<organization>
<name>42 BV</name>
<url>http://blog.42.nl/</url>
</organization>
<licenses>
<license>
<name>The Apache Software License, Version 2.0</name>
<url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
<distribution>repo</distribution>
</license>
</licenses>
<scm>
<url>scm:git://github.com/robert-bor/aho-corasick</url>
<connection>scm:git://github.com/robert-bor/aho-corasick</connection>
</scm>
<developers>
<developer>
<name>Robert Bor</name>
<organization>42</organization>
</developer>
<developer>
<name>Daniel Beck</name>
<organization>neoSearch UG (haftungsbeschränkt)</organization>
</developer>
<developer>
<name>Dave Jarvis</name>
<organization>White Magic Software, Ltd.</organization>
</developer>
</developers>
<properties>
<java.version>1.8</java.version>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<junit.version>4.10</junit.version>
<!-- Reporting -->
<maven.cobertura.version>2.5.2</maven.cobertura.version>
<maven.javadoc.version>2.8</maven.javadoc.version>
<maven.project.version>2.4</maven.project.version>
<maven.site.plugin.version>3.3</maven.site.plugin.version>
</properties>
<dependencies>
<!-- Used for unit testing -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit-dep</artifactId>
<version>${junit.version}</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<defaultGoal>install</defaultGoal>
<plugins>
<plugin>
<groupId>org.sonatype.plugins</groupId>
<artifactId>nexus-staging-maven-plugin</artifactId>
<version>1.6.7</version>
<extensions>true</extensions>
<configuration>
<serverId>ossrh</serverId>
<nexusUrl>https://oss.sonatype.org/</nexusUrl>
<autoReleaseAfterClose>false</autoReleaseAfterClose>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.6.0</version>
<configuration>
<source>${java.version}</source>
<target>${java.version}</target>
<encoding>${project.build.sourceEncoding}</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.9.1</version>
<executions>
<execution>
<id>attach-javadocs</id>
<goals>
<goal>jar</goal>
</goals>
<configuration>
<source>8</source>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<version>2.2.1</version>
<executions>
<execution>
<id>attach-sources</id>
<goals>
<goal>jar-no-fork</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-gpg-plugin</artifactId>
<version>1.5</version>
<executions>
<execution>
<id>sign-artifacts</id>
<phase>verify</phase>
<goals>
<goal>sign</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.jacoco</groupId>
<artifactId>jacoco-maven-plugin</artifactId>
<version>0.8.5</version>
<executions>
<execution>
<goals>
<goal>prepare-agent</goal>
</goals>
</execution>
<execution>
<id>report</id>
<phase>test</phase>
<goals>
<goal>report</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

View File

@ -1 +0,0 @@
rootProject.name = "ahocorasick"

View File

@ -1,18 +1,10 @@
package org.ahocorasick.interval;
import org.ahocorasick.trie.Emit;
import org.ahocorasick.trie.PayloadEmit;
/**
* Responsible for tracking the start and end bounds, which are reused by
* both {@link Emit} and {@link PayloadEmit}.
*/
public class Interval implements Intervalable {
private final int start;
private final int end;
/**
* Constructs an interval with a start and end position.
*
@ -20,12 +12,10 @@ public class Interval implements Intervalable {
* @param end The interval's ending text position.
*/
public Interval(final int start, final int end) {
this.start = start;
this.end = end;
}
/**
* Returns the starting offset into the text for this interval.
*
@ -33,11 +23,9 @@ public class Interval implements Intervalable {
*/
@Override
public int getStart() {
return this.start;
}
/**
* Returns the ending offset into the text for this interval.
*
@ -45,11 +33,9 @@ public class Interval implements Intervalable {
*/
@Override
public int getEnd() {
return this.end;
}
/**
* Returns the length of the interval.
*
@ -57,11 +43,9 @@ public class Interval implements Intervalable {
*/
@Override
public int size() {
return end - start + 1;
}
/**
* Answers whether the given interval overlaps this interval
* instance.
@ -70,38 +54,31 @@ public class Interval implements Intervalable {
* @return true The intervals overlap.
*/
public boolean overlapsWith(final Interval other) {
return this.start <= other.getEnd() && this.end >= other.getStart();
return this.start <= other.getEnd() &&
this.end >= other.getStart();
}
public boolean overlapsWith(int point) {
return this.start <= point && point <= this.end;
}
@Override
public boolean equals(Object o) {
if (!(o instanceof Intervalable)) {
return false;
}
Intervalable other = (Intervalable) o;
return this.start == other.getStart() && this.end == other.getEnd();
return this.start == other.getStart() &&
this.end == other.getEnd();
}
@Override
public int hashCode() {
return this.start % 100 + this.end % 100;
}
@Override
public int compareTo(Object o) {
if (!(o instanceof Intervalable)) {
return -1;
}
@ -110,7 +87,6 @@ public class Interval implements Intervalable {
return comparison != 0 ? comparison : this.end - other.getEnd();
}
/**
* Returns the starting offset and ending offset separated
* by a full colon (:).
@ -119,8 +95,6 @@ public class Interval implements Intervalable {
*/
@Override
public String toString() {
return this.start + ":" + this.end;
}
}

View File

@ -6,19 +6,14 @@ import java.util.List;
public class IntervalNode {
private enum Direction {
LEFT,
RIGHT
}
private enum Direction {LEFT, RIGHT}
private IntervalNode left;
private IntervalNode right;
private int point;
private List<Intervalable> intervals = new ArrayList<>();
public IntervalNode(final List<Intervalable> intervals) {
this.point = determineMedian(intervals);
final List<Intervalable> toLeft = new ArrayList<>();
@ -42,9 +37,7 @@ public class IntervalNode {
}
}
private int determineMedian(final List<Intervalable> intervals) {
public int determineMedian(final List<Intervalable> intervals) {
int start = -1;
int end = -1;
for (Intervalable interval : intervals) {
@ -60,9 +53,7 @@ public class IntervalNode {
return (start + end) / 2;
}
public List<Intervalable> findOverlaps(final Intervalable interval) {
final List<Intervalable> overlaps = new ArrayList<>();
if (this.point < interval.getStart()) {
@ -83,9 +74,10 @@ public class IntervalNode {
return overlaps;
}
protected void addToOverlaps(final Intervalable interval, final List<Intervalable> overlaps, final List<Intervalable> newOverlaps) {
protected void addToOverlaps(
final Intervalable interval,
final List<Intervalable> overlaps,
final List<Intervalable> newOverlaps) {
for (final Intervalable currentInterval : newOverlaps) {
if (!currentInterval.equals(interval)) {
overlaps.add(currentInterval);
@ -93,21 +85,16 @@ public class IntervalNode {
}
}
protected List<Intervalable> checkForOverlapsToTheLeft(final Intervalable interval) {
return checkForOverlaps(interval, Direction.LEFT);
}
protected List<Intervalable> checkForOverlapsToTheRight(final Intervalable interval) {
return checkForOverlaps(interval, Direction.RIGHT);
}
protected List<Intervalable> checkForOverlaps(final Intervalable interval, final Direction direction) {
protected List<Intervalable> checkForOverlaps(
final Intervalable interval, final Direction direction) {
final List<Intervalable> overlaps = new ArrayList<>();
for (final Intervalable currentInterval : this.intervals) {
@ -128,10 +115,9 @@ public class IntervalNode {
return overlaps;
}
protected List<Intervalable> findOverlappingRanges(IntervalNode node, Intervalable interval) {
return node == null ? Collections.<Intervalable>emptyList() : node.findOverlaps(interval);
return node == null
? Collections.<Intervalable>emptyList()
: node.findOverlaps(interval);
}
}

View File

@ -10,13 +10,10 @@ public class IntervalTree {
private final IntervalNode rootNode;
public IntervalTree(List<Intervalable> intervals) {
this.rootNode = new IntervalNode(intervals);
}
public List<Intervalable> removeOverlaps(final List<Intervalable> intervals) {
// Sort the intervals on size, then left-most position
@ -45,9 +42,7 @@ public class IntervalTree {
return intervals;
}
public List<Intervalable> findOverlaps(final Intervalable interval) {
return rootNode.findOverlaps(interval);
}

View File

@ -2,12 +2,10 @@ package org.ahocorasick.interval;
public interface Intervalable extends Comparable {
int getStart();
int getStart();
int getEnd();
int getEnd();
int size();
int size();
}

View File

@ -6,7 +6,6 @@ public class IntervalableComparatorByPosition implements Comparator<Intervalable
@Override
public int compare(final Intervalable intervalable, final Intervalable intervalable2) {
return intervalable.getStart() - intervalable2.getStart();
}

View File

@ -6,7 +6,6 @@ public class IntervalableComparatorBySize implements Comparator<Intervalable> {
@Override
public int compare(final Intervalable intervalable, final Intervalable intervalable2) {
int comparison = intervalable2.size() - intervalable.size();
if (comparison == 0) {

View File

@ -4,22 +4,16 @@ public class DefaultToken extends Token {
private PayloadToken<String> payloadToken;
public DefaultToken(PayloadToken<String> payloadToken) {
super(payloadToken.getFragment());
this.payloadToken = payloadToken;
}
public boolean isMatch() {
return payloadToken.isMatch();
}
public Emit getEmit() {
PayloadEmit<String> emit = payloadToken.getEmit();
return new Emit(emit.getStart(), emit.getEnd(), emit.getKeyword());
}

View File

@ -3,30 +3,20 @@ package org.ahocorasick.trie;
import org.ahocorasick.interval.Interval;
import org.ahocorasick.interval.Intervalable;
/**
* Responsible for tracking the bounds of matched terms.
*/
public class Emit extends Interval implements Intervalable {
private final String keyword;
public Emit(final int start, final int end, final String keyword) {
public Emit(final int start, final int end, String keyword) {
super(start, end);
this.keyword = keyword;
}
public String getKeyword() {
return this.keyword;
}
@Override
public String toString() {
return super.toString() + "=" + this.keyword;
}

View File

@ -3,21 +3,16 @@ package org.ahocorasick.trie;
public class FragmentToken extends Token {
public FragmentToken(String fragment) {
super(fragment);
}
@Override
public boolean isMatch() {
return false;
}
@Override
public Emit getEmit() {
return null;
}

View File

@ -4,26 +4,19 @@ public class MatchToken extends Token {
private final Emit emit;
public MatchToken(final String fragment, final Emit emit) {
super(fragment);
this.emit = emit;
}
@Override
public boolean isMatch() {
return true;
}
@Override
public Emit getEmit() {
return this.emit;
}
}

View File

@ -1,21 +1,32 @@
package org.ahocorasick.trie;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
/**
* Contains the matched keyword and some payload data.
*
* @param <T> The type of the wrapped payload data.
*
* @author Daniel Beck
* @param <T> The type of the wrapped payload data.
*/
@Getter
@EqualsAndHashCode
@RequiredArgsConstructor
public class Payload<T> {
public class Payload<T> implements Comparable<Payload<T>> {
private final String keyword;
private final T data;
public Payload(final String keyword, final T data) {
super();
this.keyword = keyword;
this.data = data;
}
public String getKeyword() {
return keyword;
}
public T getData() {
return data;
}
@Override
public int compareTo(Payload<T> other) {
return keyword.compareTo(other.getKeyword());
}
}

View File

@ -5,7 +5,7 @@ import org.ahocorasick.interval.Intervalable;
/**
* Contains a matched term and its associated payload data.
*
*
* @param <T> Type of the wrapped payload-data.
* @author Daniel Beck
*/
@ -15,44 +15,35 @@ public class PayloadEmit<T> extends Interval implements Intervalable {
private final T payload;
/**
* Created a PayloadEmit
*
*
* @param start Start of the matched search term.
* @param end End of the matched search term.
* @param keyword Keyword that matched.
* @param payload Emitted payload data.
*/
public PayloadEmit(final int start, final int end, String keyword, T payload) {
super(start, end);
this.keyword = keyword;
this.payload = payload;
}
public String getKeyword() {
return this.keyword;
}
/**
* Returns the payload associated to this emit.
*
*
* @return the associated payload
*/
public T getPayload() {
return this.payload;
}
@Override
public String toString() {
return super.toString() + "=" + this.keyword + (this.payload != null ? "->" + this.payload : "");
}
}

View File

@ -6,7 +6,7 @@ package org.ahocorasick.trie;
* This token indicates a matching search term was not found, so
* {@link #isMatch()} always returns {@code false}.
* </p>
*
*
* @author Daniel Beck
*
* @param <T> The Type of the emitted payloads.
@ -14,25 +14,19 @@ package org.ahocorasick.trie;
public class PayloadFragmentToken<T> extends PayloadToken<T> {
public PayloadFragmentToken(String fragment) {
super(fragment);
}
@Override
public boolean isMatch() {
return false;
}
/**
* Returns null.
*/
@Override
public PayloadEmit<T> getEmit() {
return null;
}
}

View File

@ -6,33 +6,27 @@ package org.ahocorasick.trie;
* This token indicates a matching search term was found, so {@link #isMatch()}
* always returns {@code true}.
* </p>
*
* @author Daniel Beck
*
* @param <T> The Type of the emitted payloads.
* @author Daniel Beck
*/
public class PayloadMatchToken<T> extends PayloadToken<T> {
private final PayloadEmit<T> emit;
public PayloadMatchToken(final String fragment, final PayloadEmit<T> emit) {
super(fragment);
this.emit = emit;
}
@Override
public boolean isMatch() {
return true;
}
@Override
public PayloadEmit<T> getEmit() {
return this.emit;
}
}

View File

@ -1,10 +1,6 @@
package org.ahocorasick.trie;
import java.util.*;
import java.util.stream.Collectors;
import lombok.Getter;
import lombok.Setter;
/**
* <p>
@ -31,14 +27,13 @@ import lombok.Setter;
public class PayloadState<T> {
/**
* effective the size of the keyword.
* effective the size of the keyword
*/
@Getter
private final int depth;
/**
* only used for the root state to refer to itself in case no matches have been
* found.
* found
*/
private final PayloadState<T> rootState;
@ -49,34 +44,26 @@ public class PayloadState<T> {
private final Map<Character, PayloadState<T>> success = new HashMap<>();
/**
* if no matching states are found, the failure state will be returned.
* if no matching states are found, the failure state will be returned
*/
@Getter
@Setter
private PayloadState<T> failure;
/**
* whenever this state is reached, it will emit the matches keywords for future
* reference.
* reference
*/
private Set<Payload<T>> emits;
public PayloadState() {
this(0);
}
public PayloadState(final int depth) {
this.depth = depth;
this.rootState = depth == 0 ? this : null;
}
private PayloadState<T> nextState(final Character character, final boolean ignoreRootState) {
PayloadState<T> nextState = this.success.get(character);
if (!ignoreRootState && nextState == null && this.rootState != null) {
@ -86,21 +73,15 @@ public class PayloadState<T> {
return nextState;
}
public PayloadState<T> nextState(final Character character) {
return nextState(character, false);
}
public PayloadState<T> nextStateIgnoreRootState(Character character) {
return nextState(character, true);
}
public PayloadState<T> addState(Character character) {
PayloadState<T> nextState = nextStateIgnoreRootState(character);
if (nextState == null) {
nextState = new PayloadState<>(this.depth + 1);
@ -109,56 +90,55 @@ public class PayloadState<T> {
return nextState;
}
public int getDepth() {
return this.depth;
}
/**
* Adds a payload to be emitted for this state.
*
*
* @param payload to be emitted.
*/
public void addEmit(Payload<T> payload) {
if (this.emits == null) {
this.emits = new HashSet<>();
this.emits = new TreeSet<>();
}
this.emits.add(payload);
}
/**
* Adds a collection of payloads to be emitted for this state.
*
*
* @param emits Collection of payloads to be emitted.
*/
public void addEmit(Collection<Payload<T>> emits) {
for (Payload<T> emit : emits) {
addEmit(emit);
}
}
/**
* Returns a collection of emitted payloads for this state.
*
*
* @return Collection of emitted payloads.
*/
public Collection<Payload<T>> emit() {
return this.emits == null ? Collections.<Payload<T>>emptyList() : this.emits.stream()
.sorted(Comparator.comparing(Payload::getKeyword))
.collect(Collectors.toList());
return this.emits == null ? Collections.<Payload<T>>emptyList() : this.emits;
}
public PayloadState<T> failure() {
return this.failure;
}
public void setFailure(PayloadState<T> failState) {
this.failure = failState;
}
public Collection<PayloadState<T>> getStates() {
return this.success.values();
}
public Collection<Character> getTransitions() {
return this.success.keySet();
}
}

View File

@ -9,33 +9,24 @@ package org.ahocorasick.trie;
* @param <T> The Type of the emitted payloads.
*/
public abstract class PayloadToken<T> {
private String fragment;
public PayloadToken(String fragment) {
this.fragment = fragment;
}
public String getFragment() {
return this.fragment;
}
/**
* Return {@code true} if a search term matched.
*
* @return {@code true} if this is a match
*/
public abstract boolean isMatch();
/**
* @return the payload
*/
public abstract PayloadEmit<T> getEmit();
}

View File

@ -1,10 +1,8 @@
package org.ahocorasick.trie;
import static java.lang.Character.isWhitespace;
import static java.lang.Character.toLowerCase;
import java.util.Deque;
import java.util.LinkedList;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Queue;
@ -15,6 +13,8 @@ import org.ahocorasick.interval.Intervalable;
import org.ahocorasick.trie.handler.DefaultPayloadEmitHandler;
import org.ahocorasick.trie.handler.PayloadEmitHandler;
import org.ahocorasick.trie.handler.StatefulPayloadEmitHandler;
import org.ahocorasick.util.ListElementRemoval;
import org.ahocorasick.util.ListElementRemoval.RemoveElementPredicate;
/**
* A trie implementation that carries a payload. See {@link Trie} for
@ -24,9 +24,9 @@ import org.ahocorasick.trie.handler.StatefulPayloadEmitHandler;
* The payload trie adds the possibility to specify emitted payloads for each
* added keyword.
* </p>
*
* @param <T> The type of the supplied of the payload.
*
* @author Daniel Beck
* @param <T> The type of the supplied of the payload.
*/
public class PayloadTrie<T> {
@ -34,23 +34,19 @@ public class PayloadTrie<T> {
private final PayloadState<T> rootState;
protected PayloadTrie(final TrieConfig trieConfig) {
this.trieConfig = trieConfig;
this.rootState = new PayloadState<>();
}
/**
* Used by the builder to add a text search keyword with an emit payload.
* Used by the builder to add a text search keyword with a emit payload.
*
* @param keyword The search term to add to the list of search terms.
* @param emit the payload to emit for this search term.
* @throws NullPointerException if the keyword is null.
*/
private void addKeyword(String keyword, T emit) {
if (keyword.isEmpty()) {
return;
}
@ -58,7 +54,6 @@ public class PayloadTrie<T> {
addState(keyword).addEmit(new Payload<>(keyword, emit));
}
/**
* Used by the builder to add a text search keyword.
*
@ -66,7 +61,6 @@ public class PayloadTrie<T> {
* @throws NullPointerException if the keyword is null.
*/
private void addKeyword(String keyword) {
if (keyword.isEmpty()) {
return;
}
@ -74,36 +68,29 @@ public class PayloadTrie<T> {
addState(keyword).addEmit(new Payload<>(keyword, null));
}
private PayloadState<T> addState(final String keyword) {
PayloadState<T> state = getRootState();
for (final Character character : keyword.toCharArray()) {
if (isIgnoreWhiteSpace() && isWhitespace(character)) {
continue;
}
Character adjustedChar = isCaseInsensitive() ? Character.toLowerCase(character) : character;
state = state.addState(adjustedChar);
}
return state;
}
/**
* Tokenizes the specified text and returns the emitted outputs.
*
*
* @param text The text to tokenize.
* @return the emitted outputs
*/
public Collection<PayloadToken<T>> tokenize(final String text) {
final Collection<PayloadToken<T>> tokens = new LinkedList<>();
final Collection<PayloadToken<T>> tokens = new ArrayList<>();
final Collection<PayloadEmit<T>> collectedEmits = parseText(text);
int lastCollectedPosition = -1;
for (final PayloadEmit<T> emit : collectedEmits) {
if (emit.getStart() - lastCollectedPosition > 1) {
tokens.add(createFragment(emit, text, lastCollectedPosition));
tokens.add( createFragment( emit, text, lastCollectedPosition) );
}
tokens.add(createMatch(emit, text));
@ -111,52 +98,56 @@ public class PayloadTrie<T> {
}
if (text.length() - lastCollectedPosition > 1) {
tokens.add(createFragment(null, text, lastCollectedPosition));
tokens.add( createFragment( null, text, lastCollectedPosition) );
}
return tokens;
}
private PayloadToken<T> createFragment(final PayloadEmit<T> emit, final String text, final int lastCollectedPosition) {
return new PayloadFragmentToken<>(text.substring(lastCollectedPosition + 1, emit == null ? text.length() : emit.getStart()));
return new PayloadFragmentToken<>(
text.substring( lastCollectedPosition + 1,
emit == null ? text.length() : emit.getStart() ) );
}
private PayloadToken<T> createMatch(PayloadEmit<T> emit, String text) {
return new PayloadMatchToken<>(text.substring(emit.getStart(), emit.getEnd() + 1), emit);
return new PayloadMatchToken<>( text.substring( emit.getStart(),
emit.getEnd() + 1 ),
emit );
}
/**
* Tokenizes a specified text and returns the emitted outputs.
*
*
* @param text The character sequence to tokenize.
* @return A collection of emits.
*/
public Collection<PayloadEmit<T>> parseText(final CharSequence text) {
return parseText(text, new DefaultPayloadEmitHandler<>());
}
/**
* Tokenizes the specified text by using a custom EmitHandler and returns the
* emitted outputs.
*
*
* @param text The character sequence to tokenize.
* @param emitHandler The handler that will be used to parse the text.
* @param emitHandler The emit handler that will be used to parse the text.
* @return A collection of emits.
*/
@SuppressWarnings("unchecked")
public Collection<PayloadEmit<T>> parseText(final CharSequence text, final StatefulPayloadEmitHandler<T> emitHandler) {
parseText(text, (PayloadEmitHandler<T>) emitHandler);
final List<PayloadEmit<T>> collectedEmits = emitHandler.getEmits();
if (trieConfig.isOnlyWholeWords()) {
removePartialMatches(text, collectedEmits);
}
if (trieConfig.isOnlyWholeWordsWhiteSpaceSeparated()) {
removePartialMatchesWhiteSpaceSeparated(text, collectedEmits);
}
if (!trieConfig.isAllowOverlaps()) {
IntervalTree intervalTree = new IntervalTree((List<Intervalable>) (List<?>) collectedEmits);
intervalTree.removeOverlaps((List<Intervalable>) (List<?>) collectedEmits);
@ -165,61 +156,50 @@ public class PayloadTrie<T> {
return collectedEmits;
}
/**
* Returns true if the text contains one of the search terms; otherwise,
* Returns true if the text contains contains one of the search terms. Else,
* returns false.
*
*
* @param text Specified text.
* @return true if the text contains one of the search terms. Else, returns
* false.
* false.
*/
public boolean containsMatch(final CharSequence text) {
return firstMatch(text) != null;
}
/**
* Tokenizes the specified text by using a custom EmitHandler and returns the
* emitted outputs.
*
*
* @param text The character sequence to tokenize.
* @param emitHandler The handler that will be used to parse the text.
* @param emitHandler The emit handler that will be used to parse the text.
*/
public void parseText(final CharSequence text, final PayloadEmitHandler<T> emitHandler) {
PayloadState<T> currentState = getRootState();
for (int position = 0; position < text.length(); position++) {
char character = text.charAt(position);
if (trieConfig.isIgnoreWhiteSpace() && isWhitespace(character)) {
continue;
}
char character = text.charAt( position);
// TODO: Maybe lowercase the entire string at once?
if (trieConfig.isCaseInsensitive()) {
character = Character.toLowerCase(character);
}
currentState = getState(currentState, character);
final Collection<Payload<T>> payloads = currentState.emit();
if (processEmits(text, position, payloads, emitHandler) && trieConfig.isStopOnHit()) {
if (storeEmits(position, currentState, emitHandler) && trieConfig.isStopOnHit()) {
return;
}
}
}
/**
* The first matching text sequence.
*
* @param text The text to search for keywords, must not be {@code null}.
* @return {@code null} if no matches found.
* @param text The text to search for keywords.
* @return null if no matches found.
*/
public PayloadEmit<T> firstMatch(final CharSequence text) {
assert text != null;
if (!trieConfig.isAllowOverlaps()) {
// Slow path. Needs to find all the matches to detect overlaps.
final Collection<PayloadEmit<T>> parseText = parseText(text);
@ -232,11 +212,9 @@ public class PayloadTrie<T> {
PayloadState<T> currentState = getRootState();
for (int position = 0; position < text.length(); position++) {
char character = text.charAt(position);
char character = text.charAt( position);
if (trieConfig.isIgnoreWhiteSpace() && isWhitespace(character)) {
continue;
}
// TODO: Lowercase the entire string at once?
if (trieConfig.isCaseInsensitive()) {
character = Character.toLowerCase(character);
}
@ -246,13 +224,8 @@ public class PayloadTrie<T> {
if (payloads != null && !payloads.isEmpty()) {
for (final Payload<T> payload : payloads) {
int start;
if (isIgnoreWhiteSpace()) {
start = findStart(text, position, payload);
} else {
start = position - payload.getKeyword().length() + 1;
}
final PayloadEmit<T> emit = new PayloadEmit<>(start, position, payload.getKeyword(), payload.getData());
final PayloadEmit<T> emit = new PayloadEmit<>(position - payload.getKeyword().length() + 1, position,
payload.getKeyword(), payload.getData());
if (trieConfig.isOnlyWholeWords()) {
if (!isPartialMatch(text, emit)) {
return emit;
@ -268,38 +241,48 @@ public class PayloadTrie<T> {
return null;
}
private boolean isPartialMatch(final CharSequence searchText, final PayloadEmit<T> emit) {
return (emit.getStart() != 0 && Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) || (emit.getEnd() + 1 != searchText.length() && Character.isAlphabetic(
searchText.charAt(emit.getEnd() + 1)));
return (emit.getStart() != 0 && Character.isAlphabetic(searchText.charAt(emit.getStart() - 1)))
|| (emit.getEnd() + 1 != searchText.length() && Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1)));
}
private void removePartialMatches(final CharSequence searchText, final List<PayloadEmit<T>> collectedEmits) {
private boolean isPartialMatchWhiteSpaceSeparated(final CharSequence searchText, final PayloadEmit<T> emit) {
final RemoveElementPredicate<PayloadEmit<T>> predicate = emit -> isPartialMatch( searchText, emit);
ListElementRemoval.removeIf(collectedEmits, predicate);
}
private void removePartialMatchesWhiteSpaceSeparated(final CharSequence searchText,
final List<PayloadEmit<T>> collectedEmits) {
final long size = searchText.length();
return (emit.getStart() != 0 && !isWhitespace(searchText.charAt(emit.getStart() - 1))) || (emit.getEnd() + 1 != size && !isWhitespace(searchText.charAt(emit.getEnd()
+ 1)));
}
final List<PayloadEmit<T>> removeEmits = new ArrayList<>();
for (final PayloadEmit<T> emit : collectedEmits) {
if ((emit.getStart() == 0 || isWhitespace(searchText.charAt(emit.getStart() - 1)))
&& (emit.getEnd() + 1 == size || isWhitespace(searchText.charAt(emit.getEnd() + 1)))) {
continue;
}
removeEmits.add(emit);
}
for (final PayloadEmit<T> removeEmit : removeEmits) {
collectedEmits.remove(removeEmit);
}
}
private PayloadState<T> getState(PayloadState<T> currentState, final Character character) {
PayloadState<T> newCurrentState = currentState.nextState(character);
var tempState = currentState;
while (newCurrentState == null) {
tempState = tempState.getFailure();
newCurrentState = tempState.nextState(character);
currentState = currentState.failure();
newCurrentState = currentState.nextState(character);
}
return newCurrentState;
}
private void constructFailureStates() {
final Queue<PayloadState<T>> queue = new LinkedBlockingDeque<>();
final PayloadState<T> startState = getRootState();
@ -317,9 +300,9 @@ public class PayloadTrie<T> {
PayloadState<T> targetState = currentState.nextState(transition);
queue.add(targetState);
PayloadState<T> traceFailureState = currentState.getFailure();
PayloadState<T> traceFailureState = currentState.failure();
while (traceFailureState.nextState(transition) == null) {
traceFailureState = traceFailureState.getFailure();
traceFailureState = traceFailureState.failure();
}
final PayloadState<T> newFailureState = traceFailureState.nextState(transition);
@ -329,22 +312,16 @@ public class PayloadTrie<T> {
}
}
private boolean processEmits(final CharSequence text, final int position, final Collection<Payload<T>> payloads, final PayloadEmitHandler<T> emitHandler) {
private boolean storeEmits(final int position, final PayloadState<T> currentState, final PayloadEmitHandler<T> emitHandler) {
boolean emitted = false;
for (final Payload<T> payload : payloads) {
int start;
if (isIgnoreWhiteSpace()) {
start = findStart(text, position, payload);
} else {
start = position - payload.getKeyword().length() + 1;
}
final PayloadEmit<T> payloadEmit = new PayloadEmit<>(start, position, payload.getKeyword(), payload.getData());
if (!(trieConfig.isOnlyWholeWords() && isPartialMatch(text, payloadEmit)) && !(trieConfig.isOnlyWholeWordsWhiteSpaceSeparated() && isPartialMatchWhiteSpaceSeparated(
text,
payloadEmit))) {
emitted = emitHandler.emit(payloadEmit) || emitted;
final Collection<Payload<T>> payloads = currentState.emit();
// TODO: The check for empty might be superfluous.
if (payloads != null && !payloads.isEmpty()) {
for (final Payload<T> payload : payloads) {
emitted = emitHandler.emit(new PayloadEmit<>(position - payload.getKeyword().length() + 1, position,
payload.getKeyword(), payload.getData())) || emitted;
if (emitted && trieConfig.isStopOnHit()) {
break;
}
@ -354,77 +331,41 @@ public class PayloadTrie<T> {
return emitted;
}
private int findStart(CharSequence text, int position, Payload<T> payload) {
Deque<Character> stack = new LinkedList<>();
int i;
for (i = 0; i < payload.getKeyword().length(); i++) {
if (isWhitespace(payload.getKeyword().charAt(i))) {
continue;
}
stack.push(isCaseInsensitive() ? toLowerCase(payload.getKeyword().charAt(i)) : payload.getKeyword().charAt(i));
}
for (i = position; !stack.isEmpty() && i >= 0; --i) {
char c = isCaseInsensitive() ? toLowerCase(text.charAt(i)) : text.charAt(i);
if (c == stack.peek()) {
stack.pop();
}
}
return i + 1;
}
private boolean isCaseInsensitive() {
return trieConfig.isCaseInsensitive();
}
private boolean isIgnoreWhiteSpace() {
return trieConfig.isIgnoreWhiteSpace();
}
private PayloadState<T> getRootState() {
return this.rootState;
}
/**
* Provides a fluent interface for constructing Trie instances with payloads.
*
* @param <T> The type of the emitted payload.
*
* @return The builder used to configure its Trie.
*/
public static <T> PayloadTrieBuilder<T> builder() {
return new PayloadTrieBuilder<>();
}
/**
* Builder class to create a PayloadTrie instance.
*
*
* @param <T> The type of the emitted payload.
*/
public static final class PayloadTrieBuilder<T> {
public static class PayloadTrieBuilder<T> {
private final TrieConfig trieConfig = new TrieConfig();
private final PayloadTrie<T> trie = new PayloadTrie<>(trieConfig);
/**
* Default (empty) constructor.
*/
private PayloadTrieBuilder() {
}
/**
* Configure the Trie to ignore case when searching for keywords in the text.
* This must be called before calling addKeyword because the algorithm converts
@ -434,42 +375,35 @@ public class PayloadTrie<T> {
* @return This builder.
*/
public PayloadTrieBuilder<T> ignoreCase() {
this.trieConfig.setCaseInsensitive(true);
return this;
}
/**
* Configure the Trie to ignore overlapping keywords.
*
* @return This builder.
*/
public PayloadTrieBuilder<T> ignoreOverlaps() {
this.trieConfig.setAllowOverlaps(false);
return this;
}
/**
* Adds a keyword to the {@link Trie}'s list of text search keywords.
* No {@link Payload} is supplied.
* Adds a keyword to the Trie's list of text search keywords. No Payload is
* supplied.
*
* @param keyword The keyword to add to the list.
* @return This builder.
* @throws NullPointerException if the keyword is null.
*/
public PayloadTrieBuilder<T> addKeyword(final String keyword) {
this.trie.addKeyword(keyword);
return this;
}
/**
* Adds a keyword and a payload to the {@link Trie}'s list of text
* search keywords.
* Adds a keyword and a payload to the Trie's list of text search keywords.
*
* @param keyword The keyword to add to the list.
* @param payload the payload to add
@ -477,40 +411,34 @@ public class PayloadTrie<T> {
* @throws NullPointerException if the keyword is null.
*/
public PayloadTrieBuilder<T> addKeyword(final String keyword, final T payload) {
this.trie.addKeyword(keyword, payload);
return this;
}
/**
* Adds a list of keywords and payloads to the {@link Trie}'s list of
* text search keywords.
* Adds a list of keywords and payloads to the Trie's list of text search
* keywords.
*
* @param keywords The keywords to add to the list.
* @return This builder.
*/
public PayloadTrieBuilder<T> addKeywords(final Collection<Payload<T>> keywords) {
for (Payload<T> payload : keywords) {
this.trie.addKeyword(payload.getKeyword(), payload.getData());
}
return this;
}
/**
* Configure the Trie to match whole keywords in the text.
*
* @return This builder.
*/
public PayloadTrieBuilder<T> onlyWholeWords() {
this.trieConfig.setOnlyWholeWords(true);
return this;
}
/**
* Configure the Trie to match whole keywords that are separated by whitespace
* in the text. For example, "this keyword thatkeyword" would only match the
@ -519,69 +447,44 @@ public class PayloadTrie<T> {
* @return This builder.
*/
public PayloadTrieBuilder<T> onlyWholeWordsWhiteSpaceSeparated() {
this.trieConfig.setOnlyWholeWordsWhiteSpaceSeparated(true);
return this;
}
/**
* Configure the Trie to stop after the first keyword is found in the text.
*
* @return This builder.
*/
public PayloadTrieBuilder<T> stopOnHit() {
trie.trieConfig.setStopOnHit(true);
return this;
}
/**
* Configure the PayloadTrie based on the builder settings.
*
* @return The configured PayloadTrie.
*/
public PayloadTrie<T> build() {
this.trie.constructFailureStates();
return this.trie;
}
/**
* @return This builder.
* @deprecated Use ignoreCase()
*/
@Deprecated
public PayloadTrieBuilder<T> caseInsensitive() {
return ignoreCase();
}
/**
* @return This builder.
* @deprecated Use ignoreOverlaps()
*/
@Deprecated
public PayloadTrieBuilder<T> removeOverlaps() {
return ignoreOverlaps();
}
/**
* Configure the Trie to ignore whitespaces.
*
* @return This builder.
*/
public PayloadTrieBuilder<T> ignoreWhiteSpace() {
trieConfig.setIgnoreWhiteSpace(true);
return this;
}
}
}

View File

@ -2,9 +2,6 @@ package org.ahocorasick.trie;
import java.util.*;
import lombok.Getter;
import lombok.Setter;
/**
* <p>
* A state has various important tasks it must attend to:
@ -29,7 +26,6 @@ public class State {
/**
* effective the size of the keyword
*/
@Getter
private final int depth;
/**
@ -46,8 +42,6 @@ public class State {
/**
* if no matching states are found, the failure state will be returned
*/
@Setter
@Getter
private State failure;
/**
@ -55,22 +49,16 @@ public class State {
*/
private Set<String> emits;
public State() {
this(0);
}
public State(final int depth) {
this.depth = depth;
this.rootState = depth == 0 ? this : null;
}
private State nextState(final Character character, final boolean ignoreRootState) {
State nextState = this.success.get(character);
if (!ignoreRootState && nextState == null && this.rootState != null) {
@ -80,21 +68,15 @@ public class State {
return nextState;
}
public State nextState(final Character character) {
return nextState(character, false);
}
public State nextStateIgnoreRootState(Character character) {
return nextState(character, true);
}
public State addState(String keyword) {
State state = this;
for (final Character character : keyword.toCharArray()) {
@ -104,9 +86,7 @@ public class State {
return state;
}
public State addState(Character character) {
State nextState = nextStateIgnoreRootState(character);
if (nextState == null) {
nextState = new State(this.depth + 1);
@ -115,39 +95,40 @@ public class State {
return nextState;
}
public int getDepth() {
return this.depth;
}
public void addEmit(String keyword) {
if (this.emits == null) {
this.emits = new TreeSet<>();
}
this.emits.add(keyword);
}
public void addEmit(Collection<String> emits) {
for (String emit : emits) {
addEmit(emit);
}
}
public Collection<String> emit() {
return this.emits == null ? Collections.<String>emptyList() : this.emits;
}
public State failure() {
return this.failure;
}
public void setFailure(State failState) {
this.failure = failState;
}
public Collection<State> getStates() {
return this.success.values();
}
public Collection<Character> getTransitions() {
return this.success.keySet();
}
}

View File

@ -1,25 +1,17 @@
package org.ahocorasick.trie;
public abstract class Token {
private String fragment;
public Token(String fragment) {
this.fragment = fragment;
}
public String getFragment() {
return this.fragment;
}
public abstract boolean isMatch();
public abstract Emit getEmit();
}

View File

@ -15,26 +15,20 @@ import org.ahocorasick.trie.handler.StatefulEmitHandler;
*
* @author Robert Bor
*/
public final class Trie {
public class Trie {
private final PayloadTrie<String> payloadTrie;
private Trie(final PayloadTrie<String> payloadTrie) {
this.payloadTrie = payloadTrie;
}
public Collection<Token> tokenize(final String text) {
Collection<PayloadToken<String>> tokens = this.payloadTrie.tokenize(text);
return asTokens(tokens);
}
private static Collection<Token> asTokens(Collection<PayloadToken<String>> tokens) {
Collection<Token> result = new ArrayList<>();
for (PayloadToken<String> payloadToken : tokens) {
result.add(new DefaultToken(payloadToken));
@ -42,9 +36,7 @@ public final class Trie {
return result;
}
private static Collection<Emit> asEmits(Collection<PayloadEmit<String>> emits) {
Collection<Emit> result = new ArrayList<>();
for (PayloadEmit<String> emit : emits) {
result.add(asEmit(emit));
@ -52,79 +44,60 @@ public final class Trie {
return result;
}
private static Emit asEmit(PayloadEmit<String> payloadEmit) {
return new Emit(payloadEmit.getStart(), payloadEmit.getEnd(), payloadEmit.getKeyword());
}
public Collection<Emit> parseText(final CharSequence text) {
Collection<PayloadEmit<String>> parsedText = this.payloadTrie.parseText(text);
return asEmits(parsedText);
}
@SuppressWarnings("UnusedReturnValue")
public Collection<Emit> parseText(final CharSequence text, final StatefulEmitHandler emitHandler) {
Collection<PayloadEmit<String>> parsedText = this.payloadTrie.parseText(text, new StatefulPayloadEmitDelegateHandler(emitHandler));
public Collection<Emit> parseText( final CharSequence text, final StatefulEmitHandler emitHandler) {
Collection<PayloadEmit<String>> parsedText = this.payloadTrie.parseText(text,
new StatefulPayloadEmitDelegateHandler(emitHandler));
return asEmits(parsedText);
}
public boolean containsMatch(final CharSequence text) {
return firstMatch(text) != null;
}
public void parseText(final CharSequence text, final EmitHandler emitHandler) {
this.payloadTrie.parseText(text, new PayloadEmitDelegateHandler(emitHandler));
}
/**
* The first matching text sequence.
*
* @param text The text to search for keywords, must not be {@code null}.
* @return {@code null} if no matches found.
* @param text The text to search for keywords.
* @return null if no matches found.
*/
public Emit firstMatch(final CharSequence text) {
assert text != null;
final PayloadEmit<String> payload = this.payloadTrie.firstMatch(text);
return payload == null ? null : new Emit(payload.getStart(), payload.getEnd(), payload.getKeyword());
}
/**
* Provides a fluent interface for constructing Trie instances.
*
* @return The builder used to configure its Trie.
*/
public static TrieBuilder builder() {
return new TrieBuilder();
}
public static final class TrieBuilder {
public static class TrieBuilder {
private final PayloadTrieBuilder<String> delegate = PayloadTrie.builder();
/**
* Default (empty) constructor.
*/
private TrieBuilder() {
}
/**
* Configure the Trie to ignore case when searching for keywords in the text.
* This must be called before calling addKeyword because the algorithm converts
@ -134,37 +107,21 @@ public final class Trie {
* @return This builder.
*/
public TrieBuilder ignoreCase() {
delegate.ignoreCase();
// this.trieConfig.setCaseInsensitive(true);
return this;
}
/**
* Configure the Trie to ignore overlapping keywords.
*
* @return This builder.
*/
public TrieBuilder ignoreOverlaps() {
delegate.ignoreOverlaps();
return this;
}
/**
* Configure the Trie to ignore whitespaces.
*
* @return This builder.
*/
public TrieBuilder ignoreWhiteSpace() {
delegate.ignoreWhiteSpace();
return this;
}
/**
* Adds a keyword to the Trie's list of text search keywords.
*
@ -173,12 +130,10 @@ public final class Trie {
* @throws NullPointerException if the keyword is null.
*/
public TrieBuilder addKeyword(final String keyword) {
delegate.addKeyword(keyword, null);
return this;
}
/**
* Adds a list of keywords to the Trie's list of text search keywords.
*
@ -186,14 +141,12 @@ public final class Trie {
* @return This builder.
*/
public TrieBuilder addKeywords(final String... keywords) {
for (String keyword : keywords) {
delegate.addKeyword(keyword, null);
}
return this;
}
/**
* Adds a list of keywords to the Trie's list of text search keywords.
*
@ -201,27 +154,23 @@ public final class Trie {
* @return This builder.
*/
@SuppressWarnings("unused")
public TrieBuilder addKeywords(final Collection<String> keywords) {
public TrieBuilder addKeywords( final Collection<String> keywords ) {
for (String keyword : keywords) {
this.delegate.addKeyword(keyword, null);
}
return this;
}
/**
* Configure the Trie to match whole keywords in the text.
*
* @return This builder.
*/
public TrieBuilder onlyWholeWords() {
this.delegate.onlyWholeWords();
return this;
}
/**
* Configure the Trie to match whole keywords that are separated by whitespace
* in the text. For example, "this keyword thatkeyword" would only match the
@ -230,35 +179,44 @@ public final class Trie {
* @return This builder.
*/
public TrieBuilder onlyWholeWordsWhiteSpaceSeparated() {
this.delegate.onlyWholeWordsWhiteSpaceSeparated();
return this;
}
/**
* Configure the Trie to stop after the first keyword is found in the text.
*
* @return This builder.
*/
public TrieBuilder stopOnHit() {
this.delegate.stopOnHit();
return this;
}
/**
* Configure the Trie based on the builder settings.
*
* @return The configured Trie.
*/
public Trie build() {
PayloadTrie<String> payloadTrie = this.delegate.build();
return new Trie(payloadTrie);
}
}
/**
* @return This builder.
* @deprecated Use ignoreCase()
*/
public TrieBuilder caseInsensitive() {
return ignoreCase();
}
/**
* @return This builder.
* @deprecated Use ignoreOverlaps()
*/
public TrieBuilder removeOverlaps() {
return ignoreOverlaps();
}
}
}

View File

@ -4,86 +4,51 @@ public class TrieConfig {
private boolean allowOverlaps = true;
private boolean onlyWholeWords;
private boolean onlyWholeWords = false;
private boolean onlyWholeWordsWhiteSpaceSeparated;
private boolean onlyWholeWordsWhiteSpaceSeparated = false;
private boolean caseInsensitive;
private boolean ignoreWhiteSpace;
private boolean stopOnHit;
private boolean caseInsensitive = false;
private boolean stopOnHit = false;
public boolean isStopOnHit() {
return stopOnHit;
}
public void setStopOnHit(boolean stopOnHit) {
this.stopOnHit = stopOnHit;
}
public boolean isAllowOverlaps() {
return allowOverlaps;
}
public void setAllowOverlaps(boolean allowOverlaps) {
this.allowOverlaps = allowOverlaps;
}
public boolean isOnlyWholeWords() {
return onlyWholeWords;
}
public void setOnlyWholeWords(boolean onlyWholeWords) {
this.onlyWholeWords = onlyWholeWords;
}
public boolean isOnlyWholeWordsWhiteSpaceSeparated() {
return onlyWholeWordsWhiteSpaceSeparated;
}
public void setOnlyWholeWordsWhiteSpaceSeparated(boolean onlyWholeWordsWhiteSpaceSeparated) {
this.onlyWholeWordsWhiteSpaceSeparated = onlyWholeWordsWhiteSpaceSeparated;
}
public boolean isCaseInsensitive() {
return caseInsensitive;
}
public boolean isIgnoreWhiteSpace() {
return ignoreWhiteSpace;
}
public void setCaseInsensitive(boolean caseInsensitive) {
this.caseInsensitive = caseInsensitive;
}
public void setIgnoreWhiteSpace(boolean ignoreWhiteSpace) {
this.ignoreWhiteSpace = ignoreWhiteSpace;
}
}

View File

@ -9,16 +9,12 @@ public abstract class AbstractStatefulEmitHandler implements StatefulEmitHandler
private final List<Emit> emits = new ArrayList<>();
public void addEmit(final Emit emit) {
this.emits.add(emit);
}
@Override
public List<Emit> getEmits() {
return this.emits;
}

View File

@ -9,16 +9,12 @@ public abstract class AbstractStatefulPayloadEmitHandler<T> implements StatefulP
private final List<PayloadEmit<T>> emits = new ArrayList<>();
public void addEmit(final PayloadEmit<T> emit) {
this.emits.add(emit);
}
@Override
public List<PayloadEmit<T>> getEmits() {
return this.emits;
}

View File

@ -9,19 +9,14 @@ public class DefaultEmitHandler implements StatefulEmitHandler {
private final List<Emit> emits = new ArrayList<>();
@Override
public boolean emit(final Emit emit) {
this.emits.add(emit);
return true;
}
@Override
public List<Emit> getEmits() {
return this.emits;
}
}

View File

@ -9,19 +9,14 @@ public class DefaultPayloadEmitHandler<T> implements StatefulPayloadEmitHandler<
private final List<PayloadEmit<T>> emits = new ArrayList<>();
@Override
public boolean emit(final PayloadEmit<T> emit) {
this.emits.add(emit);
return true;
}
@Override
public List<PayloadEmit<T>> getEmits() {
return this.emits;
}
}

View File

@ -3,7 +3,5 @@ package org.ahocorasick.trie.handler;
import org.ahocorasick.trie.Emit;
public interface EmitHandler {
boolean emit(Emit emit);
}

View File

@ -11,17 +11,13 @@ public class PayloadEmitDelegateHandler implements PayloadEmitHandler<String> {
private EmitHandler handler;
public PayloadEmitDelegateHandler(EmitHandler handler) {
this.handler = handler;
}
@Override
public boolean emit(PayloadEmit<String> emit) {
Emit newEmit = new Emit(emit.getStart(), emit.getEnd(), emit.getKeyword());
return handler.emit(newEmit);
}

View File

@ -3,7 +3,5 @@ package org.ahocorasick.trie.handler;
import org.ahocorasick.trie.PayloadEmit;
public interface PayloadEmitHandler<T> {
boolean emit(PayloadEmit<T> emit);
}

View File

@ -5,7 +5,5 @@ import java.util.List;
import org.ahocorasick.trie.Emit;
public interface StatefulEmitHandler extends EmitHandler {
List<Emit> getEmits();
}

View File

@ -15,16 +15,12 @@ public class StatefulPayloadEmitDelegateHandler implements StatefulPayloadEmitHa
private StatefulEmitHandler handler;
public StatefulPayloadEmitDelegateHandler(StatefulEmitHandler handler) {
this.handler = handler;
}
private static List<PayloadEmit<String>> asEmits(Collection<Emit> emits) {
List<PayloadEmit<String>> result = new ArrayList<>();
for (Emit emit : emits) {
result.add(new PayloadEmit<String>(emit.getStart(), emit.getEnd(), emit.getKeyword(), null));
@ -32,20 +28,15 @@ public class StatefulPayloadEmitDelegateHandler implements StatefulPayloadEmitHa
return result;
}
@Override
public boolean emit(PayloadEmit<String> emit) {
Emit newEmit = new Emit(emit.getStart(), emit.getEnd(), emit.getKeyword());
return handler.emit(newEmit);
}
@Override
public List<PayloadEmit<String>> getEmits() {
List<Emit> emits = this.handler.getEmits();
return asEmits(emits);
}
}

View File

@ -4,8 +4,6 @@ import java.util.List;
import org.ahocorasick.trie.PayloadEmit;
public interface StatefulPayloadEmitHandler<T> extends PayloadEmitHandler<T> {
public interface StatefulPayloadEmitHandler<T> extends PayloadEmitHandler<T>{
List<PayloadEmit<T>> getEmits();
}

View File

@ -0,0 +1,51 @@
package org.ahocorasick.util;
import java.util.ArrayList;
import java.util.List;
/**
* Helps removes elements from a list in a efficient way
*
* <p>Removing elements from an ArrayList in a naive way can lead to O(n^3)
* running time. If the algorithm first creates a list of all the elements
* to remove, then we for each element in this list (assume n elements) we look
* for the element in the original list (against n elements) and when found we need
* to remove the element and move the elements to the right (of the removed element)
* to the left by one, the size of this operation is at worst n hence O(n^3).</p>
*
* <p>This instead makes a new list and copies over only elements we want to keep,
* we then clear the original list and then add all of the elements to the original
* list. This gives us (for ArrayList) a running time of O(n).</p>
*
* <p>The performance of this has not been thoroughly tested for linked list.</p>
*
* <p>This can be completely removed in java 8 as the List#removeIf() method can be used instead
* as this already is optimised for each list implementation.
*
*/
public class ListElementRemoval {
public static interface RemoveElementPredicate<T> {
public boolean remove(T t);
}
/**
* Removes all elements from the list matching the given predicate.
*
* @param list the list from which to remove
* @param predicate to test for removal
* @param <T> type of list
*/
public static <T> void removeIf(final List<T> list, final RemoveElementPredicate<T> predicate) {
final List<T> newList = new ArrayList<>(list.size());
for(final T element : list) {
if (!predicate.remove(element)) {
newList.add(element);
}
}
list.clear();
list.addAll(newList);
}
}

View File

@ -6,79 +6,52 @@ import java.util.Iterator;
import java.util.Set;
import java.util.TreeSet;
import static org.junit.Assert.*;
import static junit.framework.Assert.*;
public class IntervalTest {
@Test
public void test_construct() {
final Interval i = new Interval(1, 3);
public void construct() {
Interval i = new Interval(1, 3);
assertEquals(1, i.getStart());
assertEquals(3, i.getEnd());
}
@Test
public void test_size() {
public void size() {
assertEquals(3, new Interval(0, 2).size());
}
@Test
public void test_intervaloverlaps() {
public void intervaloverlaps() {
assertTrue(new Interval(1, 3).overlapsWith(new Interval(2, 4)));
}
@Test
public void test_intervalDoesNotOverlap() {
public void intervalDoesNotOverlap() {
assertFalse(new Interval(1, 13).overlapsWith(new Interval(27, 42)));
}
@Test
public void test_pointOverlaps() {
public void pointOverlaps() {
assertTrue(new Interval(1, 3).overlapsWith(2));
}
@Test
public void test_pointDoesNotOverlap() {
public void pointDoesNotOverlap() {
assertFalse(new Interval(1, 13).overlapsWith(42));
}
@Test
public void test_comparable() {
final Set<Interval> intervals = new TreeSet<>();
public void comparable() {
Set<Interval> intervals = new TreeSet<>();
intervals.add(new Interval(4, 6));
intervals.add(new Interval(2, 7));
intervals.add(new Interval(3, 4));
final Iterator<Interval> it = intervals.iterator();
Iterator<Interval> it = intervals.iterator();
assertEquals(2, it.next().getStart());
assertEquals(3, it.next().getStart());
assertEquals(4, it.next().getStart());
}
@Test
public void test_checkToString() {
assertEquals("4:6", new Interval(4, 6).toString());
}
@Test
public void test_compareToNegativeTest() {
assertEquals(-1, new Interval(4, 6).compareTo(new Object()));
}
}

View File

@ -12,7 +12,6 @@ public class IntervalTreeTest {
@Test
public void findOverlaps() {
List<Intervalable> intervals = new ArrayList<>();
intervals.add(new Interval(0, 2));
intervals.add(new Interval(1, 3));
@ -29,10 +28,8 @@ public class IntervalTreeTest {
assertOverlap(overlapsIt.next(), 0, 2);
}
@Test
public void removeOverlaps() {
List<Intervalable> intervals = new ArrayList<>();
intervals.add(new Interval(0, 2));
intervals.add(new Interval(4, 5));
@ -46,9 +43,7 @@ public class IntervalTreeTest {
}
protected void assertOverlap(Intervalable interval, int expectedStart, int expectedEnd) {
assertEquals(expectedStart, interval.getStart());
assertEquals(expectedEnd, interval.getEnd());
}

View File

@ -12,7 +12,6 @@ public class IntervalableComparatorByPositionTest {
@Test
public void sortOnPosition() {
List<Intervalable> intervals = new ArrayList<Intervalable>();
intervals.add(new Interval(4, 5));
intervals.add(new Interval(1, 4));

View File

@ -12,7 +12,6 @@ public class IntervalableComparatorBySizeTest {
@Test
public void sortOnSize() {
List<Intervalable> intervals = new ArrayList<Intervalable>();
intervals.add(new Interval(4, 5));
intervals.add(new Interval(1, 4));
@ -23,10 +22,8 @@ public class IntervalableComparatorBySizeTest {
assertEquals(2, intervals.get(2).size());
}
@Test
public void sortOnSizeThenPosition() {
List<Intervalable> intervals = new ArrayList<Intervalable>();
intervals.add(new Interval(4, 7));
intervals.add(new Interval(2, 5));

View File

@ -2,35 +2,23 @@ package org.ahocorasick.trie;
import org.junit.Test;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotEquals;
import static junit.framework.Assert.assertEquals;
import static junit.framework.Assert.assertNotSame;
/**
* Test the {@link Emit} class functionality.
*/
public class EmitTest {
/**
* Test that two {@link Emit} instances having the same values are equal.
*/
@Test
public void test_Equality_SameValues_ObjectsAreEqual() {
final Emit one = new Emit(13, 42, null);
final Emit two = new Emit(13, 42, null);
public void equals() {
Emit one = new Emit(13, 42, null);
Emit two = new Emit(13, 42, null);
assertEquals(one, two);
}
/**
* Test that two {@link Emit} instances having different values are equal.
*/
@Test
public void test_Equality_DifferingValues_ObjectsAreNotEqual() {
final Emit one = new Emit(13, 42, null);
final Emit two = new Emit(13, 43, null);
assertNotEquals(one, two);
public void notEquals() {
Emit one = new Emit(13, 42, null);
Emit two = new Emit(13, 43, null);
assertNotSame(one, two);
}
}

View File

@ -1,133 +1,124 @@
package org.ahocorasick.trie;
import static junit.framework.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.ThreadLocalRandom;
import org.ahocorasick.trie.handler.AbstractStatefulPayloadEmitHandler;
import org.ahocorasick.trie.handler.PayloadEmitHandler;
import org.ahocorasick.trie.handler.StatefulPayloadEmitHandler;
import org.junit.Test;
import java.util.Collection;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import static java.util.Arrays.asList;
import static org.ahocorasick.trie.TestHelper.injectKeyword;
import static org.ahocorasick.trie.TestHelper.randomNumbers;
import static org.junit.Assert.*;
public class PayloadTrieTest {
private final static String[] ALPHABET = new String[]{"abc", "bcd", "cde"};
private final static String[] ALPHABET_PAYLOAD = new String[]{"alpha:abc", "alpha:bcd", "alpha:cde"};
private final static String[] ALPHABET = new String[] { "abc", "bcd", "cde" };
private final static String[] ALPHABET_PAYLOAD = new String[] { "alpha:abc", "alpha:bcd", "alpha:cde" };
private final static List<Payload<String>> ALPHABET_WITH_PAYLOADS = asList(new Payload<>(ALPHABET[0], ALPHABET_PAYLOAD[0]),
new Payload<>(ALPHABET[1], ALPHABET_PAYLOAD[1]),
new Payload<>(ALPHABET[2], ALPHABET_PAYLOAD[2]));
private final static List<Payload<String>> ALPHABET_WITH_PAYLOADS = Arrays.asList(//
new Payload<String>(ALPHABET[0], ALPHABET_PAYLOAD[0]), //
new Payload<String>(ALPHABET[1], ALPHABET_PAYLOAD[1]), //
new Payload<String>(ALPHABET[2], ALPHABET_PAYLOAD[2]));
private final static String[] PRONOUNS = new String[]{"hers", "his", "she", "he"};
private final static int[] PRONOUNS_PAYLOAD_ID = new int[]{9, 12, 4, 20};
private final static String[] PRONOUNS = new String[] { "hers", "his", "she", "he" };
private final static int[] PRONOUNS_PAYLOAD_ID = new int[] { 9, 12, 4, 20 };
private final static List<Payload<Integer>> PRONOUNS_WITH_PAYLOADS = asList(new Payload<>(PRONOUNS[0], PRONOUNS_PAYLOAD_ID[0]),
new Payload<>(PRONOUNS[1], PRONOUNS_PAYLOAD_ID[1]),
new Payload<>(PRONOUNS[2], PRONOUNS_PAYLOAD_ID[2]),
new Payload<>(PRONOUNS[3], PRONOUNS_PAYLOAD_ID[3]));
private final static List<Payload<Integer>> PRONOUNS_WITH_PAYLOADS = Arrays.asList(//
new Payload<Integer>(PRONOUNS[0], PRONOUNS_PAYLOAD_ID[0]), //
new Payload<Integer>(PRONOUNS[1], PRONOUNS_PAYLOAD_ID[1]), //
new Payload<Integer>(PRONOUNS[2], PRONOUNS_PAYLOAD_ID[2]), //
new Payload<Integer>(PRONOUNS[3], PRONOUNS_PAYLOAD_ID[3]) //
);
private final static String[] FOOD = new String[]{"veal", "cauliflower", "broccoli", "tomatoes"};
private final static Food[] FOOD_PAYLOAD = new Food[]{new Food("veal"), new Food("cauliflower"), new Food("broccoli"), new Food("tomatoes")};
private final static String[] FOOD = new String[] { "veal", "cauliflower", "broccoli", "tomatoes" };
private final static Food[] FOOD_PAYLOAD = new Food[] { new Food("veal"), new Food("cauliflower"), new Food("broccoli"),
new Food("tomatoes") };
private final static List<Payload<Food>> FOOD_WITH_PAYLOADS = asList(new Payload<>(FOOD[0], FOOD_PAYLOAD[0]),
new Payload<>(FOOD[1], FOOD_PAYLOAD[1]),
new Payload<>(FOOD[2], FOOD_PAYLOAD[2]),
new Payload<>(FOOD[3], FOOD_PAYLOAD[3]));
private final static List<Payload<Food>> FOOD_WITH_PAYLOADS = Arrays.asList(//
new Payload<Food>(FOOD[0], FOOD_PAYLOAD[0]), //
new Payload<Food>(FOOD[1], FOOD_PAYLOAD[1]), //
new Payload<Food>(FOOD[2], FOOD_PAYLOAD[2]), //
new Payload<Food>(FOOD[3], FOOD_PAYLOAD[3]) //
);
private final static String[] GREEK_LETTERS = new String[]{"Alpha", "Beta", "Gamma"};
private final static String[] GREEK_LETTERS_PAYLOAD = new String[]{"greek:Alpha", "greek:Beta", "greek:Gamma"};
private final static String[] GREEK_LETTERS = new String[] { "Alpha", "Beta", "Gamma" };
private final static String[] GREEK_LETTERS_PAYLOAD = new String[] { "greek:Alpha", "greek:Beta", "greek:Gamma" };
private final static List<Payload<String>> GREEK_LETTERS_WITH_PAYLOADS = asList(new Payload<>(GREEK_LETTERS[0], GREEK_LETTERS_PAYLOAD[0]),
new Payload<>(GREEK_LETTERS[1], GREEK_LETTERS_PAYLOAD[1]),
new Payload<>(GREEK_LETTERS[2], GREEK_LETTERS_PAYLOAD[2]));
private final static List<Payload<String>> GREEK_LETTERS_WITH_PAYLOADS = Arrays.asList(//
new Payload<String>(GREEK_LETTERS[0], GREEK_LETTERS_PAYLOAD[0]), //
new Payload<String>(GREEK_LETTERS[1], GREEK_LETTERS_PAYLOAD[1]), //
new Payload<String>(GREEK_LETTERS[2], GREEK_LETTERS_PAYLOAD[2]));
private final static String[] UNICODE = new String[]{"turning", "once", "again", "börkü"};
private final static String[] UNICODE_PAYLOAD = new String[]{"uni:turning", "uni:once", "uni:again", "uni:börkü"};
private final static String[] UNICODE = new String[] { "turning", "once", "again", "börkü" };
private final static String[] UNICODE_PAYLOAD = new String[] { "uni:turning", "uni:once", "uni:again", "uni:börkü" };
private final static List<Payload<String>> UNICODE_WITH_PAYLOADS = asList(new Payload<>(UNICODE[0], UNICODE_PAYLOAD[0]),
new Payload<>(UNICODE[1], UNICODE_PAYLOAD[1]),
new Payload<>(UNICODE[2], UNICODE_PAYLOAD[2]),
new Payload<>(UNICODE[3], UNICODE_PAYLOAD[3]));
private final static List<Payload<String>> UNICODE_WITH_PAYLOADS = Arrays.asList(//
new Payload<String>(UNICODE[0], UNICODE_PAYLOAD[0]), //
new Payload<String>(UNICODE[1], UNICODE_PAYLOAD[1]), //
new Payload<String>(UNICODE[2], UNICODE_PAYLOAD[2]), //
new Payload<String>(UNICODE[3], UNICODE_PAYLOAD[3]));
public static class Food {
private final String name;
public Food(String name) {
this.name = name;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((name == null) ? 0 : name.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj) {
if (this == obj)
return true;
}
if (obj == null) {
if (obj == null)
return false;
}
if (getClass() != obj.getClass()) {
if (getClass() != obj.getClass())
return false;
}
Food other = (Food) obj;
if (name == null) {
return other.name == null;
} else {
return name.equals(other.name);
}
if (other.name != null)
return false;
} else if (!name.equals(other.name))
return false;
return true;
}
}
@Test
public void keywordAndTextAreTheSame() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword(ALPHABET[0], ALPHABET_PAYLOAD[0]).build();
Collection<PayloadEmit<String>> emits = trie.parseText(ALPHABET[0]);
Iterator<PayloadEmit<String>> iterator = emits.iterator();
checkEmit(iterator.next(), 0, 2, ALPHABET[0], ALPHABET_PAYLOAD[0]);
}
@Test
public void keywordAndTextAreTheSameFirstMatch() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword(ALPHABET[0], ALPHABET_PAYLOAD[0]).build();
PayloadEmit<String> firstMatch = trie.firstMatch(ALPHABET[0]);
checkEmit(firstMatch, 0, 2, ALPHABET[0], ALPHABET_PAYLOAD[0]);
}
@Test
public void textIsLongerThanKeyword() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword(ALPHABET[0], ALPHABET_PAYLOAD[0]).build();
Collection<PayloadEmit<String>> emits = trie.parseText(" " + ALPHABET[0]);
Iterator<PayloadEmit<String>> iterator = emits.iterator();
checkEmit(iterator.next(), 1, 3, ALPHABET[0], ALPHABET_PAYLOAD[0]);
}
@Test
public void textIsLongerThanKeywordFirstMatch() {
@ -136,29 +127,23 @@ public class PayloadTrieTest {
checkEmit(firstMatch, 1, 3, ALPHABET[0], ALPHABET_PAYLOAD[0]);
}
@Test
public void variousKeywordsOneMatch() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeywords(ALPHABET_WITH_PAYLOADS).build();
Collection<PayloadEmit<String>> emits = trie.parseText("bcd");
Iterator<PayloadEmit<String>> iterator = emits.iterator();
checkEmit(iterator.next(), 0, 2, "bcd", "alpha:bcd");
}
@Test
public void variousKeywordsFirstMatch() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeywords(ALPHABET_WITH_PAYLOADS).build();
PayloadEmit<String> firstMatch = trie.firstMatch("bcd");
checkEmit(firstMatch, 0, 2, "bcd", "alpha:bcd");
}
@Test
public void ushersTestAndStopOnHit() {
PayloadTrie<Integer> trie = PayloadTrie.<Integer>builder().addKeywords(PRONOUNS_WITH_PAYLOADS).stopOnHit().build();
Collection<PayloadEmit<Integer>> emits = trie.parseText("ushers");
assertEquals(1, emits.size()); // she @ 3, he @ 3, hers @ 5
@ -166,19 +151,15 @@ public class PayloadTrieTest {
checkEmit(iterator.next(), 2, 3, "he", 20);
}
@Test
public void ushersTestStopOnHitSkipOne() {
PayloadTrie<Integer> trie = PayloadTrie.<Integer>builder().addKeywords(PRONOUNS_WITH_PAYLOADS).stopOnHit().build();
StatefulPayloadEmitHandler<Integer> testEmitHandler = new AbstractStatefulPayloadEmitHandler<Integer>() {
boolean first = true;
@Override
public boolean emit(final PayloadEmit<Integer> emit) {
if (first) {
// return false for the first element
first = false;
@ -197,10 +178,8 @@ public class PayloadTrieTest {
checkEmit(iterator.next(), 1, 3, "she", 4);
}
@Test
public void ushersTest() {
PayloadTrie<Integer> trie = PayloadTrie.<Integer>builder().addKeywords(PRONOUNS_WITH_PAYLOADS).build();
Collection<PayloadEmit<Integer>> emits = trie.parseText("ushers");
assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5
@ -211,17 +190,10 @@ public class PayloadTrieTest {
checkEmit(iterator.next(), 2, 5, "hers", 9);
}
@Test
public void ushersTestWithCapitalKeywords() {
PayloadTrie<String> trie = PayloadTrie.<String>builder()
.ignoreCase()
.addKeyword("HERS", "hers")
.addKeyword("HIS", "his")
.addKeyword("SHE", "she")
.addKeyword("HE", "he")
.build();
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().addKeyword("HERS", "hers").addKeyword("HIS", "his")
.addKeyword("SHE", "she").addKeyword("HE", "he").build();
Collection<PayloadEmit<String>> emits = trie.parseText("ushers");
assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5
Iterator<PayloadEmit<String>> iterator = emits.iterator();
@ -230,25 +202,25 @@ public class PayloadTrieTest {
checkEmit(iterator.next(), 2, 5, "HERS", "hers");
}
@Test
public void ushersTestFirstMatch() {
PayloadTrie<Integer> trie = PayloadTrie.<Integer>builder().addKeywords(PRONOUNS_WITH_PAYLOADS).build();
PayloadEmit<Integer> firstMatch = trie.firstMatch("ushers");
checkEmit(firstMatch, 2, 3, "he", 20);
}
@Test
public void ushersTestByCallback() {
PayloadTrie<Integer> trie = PayloadTrie.<Integer>builder().addKeywords(PRONOUNS_WITH_PAYLOADS).build();
final List<PayloadEmit<Integer>> emits = new LinkedList<>();
PayloadEmitHandler<Integer> emitHandler = emit -> {
emits.add(emit);
return true;
final List<PayloadEmit<Integer>> emits = new ArrayList<>();
PayloadEmitHandler<Integer> emitHandler = new PayloadEmitHandler<Integer>() {
@Override
public boolean emit(PayloadEmit<Integer> emit) {
emits.add(emit);
return true;
}
};
trie.parseText("ushers", emitHandler);
assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5
@ -259,29 +231,23 @@ public class PayloadTrieTest {
checkEmit(iterator.next(), 2, 5, "hers", 9);
}
@Test
public void misleadingTest() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword("hers", "pronon:hers").build();
Collection<PayloadEmit<String>> emits = trie.parseText("h he her hers");
Iterator<PayloadEmit<String>> iterator = emits.iterator();
checkEmit(iterator.next(), 9, 12, "hers", "pronon:hers");
}
@Test
public void misleadingTestFirstMatch() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword("hers", "pronon:hers").build();
PayloadEmit<String> firstMatch = trie.firstMatch("h he her hers");
checkEmit(firstMatch, 9, 12, "hers", "pronon:hers");
}
@Test
public void recipes() {
PayloadTrie<Food> trie = PayloadTrie.<Food>builder().addKeywords(FOOD_WITH_PAYLOADS).build();
Collection<PayloadEmit<Food>> emits = trie.parseText("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli");
Iterator<PayloadEmit<Food>> iterator = emits.iterator();
@ -291,20 +257,17 @@ public class PayloadTrieTest {
checkEmit(iterator.next(), 51, 58, "broccoli", new Food("broccoli"));
}
@Test
public void recipesFirstMatch() {
PayloadTrie<Food> trie = PayloadTrie.<Food>builder().addKeywords(FOOD_WITH_PAYLOADS).build();
PayloadEmit<Food> firstMatch = trie.firstMatch("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli");
checkEmit(firstMatch, 2, 12, "cauliflower", new Food("cauliflower"));
}
@Test
public void longAndShortOverlappingMatch() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword("he", "pronon:he").addKeyword("hehehehe", "garbage").build();
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword("he", "pronon:he").addKeyword("hehehehe", "garbage")
.build();
Collection<PayloadEmit<String>> emits = trie.parseText("hehehehehe");
Iterator<PayloadEmit<String>> iterator = emits.iterator();
checkEmit(iterator.next(), 0, 1, "he", "pronon:he");
@ -316,16 +279,10 @@ public class PayloadTrieTest {
checkEmit(iterator.next(), 2, 9, "hehehehe", "garbage");
}
@Test
public void nonOverlapping() {
PayloadTrie<String> trie = PayloadTrie.<String>builder()
.ignoreOverlaps()
.addKeyword("ab", "alpha:ab")
.addKeyword("cba", "alpha:cba")
.addKeyword("ababc", "alpha:ababc")
.build();
PayloadTrie<String> trie = PayloadTrie.<String>builder().removeOverlaps().addKeyword("ab", "alpha:ab")
.addKeyword("cba", "alpha:cba").addKeyword("ababc", "alpha:ababc").build();
Collection<PayloadEmit<String>> emits = trie.parseText("ababcbab");
assertEquals(2, emits.size());
Iterator<PayloadEmit<String>> iterator = emits.iterator();
@ -334,79 +291,49 @@ public class PayloadTrieTest {
checkEmit(iterator.next(), 6, 7, "ab", "alpha:ab");
}
@Test
public void nonOverlappingFirstMatch() {
PayloadTrie<String> trie = PayloadTrie.<String>builder()
.ignoreOverlaps()
.addKeyword("ab", "alpha:ab")
.addKeyword("cba", "alpha:cba")
.addKeyword("ababc", "alpha:ababc")
.build();
PayloadTrie<String> trie = PayloadTrie.<String>builder().removeOverlaps().addKeyword("ab", "alpha:ab")
.addKeyword("cba", "alpha:cba").addKeyword("ababc", "alpha:ababc").build();
PayloadEmit<String> firstMatch = trie.firstMatch("ababcbab");
checkEmit(firstMatch, 0, 4, "ababc", "alpha:ababc");
}
@Test
public void containsMatch() {
PayloadTrie<String> trie = PayloadTrie.<String>builder()
.ignoreOverlaps()
.addKeyword("ab", "alpha:ab")
.addKeyword("cba", "alpha:cba")
.addKeyword("ababc", "alpha:ababc")
.build();
PayloadTrie<String> trie = PayloadTrie.<String>builder().removeOverlaps().addKeyword("ab", "alpha:ab")
.addKeyword("cba", "alpha:cba").addKeyword("ababc", "alpha:ababc").build();
assertTrue(trie.containsMatch("ababcbab"));
}
@Test
public void startOfChurchillSpeech() {
PayloadTrie<String> trie = PayloadTrie.<String>builder()
.ignoreOverlaps()
.addKeyword("T")
.addKeyword("u")
.addKeyword("ur")
.addKeyword("r")
.addKeyword("urn")
.addKeyword("ni")
.addKeyword("i")
.addKeyword("in")
.addKeyword("n")
.addKeyword("urning")
.build();
PayloadTrie<String> trie = PayloadTrie.<String>builder().removeOverlaps().addKeyword("T").addKeyword("u").addKeyword("ur")
.addKeyword("r").addKeyword("urn").addKeyword("ni").addKeyword("i").addKeyword("in").addKeyword("n")
.addKeyword("urning").build();
Collection<PayloadEmit<String>> emits = trie.parseText("Turning");
assertEquals(2, emits.size());
}
@Test
public void partialMatch() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().onlyWholeWords().addKeyword("sugar", "food:sugar").build();
Collection<PayloadEmit<String>> emits = trie.parseText("sugarcane sugarcane sugar canesugar"); // left, middle, right test
assertEquals(1, emits.size()); // Match must not be made
checkEmit(emits.iterator().next(), 20, 24, "sugar", "food:sugar");
}
@Test
public void partialMatchFirstMatch() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().onlyWholeWords().addKeyword("sugar", "food:sugar").build();
PayloadEmit<String> firstMatch = trie.firstMatch("sugarcane sugarcane sugar canesugar"); // left, middle, right test
checkEmit(firstMatch, 20, 24, "sugar", "food:sugar");
}
@Test
public void tokenizeFullSentence() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeywords(GREEK_LETTERS_WITH_PAYLOADS).build();
Collection<PayloadToken<String>> tokens = trie.tokenize("Hear: Alpha team first, Beta from the rear, Gamma in reserve");
assertEquals(7, tokens.size());
@ -420,12 +347,11 @@ public class PayloadTrieTest {
assertEquals(" in reserve", tokensIt.next().getFragment());
}
// @see https://github.com/robert-bor/aho-corasick/issues/5
@Test
public void testStringIndexOutOfBoundsException() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().onlyWholeWords().addKeywords(UNICODE_WITH_PAYLOADS).build();
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().onlyWholeWords().addKeywords(UNICODE_WITH_PAYLOADS)
.build();
Collection<PayloadEmit<String>> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ");
assertEquals(4, emits.size()); // Match must not be made
Iterator<PayloadEmit<String>> it = emits.iterator();
@ -436,10 +362,8 @@ public class PayloadTrieTest {
checkEmit(it.next(), 19, 23, "börkü", "uni:börkü");
}
@Test
public void testIgnoreCase() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().addKeywords(UNICODE_WITH_PAYLOADS).build();
Collection<PayloadEmit<String>> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ");
assertEquals(4, emits.size()); // Match must not be made
@ -451,75 +375,65 @@ public class PayloadTrieTest {
checkEmit(it.next(), 19, 23, "börkü", "uni:börkü");
}
@Test
public void testIgnoreCaseFirstMatch() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().addKeywords(UNICODE_WITH_PAYLOADS).build();
PayloadEmit<String> firstMatch = trie.firstMatch("TurninG OnCe AgAiN BÖRKÜ");
checkEmit(firstMatch, 0, 6, "turning", "uni:turning");
}
@Test
public void tokenizeTokensInSequence() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeywords(GREEK_LETTERS_WITH_PAYLOADS).build();
Collection<PayloadToken<String>> tokens = trie.tokenize("Alpha Beta Gamma");
assertEquals(5, tokens.size());
}
// @see https://github.com/robert-bor/aho-corasick/issues/7
@Test
public void testZeroLength() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreOverlaps().onlyWholeWords().ignoreCase().addKeyword("").build();
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreOverlaps().onlyWholeWords().ignoreCase().addKeyword("")
.build();
trie.tokenize(
"Try a natural lip and subtle bronzer to keep all the focus on those big bright eyes with NARS Eyeshadow Duo in Rated R And the winner is... Boots No7 Advanced Renewal Anti-ageing Glycolic Peel Kit ($25 amazon.com) won most-appealing peel.");
}
// @see https://github.com/robert-bor/aho-corasick/issues/8
@Test
public void testUnicode1() {
String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char
assertEquals("THIS", target.substring(5, 9)); // Java does it the right way
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().onlyWholeWords().addKeyword("this", "pronon:this").build();
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().onlyWholeWords().addKeyword("this", "pronon:this")
.build();
Collection<PayloadEmit<String>> emits = trie.parseText(target);
assertEquals(1, emits.size());
Iterator<PayloadEmit<String>> it = emits.iterator();
checkEmit(it.next(), 5, 8, "this", "pronon:this");
}
// @see https://github.com/robert-bor/aho-corasick/issues/8
@Test
public void testUnicode2() {
String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().onlyWholeWords().addKeyword("this", "pronon:this").build();
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().onlyWholeWords().addKeyword("this", "pronon:this")
.build();
assertEquals("THIS", target.substring(5, 9)); // Java does it the right way
PayloadEmit<String> firstMatch = trie.firstMatch(target);
checkEmit(firstMatch, 5, 8, "this", "pronon:this");
}
@Test
public void testPartialMatchWhiteSpaces() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().onlyWholeWordsWhiteSpaceSeparated().addKeyword("#sugar-123", "sugar").build();
PayloadTrie<String> trie = PayloadTrie.<String>builder().onlyWholeWordsWhiteSpaceSeparated()
.addKeyword("#sugar-123", "sugar").build();
Collection<PayloadEmit<String>> emits = trie.parseText("#sugar-123 #sugar-1234"); // left, middle, right test
assertEquals(1, emits.size()); // Match must not be made
checkEmit(emits.iterator().next(), 0, 9, "#sugar-123", "sugar");
}
@Test
public void testLargeString() {
final int interval = 100;
final int textSize = 1000000;
final String keyword = FOOD[1];
@ -535,73 +449,62 @@ public class PayloadTrieTest {
assertEquals(textSize / interval, emits.size());
}
/**
* Generates a random sequence of ASCII numbers.
*
* @param count The number of numbers to generate.
* @return A character sequence filled with random digits.
*/
private StringBuilder randomNumbers(int count) {
final StringBuilder sb = new StringBuilder(count);
@Test
public void test_containsMatchWithCaseInsensitive() {
while (--count > 0) {
sb.append(randomInt(0, 10));
}
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().addKeyword("foo", "bar").build();
assertTrue(trie.containsMatch("FOOBAR"));
assertFalse(trie.containsMatch("FO!?AR"));
return sb;
}
// @see https://github.com/robert-bor/aho-corasick/issues/85
@Test
public void test_wholeWords() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword("foo", "bar").onlyWholeWords().build();
// access via PayloadTrie.parseText(CharSequence)
Collection<PayloadEmit<String>> result1 = trie.parseText("foobar");
// access via PayloadTrie.parseText(CharSequence, PayloadEmitHandler<String>)
Collection<PayloadEmit<String>> result2 = new LinkedList<>();
trie.parseText("foobar", result2::add);
assertTrue(result1.isEmpty());
assertEquals(result1, result2);
/**
* Injects keywords into a string builder.
*
* @param source Should contain a bunch of random data that cannot match any
* keyword.
* @param keyword A keyword to inject repeatedly in the text.
* @param interval How often to inject the keyword.
*/
private void injectKeyword(final StringBuilder source, final String keyword, final int interval) {
final int length = source.length();
for (int i = 0; i < length; i += interval) {
source.replace(i, i + keyword.length(), keyword);
}
}
// @see https://github.com/robert-bor/aho-corasick/issues/85
@Test
public void test_wholeWordsWhiteSpaceSeparated() {
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword("foo", "bar").onlyWholeWordsWhiteSpaceSeparated().build();
// access via PayloadTrie.parseText(CharSequence)
Collection<PayloadEmit<String>> result1 = trie.parseText("foo#bar");
// access via PayloadTrie.parseText(CharSequence, PayloadEmitHandler<String>)
Collection<PayloadEmit<String>> result2 = new LinkedList<>();
trie.parseText("foo#bar", result2::add);
assertTrue(result1.isEmpty());
assertEquals(result1, result2);
private int randomInt(final int min, final int max) {
return ThreadLocalRandom.current().nextInt(min, max);
}
private void checkEmit(final PayloadEmit<Food> next, final int expectedStart, final int expectedEnd, final String expectedKeyword, final Food expectedPayload) {
private void checkEmit(PayloadEmit<Food> next, int expectedStart, int expectedEnd, String expectedKeyword,
Food expectedPayload) {
assertEquals("Start of emit should have been " + expectedStart, expectedStart, next.getStart());
assertEquals("End of emit should have been " + expectedEnd, expectedEnd, next.getEnd());
assertEquals("Keyword of emit shoud be " + expectedKeyword, expectedKeyword, next.getKeyword());
assertEquals("Payload of emit shoud be " + expectedPayload, expectedPayload, next.getPayload());
}
private void checkEmit(final PayloadEmit<Integer> next, final int expectedStart, final int expectedEnd, final String expectedKeyword, final Integer expectedPayload) {
private void checkEmit(PayloadEmit<Integer> next, int expectedStart, int expectedEnd, String expectedKeyword,
Integer expectedPayload) {
assertEquals("Start of emit should have been " + expectedStart, expectedStart, next.getStart());
assertEquals("End of emit should have been " + expectedEnd, expectedEnd, next.getEnd());
assertEquals("Keyword of emit shoud be " + expectedKeyword, expectedKeyword, next.getKeyword());
assertEquals("Payload of emit shoud be " + expectedPayload, expectedPayload, next.getPayload());
}
private void checkEmit(final PayloadEmit<String> next, final int expectedStart, final int expectedEnd, final String expectedKeyword, final String expectedPayload) {
private void checkEmit(PayloadEmit<String> next, int expectedStart, int expectedEnd, String expectedKeyword,
String expectedPayload) {
assertEquals("Start of emit should have been " + expectedStart, expectedStart, next.getStart());
assertEquals("End of emit should have been " + expectedEnd, expectedEnd, next.getEnd());
assertEquals("Keyword of emit shoud be " + expectedKeyword, expectedKeyword, next.getKeyword());
assertEquals("Payload of emit shoud be " + expectedPayload, expectedPayload, next.getPayload());
}
}

View File

@ -2,75 +2,23 @@ package org.ahocorasick.trie;
import org.junit.Test;
import java.util.Collection;
import java.util.Collections;
import static org.junit.Assert.*;
import static junit.framework.Assert.assertEquals;
public class StateTest {
@Test
public void test_constructSequenceOfCharacters() {
final State rootState = new State();
rootState.addState('a').addState('b').addState('c');
public void constructSequenceOfCharacters() {
State rootState = new State();
rootState
.addState('a')
.addState('b')
.addState('c');
State currentState = rootState.nextState('a');
assertEquals(1, currentState.getDepth());
currentState = currentState.nextState('b');
assertEquals(2, currentState.getDepth());
currentState = currentState.nextState('c');
assertEquals(3, currentState.getDepth());
currentState = currentState.nextState('F');
assertNull(currentState);
}
@Test
public void test_getStates() {
final State rootState = new State();
rootState.addState("foo");
final State currentState = rootState.nextState('f');
final Collection<State> states = rootState.getStates();
assertEquals(1, states.size());
assertEquals(currentState, states.iterator().next());
}
@Test
public void test_getTransitions() {
final State rootState = new State();
rootState.addState("foo");
final State currentState = rootState.nextState('f');
final Collection<Character> transitions = rootState.getTransitions();
assertEquals(1, transitions.size());
assertEquals(Character.valueOf('f'), transitions.iterator().next());
}
@Test
public void test_getFailure() {
final State failureState = new State();
final State rootState = new State();
rootState.setFailure(failureState);
assertEquals(failureState, rootState.getFailure());
}
@Test
public void test_checkEmits() {
final State rootState = new State();
rootState.addState('a').addEmit(Collections.singleton("tag"));
final Collection<String> actual = rootState.nextState('a').emit();
assertEquals(1, actual.size());
assertEquals("tag", actual.iterator().next());
}
}

View File

@ -1,47 +0,0 @@
package org.ahocorasick.trie;
import static java.util.concurrent.ThreadLocalRandom.current;
/**
* Contains functionality common to tests.
*/
public class TestHelper {
/**
* Injects keywords into a string builder.
*
* @param source Should contain a bunch of random data that cannot match
* any keyword.
* @param keyword A keyword to inject repeatedly in the text.
* @param interval How often to inject the keyword.
*/
@SuppressWarnings("SameParameterValue")
static void injectKeyword(final StringBuilder source, final String keyword, final int interval) {
final int length = source.length();
for (int i = 0; i < length; i += interval) {
source.replace(i, i + keyword.length(), keyword);
}
}
/**
* Generates a random sequence of ASCII numbers.
*
* @param count The number of numbers to generate.
* @return A character sequence filled with random digits.
*/
@SuppressWarnings("SameParameterValue")
public static StringBuilder randomNumbers(int count) {
int localCount = count;
final StringBuilder sb = new StringBuilder(localCount);
while (--localCount > 0) {
sb.append(current().nextInt(0, 10));
}
return sb;
}
}

View File

@ -1,164 +1,120 @@
package org.ahocorasick.trie;
import static junit.framework.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.util.*;
import java.util.concurrent.ThreadLocalRandom;
import java.util.concurrent.atomic.AtomicInteger;
import org.ahocorasick.trie.handler.AbstractStatefulEmitHandler;
import org.ahocorasick.trie.handler.EmitHandler;
import org.ahocorasick.trie.handler.StatefulEmitHandler;
import org.junit.Test;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import static java.lang.String.format;
import static org.ahocorasick.trie.TestHelper.injectKeyword;
import static org.ahocorasick.trie.TestHelper.randomNumbers;
import static org.ahocorasick.trie.Trie.builder;
import static org.junit.Assert.*;
/**
* Test the {@link Trie} class functionality.
*/
public class TrieTest {
private final static String[] ALPHABET = new String[]{
"abc", "bcd", "cde"
};
private final static String[] ALPHABET = new String[]{"abc", "bcd", "cde"};
private final static String[] PRONOUNS = new String[]{
"hers", "his", "she", "he"
};
private final static String[] PRONOUNS = new String[]{"hers", "his", "she", "he"};
private final static String[] FOOD = new String[]{
"veal", "cauliflower", "broccoli", "tomatoes"
};
private final static String[] FOOD = new String[]{"veal", "cauliflower", "broccoli", "tomatoes"};
private final static String[] GREEK_LETTERS = new String[]{"Alpha", "Beta", "Gamma"};
private final static String[] UNICODE = new String[]{"turning", "once", "again", "börkü"};
private static Trie trie(final String keyword) {
return Trie.builder().addKeyword(keyword).build();
}
private static Trie trieIgnoreWhiteSpace(final String keyword) {
return Trie.builder().addKeyword(keyword).ignoreWhiteSpace().build();
}
private static Trie trie(final String[] keywords) {
return Trie.builder().addKeywords(keywords).ignoreWhiteSpace().build();
}
private static Trie trieIgnoreWhiteSpace(final String[] keywords) {
return Trie.builder().addKeywords(keywords).ignoreWhiteSpace().build();
}
private final static String[] GREEK_LETTERS = new String[]{
"Alpha", "Beta", "Gamma"
};
private final static String[] UNICODE = new String[]{
"turning", "once", "again", "börkü"
};
@Test
public void test_KeywordAndTextAreTheSame() {
final Trie trie = trie(ALPHABET[0]);
final Collection<Emit> emits = trie.parseText(ALPHABET[0]);
final Iterator<Emit> iterator = emits.iterator();
public void keywordAndTextAreTheSame() {
Trie trie = Trie.builder()
.addKeyword(ALPHABET[0])
.build();
Collection<Emit> emits = trie.parseText(ALPHABET[0]);
Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 0, 2, ALPHABET[0]);
}
@Test
public void test_ignoringWhitespace_KeywordAndTextAreTheSame() {
final Trie trie = trieIgnoreWhiteSpace(ALPHABET);
final Collection<Emit> emits = trie.parseText("a b c d e");
final Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 0, 4, ALPHABET[0]);
checkEmit(iterator.next(), 2, 6, ALPHABET[1]);
checkEmit(iterator.next(), 4, 8, ALPHABET[2]);
}
@Test
public void test_KeywordAndTextAreTheSameFirstMatch() {
final Trie trie = trie(ALPHABET[0]);
final Emit firstMatch = trie.firstMatch(ALPHABET[0]);
public void keywordAndTextAreTheSameFirstMatch() {
Trie trie = Trie.builder()
.addKeyword(ALPHABET[0])
.build();
Emit firstMatch = trie.firstMatch(ALPHABET[0]);
checkEmit(firstMatch, 0, 2, ALPHABET[0]);
}
@Test
public void test_TextIsLongerThanKeyword() {
final Trie trie = trie(ALPHABET[0]);
final Collection<Emit> emits = trie.parseText(" " + ALPHABET[0]);
final Iterator<Emit> iterator = emits.iterator();
public void textIsLongerThanKeyword() {
Trie trie = Trie.builder()
.addKeyword(ALPHABET[0])
.build();
Collection<Emit> emits = trie.parseText(" " + ALPHABET[0]);
Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 1, 3, ALPHABET[0]);
}
@Test
public void test_TextIsLongerThanKeywordFirstMatch() {
final Trie trie = trie(ALPHABET[0]);
final Emit firstMatch = trie.firstMatch(" " + ALPHABET[0]);
public void textIsLongerThanKeywordFirstMatch() {
Trie trie = Trie.builder()
.addKeyword(ALPHABET[0])
.build();
Emit firstMatch = trie.firstMatch(" " + ALPHABET[0]);
checkEmit(firstMatch, 1, 3, ALPHABET[0]);
}
@Test
public void test_VariousKeywordsOneMatch() {
final Trie trie = trie(ALPHABET);
final Collection<Emit> emits = trie.parseText("bcd");
final Iterator<Emit> iterator = emits.iterator();
public void variousKeywordsOneMatch() {
Trie trie = Trie.builder()
.addKeywords(ALPHABET)
.build();
Collection<Emit> emits = trie.parseText("bcd");
Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 0, 2, "bcd");
}
@Test
public void test_VariousKeywordsFirstMatch() {
final Trie trie = trie(ALPHABET);
final Emit firstMatch = trie.firstMatch("bc d");
checkEmit(firstMatch, 0, 3, "bcd");
public void variousKeywordsFirstMatch() {
Trie trie = Trie.builder()
.addKeywords(ALPHABET)
.build();
Emit firstMatch = trie.firstMatch("bcd");
checkEmit(firstMatch, 0, 2, "bcd");
}
@Test(expected = AssertionError.class)
public void test_NullInputTextFirstMatch() {
final Trie trie = trie(ALPHABET);
final Emit firstMatch = trie.firstMatch(null);
assertNull(firstMatch);
}
@Test
public void test_UshersTestAndStopOnHit() {
final Trie trie = Trie.builder().addKeywords(PRONOUNS).stopOnHit().build();
final Collection<Emit> emits = trie.parseText("ushers");
public void ushersTestAndStopOnHit() {
Trie trie = Trie.builder()
.addKeywords(PRONOUNS)
.stopOnHit()
.build();
Collection<Emit> emits = trie.parseText("ushers");
assertEquals(1, emits.size()); // she @ 3, he @ 3, hers @ 5
final Iterator<Emit> iterator = emits.iterator();
Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 2, 3, "he");
}
@Test
public void test_UshersTestStopOnHitSkipOne() {
final Trie trie = Trie.builder().addKeywords(PRONOUNS).stopOnHit().build();
final StatefulEmitHandler testEmitHandler = new AbstractStatefulEmitHandler() {
public void ushersTestStopOnHitSkipOne() {
Trie trie = Trie.builder()
.addKeywords(PRONOUNS)
.stopOnHit()
.build();
StatefulEmitHandler testEmitHandler = new AbstractStatefulEmitHandler() {
boolean first = true;
@Override
public boolean emit(final Emit emit) {
if (first) {
if(first) {
// return false for the first element
first = false;
return false;
@ -166,117 +122,128 @@ public class TrieTest {
addEmit(emit);
return true;
}
};
};
trie.parseText("ushers", testEmitHandler);
final Collection<Emit> emits = testEmitHandler.getEmits();
Collection<Emit> emits = testEmitHandler.getEmits();
assertEquals(1, emits.size()); // she @ 3, he @ 3, hers @ 5
final Iterator<Emit> iterator = emits.iterator();
Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 1, 3, "she");
}
@Test
public void test_UshersTest() {
final Trie trie = trie(PRONOUNS);
final Collection<Emit> emits = trie.parseText("ushers");
public void ushersTest() {
Trie trie = Trie.builder()
.addKeywords(PRONOUNS)
.build();
Collection<Emit> emits = trie.parseText("ushers");
assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5
final Iterator<Emit> iterator = emits.iterator();
Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 2, 3, "he");
checkEmit(iterator.next(), 1, 3, "she");
checkEmit(iterator.next(), 2, 5, "hers");
}
@Test
public void test_UshersTestWithCapitalKeywords() {
final Trie trie = Trie.builder().ignoreCase().addKeyword("HERS").addKeyword("HIS").addKeyword("SHE").addKeyword("HE").build();
final Collection<Emit> emits = trie.parseText("ushers");
public void ushersTestWithCapitalKeywords() {
Trie trie = Trie.builder()
.ignoreCase()
.addKeyword("HERS")
.addKeyword("HIS")
.addKeyword("SHE")
.addKeyword("HE")
.build();
Collection<Emit> emits = trie.parseText("ushers");
assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5
final Iterator<Emit> iterator = emits.iterator();
Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 2, 3, "HE");
checkEmit(iterator.next(), 1, 3, "SHE");
checkEmit(iterator.next(), 2, 5, "HERS");
}
@Test
public void test_UshersTestFirstMatch() {
final Trie trie = trie(PRONOUNS);
final Emit firstMatch = trie.firstMatch("ushers");
public void ushersTestFirstMatch() {
Trie trie = Trie.builder()
.addKeywords(PRONOUNS)
.build();
Emit firstMatch = trie.firstMatch("ushers");
checkEmit(firstMatch, 2, 3, "he");
}
@Test
public void test_UshersTestByCallback() {
public void ushersTestByCallback() {
Trie trie = Trie.builder()
.addKeywords(PRONOUNS)
.build();
final Trie trie = trie(PRONOUNS);
final List<Emit> emits = new ArrayList<>();
final EmitHandler emitHandler = emit -> {
emits.add(emit);
return true;
EmitHandler emitHandler = new EmitHandler() {
@Override
public boolean emit(Emit emit) {
emits.add(emit);
return true;
}
};
trie.parseText("ushers", emitHandler);
assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5
final Iterator<Emit> iterator = emits.iterator();
Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 2, 3, "he");
checkEmit(iterator.next(), 1, 3, "she");
checkEmit(iterator.next(), 2, 5, "hers");
}
@Test
public void test_MisleadingTest() {
final Trie trie = trie("hers");
final Collection<Emit> emits = trie.parseText("h he her hers");
final Iterator<Emit> iterator = emits.iterator();
public void misleadingTest() {
Trie trie = Trie.builder()
.addKeyword("hers")
.build();
Collection<Emit> emits = trie.parseText("h he her hers");
Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 9, 12, "hers");
}
@Test
public void test_MisleadingTestFirstMatch() {
final Trie trie = trie("hers");
final Emit firstMatch = trie.firstMatch("h he her hers");
public void misleadingTestFirstMatch() {
Trie trie = Trie.builder()
.addKeyword("hers")
.build();
Emit firstMatch = trie.firstMatch("h he her hers");
checkEmit(firstMatch, 9, 12, "hers");
}
@Test
public void test_Recipes() {
final Trie trie = trie(FOOD);
final Collection<Emit> emits = trie.parseText("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli");
final Iterator<Emit> iterator = emits.iterator();
public void recipes() {
Trie trie = Trie.builder()
.addKeywords(FOOD)
.build();
Collection<Emit> emits = trie.parseText("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli");
Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 2, 12, "cauliflower");
checkEmit(iterator.next(), 18, 25, "tomatoes");
checkEmit(iterator.next(), 40, 43, "veal");
checkEmit(iterator.next(), 51, 58, "broccoli");
}
@Test
public void test_RecipesFirstMatch() {
final Trie trie = trie(FOOD);
final Emit firstMatch = trie.firstMatch("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli");
public void recipesFirstMatch() {
Trie trie = Trie.builder()
.addKeywords(FOOD)
.build();
Emit firstMatch = trie.firstMatch("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli");
checkEmit(firstMatch, 2, 12, "cauliflower");
}
@Test
public void test_LongAndShortOverlappingMatch() {
final Trie trie = Trie.builder().addKeyword("he").addKeyword("hehehehe").build();
final Collection<Emit> emits = trie.parseText("hehehehehe");
final Iterator<Emit> iterator = emits.iterator();
public void longAndShortOverlappingMatch() {
Trie trie = Trie.builder()
.addKeyword("he")
.addKeyword("hehehehe")
.build();
Collection<Emit> emits = trie.parseText("hehehehehe");
Iterator<Emit> iterator = emits.iterator();
checkEmit(iterator.next(), 0, 1, "he");
checkEmit(iterator.next(), 2, 3, "he");
checkEmit(iterator.next(), 4, 5, "he");
@ -286,43 +253,46 @@ public class TrieTest {
checkEmit(iterator.next(), 2, 9, "hehehehe");
}
@Test
public void test_NonOverlapping() {
final Trie trie = Trie.builder().ignoreOverlaps().addKeyword("ab").addKeyword("cba").addKeyword("ababc").build();
final Collection<Emit> emits = trie.parseText("ababcbab");
public void nonOverlapping() {
Trie trie = Trie.builder().removeOverlaps()
.addKeyword("ab")
.addKeyword("cba")
.addKeyword("ababc")
.build();
Collection<Emit> emits = trie.parseText("ababcbab");
assertEquals(2, emits.size());
final Iterator<Emit> iterator = emits.iterator();
Iterator<Emit> iterator = emits.iterator();
// With overlaps: ab@1, ab@3, ababc@4, cba@6, ab@7
checkEmit(iterator.next(), 0, 4, "ababc");
checkEmit(iterator.next(), 6, 7, "ab");
}
@Test
public void test_NonOverlappingFirstMatch() {
final Trie trie = Trie.builder().ignoreOverlaps().addKeyword("ab").addKeyword("cba").addKeyword("ababc").build();
final Emit firstMatch = trie.firstMatch("ababcbab");
public void nonOverlappingFirstMatch() {
Trie trie = Trie.builder().removeOverlaps()
.addKeyword("ab")
.addKeyword("cba")
.addKeyword("ababc")
.build();
Emit firstMatch = trie.firstMatch("ababcbab");
checkEmit(firstMatch, 0, 4, "ababc");
}
@Test
public void test_ContainsMatch() {
final Trie trie = Trie.builder().ignoreOverlaps().addKeyword("ab").addKeyword("cba").addKeyword("ababc").build();
public void containsMatch() {
Trie trie = Trie.builder().removeOverlaps()
.addKeyword("ab")
.addKeyword("cba")
.addKeyword("ababc")
.build();
assertTrue(trie.containsMatch("ababcbab"));
}
@Test
public void test_StartOfChurchillSpeech() {
final Trie trie = Trie.builder()
.ignoreOverlaps()
public void startOfChurchillSpeech() {
Trie trie = Trie.builder().removeOverlaps()
.addKeyword("T")
.addKeyword("u")
.addKeyword("ur")
@ -334,40 +304,40 @@ public class TrieTest {
.addKeyword("n")
.addKeyword("urning")
.build();
final Collection<Emit> emits = trie.parseText("Turning");
Collection<Emit> emits = trie.parseText("Turning");
assertEquals(2, emits.size());
}
@Test
public void test_PartialMatch() {
final Trie trie = Trie.builder().onlyWholeWords().addKeyword("sugar").build();
final Collection<Emit> emits = trie.parseText("sugarcane sugarcane sugar canesugar"); // left, middle, right test
public void partialMatch() {
Trie trie = Trie.builder()
.onlyWholeWords()
.addKeyword("sugar")
.build();
Collection<Emit> emits = trie.parseText("sugarcane sugarcane sugar canesugar"); // left, middle, right test
assertEquals(1, emits.size()); // Match must not be made
checkEmit(emits.iterator().next(), 20, 24, "sugar");
}
@Test
public void test_PartialMatchFirstMatch() {
final Trie trie = Trie.builder().onlyWholeWords().addKeyword("sugar").build();
// left, middle, right test
final Emit firstMatch = trie.firstMatch("sugarcane sugarcane sugar canesugar");
public void partialMatchFirstMatch() {
Trie trie = Trie.builder()
.onlyWholeWords()
.addKeyword("sugar")
.build();
Emit firstMatch = trie.firstMatch("sugarcane sugarcane sugar canesugar"); // left, middle, right test
checkEmit(firstMatch, 20, 24, "sugar");
}
@Test
public void test_TokenizeFullSentence() {
final Trie trie = trie(GREEK_LETTERS);
final Collection<Token> tokens = trie.tokenize("Hear: Alpha team first, Beta from the rear, Gamma in reserve");
public void tokenizeFullSentence() {
Trie trie = Trie.builder()
.addKeywords(GREEK_LETTERS)
.build();
Collection<Token> tokens = trie.tokenize("Hear: Alpha team first, Beta from the rear, Gamma in reserve");
assertEquals(7, tokens.size());
final Iterator<Token> tokensIt = tokens.iterator();
Iterator<Token> tokensIt = tokens.iterator();
assertEquals("Hear: ", tokensIt.next().getFragment());
assertEquals("Alpha", tokensIt.next().getFragment());
assertEquals(" team first, ", tokensIt.next().getFragment());
@ -377,147 +347,104 @@ public class TrieTest {
assertEquals(" in reserve", tokensIt.next().getFragment());
}
/**
* Test boundary check with case-insensitive matches with whole words.
*/
// @see https://github.com/robert-bor/aho-corasick/issues/5
@Test
public void test_StringIndexOutOfBoundsException() {
final Trie trie = Trie.builder().ignoreCase().onlyWholeWords().addKeywords(UNICODE).build();
final Collection<Emit> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ");
public void testStringIndexOutOfBoundsException() {
Trie trie = Trie.builder().ignoreCase().onlyWholeWords()
.addKeywords(UNICODE)
.build();
Collection<Emit> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ");
assertEquals(4, emits.size()); // Match must not be made
final Iterator<Emit> it = emits.iterator();
Iterator<Emit> it = emits.iterator();
checkEmit(it.next(), 0, 6, "turning");
checkEmit(it.next(), 8, 11, "once");
checkEmit(it.next(), 13, 17, "again");
checkEmit(it.next(), 19, 23, "börkü");
}
@Test
public void test_IgnoreCase() {
final Trie trie = Trie.builder().ignoreCase().addKeywords(UNICODE).build();
final Collection<Emit> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ");
public void testIgnoreCase() {
Trie trie = Trie.builder().ignoreCase()
.addKeywords(UNICODE)
.build();
Collection<Emit> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ");
assertEquals(4, emits.size()); // Match must not be made
final Iterator<Emit> it = emits.iterator();
Iterator<Emit> it = emits.iterator();
checkEmit(it.next(), 0, 6, "turning");
checkEmit(it.next(), 8, 11, "once");
checkEmit(it.next(), 13, 17, "again");
checkEmit(it.next(), 19, 23, "börkü");
}
@Test
public void test_IgnoreCaseFirstMatch() {
final Trie trie = Trie.builder().ignoreCase().addKeywords(UNICODE).build();
final Emit firstMatch = trie.firstMatch("TurninG OnCe AgAiN BÖRKÜ");
public void testIgnoreCaseFirstMatch() {
Trie trie = Trie.builder().ignoreCase()
.addKeywords(UNICODE)
.build();
Emit firstMatch = trie.firstMatch("TurninG OnCe AgAiN BÖRKÜ");
checkEmit(firstMatch, 0, 6, "turning");
}
@Test
public void test_TokenizeTokensInSequence() {
final Trie trie = trie(GREEK_LETTERS);
final Collection<Token> tokens = trie.tokenize("Alpha Beta Gamma");
public void tokenizeTokensInSequence() {
Trie trie = Trie.builder()
.addKeywords(GREEK_LETTERS)
.build();
Collection<Token> tokens = trie.tokenize("Alpha Beta Gamma");
assertEquals(5, tokens.size());
}
/**
* Fix adding a word of size 0 ("") as a dictionary. A bug in the dictionary
* parsing code (at end of line) caused it to generate words of 0 length,
* which were being added to the trie. Removing the additional commas
* resolved the issue.
*/
// @see https://github.com/robert-bor/aho-corasick/issues/7
@Test
public void test_ZeroLength() {
final Trie trie = Trie.builder().ignoreOverlaps().onlyWholeWords().ignoreCase().addKeyword("").build();
trie.tokenize("Try a natural lip and subtle bronzer to keep all the focus on those "
+ "big bright eyes with NARS Eyeshadow Duo in Rated R And the "
+ "winner is... Boots No7 Advanced Renewal Anti-ageing Glycolic "
+ "Peel Kit ($25 amazon.com) won most-appealing peel.");
public void testZeroLength() {
Trie trie = Trie.builder().ignoreOverlaps().onlyWholeWords().ignoreCase()
.addKeyword("")
.build();
trie.tokenize("Try a natural lip and subtle bronzer to keep all the focus on those big bright eyes with NARS Eyeshadow Duo in Rated R And the winner is... Boots No7 Advanced Renewal Anti-ageing Glycolic Peel Kit ($25 amazon.com) won most-appealing peel.");
}
// @see https://github.com/robert-bor/aho-corasick/issues/8
@Test
public void test_Emit_PunctuatedKeyword_AllOffsetsFound() {
final String keyword = "{{var}}";
final int len = keyword.length() - 1;
final Trie trie = builder().ignoreOverlaps().addKeyword(keyword).build();
final Collection<Emit> emits = trie.parseText(format("__%s__ **%s** {{%s}} %s%s", keyword, keyword, keyword, keyword, keyword));
assertEquals(5, emits.size());
final Iterator<Emit> it = emits.iterator();
checkEmit(it.next(), 2, 2 + len, keyword);
checkEmit(it.next(), 14, 14 + len, keyword);
checkEmit(it.next(), 26, 26 + len, keyword);
checkEmit(it.next(), 36, 36 + len, keyword);
checkEmit(it.next(), 43, 43 + len, keyword);
}
/**
* Notice the capital I with a dot. The code used to compute the offsets
* at (6, 9), which caused {@link Trie#tokenize(String)} to crash because
* 9 is past the end of the string. That character is two bytes wide, so it
* pushes the offset calculation off.
*/
@Test
public void test_Unicode1() {
// The second character ('İ') is
// Unicode, which was read by AC as a 2-byte char
final String target = "LİKE THIS";
// Java does it the right way
assertEquals("THIS", target.substring(5, 9));
final Trie trie = Trie.builder().ignoreCase().onlyWholeWords().addKeyword("this").build();
final Collection<Emit> emits = trie.parseText(target);
public void testUnicode1() {
String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char
assertEquals("THIS", target.substring(5, 9)); // Java does it the right way
Trie trie = Trie.builder().ignoreCase().onlyWholeWords()
.addKeyword("this")
.build();
Collection<Emit> emits = trie.parseText(target);
assertEquals(1, emits.size());
final Iterator<Emit> it = emits.iterator();
Iterator<Emit> it = emits.iterator();
checkEmit(it.next(), 5, 8, "this");
}
/**
* Notice the capital I with a dot. The code used to compute the offsets
* at (6, 9), which caused {@link Trie#tokenize(String)} to crash because
* 9 is past the end of the string. That character is two bytes wide, so it
* pushes the offset calculation off.
*/
// @see https://github.com/robert-bor/aho-corasick/issues/8
@Test
public void test_Unicode2() {
// The second character ('İ') is
// Unicode, which was read by AC as a 2-byte char
final String target = "LİKE THIS";
final Trie trie = Trie.builder().ignoreCase().onlyWholeWords().addKeyword("this").build();
// Java does it the right way
assertEquals("THIS", target.substring(5, 9));
final Emit firstMatch = trie.firstMatch(target);
public void testUnicode2() {
String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char
Trie trie = Trie.builder()
.ignoreCase()
.onlyWholeWords()
.addKeyword("this")
.build();
assertEquals("THIS", target.substring(5, 9)); // Java does it the right way
Emit firstMatch = trie.firstMatch(target);
checkEmit(firstMatch, 5, 8, "this");
}
@Test
public void test_PartialMatchWhiteSpaces() {
final Trie trie = Trie.builder().onlyWholeWordsWhiteSpaceSeparated().addKeyword("#sugar-123").build();
final Collection<Emit> emits = trie.parseText("#sugar-123 #sugar-1234"); // left, middle, right test
public void testPartialMatchWhiteSpaces() {
Trie trie = Trie.builder()
.onlyWholeWordsWhiteSpaceSeparated()
.addKeyword("#sugar-123")
.build();
Collection<Emit> emits = trie.parseText("#sugar-123 #sugar-1234"); // left, middle, right test
assertEquals(1, emits.size()); // Match must not be made
checkEmit(emits.iterator().next(), 0, 9, "#sugar-123");
}
@Test
public void test_LargeString() {
public void testLargeString() {
final int interval = 100;
final int textSize = 1000000;
final String keyword = FOOD[1];
@ -525,51 +452,65 @@ public class TrieTest {
injectKeyword(text, keyword, interval);
final Trie trie = Trie.builder().onlyWholeWords().addKeyword(keyword).build();
Trie trie = Trie.builder()
.onlyWholeWords()
.addKeyword(keyword)
.build();
final Collection<Emit> emits = trie.parseText(text);
assertEquals(textSize / interval, emits.size());
}
@Test
public void test_UnicodeIssueBug39ReportedByHumanzz() {
// Problem: "İ".length => 1, "İ".toLowerCase().length => 2. This causes
// all sorts of unexpected behaviors
// and bugs where the Emit will have a size different from the original
// string.
// Soln: As in issue #8, convert at character level Character.toLowerCase
// ('İ') => 'i' + make sure
public void unicodeIssueBug39ReportedByHumanzz(){
// Problem: "İ".length => 1, "İ".toLowerCase().length => 2. This causes all sorts of unexpected behaviors
// and bugs where the Emit will have a size different from the original string.
// Soln: As in issue #8, convert at character level Character.toLowerCase('İ') => 'i' + make sure
// that emit gets the properly cased keyword.
final String upperLengthOne = "İnt";
final Trie trie = Trie.builder().ignoreCase().onlyWholeWords().addKeyword(upperLengthOne).build();
final Collection<Emit> emits = trie.parseText("İnt is good");
String upperLengthOne = "İnt";
Trie trie = Trie.builder()
.ignoreCase()
.onlyWholeWords()
.addKeyword(upperLengthOne)
.build();
Collection<Emit> emits = trie.parseText("İnt is good");
assertEquals(1, emits.size());
checkEmit(emits.iterator().next(), 0, 2, upperLengthOne);
}
@Test(timeout = 30_000)
public void test_ParallelSearch() throws InterruptedException {
@Test(timeout=30_000)
public void testParallelSearch() throws InterruptedException {
final int interval = 100;
final int textSize = 1000000;
final String keyword = FOOD[1];
final StringBuilder matchingText = randomNumbers(textSize);
injectKeyword(matchingText, keyword, interval);
final StringBuilder nonMatchingText = randomNumbers(textSize);
injectKeyword(nonMatchingText, keyword.substring(0, keyword.length() - 1), interval);
injectKeyword(nonMatchingText, keyword.substring(0, keyword.length()-1), interval);
final Trie trie = Trie.builder().onlyWholeWords().addKeyword(keyword).build();
final Trie trie = Trie.builder()
.onlyWholeWords()
.addKeyword(keyword)
.build();
final AtomicInteger matchCount = new AtomicInteger(0);
final Runnable matchingTask = () -> matchCount.set(trie.parseText(matchingText).size());
Runnable matchingTask = new Runnable() {
@Override
public void run() {
matchCount.set(trie.parseText(matchingText).size());
}
};
final AtomicInteger nonMatchCount = new AtomicInteger(0);
final Runnable nonMatchingTask = () -> nonMatchCount.set(trie.parseText(nonMatchingText).size());
final Thread matchingThread = new Thread(matchingTask);
final Thread nonMatchingThread = new Thread(nonMatchingTask);
Runnable nonMatchingTask = new Runnable() {
@Override
public void run() {
nonMatchCount.set(trie.parseText(nonMatchingText).size());
}
};
Thread matchingThread = new Thread(matchingTask);
Thread nonMatchingThread = new Thread(nonMatchingTask);
matchingThread.start();
nonMatchingThread.start();
matchingThread.join();
@ -579,12 +520,47 @@ public class TrieTest {
assertEquals(0, nonMatchCount.get());
}
/**
* Generates a random sequence of ASCII numbers.
*
* @param count The number of numbers to generate.
* @return A character sequence filled with random digits.
*/
private StringBuilder randomNumbers(int count) {
final StringBuilder sb = new StringBuilder(count);
while (--count > 0) {
sb.append(randomInt(0, 10));
}
return sb;
}
/**
* Injects keywords into a string builder.
*
* @param source Should contain a bunch of random data that cannot match
* any keyword.
* @param keyword A keyword to inject repeatedly in the text.
* @param interval How often to inject the keyword.
*/
private void injectKeyword(
final StringBuilder source,
final String keyword,
final int interval) {
final int length = source.length();
for (int i = 0; i < length; i += interval) {
source.replace(i, i + keyword.length(), keyword);
}
}
private int randomInt(final int min, final int max) {
return ThreadLocalRandom.current().nextInt(min, max);
}
private void checkEmit(Emit next, int expectedStart, int expectedEnd, String expectedKeyword) {
assertEquals("Start of emit should have been " + expectedStart, expectedStart, next.getStart());
assertEquals("End of emit should have been " + expectedEnd, expectedEnd, next.getEnd());
assertEquals(expectedKeyword, next.getKeyword());
}
}

View File

@ -0,0 +1,65 @@
package org.ahocorasick.util;
import java.util.ArrayList;
import java.util.List;
import org.ahocorasick.util.ListElementRemoval.RemoveElementPredicate;
import org.junit.Test;
import junit.framework.Assert;
import static java.util.Arrays.asList;
public class ListElementRemovalTest {
@Test
public void removeNone() {
List<String> list = new ArrayList<>(asList("a", "b", "c"));
RemoveElementPredicate<String> matchNothing = new RemoveElementPredicate<String>() {
@Override
public boolean remove(String t) {
return false;
}
};
ListElementRemoval.removeIf(list, matchNothing);
Assert.assertEquals(3, list.size());
}
@Test
public void removeAll() {
List<String> list = new ArrayList<>(asList("a", "b", "c"));
RemoveElementPredicate<String> matchNothing = new RemoveElementPredicate<String>() {
@Override
public boolean remove(String t) {
return true;
}
};
ListElementRemoval.removeIf(list, matchNothing);
Assert.assertEquals(0, list.size());
}
@Test
public void removeSome() {
List<String> list = new ArrayList<>(asList("a", "b", "c"));
RemoveElementPredicate<String> matchNothing = new RemoveElementPredicate<String>() {
@Override
public boolean remove(String t) {
return "a".equals(t) || "c".equals(t);
}
};
ListElementRemoval.removeIf(list, matchNothing);
Assert.assertEquals(1, list.size());
Assert.assertEquals("b", list.get(0));
}
}