Compare commits
1 Commits
master
...
issue40_ma
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b1c1ad4d71 |
3
.gitignore
vendored
3
.gitignore
vendored
@ -2,5 +2,4 @@
|
|||||||
*.iml
|
*.iml
|
||||||
src/main/java/Main.java
|
src/main/java/Main.java
|
||||||
*.txt
|
*.txt
|
||||||
docs
|
docs
|
||||||
/target/
|
|
||||||
@ -1,21 +0,0 @@
|
|||||||
include:
|
|
||||||
- project: 'gitlab/gitlab'
|
|
||||||
ref: 'main'
|
|
||||||
file: 'ci-templates/gradle_java.yml'
|
|
||||||
|
|
||||||
|
|
||||||
deploy:
|
|
||||||
stage: deploy
|
|
||||||
tags:
|
|
||||||
- dind
|
|
||||||
script:
|
|
||||||
- echo "Building with gradle version ${BUILDVERSION}"
|
|
||||||
- gradle -Pversion=${BUILDVERSION} publish
|
|
||||||
- echo "BUILDVERSION=$BUILDVERSION" >> version.env
|
|
||||||
artifacts:
|
|
||||||
reports:
|
|
||||||
dotenv: version.env
|
|
||||||
rules:
|
|
||||||
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
|
|
||||||
- if: $CI_COMMIT_BRANCH =~ /^release/
|
|
||||||
- if: $CI_COMMIT_TAG
|
|
||||||
202
LICENSE.md
202
LICENSE.md
@ -1,202 +0,0 @@
|
|||||||
|
|
||||||
Apache License
|
|
||||||
Version 2.0, January 2004
|
|
||||||
http://www.apache.org/licenses/
|
|
||||||
|
|
||||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
|
||||||
|
|
||||||
1. Definitions.
|
|
||||||
|
|
||||||
"License" shall mean the terms and conditions for use, reproduction,
|
|
||||||
and distribution as defined by Sections 1 through 9 of this document.
|
|
||||||
|
|
||||||
"Licensor" shall mean the copyright owner or entity authorized by
|
|
||||||
the copyright owner that is granting the License.
|
|
||||||
|
|
||||||
"Legal Entity" shall mean the union of the acting entity and all
|
|
||||||
other entities that control, are controlled by, or are under common
|
|
||||||
control with that entity. For the purposes of this definition,
|
|
||||||
"control" means (i) the power, direct or indirect, to cause the
|
|
||||||
direction or management of such entity, whether by contract or
|
|
||||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
|
||||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
|
||||||
|
|
||||||
"You" (or "Your") shall mean an individual or Legal Entity
|
|
||||||
exercising permissions granted by this License.
|
|
||||||
|
|
||||||
"Source" form shall mean the preferred form for making modifications,
|
|
||||||
including but not limited to software source code, documentation
|
|
||||||
source, and configuration files.
|
|
||||||
|
|
||||||
"Object" form shall mean any form resulting from mechanical
|
|
||||||
transformation or translation of a Source form, including but
|
|
||||||
not limited to compiled object code, generated documentation,
|
|
||||||
and conversions to other media types.
|
|
||||||
|
|
||||||
"Work" shall mean the work of authorship, whether in Source or
|
|
||||||
Object form, made available under the License, as indicated by a
|
|
||||||
copyright notice that is included in or attached to the work
|
|
||||||
(an example is provided in the Appendix below).
|
|
||||||
|
|
||||||
"Derivative Works" shall mean any work, whether in Source or Object
|
|
||||||
form, that is based on (or derived from) the Work and for which the
|
|
||||||
editorial revisions, annotations, elaborations, or other modifications
|
|
||||||
represent, as a whole, an original work of authorship. For the purposes
|
|
||||||
of this License, Derivative Works shall not include works that remain
|
|
||||||
separable from, or merely link (or bind by name) to the interfaces of,
|
|
||||||
the Work and Derivative Works thereof.
|
|
||||||
|
|
||||||
"Contribution" shall mean any work of authorship, including
|
|
||||||
the original version of the Work and any modifications or additions
|
|
||||||
to that Work or Derivative Works thereof, that is intentionally
|
|
||||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
|
||||||
or by an individual or Legal Entity authorized to submit on behalf of
|
|
||||||
the copyright owner. For the purposes of this definition, "submitted"
|
|
||||||
means any form of electronic, verbal, or written communication sent
|
|
||||||
to the Licensor or its representatives, including but not limited to
|
|
||||||
communication on electronic mailing lists, source code control systems,
|
|
||||||
and issue tracking systems that are managed by, or on behalf of, the
|
|
||||||
Licensor for the purpose of discussing and improving the Work, but
|
|
||||||
excluding communication that is conspicuously marked or otherwise
|
|
||||||
designated in writing by the copyright owner as "Not a Contribution."
|
|
||||||
|
|
||||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
|
||||||
on behalf of whom a Contribution has been received by Licensor and
|
|
||||||
subsequently incorporated within the Work.
|
|
||||||
|
|
||||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
|
||||||
this License, each Contributor hereby grants to You a perpetual,
|
|
||||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
|
||||||
copyright license to reproduce, prepare Derivative Works of,
|
|
||||||
publicly display, publicly perform, sublicense, and distribute the
|
|
||||||
Work and such Derivative Works in Source or Object form.
|
|
||||||
|
|
||||||
3. Grant of Patent License. Subject to the terms and conditions of
|
|
||||||
this License, each Contributor hereby grants to You a perpetual,
|
|
||||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
|
||||||
(except as stated in this section) patent license to make, have made,
|
|
||||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
|
||||||
where such license applies only to those patent claims licensable
|
|
||||||
by such Contributor that are necessarily infringed by their
|
|
||||||
Contribution(s) alone or by combination of their Contribution(s)
|
|
||||||
with the Work to which such Contribution(s) was submitted. If You
|
|
||||||
institute patent litigation against any entity (including a
|
|
||||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
|
||||||
or a Contribution incorporated within the Work constitutes direct
|
|
||||||
or contributory patent infringement, then any patent licenses
|
|
||||||
granted to You under this License for that Work shall terminate
|
|
||||||
as of the date such litigation is filed.
|
|
||||||
|
|
||||||
4. Redistribution. You may reproduce and distribute copies of the
|
|
||||||
Work or Derivative Works thereof in any medium, with or without
|
|
||||||
modifications, and in Source or Object form, provided that You
|
|
||||||
meet the following conditions:
|
|
||||||
|
|
||||||
(a) You must give any other recipients of the Work or
|
|
||||||
Derivative Works a copy of this License; and
|
|
||||||
|
|
||||||
(b) You must cause any modified files to carry prominent notices
|
|
||||||
stating that You changed the files; and
|
|
||||||
|
|
||||||
(c) You must retain, in the Source form of any Derivative Works
|
|
||||||
that You distribute, all copyright, patent, trademark, and
|
|
||||||
attribution notices from the Source form of the Work,
|
|
||||||
excluding those notices that do not pertain to any part of
|
|
||||||
the Derivative Works; and
|
|
||||||
|
|
||||||
(d) If the Work includes a "NOTICE" text file as part of its
|
|
||||||
distribution, then any Derivative Works that You distribute must
|
|
||||||
include a readable copy of the attribution notices contained
|
|
||||||
within such NOTICE file, excluding those notices that do not
|
|
||||||
pertain to any part of the Derivative Works, in at least one
|
|
||||||
of the following places: within a NOTICE text file distributed
|
|
||||||
as part of the Derivative Works; within the Source form or
|
|
||||||
documentation, if provided along with the Derivative Works; or,
|
|
||||||
within a display generated by the Derivative Works, if and
|
|
||||||
wherever such third-party notices normally appear. The contents
|
|
||||||
of the NOTICE file are for informational purposes only and
|
|
||||||
do not modify the License. You may add Your own attribution
|
|
||||||
notices within Derivative Works that You distribute, alongside
|
|
||||||
or as an addendum to the NOTICE text from the Work, provided
|
|
||||||
that such additional attribution notices cannot be construed
|
|
||||||
as modifying the License.
|
|
||||||
|
|
||||||
You may add Your own copyright statement to Your modifications and
|
|
||||||
may provide additional or different license terms and conditions
|
|
||||||
for use, reproduction, or distribution of Your modifications, or
|
|
||||||
for any such Derivative Works as a whole, provided Your use,
|
|
||||||
reproduction, and distribution of the Work otherwise complies with
|
|
||||||
the conditions stated in this License.
|
|
||||||
|
|
||||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
|
||||||
any Contribution intentionally submitted for inclusion in the Work
|
|
||||||
by You to the Licensor shall be under the terms and conditions of
|
|
||||||
this License, without any additional terms or conditions.
|
|
||||||
Notwithstanding the above, nothing herein shall supersede or modify
|
|
||||||
the terms of any separate license agreement you may have executed
|
|
||||||
with Licensor regarding such Contributions.
|
|
||||||
|
|
||||||
6. Trademarks. This License does not grant permission to use the trade
|
|
||||||
names, trademarks, service marks, or product names of the Licensor,
|
|
||||||
except as required for reasonable and customary use in describing the
|
|
||||||
origin of the Work and reproducing the content of the NOTICE file.
|
|
||||||
|
|
||||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
|
||||||
agreed to in writing, Licensor provides the Work (and each
|
|
||||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
||||||
implied, including, without limitation, any warranties or conditions
|
|
||||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
|
||||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
|
||||||
appropriateness of using or redistributing the Work and assume any
|
|
||||||
risks associated with Your exercise of permissions under this License.
|
|
||||||
|
|
||||||
8. Limitation of Liability. In no event and under no legal theory,
|
|
||||||
whether in tort (including negligence), contract, or otherwise,
|
|
||||||
unless required by applicable law (such as deliberate and grossly
|
|
||||||
negligent acts) or agreed to in writing, shall any Contributor be
|
|
||||||
liable to You for damages, including any direct, indirect, special,
|
|
||||||
incidental, or consequential damages of any character arising as a
|
|
||||||
result of this License or out of the use or inability to use the
|
|
||||||
Work (including but not limited to damages for loss of goodwill,
|
|
||||||
work stoppage, computer failure or malfunction, or any and all
|
|
||||||
other commercial damages or losses), even if such Contributor
|
|
||||||
has been advised of the possibility of such damages.
|
|
||||||
|
|
||||||
9. Accepting Warranty or Additional Liability. While redistributing
|
|
||||||
the Work or Derivative Works thereof, You may choose to offer,
|
|
||||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
|
||||||
or other liability obligations and/or rights consistent with this
|
|
||||||
License. However, in accepting such obligations, You may act only
|
|
||||||
on Your own behalf and on Your sole responsibility, not on behalf
|
|
||||||
of any other Contributor, and only if You agree to indemnify,
|
|
||||||
defend, and hold each Contributor harmless for any liability
|
|
||||||
incurred by, or claims asserted against, such Contributor by reason
|
|
||||||
of your accepting any such warranty or additional liability.
|
|
||||||
|
|
||||||
END OF TERMS AND CONDITIONS
|
|
||||||
|
|
||||||
APPENDIX: How to apply the Apache License to your work.
|
|
||||||
|
|
||||||
To apply the Apache License to your work, attach the following
|
|
||||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
|
||||||
replaced with your own identifying information. (Don't include
|
|
||||||
the brackets!) The text should be enclosed in the appropriate
|
|
||||||
comment syntax for the file format. We also recommend that a
|
|
||||||
file or class name and description of purpose be included on the
|
|
||||||
same "printed page" as the copyright notice for easier
|
|
||||||
identification within third-party archives.
|
|
||||||
|
|
||||||
Copyright 2018 Robert Bor
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
you may not use this file except in compliance with the License.
|
|
||||||
You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License.
|
|
||||||
340
README.md
340
README.md
@ -1,236 +1,194 @@
|
|||||||
Aho-Corasick
|
Aho-Corasick
|
||||||
============
|
============
|
||||||
|
|
||||||
[](https://travis-ci.org/robert-bor/aho-corasick)
|
|
||||||
[](https://codecov.io/gh/robert-bor/aho-corasick)
|
|
||||||
[](https://maven-badges.herokuapp.com/maven-central/org.ahocorasick/ahocorasick)
|
|
||||||
[](https://javadoc.io/doc/org.ahocorasick/ahocorasick)
|
|
||||||
[](http://www.apache.org/licenses/LICENSE-2.0)
|
|
||||||
|
|
||||||
Dependency
|
Dependency
|
||||||
----------
|
----------
|
||||||
|
|
||||||
Include this dependency in your POM. Be sure to check for the latest version in Maven Central.
|
Include this dependency in your POM. Be sure to check for the latest version in Maven Central.
|
||||||
|
|
||||||
```xml
|
```xml
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.ahocorasick</groupId>
|
<groupId>org.ahocorasick</groupId>
|
||||||
<artifactId>ahocorasick</artifactId>
|
<artifactId>ahocorasick</artifactId>
|
||||||
<version>0.6.3</version>
|
<version>0.3.0</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
```
|
```
|
||||||
|
|
||||||
Introduction
|
Introduction
|
||||||
------------
|
------------
|
||||||
|
Nowadays most free-text searching is based on Lucene-like approaches, where the search text is parsed into its
|
||||||
|
various components. For every keyword a lookup is done to see where it occurs. When looking for a couple of keywords
|
||||||
|
this approach is great. But what about it if you are not looking for just a couple of keywords, but a 100,000 of
|
||||||
|
them? Like, for example, checking against a dictionary?
|
||||||
|
|
||||||
Most free-text searching is based on Lucene-like approaches, where the
|
This is where the Aho-Corasick algorithm shines. Instead of chopping up the search text, it uses all the keywords
|
||||||
search text is parsed into its various components. For every keyword a
|
to build up a construct called a [Trie](http://en.wikipedia.org/wiki/Trie). There are three crucial components
|
||||||
lookup is done to see where it occurs. When looking for a couple of keywords
|
to Aho-Corasick:
|
||||||
this approach is great, but when searching for 100,000 words, the approach
|
|
||||||
is quite slow (for example, checking against a dictionary).
|
|
||||||
|
|
||||||
The Aho-Corasick algorithm shines when looking for multiple words.
|
|
||||||
Rather than chop up the search text, it uses all the keywords to build
|
|
||||||
a [Trie](http://en.wikipedia.org/wiki/Trie) construct. The crucial
|
|
||||||
Aho-Corasick components include:
|
|
||||||
|
|
||||||
* goto
|
* goto
|
||||||
* fail
|
* fail
|
||||||
* output
|
* output
|
||||||
|
|
||||||
Every character encountered is presented to a state object within the
|
Every character encountered is presented to a state object within the *goto* structure. If there is a matching state,
|
||||||
*goto* structure. If there is a matching state, that will be elevated to
|
that will be elevated to the new current state.
|
||||||
the new current state.
|
|
||||||
|
|
||||||
However, if there is no matching state, the algorithm will signal a
|
However, if there is no matching state, the algorithm will signal a *fail* and fall back to states with less depth
|
||||||
*fail* and fall back to states with less depth (i.e., a match less long)
|
(ie, a match less long) and proceed from there, until it found a matching state, or it has reached the root state.
|
||||||
and proceed from there, until it found a matching state, or it has reached
|
|
||||||
the root state.
|
|
||||||
|
|
||||||
Whenever a state is reached that matches an entire keyword, it is
|
Whenever a state is reached that matches an entire keyword, it is emitted to an *output* set which can be read after
|
||||||
emitted to an *output* set which can be read after the entire scan
|
the entire scan has completed.
|
||||||
has completed.
|
|
||||||
|
|
||||||
The algorithm is O(n). No matter how many keywords are given, or how large
|
The beauty of the algorithm is that it is O(n). No matter how many keywords you have, or how big the search text is,
|
||||||
the search text is, the performance will decline linearly.
|
the performance will decline in a linear way.
|
||||||
|
|
||||||
The Aho-Corasick algorithm can help:
|
Some examples you could use the Aho-Corasick algorithm for:
|
||||||
|
* looking for certain words in texts in order to URL link or emphasize them
|
||||||
|
* adding semantics to plain text
|
||||||
|
* checking against a dictionary to see if syntactic errors were made
|
||||||
|
|
||||||
* find words in texts to link or emphasize them;
|
This library is the Java implementation of the afore-mentioned Aho-Corasick algorithm for efficient string matching.
|
||||||
* add semantics to plain text; or
|
The algorithm is explained in great detail in the white paper written by
|
||||||
* check against a dictionary to see if syntactic errors were made.
|
Aho and Corasick: http://cr.yp.to/bib/1975/aho.pdf
|
||||||
|
|
||||||
See the [white paper](http://cr.yp.to/bib/1975/aho.pdf) by Aho and
|
|
||||||
Corasick for algorithmic details.
|
|
||||||
|
|
||||||
Usage
|
Usage
|
||||||
-----
|
-----
|
||||||
Set up the Trie using a builder as follows:
|
Setting up the Trie is a piece of cake:
|
||||||
|
|
||||||
```java
|
```java
|
||||||
Trie trie = Trie.builder()
|
Trie trie = Trie.builder()
|
||||||
.addKeyword("hers")
|
|
||||||
.addKeyword("his")
|
|
||||||
.addKeyword("she")
|
|
||||||
.addKeyword("he")
|
|
||||||
.build();
|
|
||||||
Collection<Emit> emits = trie.parseText("ushers");
|
|
||||||
```
|
|
||||||
|
|
||||||
The collection will contain `Emit` objects that match:
|
|
||||||
|
|
||||||
* "she" starting at position 1, ending at position 3
|
|
||||||
* "he" starting at position 2, ending at position 3
|
|
||||||
* "hers" starting at position 2, ending at position 5
|
|
||||||
|
|
||||||
In situations where overlapping instances are not desired, retain
|
|
||||||
the longest and left-most matches by calling `ignoreOverlaps()`:
|
|
||||||
|
|
||||||
```java
|
|
||||||
Trie trie = Trie.builder()
|
|
||||||
.ignoreOverlaps()
|
|
||||||
.addKeyword("hot")
|
|
||||||
.addKeyword("hot chocolate")
|
|
||||||
.build();
|
|
||||||
Collection<Emit> emits = trie.parseText("hot chocolate");
|
|
||||||
```
|
|
||||||
|
|
||||||
The `ignoreOverlaps()` method tells the Trie to remove all overlapping
|
|
||||||
matches. For this it relies on the following conflict resolution rules:
|
|
||||||
|
|
||||||
1. longer matches prevail over shorter matches; and
|
|
||||||
1. left-most prevails over right-most.
|
|
||||||
|
|
||||||
Only one result is returned:
|
|
||||||
|
|
||||||
* "hot chocolate" starting at position 0, ending at position 12
|
|
||||||
|
|
||||||
To check for whole words exclusively, call `onlyWholeWords()` as follows:
|
|
||||||
|
|
||||||
```java
|
|
||||||
Trie trie = Trie.builder()
|
|
||||||
.onlyWholeWords()
|
|
||||||
.addKeyword("sugar")
|
|
||||||
.build();
|
|
||||||
Collection<Emit> emits = trie.parseText("sugarcane sugar canesugar");
|
|
||||||
```
|
|
||||||
|
|
||||||
Only one match is found; whereas, without calling `onlyWholeWords()` three
|
|
||||||
matches are found. The sugarcane/canesugar words are discarded because
|
|
||||||
they are partial matches.
|
|
||||||
|
|
||||||
Some text is `WrItTeN` in mixed case, which makes it hard to identify.
|
|
||||||
Instruct the Trie to convert the searchtext to lowercase to ease the
|
|
||||||
matching process. The lower-casing applies to keywords as well.
|
|
||||||
|
|
||||||
```java
|
|
||||||
Trie trie = Trie.builder()
|
|
||||||
.ignoreCase()
|
|
||||||
.addKeyword("casing")
|
|
||||||
.build();
|
|
||||||
Collection<Emit> emits = trie.parseText("CaSiNg");
|
|
||||||
```
|
|
||||||
|
|
||||||
Normally, this match would not be found. By calling `ignoreCase()`,
|
|
||||||
the entire search text is made lowercase before matching begins.
|
|
||||||
Therefore it will find exactly one match.
|
|
||||||
|
|
||||||
It is also possible to just ask whether the text matches any of
|
|
||||||
the keywords, or just to return the first match it finds.
|
|
||||||
|
|
||||||
```java
|
|
||||||
Trie trie = Trie.builder().ignoreOverlaps()
|
|
||||||
.addKeyword("ab")
|
|
||||||
.addKeyword("cba")
|
|
||||||
.addKeyword("ababc")
|
|
||||||
.build();
|
|
||||||
Emit firstMatch = trie.firstMatch("ababcbab");
|
|
||||||
```
|
|
||||||
|
|
||||||
The value for `firstMatch` will be "ababc" from position 0. The
|
|
||||||
`containsMatch()` method checks whether `firstMatch` found a match and
|
|
||||||
returns `true` if that is the case.
|
|
||||||
|
|
||||||
For a barebones Aho-Corasick algorithm with a custom emit handler use:
|
|
||||||
|
|
||||||
```java
|
|
||||||
Trie trie = Trie.builder()
|
|
||||||
.addKeyword("hers")
|
.addKeyword("hers")
|
||||||
.addKeyword("his")
|
.addKeyword("his")
|
||||||
.addKeyword("she")
|
.addKeyword("she")
|
||||||
.addKeyword("he")
|
.addKeyword("he")
|
||||||
.build();
|
.build();
|
||||||
|
Collection<Emit> emits = trie.parseText("ushers");
|
||||||
final List<Emit> emits = new ArrayList<>();
|
|
||||||
EmitHandler emitHandler = new EmitHandler() {
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void emit(Emit emit) {
|
|
||||||
emits.add(emit);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
```
|
```
|
||||||
|
|
||||||
In many cases you may want to do perform tasks with both the non-matching
|
You can now read the set. In this case it will find the following:
|
||||||
and the matching text. Such implementations may be better served by using
|
* "she" starting at position 1, ending at position 3
|
||||||
`Trie.tokenize()`. The `tokenize()` method allows looping over the
|
* "he" starting at position 2, ending at position 3
|
||||||
corpus to deal with matches as soon as they are encountered. Here's an
|
* "hers" starting at position 2, ending at position 5
|
||||||
example that outputs key words as italicized HTML elements:
|
|
||||||
|
In normal situations you probably want to remove overlapping instances, retaining the longest and left-most
|
||||||
|
matches.
|
||||||
|
|
||||||
```java
|
```java
|
||||||
String speech = "The Answer to the Great Question... Of Life, " +
|
Trie trie = Trie.builder()
|
||||||
"the Universe and Everything... Is... Forty-two,' said " +
|
.removeOverlaps()
|
||||||
"Deep Thought, with infinite majesty and calm.";
|
.addKeyword("hot")
|
||||||
|
.addKeyword("hot chocolate")
|
||||||
Trie trie = Trie.builder().ignoreOverlaps().onlyWholeWords().ignoreCase()
|
.build();
|
||||||
.addKeyword("great question")
|
Collection<Emit> emits = trie.parseText("hot chocolate");
|
||||||
.addKeyword("forty-two")
|
|
||||||
.addKeyword("deep thought")
|
|
||||||
.build();
|
|
||||||
|
|
||||||
Collection<Token> tokens = trie.tokenize(speech);
|
|
||||||
StringBuilder html = new StringBuilder();
|
|
||||||
html.append("<html><body><p>");
|
|
||||||
|
|
||||||
for (Token token : tokens) {
|
|
||||||
if (token.isMatch()) {
|
|
||||||
html.append("<i>");
|
|
||||||
}
|
|
||||||
html.append(token.getFragment());
|
|
||||||
if (token.isMatch()) {
|
|
||||||
html.append("</i>");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
html.append("</p></body></html>");
|
|
||||||
System.out.println(html);
|
|
||||||
```
|
```
|
||||||
|
|
||||||
You can also emit custom outputs. This might for example be useful to
|
The removeOverlaps method tells the Trie to remove all overlapping matches. For this it relies on the following
|
||||||
implement a trivial named entity recognizer. In this case use a
|
conflict resolution rules: 1) longer matches prevail over shorter matches, 2) left-most prevails over right-most.
|
||||||
`PayloadTrie` instead of a `Trie` as follows:
|
There is only one result now:
|
||||||
|
* "hot chocolate" starting at position 0, ending at position 12
|
||||||
|
|
||||||
|
If you want the algorithm to only check for whole words, you can tell the Trie to do so:
|
||||||
|
|
||||||
```java
|
```java
|
||||||
class Word {
|
Trie trie = Trie.builder()
|
||||||
private final String gender;
|
.onlyWholeWords()
|
||||||
public Word(String gender) {
|
.addKeyword("sugar")
|
||||||
this.gender = gender;
|
.build();
|
||||||
}
|
Collection<Emit> emits = trie.parseText("sugarcane sugarcane sugar canesugar");
|
||||||
}
|
```
|
||||||
|
|
||||||
PayloadTrie<Word> trie = PayloadTrie.<Word>builder()
|
In this case, it will only find one match, whereas it would normally find four. The sugarcane/canesugar words
|
||||||
.addKeyword("hers", new Word("f"))
|
are discarded because they are partial matches.
|
||||||
.addKeyword("his", new Word("m"))
|
|
||||||
.addKeyword("she", new Word("f"))
|
Some text is WrItTeN in a combination of lowercase and uppercase and therefore hard to identify. You can instruct
|
||||||
.addKeyword("he", new Word("m"))
|
the Trie to lowercase the entire searchtext to ease the matching process. The lower-casing extends to keywords as well.
|
||||||
.addKeyword("nonbinary", new Word("nb"))
|
|
||||||
.addKeyword("transgender", new Word("tg"))
|
```java
|
||||||
.build();
|
Trie trie = Trie.builder()
|
||||||
Collection<PayloadEmit<Word>> emits = trie.parseText("ushers");
|
.caseInsensitive()
|
||||||
|
.addKeyword("casing")
|
||||||
|
.build();
|
||||||
|
Collection<Emit> emits = trie.parseText("CaSiNg");
|
||||||
|
```
|
||||||
|
|
||||||
|
Normally, this match would not be found. With the caseInsensitive settings the entire search text is lowercased
|
||||||
|
before the matching begins. Therefore it will find exactly one match. Since you still have control of the original
|
||||||
|
search text and you will know exactly where the match was, you can still utilize the original casing.
|
||||||
|
|
||||||
|
It is also possible to just ask whether the text matches any of the keywords, or just to return the first match it
|
||||||
|
finds.
|
||||||
|
|
||||||
|
```java
|
||||||
|
Trie trie = Trie.builder().removeOverlaps()
|
||||||
|
.addKeyword("ab")
|
||||||
|
.addKeyword("cba")
|
||||||
|
.addKeyword("ababc")
|
||||||
|
.build();
|
||||||
|
Emit firstMatch = trie.firstMatch("ababcbab");
|
||||||
|
```
|
||||||
|
|
||||||
|
The firstMatch will now be "ababc" found at position 0. containsMatch just checks if there is a firstMatch and
|
||||||
|
returns true if that is the case.
|
||||||
|
|
||||||
|
If you just want the barebones Aho-Corasick algorithm (ie, no dealing with case insensitivity, overlaps and whole
|
||||||
|
words) and you prefer to add your own handler to the mix, that is also possible.
|
||||||
|
|
||||||
|
```java
|
||||||
|
Trie trie = Trie.builder()
|
||||||
|
.addKeyword("hers")
|
||||||
|
.addKeyword("his")
|
||||||
|
.addKeyword("she")
|
||||||
|
.addKeyword("he")
|
||||||
|
.build();
|
||||||
|
|
||||||
|
final List<Emit> emits = new ArrayList<>();
|
||||||
|
EmitHandler emitHandler = new EmitHandler() {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void emit(Emit emit) {
|
||||||
|
emits.add(emit);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
In many cases you may want to do useful stuff with both the non-matching and the matching text. In this case, you
|
||||||
|
might be better served by using the Trie.tokenize(). It allows you to loop over the entire text and deal with
|
||||||
|
matches as soon as you encounter them. Let's look at an example where we want to highlight words from HGttG in HTML:
|
||||||
|
|
||||||
|
```java
|
||||||
|
String speech = "The Answer to the Great Question... Of Life, " +
|
||||||
|
"the Universe and Everything... Is... Forty-two,' said " +
|
||||||
|
"Deep Thought, with infinite majesty and calm.";
|
||||||
|
Trie trie = Trie.builder().removeOverlaps().onlyWholeWords().caseInsensitive()
|
||||||
|
.addKeyword("great question")
|
||||||
|
.addKeyword("forty-two")
|
||||||
|
.addKeyword("deep thought")
|
||||||
|
.build();
|
||||||
|
Collection<Token> tokens = trie.tokenize(speech);
|
||||||
|
StringBuffer html = new StringBuffer();
|
||||||
|
html.append("<html><body><p>");
|
||||||
|
for (Token token : tokens) {
|
||||||
|
if (token.isMatch()) {
|
||||||
|
html.append("<i>");
|
||||||
|
}
|
||||||
|
html.append(token.getFragment());
|
||||||
|
if (token.isMatch()) {
|
||||||
|
html.append("</i>");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
html.append("</p></body></html>");
|
||||||
|
System.out.println(html);
|
||||||
```
|
```
|
||||||
|
|
||||||
Releases
|
Releases
|
||||||
--------
|
--------
|
||||||
|
Information on the aho-corasick [releases](https://github.com/robert-bor/aho-corasick/releases).
|
||||||
|
|
||||||
See [releases](https://github.com/robert-bor/aho-corasick/releases) for details.
|
License
|
||||||
|
-------
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
|||||||
@ -1,72 +0,0 @@
|
|||||||
plugins {
|
|
||||||
`java-library`
|
|
||||||
`maven-publish`
|
|
||||||
pmd
|
|
||||||
checkstyle
|
|
||||||
id("io.freefair.lombok") version "8.4"
|
|
||||||
}
|
|
||||||
|
|
||||||
repositories {
|
|
||||||
mavenLocal()
|
|
||||||
maven {
|
|
||||||
url = uri("https://nexus.knecon.com/repository/gindev/")
|
|
||||||
credentials {
|
|
||||||
username = providers.gradleProperty("mavenUser").getOrNull();
|
|
||||||
password = providers.gradleProperty("mavenPassword").getOrNull();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
maven {
|
|
||||||
url = uri("https://repo.maven.apache.org/maven2/")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
dependencies {
|
|
||||||
testImplementation("junit:junit:4.13.2")
|
|
||||||
}
|
|
||||||
|
|
||||||
group = "org.ahocorasick"
|
|
||||||
description = "Aho-CoraSick algorithm for efficient string matching"
|
|
||||||
java.sourceCompatibility = JavaVersion.VERSION_17
|
|
||||||
java.targetCompatibility = JavaVersion.VERSION_17
|
|
||||||
|
|
||||||
java {
|
|
||||||
withSourcesJar()
|
|
||||||
withJavadocJar()
|
|
||||||
}
|
|
||||||
|
|
||||||
publishing {
|
|
||||||
publications.create<MavenPublication>("maven") {
|
|
||||||
from(components["java"])
|
|
||||||
}
|
|
||||||
repositories {
|
|
||||||
maven {
|
|
||||||
url = uri("https://nexus.knecon.com/repository/red-platform-releases/")
|
|
||||||
credentials {
|
|
||||||
username = providers.gradleProperty("mavenUser").getOrNull();
|
|
||||||
password = providers.gradleProperty("mavenPassword").getOrNull();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
tasks.withType<JavaCompile>() {
|
|
||||||
options.encoding = "UTF-8"
|
|
||||||
}
|
|
||||||
|
|
||||||
tasks.withType<Javadoc>() {
|
|
||||||
options.encoding = "UTF-8"
|
|
||||||
}
|
|
||||||
|
|
||||||
pmd {
|
|
||||||
isConsoleOutput = true
|
|
||||||
}
|
|
||||||
|
|
||||||
tasks.pmdMain {
|
|
||||||
pmd.ruleSetFiles = files("${rootDir}/config/pmd/pmd.xml")
|
|
||||||
}
|
|
||||||
|
|
||||||
tasks.pmdTest {
|
|
||||||
pmd.ruleSetFiles = files("${rootDir}/config/pmd/test_pmd.xml")
|
|
||||||
}
|
|
||||||
|
|
||||||
@ -1,38 +0,0 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<!DOCTYPE module PUBLIC "-//Puppy Crawl//DTD Check Configuration 1.3//EN"
|
|
||||||
"http://www.puppycrawl.com/dtds/configuration_1_3.dtd">
|
|
||||||
<module name="Checker">
|
|
||||||
<property
|
|
||||||
name="severity"
|
|
||||||
value="error"/>
|
|
||||||
<module name="TreeWalker">
|
|
||||||
<module name="SuppressWarningsHolder"/>
|
|
||||||
<module name="MissingDeprecated"/>
|
|
||||||
<module name="MissingOverride"/>
|
|
||||||
<module name="AnnotationLocation"/>
|
|
||||||
<module name="NonEmptyAtclauseDescription"/>
|
|
||||||
<module name="IllegalImport"/>
|
|
||||||
<module name="RedundantImport"/>
|
|
||||||
<module name="RedundantModifier"/>
|
|
||||||
<module name="EmptyBlock"/>
|
|
||||||
<module name="DefaultComesLast"/>
|
|
||||||
<module name="EmptyStatement"/>
|
|
||||||
<module name="EqualsHashCode"/>
|
|
||||||
<module name="ExplicitInitialization"/>
|
|
||||||
<module name="IllegalInstantiation"/>
|
|
||||||
<module name="ModifiedControlVariable"/>
|
|
||||||
<module name="MultipleVariableDeclarations"/>
|
|
||||||
<module name="PackageDeclaration"/>
|
|
||||||
<module name="ParameterAssignment"/>
|
|
||||||
<module name="SimplifyBooleanExpression"/>
|
|
||||||
<module name="SimplifyBooleanReturn"/>
|
|
||||||
<module name="StringLiteralEquality"/>
|
|
||||||
<module name="OneStatementPerLine"/>
|
|
||||||
<module name="FinalClass"/>
|
|
||||||
<module name="ArrayTypeStyle"/>
|
|
||||||
<module name="UpperEll"/>
|
|
||||||
<module name="OuterTypeFilename"/>
|
|
||||||
</module>
|
|
||||||
<module name="FileTabCharacter"/>
|
|
||||||
<module name="SuppressWarningsFilter"/>
|
|
||||||
</module>
|
|
||||||
@ -1,21 +0,0 @@
|
|||||||
<?xml version="1.0"?>
|
|
||||||
<ruleset name="Custom ruleset"
|
|
||||||
xmlns="http://pmd.sourceforge.net/ruleset/2.0.0"
|
|
||||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
|
||||||
xsi:schemaLocation="http://pmd.sourceforge.net/ruleset/2.0.0 http://pmd.sourceforge.net/ruleset_2_0_0.xsd">
|
|
||||||
|
|
||||||
<description>
|
|
||||||
Knecon ruleset checks the code for bad stuff
|
|
||||||
</description>
|
|
||||||
|
|
||||||
<rule ref="category/java/errorprone.xml">
|
|
||||||
<exclude name="MissingSerialVersionUID"/>
|
|
||||||
<exclude name="AvoidLiteralsInIfCondition"/>
|
|
||||||
<exclude name="DataflowAnomalyAnalysis"/>
|
|
||||||
<exclude name="AvoidDuplicateLiterals"/>
|
|
||||||
<exclude name="NullAssignment"/>
|
|
||||||
<exclude name="AssignmentInOperand"/>
|
|
||||||
<exclude name="BeanMembersShouldSerialize"/>
|
|
||||||
</rule>
|
|
||||||
|
|
||||||
</ruleset>
|
|
||||||
@ -1,11 +0,0 @@
|
|||||||
<?xml version="1.0"?>
|
|
||||||
<ruleset name="Custom ruleset"
|
|
||||||
xmlns="http://pmd.sourceforge.net/ruleset/2.0.0"
|
|
||||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
|
||||||
xsi:schemaLocation="http://pmd.sourceforge.net/ruleset/2.0.0 http://pmd.sourceforge.net/ruleset_2_0_0.xsd">
|
|
||||||
|
|
||||||
<description>
|
|
||||||
Knecon test ruleset checks the code for bad stuff
|
|
||||||
</description>
|
|
||||||
|
|
||||||
</ruleset>
|
|
||||||
@ -1 +0,0 @@
|
|||||||
version = 0.7-SNAPSHOT
|
|
||||||
107
pom.xml
Normal file
107
pom.xml
Normal file
@ -0,0 +1,107 @@
|
|||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
<groupId>org.ahocorasick</groupId>
|
||||||
|
<artifactId>ahocorasick</artifactId>
|
||||||
|
<version>0.3.1-SNAPSHOT</version>
|
||||||
|
<packaging>jar</packaging>
|
||||||
|
<name>Aho-CoraSick algorithm for efficient string matching</name>
|
||||||
|
<description>Java library for efficient string matching against a large set of keywords</description>
|
||||||
|
<inceptionYear>2014</inceptionYear>
|
||||||
|
<url>http://ahocorasick.org</url>
|
||||||
|
|
||||||
|
<parent>
|
||||||
|
<groupId>org.sonatype.oss</groupId>
|
||||||
|
<artifactId>oss-parent</artifactId>
|
||||||
|
<version>7</version>
|
||||||
|
</parent>
|
||||||
|
|
||||||
|
<organization>
|
||||||
|
<name>42 BV</name>
|
||||||
|
<url>http://blog.42.nl/</url>
|
||||||
|
</organization>
|
||||||
|
|
||||||
|
<licenses>
|
||||||
|
<license>
|
||||||
|
<name>The Apache Software License, Version 2.0</name>
|
||||||
|
<url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
|
||||||
|
<distribution>repo</distribution>
|
||||||
|
</license>
|
||||||
|
</licenses>
|
||||||
|
|
||||||
|
<scm>
|
||||||
|
<url>scm:git://github.com/robert-bor/aho-corasick</url>
|
||||||
|
<connection>scm:git://github.com/robert-bor/aho-corasick</connection>
|
||||||
|
</scm>
|
||||||
|
|
||||||
|
<developers>
|
||||||
|
<developer>
|
||||||
|
<name>Robert Bor</name>
|
||||||
|
<organization>42</organization>
|
||||||
|
</developer>
|
||||||
|
</developers>
|
||||||
|
|
||||||
|
<properties>
|
||||||
|
<junit.version>4.10</junit.version>
|
||||||
|
<!-- Reporting -->
|
||||||
|
<maven.cobertura.version>2.5.2</maven.cobertura.version>
|
||||||
|
<maven.javadoc.version>2.8</maven.javadoc.version>
|
||||||
|
<maven.project.version>2.4</maven.project.version>
|
||||||
|
<maven.site.plugin.version>3.3</maven.site.plugin.version>
|
||||||
|
</properties>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
|
||||||
|
<!-- Used for unit testing -->
|
||||||
|
<dependency>
|
||||||
|
<groupId>junit</groupId>
|
||||||
|
<artifactId>junit-dep</artifactId>
|
||||||
|
<version>${junit.version}</version>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
|
||||||
|
<defaultGoal>install</defaultGoal>
|
||||||
|
|
||||||
|
<plugins>
|
||||||
|
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-jar-plugin</artifactId>
|
||||||
|
<version>2.4</version>
|
||||||
|
</plugin>
|
||||||
|
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-compiler-plugin</artifactId>
|
||||||
|
<configuration>
|
||||||
|
<source>1.7</source>
|
||||||
|
<target>1.7</target>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
|
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
|
||||||
|
<reporting>
|
||||||
|
<plugins>
|
||||||
|
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.codehaus.mojo</groupId>
|
||||||
|
<artifactId>cobertura-maven-plugin</artifactId>
|
||||||
|
<version>${maven.cobertura.version}</version>
|
||||||
|
</plugin>
|
||||||
|
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.maven.plugins</groupId>
|
||||||
|
<artifactId>maven-javadoc-plugin</artifactId>
|
||||||
|
<version>${maven.javadoc.version}</version>
|
||||||
|
</plugin>
|
||||||
|
|
||||||
|
</plugins>
|
||||||
|
</reporting>
|
||||||
|
|
||||||
|
</project>
|
||||||
@ -1 +0,0 @@
|
|||||||
rootProject.name = "ahocorasick"
|
|
||||||
@ -1,125 +1,63 @@
|
|||||||
package org.ahocorasick.interval;
|
package org.ahocorasick.interval;
|
||||||
|
|
||||||
import org.ahocorasick.trie.Emit;
|
|
||||||
import org.ahocorasick.trie.PayloadEmit;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Responsible for tracking the start and end bounds, which are reused by
|
|
||||||
* both {@link Emit} and {@link PayloadEmit}.
|
|
||||||
*/
|
|
||||||
public class Interval implements Intervalable {
|
public class Interval implements Intervalable {
|
||||||
|
|
||||||
private final int start;
|
private int start;
|
||||||
private final int end;
|
private int end;
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Constructs an interval with a start and end position.
|
|
||||||
*
|
|
||||||
* @param start The interval's starting text position.
|
|
||||||
* @param end The interval's ending text position.
|
|
||||||
*/
|
|
||||||
public Interval(final int start, final int end) {
|
public Interval(final int start, final int end) {
|
||||||
|
|
||||||
this.start = start;
|
this.start = start;
|
||||||
this.end = end;
|
this.end = end;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the starting offset into the text for this interval.
|
|
||||||
*
|
|
||||||
* @return A number between 0 (start of text) and the text length.
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public int getStart() {
|
public int getStart() {
|
||||||
|
|
||||||
return this.start;
|
return this.start;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the ending offset into the text for this interval.
|
|
||||||
*
|
|
||||||
* @return A number between getStart() + 1 and the text length.
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public int getEnd() {
|
public int getEnd() {
|
||||||
|
|
||||||
return this.end;
|
return this.end;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the length of the interval.
|
|
||||||
*
|
|
||||||
* @return The end position less the start position, plus one.
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public int size() {
|
public int size() {
|
||||||
|
|
||||||
return end - start + 1;
|
return end - start + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean overlapsWith(Interval other) {
|
||||||
/**
|
return this.start <= other.getEnd() &&
|
||||||
* Answers whether the given interval overlaps this interval
|
this.end >= other.getStart();
|
||||||
* instance.
|
|
||||||
*
|
|
||||||
* @param other the other interval to check for overlap
|
|
||||||
* @return true The intervals overlap.
|
|
||||||
*/
|
|
||||||
public boolean overlapsWith(final Interval other) {
|
|
||||||
|
|
||||||
return this.start <= other.getEnd() && this.end >= other.getStart();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public boolean overlapsWith(int point) {
|
public boolean overlapsWith(int point) {
|
||||||
|
|
||||||
return this.start <= point && point <= this.end;
|
return this.start <= point && point <= this.end;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean equals(Object o) {
|
public boolean equals(Object o) {
|
||||||
|
|
||||||
if (!(o instanceof Intervalable)) {
|
if (!(o instanceof Intervalable)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
Intervalable other = (Intervalable) o;
|
Intervalable other = (Intervalable)o;
|
||||||
return this.start == other.getStart() && this.end == other.getEnd();
|
return this.start == other.getStart() &&
|
||||||
|
this.end == other.getEnd();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int hashCode() {
|
public int hashCode() {
|
||||||
|
|
||||||
return this.start % 100 + this.end % 100;
|
return this.start % 100 + this.end % 100;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int compareTo(Object o) {
|
public int compareTo(Object o) {
|
||||||
|
|
||||||
if (!(o instanceof Intervalable)) {
|
if (!(o instanceof Intervalable)) {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
Intervalable other = (Intervalable) o;
|
Intervalable other = (Intervalable)o;
|
||||||
int comparison = this.start - other.getStart();
|
int comparison = this.start - other.getStart();
|
||||||
return comparison != 0 ? comparison : this.end - other.getEnd();
|
return comparison != 0 ? comparison : this.end - other.getEnd();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the starting offset and ending offset separated
|
|
||||||
* by a full colon (:).
|
|
||||||
*
|
|
||||||
* @return A non-null String, never empty.
|
|
||||||
*/
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
|
|
||||||
return this.start + ":" + this.end;
|
return this.start + ":" + this.end;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -6,23 +6,18 @@ import java.util.List;
|
|||||||
|
|
||||||
public class IntervalNode {
|
public class IntervalNode {
|
||||||
|
|
||||||
private enum Direction {
|
private enum Direction { LEFT, RIGHT }
|
||||||
LEFT,
|
|
||||||
RIGHT
|
|
||||||
}
|
|
||||||
|
|
||||||
private IntervalNode left;
|
private IntervalNode left = null;
|
||||||
private IntervalNode right;
|
private IntervalNode right = null;
|
||||||
private int point;
|
private int point;
|
||||||
private List<Intervalable> intervals = new ArrayList<>();
|
private List<Intervalable> intervals = new ArrayList<Intervalable>();
|
||||||
|
|
||||||
|
|
||||||
public IntervalNode(final List<Intervalable> intervals) {
|
|
||||||
|
|
||||||
|
public IntervalNode(List<Intervalable> intervals) {
|
||||||
this.point = determineMedian(intervals);
|
this.point = determineMedian(intervals);
|
||||||
|
|
||||||
final List<Intervalable> toLeft = new ArrayList<>();
|
List<Intervalable> toLeft = new ArrayList<Intervalable>();
|
||||||
final List<Intervalable> toRight = new ArrayList<>();
|
List<Intervalable> toRight = new ArrayList<Intervalable>();
|
||||||
|
|
||||||
for (Intervalable interval : intervals) {
|
for (Intervalable interval : intervals) {
|
||||||
if (interval.getEnd() < this.point) {
|
if (interval.getEnd() < this.point) {
|
||||||
@ -42,9 +37,7 @@ public class IntervalNode {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public int determineMedian(List<Intervalable> intervals) {
|
||||||
private int determineMedian(final List<Intervalable> intervals) {
|
|
||||||
|
|
||||||
int start = -1;
|
int start = -1;
|
||||||
int end = -1;
|
int end = -1;
|
||||||
for (Intervalable interval : intervals) {
|
for (Intervalable interval : intervals) {
|
||||||
@ -60,21 +53,17 @@ public class IntervalNode {
|
|||||||
return (start + end) / 2;
|
return (start + end) / 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public List<Intervalable> findOverlaps(Intervalable interval) {
|
||||||
|
|
||||||
public List<Intervalable> findOverlaps(final Intervalable interval) {
|
List<Intervalable> overlaps = new ArrayList<Intervalable>();
|
||||||
|
|
||||||
final List<Intervalable> overlaps = new ArrayList<>();
|
if (this.point < interval.getStart()) { // Tends to the right
|
||||||
|
|
||||||
if (this.point < interval.getStart()) {
|
|
||||||
// Tends to the right
|
|
||||||
addToOverlaps(interval, overlaps, findOverlappingRanges(this.right, interval));
|
addToOverlaps(interval, overlaps, findOverlappingRanges(this.right, interval));
|
||||||
addToOverlaps(interval, overlaps, checkForOverlapsToTheRight(interval));
|
addToOverlaps(interval, overlaps, checkForOverlapsToTheRight(interval));
|
||||||
} else if (this.point > interval.getEnd()) {
|
} else if (this.point > interval.getEnd()) { // Tends to the left
|
||||||
// Tends to the left
|
|
||||||
addToOverlaps(interval, overlaps, findOverlappingRanges(this.left, interval));
|
addToOverlaps(interval, overlaps, findOverlappingRanges(this.left, interval));
|
||||||
addToOverlaps(interval, overlaps, checkForOverlapsToTheLeft(interval));
|
addToOverlaps(interval, overlaps, checkForOverlapsToTheLeft(interval));
|
||||||
} else {
|
} else { // Somewhere in the middle
|
||||||
// Somewhere in the middle
|
|
||||||
addToOverlaps(interval, overlaps, this.intervals);
|
addToOverlaps(interval, overlaps, this.intervals);
|
||||||
addToOverlaps(interval, overlaps, findOverlappingRanges(this.left, interval));
|
addToOverlaps(interval, overlaps, findOverlappingRanges(this.left, interval));
|
||||||
addToOverlaps(interval, overlaps, findOverlappingRanges(this.right, interval));
|
addToOverlaps(interval, overlaps, findOverlappingRanges(this.right, interval));
|
||||||
@ -83,55 +72,48 @@ public class IntervalNode {
|
|||||||
return overlaps;
|
return overlaps;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected void addToOverlaps(Intervalable interval, List<Intervalable> overlaps, List<Intervalable> newOverlaps) {
|
||||||
protected void addToOverlaps(final Intervalable interval, final List<Intervalable> overlaps, final List<Intervalable> newOverlaps) {
|
for (Intervalable currentInterval : newOverlaps) {
|
||||||
|
|
||||||
for (final Intervalable currentInterval : newOverlaps) {
|
|
||||||
if (!currentInterval.equals(interval)) {
|
if (!currentInterval.equals(interval)) {
|
||||||
overlaps.add(currentInterval);
|
overlaps.add(currentInterval);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected List<Intervalable> checkForOverlapsToTheLeft(Intervalable interval) {
|
||||||
protected List<Intervalable> checkForOverlapsToTheLeft(final Intervalable interval) {
|
|
||||||
|
|
||||||
return checkForOverlaps(interval, Direction.LEFT);
|
return checkForOverlaps(interval, Direction.LEFT);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected List<Intervalable> checkForOverlapsToTheRight(Intervalable interval) {
|
||||||
protected List<Intervalable> checkForOverlapsToTheRight(final Intervalable interval) {
|
|
||||||
|
|
||||||
return checkForOverlaps(interval, Direction.RIGHT);
|
return checkForOverlaps(interval, Direction.RIGHT);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected List<Intervalable> checkForOverlaps(Intervalable interval, Direction direction) {
|
||||||
|
|
||||||
protected List<Intervalable> checkForOverlaps(final Intervalable interval, final Direction direction) {
|
List<Intervalable> overlaps = new ArrayList<Intervalable>();
|
||||||
|
for (Intervalable currentInterval : this.intervals) {
|
||||||
final List<Intervalable> overlaps = new ArrayList<>();
|
|
||||||
|
|
||||||
for (final Intervalable currentInterval : this.intervals) {
|
|
||||||
switch (direction) {
|
switch (direction) {
|
||||||
case LEFT:
|
case LEFT :
|
||||||
if (currentInterval.getStart() <= interval.getEnd()) {
|
if (currentInterval.getStart() <= interval.getEnd()) {
|
||||||
overlaps.add(currentInterval);
|
overlaps.add(currentInterval);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case RIGHT:
|
case RIGHT :
|
||||||
if (currentInterval.getEnd() >= interval.getStart()) {
|
if (currentInterval.getEnd() >= interval.getStart()) {
|
||||||
overlaps.add(currentInterval);
|
overlaps.add(currentInterval);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return overlaps;
|
return overlaps;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
protected List<Intervalable> findOverlappingRanges(IntervalNode node, Intervalable interval) {
|
protected List<Intervalable> findOverlappingRanges(IntervalNode node, Intervalable interval) {
|
||||||
|
if (node != null) {
|
||||||
return node == null ? Collections.<Intervalable>emptyList() : node.findOverlaps(interval);
|
return node.findOverlaps(interval);
|
||||||
|
}
|
||||||
|
return Collections.emptyList();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,30 +1,26 @@
|
|||||||
package org.ahocorasick.interval;
|
package org.ahocorasick.interval;
|
||||||
|
|
||||||
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.TreeSet;
|
import java.util.TreeSet;
|
||||||
|
|
||||||
import static java.util.Collections.sort;
|
|
||||||
|
|
||||||
public class IntervalTree {
|
public class IntervalTree {
|
||||||
|
|
||||||
private final IntervalNode rootNode;
|
private IntervalNode rootNode = null;
|
||||||
|
|
||||||
|
|
||||||
public IntervalTree(List<Intervalable> intervals) {
|
public IntervalTree(List<Intervalable> intervals) {
|
||||||
|
|
||||||
this.rootNode = new IntervalNode(intervals);
|
this.rootNode = new IntervalNode(intervals);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public List<Intervalable> removeOverlaps(List<Intervalable> intervals) {
|
||||||
public List<Intervalable> removeOverlaps(final List<Intervalable> intervals) {
|
|
||||||
|
|
||||||
// Sort the intervals on size, then left-most position
|
// Sort the intervals on size, then left-most position
|
||||||
sort(intervals, new IntervalableComparatorBySize());
|
Collections.sort(intervals, new IntervalableComparatorBySize());
|
||||||
|
|
||||||
final Set<Intervalable> removeIntervals = new TreeSet<>();
|
Set<Intervalable> removeIntervals = new TreeSet<Intervalable>();
|
||||||
|
|
||||||
for (final Intervalable interval : intervals) {
|
for (Intervalable interval : intervals) {
|
||||||
// If the interval was already removed, ignore it
|
// If the interval was already removed, ignore it
|
||||||
if (removeIntervals.contains(interval)) {
|
if (removeIntervals.contains(interval)) {
|
||||||
continue;
|
continue;
|
||||||
@ -35,19 +31,17 @@ public class IntervalTree {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Remove all intervals that were overlapping
|
// Remove all intervals that were overlapping
|
||||||
for (final Intervalable removeInterval : removeIntervals) {
|
for (Intervalable removeInterval : removeIntervals) {
|
||||||
intervals.remove(removeInterval);
|
intervals.remove(removeInterval);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sort the intervals, now on left-most position only
|
// Sort the intervals, now on left-most position only
|
||||||
sort(intervals, new IntervalableComparatorByPosition());
|
Collections.sort(intervals, new IntervalableComparatorByPosition());
|
||||||
|
|
||||||
return intervals;
|
return intervals;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public List<Intervalable> findOverlaps(Intervalable interval) {
|
||||||
public List<Intervalable> findOverlaps(final Intervalable interval) {
|
|
||||||
|
|
||||||
return rootNode.findOverlaps(interval);
|
return rootNode.findOverlaps(interval);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -2,12 +2,8 @@ package org.ahocorasick.interval;
|
|||||||
|
|
||||||
public interface Intervalable extends Comparable {
|
public interface Intervalable extends Comparable {
|
||||||
|
|
||||||
int getStart();
|
public int getStart();
|
||||||
|
public int getEnd();
|
||||||
|
public int size();
|
||||||
int getEnd();
|
|
||||||
|
|
||||||
|
|
||||||
int size();
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -5,8 +5,7 @@ import java.util.Comparator;
|
|||||||
public class IntervalableComparatorByPosition implements Comparator<Intervalable> {
|
public class IntervalableComparatorByPosition implements Comparator<Intervalable> {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int compare(final Intervalable intervalable, final Intervalable intervalable2) {
|
public int compare(Intervalable intervalable, Intervalable intervalable2) {
|
||||||
|
|
||||||
return intervalable.getStart() - intervalable2.getStart();
|
return intervalable.getStart() - intervalable2.getStart();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -5,14 +5,11 @@ import java.util.Comparator;
|
|||||||
public class IntervalableComparatorBySize implements Comparator<Intervalable> {
|
public class IntervalableComparatorBySize implements Comparator<Intervalable> {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int compare(final Intervalable intervalable, final Intervalable intervalable2) {
|
public int compare(Intervalable intervalable, Intervalable intervalable2) {
|
||||||
|
|
||||||
int comparison = intervalable2.size() - intervalable.size();
|
int comparison = intervalable2.size() - intervalable.size();
|
||||||
|
|
||||||
if (comparison == 0) {
|
if (comparison == 0) {
|
||||||
comparison = intervalable.getStart() - intervalable2.getStart();
|
comparison = intervalable.getStart() - intervalable2.getStart();
|
||||||
}
|
}
|
||||||
|
|
||||||
return comparison;
|
return comparison;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1,27 +0,0 @@
|
|||||||
package org.ahocorasick.trie;
|
|
||||||
|
|
||||||
public class DefaultToken extends Token {
|
|
||||||
|
|
||||||
private PayloadToken<String> payloadToken;
|
|
||||||
|
|
||||||
|
|
||||||
public DefaultToken(PayloadToken<String> payloadToken) {
|
|
||||||
|
|
||||||
super(payloadToken.getFragment());
|
|
||||||
this.payloadToken = payloadToken;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public boolean isMatch() {
|
|
||||||
|
|
||||||
return payloadToken.isMatch();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public Emit getEmit() {
|
|
||||||
|
|
||||||
PayloadEmit<String> emit = payloadToken.getEmit();
|
|
||||||
return new Emit(emit.getStart(), emit.getEnd(), emit.getKeyword());
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -3,30 +3,21 @@ package org.ahocorasick.trie;
|
|||||||
import org.ahocorasick.interval.Interval;
|
import org.ahocorasick.interval.Interval;
|
||||||
import org.ahocorasick.interval.Intervalable;
|
import org.ahocorasick.interval.Intervalable;
|
||||||
|
|
||||||
/**
|
|
||||||
* Responsible for tracking the bounds of matched terms.
|
|
||||||
*/
|
|
||||||
public class Emit extends Interval implements Intervalable {
|
public class Emit extends Interval implements Intervalable {
|
||||||
|
|
||||||
private final String keyword;
|
private final String keyword;
|
||||||
|
|
||||||
|
|
||||||
public Emit(final int start, final int end, final String keyword) {
|
public Emit(final int start, final int end, final String keyword) {
|
||||||
|
|
||||||
super(start, end);
|
super(start, end);
|
||||||
this.keyword = keyword;
|
this.keyword = keyword;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public String getKeyword() {
|
public String getKeyword() {
|
||||||
|
|
||||||
return this.keyword;
|
return this.keyword;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
|
|
||||||
return super.toString() + "=" + this.keyword;
|
return super.toString() + "=" + this.keyword;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -3,22 +3,16 @@ package org.ahocorasick.trie;
|
|||||||
public class FragmentToken extends Token {
|
public class FragmentToken extends Token {
|
||||||
|
|
||||||
public FragmentToken(String fragment) {
|
public FragmentToken(String fragment) {
|
||||||
|
|
||||||
super(fragment);
|
super(fragment);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean isMatch() {
|
public boolean isMatch() {
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Emit getEmit() {
|
public Emit getEmit() {
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -2,27 +2,20 @@ package org.ahocorasick.trie;
|
|||||||
|
|
||||||
public class MatchToken extends Token {
|
public class MatchToken extends Token {
|
||||||
|
|
||||||
private final Emit emit;
|
private Emit emit;
|
||||||
|
|
||||||
|
|
||||||
public MatchToken(final String fragment, final Emit emit) {
|
|
||||||
|
|
||||||
|
public MatchToken(String fragment, Emit emit) {
|
||||||
super(fragment);
|
super(fragment);
|
||||||
this.emit = emit;
|
this.emit = emit;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean isMatch() {
|
public boolean isMatch() {
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Emit getEmit() {
|
public Emit getEmit() {
|
||||||
|
|
||||||
return this.emit;
|
return this.emit;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1,21 +0,0 @@
|
|||||||
package org.ahocorasick.trie;
|
|
||||||
|
|
||||||
import lombok.EqualsAndHashCode;
|
|
||||||
import lombok.Getter;
|
|
||||||
import lombok.RequiredArgsConstructor;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Contains the matched keyword and some payload data.
|
|
||||||
*
|
|
||||||
* @param <T> The type of the wrapped payload data.
|
|
||||||
* @author Daniel Beck
|
|
||||||
*/
|
|
||||||
@Getter
|
|
||||||
@EqualsAndHashCode
|
|
||||||
@RequiredArgsConstructor
|
|
||||||
public class Payload<T> {
|
|
||||||
|
|
||||||
private final String keyword;
|
|
||||||
private final T data;
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,58 +0,0 @@
|
|||||||
package org.ahocorasick.trie;
|
|
||||||
|
|
||||||
import org.ahocorasick.interval.Interval;
|
|
||||||
import org.ahocorasick.interval.Intervalable;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Contains a matched term and its associated payload data.
|
|
||||||
*
|
|
||||||
* @param <T> Type of the wrapped payload-data.
|
|
||||||
* @author Daniel Beck
|
|
||||||
*/
|
|
||||||
public class PayloadEmit<T> extends Interval implements Intervalable {
|
|
||||||
|
|
||||||
private final String keyword;
|
|
||||||
|
|
||||||
private final T payload;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Created a PayloadEmit
|
|
||||||
*
|
|
||||||
* @param start Start of the matched search term.
|
|
||||||
* @param end End of the matched search term.
|
|
||||||
* @param keyword Keyword that matched.
|
|
||||||
* @param payload Emitted payload data.
|
|
||||||
*/
|
|
||||||
public PayloadEmit(final int start, final int end, String keyword, T payload) {
|
|
||||||
|
|
||||||
super(start, end);
|
|
||||||
this.keyword = keyword;
|
|
||||||
this.payload = payload;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public String getKeyword() {
|
|
||||||
|
|
||||||
return this.keyword;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the payload associated to this emit.
|
|
||||||
*
|
|
||||||
* @return the associated payload
|
|
||||||
*/
|
|
||||||
public T getPayload() {
|
|
||||||
|
|
||||||
return this.payload;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
|
|
||||||
return super.toString() + "=" + this.keyword + (this.payload != null ? "->" + this.payload : "");
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,38 +0,0 @@
|
|||||||
package org.ahocorasick.trie;
|
|
||||||
|
|
||||||
/***
|
|
||||||
* Container for a token ("the fragment") that can emit a type of payload.
|
|
||||||
* <p>
|
|
||||||
* This token indicates a matching search term was not found, so
|
|
||||||
* {@link #isMatch()} always returns {@code false}.
|
|
||||||
* </p>
|
|
||||||
*
|
|
||||||
* @author Daniel Beck
|
|
||||||
*
|
|
||||||
* @param <T> The Type of the emitted payloads.
|
|
||||||
*/
|
|
||||||
public class PayloadFragmentToken<T> extends PayloadToken<T> {
|
|
||||||
|
|
||||||
public PayloadFragmentToken(String fragment) {
|
|
||||||
|
|
||||||
super(fragment);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean isMatch() {
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns null.
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public PayloadEmit<T> getEmit() {
|
|
||||||
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,38 +0,0 @@
|
|||||||
package org.ahocorasick.trie;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Container for a token ("the fragment") that can emit a type of payload.
|
|
||||||
* <p>
|
|
||||||
* This token indicates a matching search term was found, so {@link #isMatch()}
|
|
||||||
* always returns {@code true}.
|
|
||||||
* </p>
|
|
||||||
*
|
|
||||||
* @param <T> The Type of the emitted payloads.
|
|
||||||
* @author Daniel Beck
|
|
||||||
*/
|
|
||||||
public class PayloadMatchToken<T> extends PayloadToken<T> {
|
|
||||||
|
|
||||||
private final PayloadEmit<T> emit;
|
|
||||||
|
|
||||||
|
|
||||||
public PayloadMatchToken(final String fragment, final PayloadEmit<T> emit) {
|
|
||||||
|
|
||||||
super(fragment);
|
|
||||||
this.emit = emit;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean isMatch() {
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public PayloadEmit<T> getEmit() {
|
|
||||||
|
|
||||||
return this.emit;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,164 +0,0 @@
|
|||||||
package org.ahocorasick.trie;
|
|
||||||
|
|
||||||
import java.util.*;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import lombok.Getter;
|
|
||||||
import lombok.Setter;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* <p>
|
|
||||||
* A state has various important tasks it must attend to:
|
|
||||||
* </p>
|
|
||||||
* <ul>
|
|
||||||
* <li>success; when a character points to another state, it must return that
|
|
||||||
* state</li>
|
|
||||||
* <li>failure; when a character has no matching state, the algorithm must be
|
|
||||||
* able to fall back on a state with less depth</li>
|
|
||||||
* <li>emits; when this state is passed and keywords have been matched, the
|
|
||||||
* matches and their payloads must be 'emitted' so that they can be used later
|
|
||||||
* on.</li>
|
|
||||||
* </ul>
|
|
||||||
* <p>
|
|
||||||
* The root state is special in the sense that it has no failure state; it
|
|
||||||
* cannot fail. If it 'fails' it will still parse the next character and start
|
|
||||||
* from the root node. This ensures that the algorithm always runs. All other
|
|
||||||
* states always have a fail state.
|
|
||||||
* </p>
|
|
||||||
*
|
|
||||||
* @author Daniel Beck
|
|
||||||
*/
|
|
||||||
public class PayloadState<T> {
|
|
||||||
|
|
||||||
/**
|
|
||||||
* effective the size of the keyword.
|
|
||||||
*/
|
|
||||||
@Getter
|
|
||||||
private final int depth;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* only used for the root state to refer to itself in case no matches have been
|
|
||||||
* found.
|
|
||||||
*/
|
|
||||||
private final PayloadState<T> rootState;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* referred to in the white paper as the 'goto' structure. From a state it is
|
|
||||||
* possible to go to other states, depending on the character passed.
|
|
||||||
*/
|
|
||||||
private final Map<Character, PayloadState<T>> success = new HashMap<>();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* if no matching states are found, the failure state will be returned.
|
|
||||||
*/
|
|
||||||
@Getter
|
|
||||||
@Setter
|
|
||||||
private PayloadState<T> failure;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* whenever this state is reached, it will emit the matches keywords for future
|
|
||||||
* reference.
|
|
||||||
*/
|
|
||||||
private Set<Payload<T>> emits;
|
|
||||||
|
|
||||||
|
|
||||||
public PayloadState() {
|
|
||||||
|
|
||||||
this(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public PayloadState(final int depth) {
|
|
||||||
|
|
||||||
this.depth = depth;
|
|
||||||
this.rootState = depth == 0 ? this : null;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private PayloadState<T> nextState(final Character character, final boolean ignoreRootState) {
|
|
||||||
|
|
||||||
PayloadState<T> nextState = this.success.get(character);
|
|
||||||
|
|
||||||
if (!ignoreRootState && nextState == null && this.rootState != null) {
|
|
||||||
nextState = this.rootState;
|
|
||||||
}
|
|
||||||
|
|
||||||
return nextState;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public PayloadState<T> nextState(final Character character) {
|
|
||||||
|
|
||||||
return nextState(character, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public PayloadState<T> nextStateIgnoreRootState(Character character) {
|
|
||||||
|
|
||||||
return nextState(character, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public PayloadState<T> addState(Character character) {
|
|
||||||
|
|
||||||
PayloadState<T> nextState = nextStateIgnoreRootState(character);
|
|
||||||
if (nextState == null) {
|
|
||||||
nextState = new PayloadState<>(this.depth + 1);
|
|
||||||
this.success.put(character, nextState);
|
|
||||||
}
|
|
||||||
return nextState;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Adds a payload to be emitted for this state.
|
|
||||||
*
|
|
||||||
* @param payload to be emitted.
|
|
||||||
*/
|
|
||||||
public void addEmit(Payload<T> payload) {
|
|
||||||
|
|
||||||
if (this.emits == null) {
|
|
||||||
this.emits = new HashSet<>();
|
|
||||||
}
|
|
||||||
this.emits.add(payload);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Adds a collection of payloads to be emitted for this state.
|
|
||||||
*
|
|
||||||
* @param emits Collection of payloads to be emitted.
|
|
||||||
*/
|
|
||||||
public void addEmit(Collection<Payload<T>> emits) {
|
|
||||||
|
|
||||||
for (Payload<T> emit : emits) {
|
|
||||||
addEmit(emit);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns a collection of emitted payloads for this state.
|
|
||||||
*
|
|
||||||
* @return Collection of emitted payloads.
|
|
||||||
*/
|
|
||||||
public Collection<Payload<T>> emit() {
|
|
||||||
|
|
||||||
return this.emits == null ? Collections.<Payload<T>>emptyList() : this.emits.stream()
|
|
||||||
.sorted(Comparator.comparing(Payload::getKeyword))
|
|
||||||
.collect(Collectors.toList());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public Collection<PayloadState<T>> getStates() {
|
|
||||||
|
|
||||||
return this.success.values();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public Collection<Character> getTransitions() {
|
|
||||||
|
|
||||||
return this.success.keySet();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,41 +0,0 @@
|
|||||||
package org.ahocorasick.trie;
|
|
||||||
|
|
||||||
/***
|
|
||||||
* PayloadToken holds a text ("the fragment") an emits some output. If
|
|
||||||
* {@link #isMatch()} returns {@code true}, the token matched a search term.
|
|
||||||
*
|
|
||||||
* @author Daniel Beck
|
|
||||||
*
|
|
||||||
* @param <T> The Type of the emitted payloads.
|
|
||||||
*/
|
|
||||||
public abstract class PayloadToken<T> {
|
|
||||||
|
|
||||||
private String fragment;
|
|
||||||
|
|
||||||
|
|
||||||
public PayloadToken(String fragment) {
|
|
||||||
|
|
||||||
this.fragment = fragment;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public String getFragment() {
|
|
||||||
|
|
||||||
return this.fragment;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Return {@code true} if a search term matched.
|
|
||||||
*
|
|
||||||
* @return {@code true} if this is a match
|
|
||||||
*/
|
|
||||||
public abstract boolean isMatch();
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @return the payload
|
|
||||||
*/
|
|
||||||
public abstract PayloadEmit<T> getEmit();
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,587 +0,0 @@
|
|||||||
package org.ahocorasick.trie;
|
|
||||||
|
|
||||||
import static java.lang.Character.isWhitespace;
|
|
||||||
import static java.lang.Character.toLowerCase;
|
|
||||||
|
|
||||||
import java.util.Deque;
|
|
||||||
import java.util.LinkedList;
|
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Queue;
|
|
||||||
import java.util.concurrent.LinkedBlockingDeque;
|
|
||||||
|
|
||||||
import org.ahocorasick.interval.IntervalTree;
|
|
||||||
import org.ahocorasick.interval.Intervalable;
|
|
||||||
import org.ahocorasick.trie.handler.DefaultPayloadEmitHandler;
|
|
||||||
import org.ahocorasick.trie.handler.PayloadEmitHandler;
|
|
||||||
import org.ahocorasick.trie.handler.StatefulPayloadEmitHandler;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A trie implementation that carries a payload. See {@link Trie} for
|
|
||||||
* details on usage.
|
|
||||||
*
|
|
||||||
* <p>
|
|
||||||
* The payload trie adds the possibility to specify emitted payloads for each
|
|
||||||
* added keyword.
|
|
||||||
* </p>
|
|
||||||
*
|
|
||||||
* @param <T> The type of the supplied of the payload.
|
|
||||||
* @author Daniel Beck
|
|
||||||
*/
|
|
||||||
public class PayloadTrie<T> {
|
|
||||||
|
|
||||||
private final TrieConfig trieConfig;
|
|
||||||
|
|
||||||
private final PayloadState<T> rootState;
|
|
||||||
|
|
||||||
|
|
||||||
protected PayloadTrie(final TrieConfig trieConfig) {
|
|
||||||
|
|
||||||
this.trieConfig = trieConfig;
|
|
||||||
this.rootState = new PayloadState<>();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Used by the builder to add a text search keyword with an emit payload.
|
|
||||||
*
|
|
||||||
* @param keyword The search term to add to the list of search terms.
|
|
||||||
* @param emit the payload to emit for this search term.
|
|
||||||
* @throws NullPointerException if the keyword is null.
|
|
||||||
*/
|
|
||||||
private void addKeyword(String keyword, T emit) {
|
|
||||||
|
|
||||||
if (keyword.isEmpty()) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
addState(keyword).addEmit(new Payload<>(keyword, emit));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Used by the builder to add a text search keyword.
|
|
||||||
*
|
|
||||||
* @param keyword The search term to add to the list of search terms.
|
|
||||||
* @throws NullPointerException if the keyword is null.
|
|
||||||
*/
|
|
||||||
private void addKeyword(String keyword) {
|
|
||||||
|
|
||||||
if (keyword.isEmpty()) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
addState(keyword).addEmit(new Payload<>(keyword, null));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private PayloadState<T> addState(final String keyword) {
|
|
||||||
|
|
||||||
PayloadState<T> state = getRootState();
|
|
||||||
for (final Character character : keyword.toCharArray()) {
|
|
||||||
if (isIgnoreWhiteSpace() && isWhitespace(character)) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
Character adjustedChar = isCaseInsensitive() ? Character.toLowerCase(character) : character;
|
|
||||||
state = state.addState(adjustedChar);
|
|
||||||
}
|
|
||||||
return state;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Tokenizes the specified text and returns the emitted outputs.
|
|
||||||
*
|
|
||||||
* @param text The text to tokenize.
|
|
||||||
* @return the emitted outputs
|
|
||||||
*/
|
|
||||||
public Collection<PayloadToken<T>> tokenize(final String text) {
|
|
||||||
|
|
||||||
final Collection<PayloadToken<T>> tokens = new LinkedList<>();
|
|
||||||
final Collection<PayloadEmit<T>> collectedEmits = parseText(text);
|
|
||||||
int lastCollectedPosition = -1;
|
|
||||||
|
|
||||||
for (final PayloadEmit<T> emit : collectedEmits) {
|
|
||||||
if (emit.getStart() - lastCollectedPosition > 1) {
|
|
||||||
tokens.add(createFragment(emit, text, lastCollectedPosition));
|
|
||||||
}
|
|
||||||
|
|
||||||
tokens.add(createMatch(emit, text));
|
|
||||||
lastCollectedPosition = emit.getEnd();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (text.length() - lastCollectedPosition > 1) {
|
|
||||||
tokens.add(createFragment(null, text, lastCollectedPosition));
|
|
||||||
}
|
|
||||||
|
|
||||||
return tokens;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private PayloadToken<T> createFragment(final PayloadEmit<T> emit, final String text, final int lastCollectedPosition) {
|
|
||||||
|
|
||||||
return new PayloadFragmentToken<>(text.substring(lastCollectedPosition + 1, emit == null ? text.length() : emit.getStart()));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private PayloadToken<T> createMatch(PayloadEmit<T> emit, String text) {
|
|
||||||
|
|
||||||
return new PayloadMatchToken<>(text.substring(emit.getStart(), emit.getEnd() + 1), emit);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Tokenizes a specified text and returns the emitted outputs.
|
|
||||||
*
|
|
||||||
* @param text The character sequence to tokenize.
|
|
||||||
* @return A collection of emits.
|
|
||||||
*/
|
|
||||||
public Collection<PayloadEmit<T>> parseText(final CharSequence text) {
|
|
||||||
|
|
||||||
return parseText(text, new DefaultPayloadEmitHandler<>());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Tokenizes the specified text by using a custom EmitHandler and returns the
|
|
||||||
* emitted outputs.
|
|
||||||
*
|
|
||||||
* @param text The character sequence to tokenize.
|
|
||||||
* @param emitHandler The handler that will be used to parse the text.
|
|
||||||
* @return A collection of emits.
|
|
||||||
*/
|
|
||||||
@SuppressWarnings("unchecked")
|
|
||||||
public Collection<PayloadEmit<T>> parseText(final CharSequence text, final StatefulPayloadEmitHandler<T> emitHandler) {
|
|
||||||
|
|
||||||
parseText(text, (PayloadEmitHandler<T>) emitHandler);
|
|
||||||
|
|
||||||
final List<PayloadEmit<T>> collectedEmits = emitHandler.getEmits();
|
|
||||||
|
|
||||||
if (!trieConfig.isAllowOverlaps()) {
|
|
||||||
IntervalTree intervalTree = new IntervalTree((List<Intervalable>) (List<?>) collectedEmits);
|
|
||||||
intervalTree.removeOverlaps((List<Intervalable>) (List<?>) collectedEmits);
|
|
||||||
}
|
|
||||||
|
|
||||||
return collectedEmits;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns true if the text contains one of the search terms; otherwise,
|
|
||||||
* returns false.
|
|
||||||
*
|
|
||||||
* @param text Specified text.
|
|
||||||
* @return true if the text contains one of the search terms. Else, returns
|
|
||||||
* false.
|
|
||||||
*/
|
|
||||||
public boolean containsMatch(final CharSequence text) {
|
|
||||||
|
|
||||||
return firstMatch(text) != null;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Tokenizes the specified text by using a custom EmitHandler and returns the
|
|
||||||
* emitted outputs.
|
|
||||||
*
|
|
||||||
* @param text The character sequence to tokenize.
|
|
||||||
* @param emitHandler The handler that will be used to parse the text.
|
|
||||||
*/
|
|
||||||
public void parseText(final CharSequence text, final PayloadEmitHandler<T> emitHandler) {
|
|
||||||
|
|
||||||
PayloadState<T> currentState = getRootState();
|
|
||||||
for (int position = 0; position < text.length(); position++) {
|
|
||||||
char character = text.charAt(position);
|
|
||||||
|
|
||||||
if (trieConfig.isIgnoreWhiteSpace() && isWhitespace(character)) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (trieConfig.isCaseInsensitive()) {
|
|
||||||
character = Character.toLowerCase(character);
|
|
||||||
}
|
|
||||||
|
|
||||||
currentState = getState(currentState, character);
|
|
||||||
final Collection<Payload<T>> payloads = currentState.emit();
|
|
||||||
if (processEmits(text, position, payloads, emitHandler) && trieConfig.isStopOnHit()) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The first matching text sequence.
|
|
||||||
*
|
|
||||||
* @param text The text to search for keywords, must not be {@code null}.
|
|
||||||
* @return {@code null} if no matches found.
|
|
||||||
*/
|
|
||||||
public PayloadEmit<T> firstMatch(final CharSequence text) {
|
|
||||||
|
|
||||||
assert text != null;
|
|
||||||
|
|
||||||
if (!trieConfig.isAllowOverlaps()) {
|
|
||||||
// Slow path. Needs to find all the matches to detect overlaps.
|
|
||||||
final Collection<PayloadEmit<T>> parseText = parseText(text);
|
|
||||||
|
|
||||||
if (parseText != null && !parseText.isEmpty()) {
|
|
||||||
return parseText.iterator().next();
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Fast path. Returns first match found.
|
|
||||||
PayloadState<T> currentState = getRootState();
|
|
||||||
|
|
||||||
for (int position = 0; position < text.length(); position++) {
|
|
||||||
char character = text.charAt(position);
|
|
||||||
|
|
||||||
if (trieConfig.isIgnoreWhiteSpace() && isWhitespace(character)) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (trieConfig.isCaseInsensitive()) {
|
|
||||||
character = Character.toLowerCase(character);
|
|
||||||
}
|
|
||||||
|
|
||||||
currentState = getState(currentState, character);
|
|
||||||
Collection<Payload<T>> payloads = currentState.emit();
|
|
||||||
|
|
||||||
if (payloads != null && !payloads.isEmpty()) {
|
|
||||||
for (final Payload<T> payload : payloads) {
|
|
||||||
int start;
|
|
||||||
if (isIgnoreWhiteSpace()) {
|
|
||||||
start = findStart(text, position, payload);
|
|
||||||
} else {
|
|
||||||
start = position - payload.getKeyword().length() + 1;
|
|
||||||
}
|
|
||||||
final PayloadEmit<T> emit = new PayloadEmit<>(start, position, payload.getKeyword(), payload.getData());
|
|
||||||
if (trieConfig.isOnlyWholeWords()) {
|
|
||||||
if (!isPartialMatch(text, emit)) {
|
|
||||||
return emit;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
return emit;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private boolean isPartialMatch(final CharSequence searchText, final PayloadEmit<T> emit) {
|
|
||||||
|
|
||||||
return (emit.getStart() != 0 && Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) || (emit.getEnd() + 1 != searchText.length() && Character.isAlphabetic(
|
|
||||||
searchText.charAt(emit.getEnd() + 1)));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private boolean isPartialMatchWhiteSpaceSeparated(final CharSequence searchText, final PayloadEmit<T> emit) {
|
|
||||||
|
|
||||||
final long size = searchText.length();
|
|
||||||
return (emit.getStart() != 0 && !isWhitespace(searchText.charAt(emit.getStart() - 1))) || (emit.getEnd() + 1 != size && !isWhitespace(searchText.charAt(emit.getEnd()
|
|
||||||
+ 1)));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private PayloadState<T> getState(PayloadState<T> currentState, final Character character) {
|
|
||||||
|
|
||||||
PayloadState<T> newCurrentState = currentState.nextState(character);
|
|
||||||
|
|
||||||
var tempState = currentState;
|
|
||||||
while (newCurrentState == null) {
|
|
||||||
tempState = tempState.getFailure();
|
|
||||||
newCurrentState = tempState.nextState(character);
|
|
||||||
}
|
|
||||||
|
|
||||||
return newCurrentState;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void constructFailureStates() {
|
|
||||||
|
|
||||||
final Queue<PayloadState<T>> queue = new LinkedBlockingDeque<>();
|
|
||||||
final PayloadState<T> startState = getRootState();
|
|
||||||
|
|
||||||
// First, set the fail state of all depth 1 states to the root state
|
|
||||||
for (PayloadState<T> depthOneState : startState.getStates()) {
|
|
||||||
depthOneState.setFailure(startState);
|
|
||||||
queue.add(depthOneState);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Second, determine the fail state for all depth > 1 state
|
|
||||||
while (!queue.isEmpty()) {
|
|
||||||
final PayloadState<T> currentState = queue.remove();
|
|
||||||
|
|
||||||
for (final Character transition : currentState.getTransitions()) {
|
|
||||||
PayloadState<T> targetState = currentState.nextState(transition);
|
|
||||||
queue.add(targetState);
|
|
||||||
|
|
||||||
PayloadState<T> traceFailureState = currentState.getFailure();
|
|
||||||
while (traceFailureState.nextState(transition) == null) {
|
|
||||||
traceFailureState = traceFailureState.getFailure();
|
|
||||||
}
|
|
||||||
|
|
||||||
final PayloadState<T> newFailureState = traceFailureState.nextState(transition);
|
|
||||||
targetState.setFailure(newFailureState);
|
|
||||||
targetState.addEmit(newFailureState.emit());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private boolean processEmits(final CharSequence text, final int position, final Collection<Payload<T>> payloads, final PayloadEmitHandler<T> emitHandler) {
|
|
||||||
|
|
||||||
boolean emitted = false;
|
|
||||||
for (final Payload<T> payload : payloads) {
|
|
||||||
int start;
|
|
||||||
if (isIgnoreWhiteSpace()) {
|
|
||||||
start = findStart(text, position, payload);
|
|
||||||
} else {
|
|
||||||
start = position - payload.getKeyword().length() + 1;
|
|
||||||
}
|
|
||||||
final PayloadEmit<T> payloadEmit = new PayloadEmit<>(start, position, payload.getKeyword(), payload.getData());
|
|
||||||
if (!(trieConfig.isOnlyWholeWords() && isPartialMatch(text, payloadEmit)) && !(trieConfig.isOnlyWholeWordsWhiteSpaceSeparated() && isPartialMatchWhiteSpaceSeparated(
|
|
||||||
text,
|
|
||||||
payloadEmit))) {
|
|
||||||
emitted = emitHandler.emit(payloadEmit) || emitted;
|
|
||||||
if (emitted && trieConfig.isStopOnHit()) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return emitted;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private int findStart(CharSequence text, int position, Payload<T> payload) {
|
|
||||||
|
|
||||||
Deque<Character> stack = new LinkedList<>();
|
|
||||||
int i;
|
|
||||||
for (i = 0; i < payload.getKeyword().length(); i++) {
|
|
||||||
if (isWhitespace(payload.getKeyword().charAt(i))) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
stack.push(isCaseInsensitive() ? toLowerCase(payload.getKeyword().charAt(i)) : payload.getKeyword().charAt(i));
|
|
||||||
}
|
|
||||||
for (i = position; !stack.isEmpty() && i >= 0; --i) {
|
|
||||||
char c = isCaseInsensitive() ? toLowerCase(text.charAt(i)) : text.charAt(i);
|
|
||||||
if (c == stack.peek()) {
|
|
||||||
stack.pop();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return i + 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private boolean isCaseInsensitive() {
|
|
||||||
|
|
||||||
return trieConfig.isCaseInsensitive();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private boolean isIgnoreWhiteSpace() {
|
|
||||||
|
|
||||||
return trieConfig.isIgnoreWhiteSpace();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private PayloadState<T> getRootState() {
|
|
||||||
|
|
||||||
return this.rootState;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Provides a fluent interface for constructing Trie instances with payloads.
|
|
||||||
*
|
|
||||||
* @param <T> The type of the emitted payload.
|
|
||||||
* @return The builder used to configure its Trie.
|
|
||||||
*/
|
|
||||||
public static <T> PayloadTrieBuilder<T> builder() {
|
|
||||||
|
|
||||||
return new PayloadTrieBuilder<>();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Builder class to create a PayloadTrie instance.
|
|
||||||
*
|
|
||||||
* @param <T> The type of the emitted payload.
|
|
||||||
*/
|
|
||||||
public static final class PayloadTrieBuilder<T> {
|
|
||||||
|
|
||||||
private final TrieConfig trieConfig = new TrieConfig();
|
|
||||||
|
|
||||||
private final PayloadTrie<T> trie = new PayloadTrie<>(trieConfig);
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Default (empty) constructor.
|
|
||||||
*/
|
|
||||||
private PayloadTrieBuilder() {
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Configure the Trie to ignore case when searching for keywords in the text.
|
|
||||||
* This must be called before calling addKeyword because the algorithm converts
|
|
||||||
* keywords to lowercase as they are added, depending on this case sensitivity
|
|
||||||
* setting.
|
|
||||||
*
|
|
||||||
* @return This builder.
|
|
||||||
*/
|
|
||||||
public PayloadTrieBuilder<T> ignoreCase() {
|
|
||||||
|
|
||||||
this.trieConfig.setCaseInsensitive(true);
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Configure the Trie to ignore overlapping keywords.
|
|
||||||
*
|
|
||||||
* @return This builder.
|
|
||||||
*/
|
|
||||||
public PayloadTrieBuilder<T> ignoreOverlaps() {
|
|
||||||
|
|
||||||
this.trieConfig.setAllowOverlaps(false);
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Adds a keyword to the {@link Trie}'s list of text search keywords.
|
|
||||||
* No {@link Payload} is supplied.
|
|
||||||
*
|
|
||||||
* @param keyword The keyword to add to the list.
|
|
||||||
* @return This builder.
|
|
||||||
* @throws NullPointerException if the keyword is null.
|
|
||||||
*/
|
|
||||||
public PayloadTrieBuilder<T> addKeyword(final String keyword) {
|
|
||||||
|
|
||||||
this.trie.addKeyword(keyword);
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Adds a keyword and a payload to the {@link Trie}'s list of text
|
|
||||||
* search keywords.
|
|
||||||
*
|
|
||||||
* @param keyword The keyword to add to the list.
|
|
||||||
* @param payload the payload to add
|
|
||||||
* @return This builder.
|
|
||||||
* @throws NullPointerException if the keyword is null.
|
|
||||||
*/
|
|
||||||
public PayloadTrieBuilder<T> addKeyword(final String keyword, final T payload) {
|
|
||||||
|
|
||||||
this.trie.addKeyword(keyword, payload);
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Adds a list of keywords and payloads to the {@link Trie}'s list of
|
|
||||||
* text search keywords.
|
|
||||||
*
|
|
||||||
* @param keywords The keywords to add to the list.
|
|
||||||
* @return This builder.
|
|
||||||
*/
|
|
||||||
public PayloadTrieBuilder<T> addKeywords(final Collection<Payload<T>> keywords) {
|
|
||||||
|
|
||||||
for (Payload<T> payload : keywords) {
|
|
||||||
this.trie.addKeyword(payload.getKeyword(), payload.getData());
|
|
||||||
}
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Configure the Trie to match whole keywords in the text.
|
|
||||||
*
|
|
||||||
* @return This builder.
|
|
||||||
*/
|
|
||||||
public PayloadTrieBuilder<T> onlyWholeWords() {
|
|
||||||
|
|
||||||
this.trieConfig.setOnlyWholeWords(true);
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Configure the Trie to match whole keywords that are separated by whitespace
|
|
||||||
* in the text. For example, "this keyword thatkeyword" would only match the
|
|
||||||
* first occurrence of "keyword".
|
|
||||||
*
|
|
||||||
* @return This builder.
|
|
||||||
*/
|
|
||||||
public PayloadTrieBuilder<T> onlyWholeWordsWhiteSpaceSeparated() {
|
|
||||||
|
|
||||||
this.trieConfig.setOnlyWholeWordsWhiteSpaceSeparated(true);
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Configure the Trie to stop after the first keyword is found in the text.
|
|
||||||
*
|
|
||||||
* @return This builder.
|
|
||||||
*/
|
|
||||||
public PayloadTrieBuilder<T> stopOnHit() {
|
|
||||||
|
|
||||||
trie.trieConfig.setStopOnHit(true);
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Configure the PayloadTrie based on the builder settings.
|
|
||||||
*
|
|
||||||
* @return The configured PayloadTrie.
|
|
||||||
*/
|
|
||||||
public PayloadTrie<T> build() {
|
|
||||||
|
|
||||||
this.trie.constructFailureStates();
|
|
||||||
return this.trie;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @return This builder.
|
|
||||||
* @deprecated Use ignoreCase()
|
|
||||||
*/
|
|
||||||
@Deprecated
|
|
||||||
public PayloadTrieBuilder<T> caseInsensitive() {
|
|
||||||
|
|
||||||
return ignoreCase();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @return This builder.
|
|
||||||
* @deprecated Use ignoreOverlaps()
|
|
||||||
*/
|
|
||||||
@Deprecated
|
|
||||||
public PayloadTrieBuilder<T> removeOverlaps() {
|
|
||||||
|
|
||||||
return ignoreOverlaps();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Configure the Trie to ignore whitespaces.
|
|
||||||
*
|
|
||||||
* @return This builder.
|
|
||||||
*/
|
|
||||||
public PayloadTrieBuilder<T> ignoreWhiteSpace() {
|
|
||||||
|
|
||||||
trieConfig.setIgnoreWhiteSpace(true);
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -2,152 +2,116 @@ package org.ahocorasick.trie;
|
|||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
import lombok.Getter;
|
|
||||||
import lombok.Setter;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* <p>
|
* <p>
|
||||||
* A state has various important tasks it must attend to:
|
* A state has various important tasks it must attend to:
|
||||||
* </p>
|
* </p>
|
||||||
|
*
|
||||||
* <ul>
|
* <ul>
|
||||||
* <li>success; when a character points to another state, it must return that state</li>
|
* <li>success; when a character points to another state, it must return that state</li>
|
||||||
* <li>failure; when a character has no matching state, the algorithm must be able to fall back on a
|
* <li>failure; when a character has no matching state, the algorithm must be able to fall back on a
|
||||||
* state with less depth</li>
|
* state with less depth</li>
|
||||||
* <li>emits; when this state is passed and keywords have been matched, the matches must be
|
* <li>emits; when this state is passed and keywords have been matched, the matches must be
|
||||||
* 'emitted' so that they can be used later on.</li>
|
* 'emitted' so that they can be used later on.</li>
|
||||||
* </ul>
|
* </ul>
|
||||||
|
*
|
||||||
* <p>
|
* <p>
|
||||||
* The root state is special in the sense that it has no failure state; it cannot fail. If it 'fails'
|
* The root state is special in the sense that it has no failure state; it cannot fail. If it 'fails'
|
||||||
* it will still parse the next character and start from the root node. This ensures that the algorithm
|
* it will still parse the next character and start from the root node. This ensures that the algorithm
|
||||||
* always runs. All other states always have a fail state.
|
* always runs. All other states always have a fail state.
|
||||||
* </p>
|
* </p>
|
||||||
*
|
*
|
||||||
* @author Robert Bor
|
* @author Robert Bor
|
||||||
*/
|
*/
|
||||||
public class State {
|
public class State {
|
||||||
|
|
||||||
/**
|
/** effective the size of the keyword */
|
||||||
* effective the size of the keyword
|
|
||||||
*/
|
|
||||||
@Getter
|
|
||||||
private final int depth;
|
private final int depth;
|
||||||
|
|
||||||
/**
|
/** only used for the root state to refer to itself in case no matches have been found */
|
||||||
* only used for the root state to refer to itself in case no matches have been found
|
|
||||||
*/
|
|
||||||
private final State rootState;
|
private final State rootState;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* referred to in the white paper as the 'goto' structure. From a state it is possible to go
|
* referred to in the white paper as the 'goto' structure. From a state it is possible to go
|
||||||
* to other states, depending on the character passed.
|
* to other states, depending on the character passed.
|
||||||
*/
|
*/
|
||||||
private final Map<Character, State> success = new HashMap<>();
|
private Map<Character,State> success = new HashMap<Character, State>();
|
||||||
|
|
||||||
/**
|
/** if no matching states are found, the failure state will be returned */
|
||||||
* if no matching states are found, the failure state will be returned
|
private State failure = null;
|
||||||
*/
|
|
||||||
@Setter
|
|
||||||
@Getter
|
|
||||||
private State failure;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* whenever this state is reached, it will emit the matches keywords for future reference
|
|
||||||
*/
|
|
||||||
private Set<String> emits;
|
|
||||||
|
|
||||||
|
/** whenever this state is reached, it will emit the matches keywords for future reference */
|
||||||
|
private Set<String> emits = null;
|
||||||
|
|
||||||
public State() {
|
public State() {
|
||||||
|
|
||||||
this(0);
|
this(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public State(int depth) {
|
||||||
public State(final int depth) {
|
|
||||||
|
|
||||||
this.depth = depth;
|
this.depth = depth;
|
||||||
this.rootState = depth == 0 ? this : null;
|
this.rootState = depth == 0 ? this : null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private State nextState(Character character, boolean ignoreRootState) {
|
||||||
private State nextState(final Character character, final boolean ignoreRootState) {
|
|
||||||
|
|
||||||
State nextState = this.success.get(character);
|
State nextState = this.success.get(character);
|
||||||
|
|
||||||
if (!ignoreRootState && nextState == null && this.rootState != null) {
|
if (!ignoreRootState && nextState == null && this.rootState != null) {
|
||||||
nextState = this.rootState;
|
nextState = this.rootState;
|
||||||
}
|
}
|
||||||
|
|
||||||
return nextState;
|
return nextState;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public State nextState(Character character) {
|
||||||
public State nextState(final Character character) {
|
|
||||||
|
|
||||||
return nextState(character, false);
|
return nextState(character, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public State nextStateIgnoreRootState(Character character) {
|
public State nextStateIgnoreRootState(Character character) {
|
||||||
|
|
||||||
return nextState(character, true);
|
return nextState(character, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public State addState(String keyword) {
|
|
||||||
|
|
||||||
State state = this;
|
|
||||||
|
|
||||||
for (final Character character : keyword.toCharArray()) {
|
|
||||||
state = state.addState(character);
|
|
||||||
}
|
|
||||||
|
|
||||||
return state;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public State addState(Character character) {
|
public State addState(Character character) {
|
||||||
|
|
||||||
State nextState = nextStateIgnoreRootState(character);
|
State nextState = nextStateIgnoreRootState(character);
|
||||||
if (nextState == null) {
|
if (nextState == null) {
|
||||||
nextState = new State(this.depth + 1);
|
nextState = new State(this.depth+1);
|
||||||
this.success.put(character, nextState);
|
this.success.put(character, nextState);
|
||||||
}
|
}
|
||||||
return nextState;
|
return nextState;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public int getDepth() {
|
||||||
|
return this.depth;
|
||||||
|
}
|
||||||
|
|
||||||
public void addEmit(String keyword) {
|
public void addEmit(String keyword) {
|
||||||
|
|
||||||
if (this.emits == null) {
|
if (this.emits == null) {
|
||||||
this.emits = new TreeSet<>();
|
this.emits = new TreeSet<>();
|
||||||
}
|
}
|
||||||
this.emits.add(keyword);
|
this.emits.add(keyword);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void addEmit(Collection<String> emits) {
|
public void addEmit(Collection<String> emits) {
|
||||||
|
|
||||||
for (String emit : emits) {
|
for (String emit : emits) {
|
||||||
addEmit(emit);
|
addEmit(emit);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public Collection<String> emit() {
|
public Collection<String> emit() {
|
||||||
|
return this.emits == null ? Collections.<String> emptyList() : this.emits;
|
||||||
return this.emits == null ? Collections.<String>emptyList() : this.emits;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public State failure() {
|
||||||
|
return this.failure;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setFailure(State failState) {
|
||||||
|
this.failure = failState;
|
||||||
|
}
|
||||||
|
|
||||||
public Collection<State> getStates() {
|
public Collection<State> getStates() {
|
||||||
|
|
||||||
return this.success.values();
|
return this.success.values();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public Collection<Character> getTransitions() {
|
public Collection<Character> getTransitions() {
|
||||||
|
|
||||||
return this.success.keySet();
|
return this.success.keySet();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -4,22 +4,16 @@ public abstract class Token {
|
|||||||
|
|
||||||
private String fragment;
|
private String fragment;
|
||||||
|
|
||||||
|
|
||||||
public Token(String fragment) {
|
public Token(String fragment) {
|
||||||
|
|
||||||
this.fragment = fragment;
|
this.fragment = fragment;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public String getFragment() {
|
public String getFragment() {
|
||||||
|
|
||||||
return this.fragment;
|
return this.fragment;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public abstract boolean isMatch();
|
public abstract boolean isMatch();
|
||||||
|
|
||||||
|
|
||||||
public abstract Emit getEmit();
|
public abstract Emit getEmit();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,264 +1,283 @@
|
|||||||
package org.ahocorasick.trie;
|
package org.ahocorasick.trie;
|
||||||
|
|
||||||
|
import org.ahocorasick.interval.IntervalTree;
|
||||||
|
import org.ahocorasick.interval.Intervalable;
|
||||||
|
import org.ahocorasick.trie.handler.DefaultEmitHandler;
|
||||||
|
import org.ahocorasick.trie.handler.EmitHandler;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
|
import java.util.List;
|
||||||
import org.ahocorasick.trie.PayloadTrie.PayloadTrieBuilder;
|
import java.util.Queue;
|
||||||
import org.ahocorasick.trie.handler.EmitHandler;
|
import java.util.concurrent.LinkedBlockingDeque;
|
||||||
import org.ahocorasick.trie.handler.StatefulPayloadEmitDelegateHandler;
|
|
||||||
import org.ahocorasick.trie.handler.PayloadEmitDelegateHandler;
|
|
||||||
import org.ahocorasick.trie.handler.StatefulEmitHandler;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Based on the <a href="http://cr.yp.to/bib/1975/aho.pdf">Aho-Corasick white
|
|
||||||
* paper</a>, from Bell technologies.
|
|
||||||
*
|
*
|
||||||
|
* Based on the Aho-Corasick white paper, Bell technologies: http://cr.yp.to/bib/1975/aho.pdf
|
||||||
* @author Robert Bor
|
* @author Robert Bor
|
||||||
*/
|
*/
|
||||||
public final class Trie {
|
public class Trie {
|
||||||
|
|
||||||
private final PayloadTrie<String> payloadTrie;
|
private TrieConfig trieConfig;
|
||||||
|
|
||||||
|
private State rootState;
|
||||||
|
|
||||||
private Trie(final PayloadTrie<String> payloadTrie) {
|
private Trie(TrieConfig trieConfig) {
|
||||||
|
this.trieConfig = trieConfig;
|
||||||
this.payloadTrie = payloadTrie;
|
this.rootState = new State();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void addKeyword(String keyword) {
|
||||||
public Collection<Token> tokenize(final String text) {
|
if (keyword == null || keyword.length() == 0) {
|
||||||
|
return;
|
||||||
Collection<PayloadToken<String>> tokens = this.payloadTrie.tokenize(text);
|
|
||||||
return asTokens(tokens);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private static Collection<Token> asTokens(Collection<PayloadToken<String>> tokens) {
|
|
||||||
|
|
||||||
Collection<Token> result = new ArrayList<>();
|
|
||||||
for (PayloadToken<String> payloadToken : tokens) {
|
|
||||||
result.add(new DefaultToken(payloadToken));
|
|
||||||
}
|
}
|
||||||
return result;
|
State currentState = this.rootState;
|
||||||
|
String caseAdjustedKeyword = adjustCase(keyword).toString();
|
||||||
|
for (Character character : caseAdjustedKeyword.toCharArray())
|
||||||
|
currentState = currentState.addState(character);
|
||||||
|
currentState.addEmit(caseAdjustedKeyword);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Collection<Token> tokenize(String text) {
|
||||||
|
|
||||||
private static Collection<Emit> asEmits(Collection<PayloadEmit<String>> emits) {
|
Collection<Token> tokens = new ArrayList<>();
|
||||||
|
|
||||||
Collection<Emit> result = new ArrayList<>();
|
Collection<Emit> collectedEmits = parseText(text);
|
||||||
for (PayloadEmit<String> emit : emits) {
|
int lastCollectedPosition = -1;
|
||||||
result.add(asEmit(emit));
|
for (Emit emit : collectedEmits) {
|
||||||
|
if (emit.getStart() - lastCollectedPosition > 1) {
|
||||||
|
tokens.add(createFragment(emit, text, lastCollectedPosition));
|
||||||
|
}
|
||||||
|
tokens.add(createMatch(emit, text));
|
||||||
|
lastCollectedPosition = emit.getEnd();
|
||||||
}
|
}
|
||||||
return result;
|
if (text.length() - lastCollectedPosition > 1) {
|
||||||
|
tokens.add(createFragment(null, text, lastCollectedPosition));
|
||||||
|
}
|
||||||
|
|
||||||
|
return tokens;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private Token createFragment(Emit emit, String text, int lastCollectedPosition) {
|
||||||
private static Emit asEmit(PayloadEmit<String> payloadEmit) {
|
return new FragmentToken(text.substring(lastCollectedPosition+1, emit == null ? text.length() : emit.getStart()));
|
||||||
|
|
||||||
return new Emit(payloadEmit.getStart(), payloadEmit.getEnd(), payloadEmit.getKeyword());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private Token createMatch(Emit emit, String text) {
|
||||||
public Collection<Emit> parseText(final CharSequence text) {
|
return new MatchToken(text.substring(emit.getStart(), emit.getEnd()+1), emit);
|
||||||
|
|
||||||
Collection<PayloadEmit<String>> parsedText = this.payloadTrie.parseText(text);
|
|
||||||
return asEmits(parsedText);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
public Collection<Emit> parseText(CharSequence text) {
|
||||||
|
DefaultEmitHandler emitHandler = new DefaultEmitHandler();
|
||||||
|
parseText(text, emitHandler);
|
||||||
|
|
||||||
@SuppressWarnings("UnusedReturnValue")
|
List<Emit> collectedEmits = emitHandler.getEmits();
|
||||||
public Collection<Emit> parseText(final CharSequence text, final StatefulEmitHandler emitHandler) {
|
|
||||||
|
|
||||||
Collection<PayloadEmit<String>> parsedText = this.payloadTrie.parseText(text, new StatefulPayloadEmitDelegateHandler(emitHandler));
|
if (trieConfig.isOnlyWholeWords()) {
|
||||||
return asEmits(parsedText);
|
removePartialMatches(text, collectedEmits);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (trieConfig.isOnlyWholeWordsWhiteSpaceSeparated()) {
|
||||||
|
removePartialMatchesWhiteSpaceSeparated(text, collectedEmits);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!trieConfig.isAllowOverlaps()) {
|
||||||
|
IntervalTree intervalTree = new IntervalTree((List<Intervalable>) (List<?>) collectedEmits);
|
||||||
|
intervalTree.removeOverlaps((List<Intervalable>) (List<?>) collectedEmits);
|
||||||
|
}
|
||||||
|
|
||||||
|
return collectedEmits;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean containsMatch(CharSequence text) {
|
||||||
public boolean containsMatch(final CharSequence text) {
|
Emit firstMatch = firstMatch(text);
|
||||||
|
return firstMatch != null;
|
||||||
return firstMatch(text) != null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void parseText(CharSequence text, EmitHandler emitHandler) {
|
||||||
|
State currentState = this.rootState;
|
||||||
|
CharSequence caseAdjustedText = adjustCase(text);
|
||||||
|
for (int position = 0; position < caseAdjustedText.length(); position++) {
|
||||||
|
Character character = caseAdjustedText.charAt(position);
|
||||||
|
currentState = getState(currentState, character);
|
||||||
|
if (storeEmits(position, currentState, emitHandler) && trieConfig.isStopOnHit()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public void parseText(final CharSequence text, final EmitHandler emitHandler) {
|
|
||||||
|
|
||||||
this.payloadTrie.parseText(text, new PayloadEmitDelegateHandler(emitHandler));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Emit firstMatch(CharSequence text) {
|
||||||
/**
|
if (!trieConfig.isAllowOverlaps()) {
|
||||||
* The first matching text sequence.
|
// Slow path. Needs to find all the matches to detect overlaps.
|
||||||
*
|
Collection<Emit> parseText = parseText(text);
|
||||||
* @param text The text to search for keywords, must not be {@code null}.
|
if (parseText != null && !parseText.isEmpty()) {
|
||||||
* @return {@code null} if no matches found.
|
return parseText.iterator().next();
|
||||||
*/
|
}
|
||||||
public Emit firstMatch(final CharSequence text) {
|
} else {
|
||||||
|
// Fast path. Returns first match found.
|
||||||
assert text != null;
|
State currentState = this.rootState;
|
||||||
|
CharSequence caseAdjustedText = adjustCase(text);
|
||||||
final PayloadEmit<String> payload = this.payloadTrie.firstMatch(text);
|
for (int position = 0; position < caseAdjustedText.length(); position++) {
|
||||||
return payload == null ? null : new Emit(payload.getStart(), payload.getEnd(), payload.getKeyword());
|
Character character = caseAdjustedText.charAt(position);
|
||||||
|
currentState = getState(currentState, character);
|
||||||
|
Collection<String> emitStrs = currentState.emit();
|
||||||
|
if (emitStrs != null && !emitStrs.isEmpty()) {
|
||||||
|
for (String emitStr : emitStrs) {
|
||||||
|
final Emit emit = new Emit(position - emitStr.length() + 1, position, emitStr);
|
||||||
|
if (trieConfig.isOnlyWholeWords()) {
|
||||||
|
if (!isPartialMatch(text, emit)) {
|
||||||
|
return emit;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return emit;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean isPartialMatch(CharSequence searchText, Emit emit) {
|
||||||
|
return (emit.getStart() != 0 &&
|
||||||
|
Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) ||
|
||||||
|
(emit.getEnd() + 1 != searchText.length() &&
|
||||||
|
Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1)));
|
||||||
|
}
|
||||||
|
|
||||||
|
private void removePartialMatches(CharSequence searchText, List<Emit> collectedEmits) {
|
||||||
|
List<Emit> removeEmits = new ArrayList<>();
|
||||||
|
for (Emit emit : collectedEmits) {
|
||||||
|
if (isPartialMatch(searchText, emit)) {
|
||||||
|
removeEmits.add(emit);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (Emit removeEmit : removeEmits) {
|
||||||
|
collectedEmits.remove(removeEmit);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void removePartialMatchesWhiteSpaceSeparated(CharSequence searchText, List<Emit> collectedEmits) {
|
||||||
|
long size = searchText.length();
|
||||||
|
List<Emit> removeEmits = new ArrayList<>();
|
||||||
|
for (Emit emit : collectedEmits) {
|
||||||
|
if ((emit.getStart() == 0 || Character.isWhitespace(searchText.charAt(emit.getStart() - 1))) &&
|
||||||
|
(emit.getEnd() + 1 == size || Character.isWhitespace(searchText.charAt(emit.getEnd() + 1)))) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
removeEmits.add(emit);
|
||||||
|
}
|
||||||
|
for (Emit removeEmit : removeEmits) {
|
||||||
|
collectedEmits.remove(removeEmit);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private State getState(State currentState, Character character) {
|
||||||
|
State newCurrentState = currentState.nextState(character);
|
||||||
|
while (newCurrentState == null) {
|
||||||
|
currentState = currentState.failure();
|
||||||
|
newCurrentState = currentState.nextState(character);
|
||||||
|
}
|
||||||
|
return newCurrentState;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void constructFailureStates() {
|
||||||
|
Queue<State> queue = new LinkedBlockingDeque<>();
|
||||||
|
|
||||||
|
// First, set the fail state of all depth 1 states to the root state
|
||||||
|
for (State depthOneState : this.rootState.getStates()) {
|
||||||
|
depthOneState.setFailure(this.rootState);
|
||||||
|
queue.add(depthOneState);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Second, determine the fail state for all depth > 1 state
|
||||||
|
while (!queue.isEmpty()) {
|
||||||
|
State currentState = queue.remove();
|
||||||
|
|
||||||
|
for (Character transition : currentState.getTransitions()) {
|
||||||
|
State targetState = currentState.nextState(transition);
|
||||||
|
queue.add(targetState);
|
||||||
|
|
||||||
|
State traceFailureState = currentState.failure();
|
||||||
|
while (traceFailureState.nextState(transition) == null) {
|
||||||
|
traceFailureState = traceFailureState.failure();
|
||||||
|
}
|
||||||
|
State newFailureState = traceFailureState.nextState(transition);
|
||||||
|
targetState.setFailure(newFailureState);
|
||||||
|
targetState.addEmit(newFailureState.emit());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean storeEmits(int position, State currentState, EmitHandler emitHandler) {
|
||||||
|
boolean emitted = false;
|
||||||
|
Collection<String> emits = currentState.emit();
|
||||||
|
if (emits != null && !emits.isEmpty()) {
|
||||||
|
for (String emit : emits) {
|
||||||
|
emitHandler.emit(new Emit(position - emit.length() + 1, position, emit));
|
||||||
|
emitted = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return emitted;
|
||||||
|
}
|
||||||
|
|
||||||
|
private CharSequence adjustCase(CharSequence text) {
|
||||||
|
if(trieConfig.isCaseInsensitive()){
|
||||||
|
char[] textChars = text.toString().toCharArray();
|
||||||
|
char[] adjustedTextChars = new char[textChars.length];
|
||||||
|
for(int i = 0; i < textChars.length; i++)
|
||||||
|
adjustedTextChars[i] = Character.toLowerCase(textChars[i]);
|
||||||
|
return new String(adjustedTextChars);
|
||||||
|
}
|
||||||
|
return text;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Provides a fluent interface for constructing Trie instances.
|
|
||||||
*
|
|
||||||
* @return The builder used to configure its Trie.
|
|
||||||
*/
|
|
||||||
public static TrieBuilder builder() {
|
public static TrieBuilder builder() {
|
||||||
|
|
||||||
return new TrieBuilder();
|
return new TrieBuilder();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static class TrieBuilder {
|
||||||
|
|
||||||
public static final class TrieBuilder {
|
private TrieConfig trieConfig = new TrieConfig();
|
||||||
|
|
||||||
private final PayloadTrieBuilder<String> delegate = PayloadTrie.builder();
|
private Trie trie = new Trie(trieConfig);
|
||||||
|
|
||||||
|
private TrieBuilder() {}
|
||||||
|
|
||||||
/**
|
public TrieBuilder caseInsensitive() {
|
||||||
* Default (empty) constructor.
|
this.trieConfig.setCaseInsensitive(true);
|
||||||
*/
|
|
||||||
private TrieBuilder() {
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Configure the Trie to ignore case when searching for keywords in the text.
|
|
||||||
* This must be called before calling addKeyword because the algorithm converts
|
|
||||||
* keywords to lowercase as they are added, depending on this case sensitivity
|
|
||||||
* setting.
|
|
||||||
*
|
|
||||||
* @return This builder.
|
|
||||||
*/
|
|
||||||
public TrieBuilder ignoreCase() {
|
|
||||||
|
|
||||||
delegate.ignoreCase();
|
|
||||||
// this.trieConfig.setCaseInsensitive(true);
|
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public TrieBuilder removeOverlaps() {
|
||||||
/**
|
this.trieConfig.setAllowOverlaps(false);
|
||||||
* Configure the Trie to ignore overlapping keywords.
|
|
||||||
*
|
|
||||||
* @return This builder.
|
|
||||||
*/
|
|
||||||
public TrieBuilder ignoreOverlaps() {
|
|
||||||
|
|
||||||
delegate.ignoreOverlaps();
|
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Configure the Trie to ignore whitespaces.
|
|
||||||
*
|
|
||||||
* @return This builder.
|
|
||||||
*/
|
|
||||||
public TrieBuilder ignoreWhiteSpace() {
|
|
||||||
|
|
||||||
delegate.ignoreWhiteSpace();
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Adds a keyword to the Trie's list of text search keywords.
|
|
||||||
*
|
|
||||||
* @param keyword The keyword to add to the list.
|
|
||||||
* @return This builder.
|
|
||||||
* @throws NullPointerException if the keyword is null.
|
|
||||||
*/
|
|
||||||
public TrieBuilder addKeyword(final String keyword) {
|
|
||||||
|
|
||||||
delegate.addKeyword(keyword, null);
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Adds a list of keywords to the Trie's list of text search keywords.
|
|
||||||
*
|
|
||||||
* @param keywords The keywords to add to the list.
|
|
||||||
* @return This builder.
|
|
||||||
*/
|
|
||||||
public TrieBuilder addKeywords(final String... keywords) {
|
|
||||||
|
|
||||||
for (String keyword : keywords) {
|
|
||||||
delegate.addKeyword(keyword, null);
|
|
||||||
}
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Adds a list of keywords to the Trie's list of text search keywords.
|
|
||||||
*
|
|
||||||
* @param keywords The keywords to add to the list.
|
|
||||||
* @return This builder.
|
|
||||||
*/
|
|
||||||
@SuppressWarnings("unused")
|
|
||||||
public TrieBuilder addKeywords(final Collection<String> keywords) {
|
|
||||||
|
|
||||||
for (String keyword : keywords) {
|
|
||||||
this.delegate.addKeyword(keyword, null);
|
|
||||||
}
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Configure the Trie to match whole keywords in the text.
|
|
||||||
*
|
|
||||||
* @return This builder.
|
|
||||||
*/
|
|
||||||
public TrieBuilder onlyWholeWords() {
|
public TrieBuilder onlyWholeWords() {
|
||||||
|
this.trieConfig.setOnlyWholeWords(true);
|
||||||
this.delegate.onlyWholeWords();
|
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Configure the Trie to match whole keywords that are separated by whitespace
|
|
||||||
* in the text. For example, "this keyword thatkeyword" would only match the
|
|
||||||
* first occurrence of "keyword".
|
|
||||||
*
|
|
||||||
* @return This builder.
|
|
||||||
*/
|
|
||||||
public TrieBuilder onlyWholeWordsWhiteSpaceSeparated() {
|
public TrieBuilder onlyWholeWordsWhiteSpaceSeparated() {
|
||||||
|
this.trieConfig.setOnlyWholeWordsWhiteSpaceSeparated(true);
|
||||||
this.delegate.onlyWholeWordsWhiteSpaceSeparated();
|
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public TrieBuilder addKeyword(String keyword) {
|
||||||
|
trie.addKeyword(keyword);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Configure the Trie to stop after the first keyword is found in the text.
|
|
||||||
*
|
|
||||||
* @return This builder.
|
|
||||||
*/
|
|
||||||
public TrieBuilder stopOnHit() {
|
public TrieBuilder stopOnHit() {
|
||||||
|
trie.trieConfig.setStopOnHit(true);
|
||||||
this.delegate.stopOnHit();
|
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Configure the Trie based on the builder settings.
|
|
||||||
*
|
|
||||||
* @return The configured Trie.
|
|
||||||
*/
|
|
||||||
public Trie build() {
|
public Trie build() {
|
||||||
|
trie.constructFailureStates();
|
||||||
PayloadTrie<String> payloadTrie = this.delegate.build();
|
return trie;
|
||||||
return new Trie(payloadTrie);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -4,86 +4,45 @@ public class TrieConfig {
|
|||||||
|
|
||||||
private boolean allowOverlaps = true;
|
private boolean allowOverlaps = true;
|
||||||
|
|
||||||
private boolean onlyWholeWords;
|
private boolean onlyWholeWords = false;
|
||||||
|
|
||||||
private boolean onlyWholeWordsWhiteSpaceSeparated;
|
private boolean onlyWholeWordsWhiteSpaceSeparated = false;
|
||||||
|
|
||||||
private boolean caseInsensitive;
|
private boolean caseInsensitive = false;
|
||||||
|
|
||||||
private boolean ignoreWhiteSpace;
|
private boolean stopOnHit = false;
|
||||||
|
|
||||||
private boolean stopOnHit;
|
public boolean isStopOnHit() { return stopOnHit; }
|
||||||
|
|
||||||
|
|
||||||
public boolean isStopOnHit() {
|
|
||||||
|
|
||||||
return stopOnHit;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void setStopOnHit(boolean stopOnHit) {
|
|
||||||
|
|
||||||
this.stopOnHit = stopOnHit;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
public void setStopOnHit(boolean stopOnHit) { this.stopOnHit = stopOnHit; }
|
||||||
|
|
||||||
public boolean isAllowOverlaps() {
|
public boolean isAllowOverlaps() {
|
||||||
|
|
||||||
return allowOverlaps;
|
return allowOverlaps;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void setAllowOverlaps(boolean allowOverlaps) {
|
public void setAllowOverlaps(boolean allowOverlaps) {
|
||||||
|
|
||||||
this.allowOverlaps = allowOverlaps;
|
this.allowOverlaps = allowOverlaps;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public boolean isOnlyWholeWords() {
|
public boolean isOnlyWholeWords() {
|
||||||
|
|
||||||
return onlyWholeWords;
|
return onlyWholeWords;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void setOnlyWholeWords(boolean onlyWholeWords) {
|
public void setOnlyWholeWords(boolean onlyWholeWords) {
|
||||||
|
|
||||||
this.onlyWholeWords = onlyWholeWords;
|
this.onlyWholeWords = onlyWholeWords;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean isOnlyWholeWordsWhiteSpaceSeparated() { return onlyWholeWordsWhiteSpaceSeparated; }
|
||||||
public boolean isOnlyWholeWordsWhiteSpaceSeparated() {
|
|
||||||
|
|
||||||
return onlyWholeWordsWhiteSpaceSeparated;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void setOnlyWholeWordsWhiteSpaceSeparated(boolean onlyWholeWordsWhiteSpaceSeparated) {
|
public void setOnlyWholeWordsWhiteSpaceSeparated(boolean onlyWholeWordsWhiteSpaceSeparated) {
|
||||||
|
|
||||||
this.onlyWholeWordsWhiteSpaceSeparated = onlyWholeWordsWhiteSpaceSeparated;
|
this.onlyWholeWordsWhiteSpaceSeparated = onlyWholeWordsWhiteSpaceSeparated;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public boolean isCaseInsensitive() {
|
public boolean isCaseInsensitive() {
|
||||||
|
|
||||||
return caseInsensitive;
|
return caseInsensitive;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public boolean isIgnoreWhiteSpace() {
|
|
||||||
|
|
||||||
return ignoreWhiteSpace;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void setCaseInsensitive(boolean caseInsensitive) {
|
public void setCaseInsensitive(boolean caseInsensitive) {
|
||||||
|
|
||||||
this.caseInsensitive = caseInsensitive;
|
this.caseInsensitive = caseInsensitive;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void setIgnoreWhiteSpace(boolean ignoreWhiteSpace) {
|
|
||||||
|
|
||||||
this.ignoreWhiteSpace = ignoreWhiteSpace;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,25 +0,0 @@
|
|||||||
package org.ahocorasick.trie.handler;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import org.ahocorasick.trie.Emit;
|
|
||||||
|
|
||||||
public abstract class AbstractStatefulEmitHandler implements StatefulEmitHandler {
|
|
||||||
|
|
||||||
private final List<Emit> emits = new ArrayList<>();
|
|
||||||
|
|
||||||
|
|
||||||
public void addEmit(final Emit emit) {
|
|
||||||
|
|
||||||
this.emits.add(emit);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public List<Emit> getEmits() {
|
|
||||||
|
|
||||||
return this.emits;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,25 +0,0 @@
|
|||||||
package org.ahocorasick.trie.handler;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import org.ahocorasick.trie.PayloadEmit;
|
|
||||||
|
|
||||||
public abstract class AbstractStatefulPayloadEmitHandler<T> implements StatefulPayloadEmitHandler<T> {
|
|
||||||
|
|
||||||
private final List<PayloadEmit<T>> emits = new ArrayList<>();
|
|
||||||
|
|
||||||
|
|
||||||
public void addEmit(final PayloadEmit<T> emit) {
|
|
||||||
|
|
||||||
this.emits.add(emit);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public List<PayloadEmit<T>> getEmits() {
|
|
||||||
|
|
||||||
return this.emits;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,26 +1,20 @@
|
|||||||
package org.ahocorasick.trie.handler;
|
package org.ahocorasick.trie.handler;
|
||||||
|
|
||||||
|
import org.ahocorasick.trie.Emit;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.ahocorasick.trie.Emit;
|
public class DefaultEmitHandler implements EmitHandler {
|
||||||
|
|
||||||
public class DefaultEmitHandler implements StatefulEmitHandler {
|
|
||||||
|
|
||||||
private final List<Emit> emits = new ArrayList<>();
|
|
||||||
|
|
||||||
|
private List<Emit> emits = new ArrayList<>();
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean emit(final Emit emit) {
|
public void emit(Emit emit) {
|
||||||
|
|
||||||
this.emits.add(emit);
|
this.emits.add(emit);
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public List<Emit> getEmits() {
|
public List<Emit> getEmits() {
|
||||||
|
|
||||||
return this.emits;
|
return this.emits;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1,27 +0,0 @@
|
|||||||
package org.ahocorasick.trie.handler;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import org.ahocorasick.trie.PayloadEmit;
|
|
||||||
|
|
||||||
public class DefaultPayloadEmitHandler<T> implements StatefulPayloadEmitHandler<T> {
|
|
||||||
|
|
||||||
private final List<PayloadEmit<T>> emits = new ArrayList<>();
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean emit(final PayloadEmit<T> emit) {
|
|
||||||
|
|
||||||
this.emits.add(emit);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public List<PayloadEmit<T>> getEmits() {
|
|
||||||
|
|
||||||
return this.emits;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -3,7 +3,5 @@ package org.ahocorasick.trie.handler;
|
|||||||
import org.ahocorasick.trie.Emit;
|
import org.ahocorasick.trie.Emit;
|
||||||
|
|
||||||
public interface EmitHandler {
|
public interface EmitHandler {
|
||||||
|
void emit(Emit emit);
|
||||||
boolean emit(Emit emit);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,29 +0,0 @@
|
|||||||
package org.ahocorasick.trie.handler;
|
|
||||||
|
|
||||||
import org.ahocorasick.trie.Emit;
|
|
||||||
import org.ahocorasick.trie.PayloadEmit;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Convenience wrapper class that delegates every method to an
|
|
||||||
* instance of {@link EmitHandler}.
|
|
||||||
*/
|
|
||||||
public class PayloadEmitDelegateHandler implements PayloadEmitHandler<String> {
|
|
||||||
|
|
||||||
private EmitHandler handler;
|
|
||||||
|
|
||||||
|
|
||||||
public PayloadEmitDelegateHandler(EmitHandler handler) {
|
|
||||||
|
|
||||||
this.handler = handler;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean emit(PayloadEmit<String> emit) {
|
|
||||||
|
|
||||||
Emit newEmit = new Emit(emit.getStart(), emit.getEnd(), emit.getKeyword());
|
|
||||||
return handler.emit(newEmit);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,9 +0,0 @@
|
|||||||
package org.ahocorasick.trie.handler;
|
|
||||||
|
|
||||||
import org.ahocorasick.trie.PayloadEmit;
|
|
||||||
|
|
||||||
public interface PayloadEmitHandler<T> {
|
|
||||||
|
|
||||||
boolean emit(PayloadEmit<T> emit);
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,11 +0,0 @@
|
|||||||
package org.ahocorasick.trie.handler;
|
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import org.ahocorasick.trie.Emit;
|
|
||||||
|
|
||||||
public interface StatefulEmitHandler extends EmitHandler {
|
|
||||||
|
|
||||||
List<Emit> getEmits();
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,51 +0,0 @@
|
|||||||
package org.ahocorasick.trie.handler;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import org.ahocorasick.trie.Emit;
|
|
||||||
import org.ahocorasick.trie.PayloadEmit;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Convenience wrapper class that delegates every method to a
|
|
||||||
* {@link StatefulPayloadEmitHandler}.
|
|
||||||
*/
|
|
||||||
public class StatefulPayloadEmitDelegateHandler implements StatefulPayloadEmitHandler<String> {
|
|
||||||
|
|
||||||
private StatefulEmitHandler handler;
|
|
||||||
|
|
||||||
|
|
||||||
public StatefulPayloadEmitDelegateHandler(StatefulEmitHandler handler) {
|
|
||||||
|
|
||||||
this.handler = handler;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private static List<PayloadEmit<String>> asEmits(Collection<Emit> emits) {
|
|
||||||
|
|
||||||
List<PayloadEmit<String>> result = new ArrayList<>();
|
|
||||||
for (Emit emit : emits) {
|
|
||||||
result.add(new PayloadEmit<String>(emit.getStart(), emit.getEnd(), emit.getKeyword(), null));
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean emit(PayloadEmit<String> emit) {
|
|
||||||
|
|
||||||
Emit newEmit = new Emit(emit.getStart(), emit.getEnd(), emit.getKeyword());
|
|
||||||
return handler.emit(newEmit);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public List<PayloadEmit<String>> getEmits() {
|
|
||||||
|
|
||||||
List<Emit> emits = this.handler.getEmits();
|
|
||||||
return asEmits(emits);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,11 +0,0 @@
|
|||||||
package org.ahocorasick.trie.handler;
|
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import org.ahocorasick.trie.PayloadEmit;
|
|
||||||
|
|
||||||
public interface StatefulPayloadEmitHandler<T> extends PayloadEmitHandler<T> {
|
|
||||||
|
|
||||||
List<PayloadEmit<T>> getEmits();
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -2,83 +2,56 @@ package org.ahocorasick.interval;
|
|||||||
|
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
import java.util.Iterator;
|
import java.util.*;
|
||||||
import java.util.Set;
|
|
||||||
import java.util.TreeSet;
|
|
||||||
|
|
||||||
import static org.junit.Assert.*;
|
import static junit.framework.Assert.assertEquals;
|
||||||
|
import static junit.framework.Assert.assertFalse;
|
||||||
|
import static junit.framework.Assert.assertTrue;
|
||||||
|
|
||||||
public class IntervalTest {
|
public class IntervalTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test_construct() {
|
public void construct() {
|
||||||
|
Interval i = new Interval(1,3);
|
||||||
final Interval i = new Interval(1, 3);
|
|
||||||
assertEquals(1, i.getStart());
|
assertEquals(1, i.getStart());
|
||||||
assertEquals(3, i.getEnd());
|
assertEquals(3, i.getEnd());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test_size() {
|
public void size() {
|
||||||
|
assertEquals(3, new Interval(0,2).size());
|
||||||
assertEquals(3, new Interval(0, 2).size());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test_intervaloverlaps() {
|
public void intervaloverlaps() {
|
||||||
|
assertTrue(new Interval(1,3).overlapsWith(new Interval(2,4)));
|
||||||
assertTrue(new Interval(1, 3).overlapsWith(new Interval(2, 4)));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test_intervalDoesNotOverlap() {
|
public void intervalDoesNotOverlap() {
|
||||||
|
|
||||||
assertFalse(new Interval(1, 13).overlapsWith(new Interval(27, 42)));
|
assertFalse(new Interval(1, 13).overlapsWith(new Interval(27, 42)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test_pointOverlaps() {
|
public void pointOverlaps() {
|
||||||
|
assertTrue(new Interval(1,3).overlapsWith(2));
|
||||||
assertTrue(new Interval(1, 3).overlapsWith(2));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test_pointDoesNotOverlap() {
|
public void pointDoesNotOverlap() {
|
||||||
|
|
||||||
assertFalse(new Interval(1, 13).overlapsWith(42));
|
assertFalse(new Interval(1, 13).overlapsWith(42));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test_comparable() {
|
public void comparable() {
|
||||||
|
Set<Interval> intervals = new TreeSet<Interval>();
|
||||||
final Set<Interval> intervals = new TreeSet<>();
|
|
||||||
intervals.add(new Interval(4, 6));
|
intervals.add(new Interval(4, 6));
|
||||||
intervals.add(new Interval(2, 7));
|
intervals.add(new Interval(2, 7));
|
||||||
intervals.add(new Interval(3, 4));
|
intervals.add(new Interval(3, 4));
|
||||||
final Iterator<Interval> it = intervals.iterator();
|
Iterator<Interval> it = intervals.iterator();
|
||||||
assertEquals(2, it.next().getStart());
|
assertEquals(2, it.next().getStart());
|
||||||
assertEquals(3, it.next().getStart());
|
assertEquals(3, it.next().getStart());
|
||||||
assertEquals(4, it.next().getStart());
|
assertEquals(4, it.next().getStart());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void test_checkToString() {
|
|
||||||
|
|
||||||
assertEquals("4:6", new Interval(4, 6).toString());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void test_compareToNegativeTest() {
|
|
||||||
|
|
||||||
assertEquals(-1, new Interval(4, 6).compareTo(new Object()));
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -9,11 +9,10 @@ import java.util.List;
|
|||||||
import static junit.framework.Assert.assertEquals;
|
import static junit.framework.Assert.assertEquals;
|
||||||
|
|
||||||
public class IntervalTreeTest {
|
public class IntervalTreeTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void findOverlaps() {
|
public void findOverlaps() {
|
||||||
|
List<Intervalable> intervals = new ArrayList<Intervalable>();
|
||||||
List<Intervalable> intervals = new ArrayList<>();
|
|
||||||
intervals.add(new Interval(0, 2));
|
intervals.add(new Interval(0, 2));
|
||||||
intervals.add(new Interval(1, 3));
|
intervals.add(new Interval(1, 3));
|
||||||
intervals.add(new Interval(2, 4));
|
intervals.add(new Interval(2, 4));
|
||||||
@ -21,7 +20,7 @@ public class IntervalTreeTest {
|
|||||||
intervals.add(new Interval(4, 6));
|
intervals.add(new Interval(4, 6));
|
||||||
intervals.add(new Interval(5, 7));
|
intervals.add(new Interval(5, 7));
|
||||||
IntervalTree intervalTree = new IntervalTree(intervals);
|
IntervalTree intervalTree = new IntervalTree(intervals);
|
||||||
List<Intervalable> overlaps = intervalTree.findOverlaps(new Interval(1, 3));
|
List<Intervalable> overlaps = intervalTree.findOverlaps(new Interval(1,3));
|
||||||
assertEquals(3, overlaps.size());
|
assertEquals(3, overlaps.size());
|
||||||
Iterator<Intervalable> overlapsIt = overlaps.iterator();
|
Iterator<Intervalable> overlapsIt = overlaps.iterator();
|
||||||
assertOverlap(overlapsIt.next(), 2, 4);
|
assertOverlap(overlapsIt.next(), 2, 4);
|
||||||
@ -29,11 +28,9 @@ public class IntervalTreeTest {
|
|||||||
assertOverlap(overlapsIt.next(), 0, 2);
|
assertOverlap(overlapsIt.next(), 0, 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void removeOverlaps() {
|
public void removeOverlaps() {
|
||||||
|
List<Intervalable> intervals = new ArrayList<Intervalable>();
|
||||||
List<Intervalable> intervals = new ArrayList<>();
|
|
||||||
intervals.add(new Interval(0, 2));
|
intervals.add(new Interval(0, 2));
|
||||||
intervals.add(new Interval(4, 5));
|
intervals.add(new Interval(4, 5));
|
||||||
intervals.add(new Interval(2, 10));
|
intervals.add(new Interval(2, 10));
|
||||||
@ -46,11 +43,9 @@ public class IntervalTreeTest {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
protected void assertOverlap(Intervalable interval, int expectedStart, int expectedEnd) {
|
protected void assertOverlap(Intervalable interval, int expectedStart, int expectedEnd) {
|
||||||
|
|
||||||
assertEquals(expectedStart, interval.getStart());
|
assertEquals(expectedStart, interval.getStart());
|
||||||
assertEquals(expectedEnd, interval.getEnd());
|
assertEquals(expectedEnd, interval.getEnd());
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -12,11 +12,10 @@ public class IntervalableComparatorByPositionTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void sortOnPosition() {
|
public void sortOnPosition() {
|
||||||
|
|
||||||
List<Intervalable> intervals = new ArrayList<Intervalable>();
|
List<Intervalable> intervals = new ArrayList<Intervalable>();
|
||||||
intervals.add(new Interval(4, 5));
|
intervals.add(new Interval(4,5));
|
||||||
intervals.add(new Interval(1, 4));
|
intervals.add(new Interval(1,4));
|
||||||
intervals.add(new Interval(3, 8));
|
intervals.add(new Interval(3,8));
|
||||||
Collections.sort(intervals, new IntervalableComparatorByPosition());
|
Collections.sort(intervals, new IntervalableComparatorByPosition());
|
||||||
assertEquals(4, intervals.get(0).size());
|
assertEquals(4, intervals.get(0).size());
|
||||||
assertEquals(6, intervals.get(1).size());
|
assertEquals(6, intervals.get(1).size());
|
||||||
|
|||||||
@ -12,24 +12,21 @@ public class IntervalableComparatorBySizeTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void sortOnSize() {
|
public void sortOnSize() {
|
||||||
|
|
||||||
List<Intervalable> intervals = new ArrayList<Intervalable>();
|
List<Intervalable> intervals = new ArrayList<Intervalable>();
|
||||||
intervals.add(new Interval(4, 5));
|
intervals.add(new Interval(4,5));
|
||||||
intervals.add(new Interval(1, 4));
|
intervals.add(new Interval(1,4));
|
||||||
intervals.add(new Interval(3, 8));
|
intervals.add(new Interval(3,8));
|
||||||
Collections.sort(intervals, new IntervalableComparatorBySize());
|
Collections.sort(intervals, new IntervalableComparatorBySize());
|
||||||
assertEquals(6, intervals.get(0).size());
|
assertEquals(6, intervals.get(0).size());
|
||||||
assertEquals(4, intervals.get(1).size());
|
assertEquals(4, intervals.get(1).size());
|
||||||
assertEquals(2, intervals.get(2).size());
|
assertEquals(2, intervals.get(2).size());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void sortOnSizeThenPosition() {
|
public void sortOnSizeThenPosition() {
|
||||||
|
|
||||||
List<Intervalable> intervals = new ArrayList<Intervalable>();
|
List<Intervalable> intervals = new ArrayList<Intervalable>();
|
||||||
intervals.add(new Interval(4, 7));
|
intervals.add(new Interval(4,7));
|
||||||
intervals.add(new Interval(2, 5));
|
intervals.add(new Interval(2,5));
|
||||||
Collections.sort(intervals, new IntervalableComparatorBySize());
|
Collections.sort(intervals, new IntervalableComparatorBySize());
|
||||||
assertEquals(2, intervals.get(0).getStart());
|
assertEquals(2, intervals.get(0).getStart());
|
||||||
assertEquals(4, intervals.get(1).getStart());
|
assertEquals(4, intervals.get(1).getStart());
|
||||||
|
|||||||
@ -2,35 +2,23 @@ package org.ahocorasick.trie;
|
|||||||
|
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
import static org.junit.Assert.assertEquals;
|
import static junit.framework.Assert.assertEquals;
|
||||||
import static org.junit.Assert.assertNotEquals;
|
import static junit.framework.Assert.assertNotSame;
|
||||||
|
|
||||||
/**
|
|
||||||
* Test the {@link Emit} class functionality.
|
|
||||||
*/
|
|
||||||
public class EmitTest {
|
public class EmitTest {
|
||||||
|
|
||||||
/**
|
|
||||||
* Test that two {@link Emit} instances having the same values are equal.
|
|
||||||
*/
|
|
||||||
@Test
|
@Test
|
||||||
public void test_Equality_SameValues_ObjectsAreEqual() {
|
public void equals() {
|
||||||
|
Emit one = new Emit(13, 42, null);
|
||||||
final Emit one = new Emit(13, 42, null);
|
Emit two = new Emit(13, 42, null);
|
||||||
final Emit two = new Emit(13, 42, null);
|
|
||||||
assertEquals(one, two);
|
assertEquals(one, two);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Test that two {@link Emit} instances having different values are equal.
|
|
||||||
*/
|
|
||||||
@Test
|
@Test
|
||||||
public void test_Equality_DifferingValues_ObjectsAreNotEqual() {
|
public void notEquals() {
|
||||||
|
Emit one = new Emit(13, 42, null);
|
||||||
final Emit one = new Emit(13, 42, null);
|
Emit two = new Emit(13, 43, null);
|
||||||
final Emit two = new Emit(13, 43, null);
|
assertNotSame(one, two);
|
||||||
assertNotEquals(one, two);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,607 +0,0 @@
|
|||||||
package org.ahocorasick.trie;
|
|
||||||
|
|
||||||
import org.ahocorasick.trie.handler.AbstractStatefulPayloadEmitHandler;
|
|
||||||
import org.ahocorasick.trie.handler.PayloadEmitHandler;
|
|
||||||
import org.ahocorasick.trie.handler.StatefulPayloadEmitHandler;
|
|
||||||
import org.junit.Test;
|
|
||||||
|
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.LinkedList;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import static java.util.Arrays.asList;
|
|
||||||
import static org.ahocorasick.trie.TestHelper.injectKeyword;
|
|
||||||
import static org.ahocorasick.trie.TestHelper.randomNumbers;
|
|
||||||
import static org.junit.Assert.*;
|
|
||||||
|
|
||||||
public class PayloadTrieTest {
|
|
||||||
|
|
||||||
private final static String[] ALPHABET = new String[]{"abc", "bcd", "cde"};
|
|
||||||
private final static String[] ALPHABET_PAYLOAD = new String[]{"alpha:abc", "alpha:bcd", "alpha:cde"};
|
|
||||||
|
|
||||||
private final static List<Payload<String>> ALPHABET_WITH_PAYLOADS = asList(new Payload<>(ALPHABET[0], ALPHABET_PAYLOAD[0]),
|
|
||||||
new Payload<>(ALPHABET[1], ALPHABET_PAYLOAD[1]),
|
|
||||||
new Payload<>(ALPHABET[2], ALPHABET_PAYLOAD[2]));
|
|
||||||
|
|
||||||
private final static String[] PRONOUNS = new String[]{"hers", "his", "she", "he"};
|
|
||||||
private final static int[] PRONOUNS_PAYLOAD_ID = new int[]{9, 12, 4, 20};
|
|
||||||
|
|
||||||
private final static List<Payload<Integer>> PRONOUNS_WITH_PAYLOADS = asList(new Payload<>(PRONOUNS[0], PRONOUNS_PAYLOAD_ID[0]),
|
|
||||||
new Payload<>(PRONOUNS[1], PRONOUNS_PAYLOAD_ID[1]),
|
|
||||||
new Payload<>(PRONOUNS[2], PRONOUNS_PAYLOAD_ID[2]),
|
|
||||||
new Payload<>(PRONOUNS[3], PRONOUNS_PAYLOAD_ID[3]));
|
|
||||||
|
|
||||||
private final static String[] FOOD = new String[]{"veal", "cauliflower", "broccoli", "tomatoes"};
|
|
||||||
private final static Food[] FOOD_PAYLOAD = new Food[]{new Food("veal"), new Food("cauliflower"), new Food("broccoli"), new Food("tomatoes")};
|
|
||||||
|
|
||||||
private final static List<Payload<Food>> FOOD_WITH_PAYLOADS = asList(new Payload<>(FOOD[0], FOOD_PAYLOAD[0]),
|
|
||||||
new Payload<>(FOOD[1], FOOD_PAYLOAD[1]),
|
|
||||||
new Payload<>(FOOD[2], FOOD_PAYLOAD[2]),
|
|
||||||
new Payload<>(FOOD[3], FOOD_PAYLOAD[3]));
|
|
||||||
|
|
||||||
private final static String[] GREEK_LETTERS = new String[]{"Alpha", "Beta", "Gamma"};
|
|
||||||
private final static String[] GREEK_LETTERS_PAYLOAD = new String[]{"greek:Alpha", "greek:Beta", "greek:Gamma"};
|
|
||||||
|
|
||||||
private final static List<Payload<String>> GREEK_LETTERS_WITH_PAYLOADS = asList(new Payload<>(GREEK_LETTERS[0], GREEK_LETTERS_PAYLOAD[0]),
|
|
||||||
new Payload<>(GREEK_LETTERS[1], GREEK_LETTERS_PAYLOAD[1]),
|
|
||||||
new Payload<>(GREEK_LETTERS[2], GREEK_LETTERS_PAYLOAD[2]));
|
|
||||||
|
|
||||||
private final static String[] UNICODE = new String[]{"turning", "once", "again", "börkü"};
|
|
||||||
private final static String[] UNICODE_PAYLOAD = new String[]{"uni:turning", "uni:once", "uni:again", "uni:börkü"};
|
|
||||||
|
|
||||||
private final static List<Payload<String>> UNICODE_WITH_PAYLOADS = asList(new Payload<>(UNICODE[0], UNICODE_PAYLOAD[0]),
|
|
||||||
new Payload<>(UNICODE[1], UNICODE_PAYLOAD[1]),
|
|
||||||
new Payload<>(UNICODE[2], UNICODE_PAYLOAD[2]),
|
|
||||||
new Payload<>(UNICODE[3], UNICODE_PAYLOAD[3]));
|
|
||||||
|
|
||||||
public static class Food {
|
|
||||||
|
|
||||||
private final String name;
|
|
||||||
|
|
||||||
|
|
||||||
public Food(String name) {
|
|
||||||
|
|
||||||
this.name = name;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int hashCode() {
|
|
||||||
|
|
||||||
final int prime = 31;
|
|
||||||
int result = 1;
|
|
||||||
result = prime * result + ((name == null) ? 0 : name.hashCode());
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean equals(Object obj) {
|
|
||||||
|
|
||||||
if (this == obj) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
if (obj == null) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (getClass() != obj.getClass()) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
Food other = (Food) obj;
|
|
||||||
if (name == null) {
|
|
||||||
return other.name == null;
|
|
||||||
} else {
|
|
||||||
return name.equals(other.name);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void keywordAndTextAreTheSame() {
|
|
||||||
|
|
||||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword(ALPHABET[0], ALPHABET_PAYLOAD[0]).build();
|
|
||||||
Collection<PayloadEmit<String>> emits = trie.parseText(ALPHABET[0]);
|
|
||||||
Iterator<PayloadEmit<String>> iterator = emits.iterator();
|
|
||||||
checkEmit(iterator.next(), 0, 2, ALPHABET[0], ALPHABET_PAYLOAD[0]);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void keywordAndTextAreTheSameFirstMatch() {
|
|
||||||
|
|
||||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword(ALPHABET[0], ALPHABET_PAYLOAD[0]).build();
|
|
||||||
PayloadEmit<String> firstMatch = trie.firstMatch(ALPHABET[0]);
|
|
||||||
checkEmit(firstMatch, 0, 2, ALPHABET[0], ALPHABET_PAYLOAD[0]);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void textIsLongerThanKeyword() {
|
|
||||||
|
|
||||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword(ALPHABET[0], ALPHABET_PAYLOAD[0]).build();
|
|
||||||
Collection<PayloadEmit<String>> emits = trie.parseText(" " + ALPHABET[0]);
|
|
||||||
Iterator<PayloadEmit<String>> iterator = emits.iterator();
|
|
||||||
checkEmit(iterator.next(), 1, 3, ALPHABET[0], ALPHABET_PAYLOAD[0]);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void textIsLongerThanKeywordFirstMatch() {
|
|
||||||
|
|
||||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword(ALPHABET[0], ALPHABET_PAYLOAD[0]).build();
|
|
||||||
PayloadEmit<String> firstMatch = trie.firstMatch(" " + ALPHABET[0]);
|
|
||||||
checkEmit(firstMatch, 1, 3, ALPHABET[0], ALPHABET_PAYLOAD[0]);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void variousKeywordsOneMatch() {
|
|
||||||
|
|
||||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeywords(ALPHABET_WITH_PAYLOADS).build();
|
|
||||||
Collection<PayloadEmit<String>> emits = trie.parseText("bcd");
|
|
||||||
Iterator<PayloadEmit<String>> iterator = emits.iterator();
|
|
||||||
checkEmit(iterator.next(), 0, 2, "bcd", "alpha:bcd");
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void variousKeywordsFirstMatch() {
|
|
||||||
|
|
||||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeywords(ALPHABET_WITH_PAYLOADS).build();
|
|
||||||
PayloadEmit<String> firstMatch = trie.firstMatch("bcd");
|
|
||||||
checkEmit(firstMatch, 0, 2, "bcd", "alpha:bcd");
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void ushersTestAndStopOnHit() {
|
|
||||||
|
|
||||||
PayloadTrie<Integer> trie = PayloadTrie.<Integer>builder().addKeywords(PRONOUNS_WITH_PAYLOADS).stopOnHit().build();
|
|
||||||
Collection<PayloadEmit<Integer>> emits = trie.parseText("ushers");
|
|
||||||
assertEquals(1, emits.size()); // she @ 3, he @ 3, hers @ 5
|
|
||||||
Iterator<PayloadEmit<Integer>> iterator = emits.iterator();
|
|
||||||
checkEmit(iterator.next(), 2, 3, "he", 20);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void ushersTestStopOnHitSkipOne() {
|
|
||||||
|
|
||||||
PayloadTrie<Integer> trie = PayloadTrie.<Integer>builder().addKeywords(PRONOUNS_WITH_PAYLOADS).stopOnHit().build();
|
|
||||||
|
|
||||||
StatefulPayloadEmitHandler<Integer> testEmitHandler = new AbstractStatefulPayloadEmitHandler<Integer>() {
|
|
||||||
boolean first = true;
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean emit(final PayloadEmit<Integer> emit) {
|
|
||||||
|
|
||||||
if (first) {
|
|
||||||
// return false for the first element
|
|
||||||
first = false;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
addEmit(emit);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
trie.parseText("ushers", testEmitHandler);
|
|
||||||
Collection<PayloadEmit<Integer>> emits = testEmitHandler.getEmits();
|
|
||||||
assertEquals(1, emits.size()); // she @ 3, he @ 3, hers @ 5
|
|
||||||
Iterator<PayloadEmit<Integer>> iterator = emits.iterator();
|
|
||||||
checkEmit(iterator.next(), 1, 3, "she", 4);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void ushersTest() {
|
|
||||||
|
|
||||||
PayloadTrie<Integer> trie = PayloadTrie.<Integer>builder().addKeywords(PRONOUNS_WITH_PAYLOADS).build();
|
|
||||||
Collection<PayloadEmit<Integer>> emits = trie.parseText("ushers");
|
|
||||||
assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5
|
|
||||||
Iterator<PayloadEmit<Integer>> iterator = emits.iterator();
|
|
||||||
|
|
||||||
checkEmit(iterator.next(), 2, 3, "he", 20);
|
|
||||||
checkEmit(iterator.next(), 1, 3, "she", 4);
|
|
||||||
checkEmit(iterator.next(), 2, 5, "hers", 9);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void ushersTestWithCapitalKeywords() {
|
|
||||||
|
|
||||||
PayloadTrie<String> trie = PayloadTrie.<String>builder()
|
|
||||||
.ignoreCase()
|
|
||||||
.addKeyword("HERS", "hers")
|
|
||||||
.addKeyword("HIS", "his")
|
|
||||||
.addKeyword("SHE", "she")
|
|
||||||
.addKeyword("HE", "he")
|
|
||||||
.build();
|
|
||||||
Collection<PayloadEmit<String>> emits = trie.parseText("ushers");
|
|
||||||
assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5
|
|
||||||
Iterator<PayloadEmit<String>> iterator = emits.iterator();
|
|
||||||
checkEmit(iterator.next(), 2, 3, "HE", "he");
|
|
||||||
checkEmit(iterator.next(), 1, 3, "SHE", "she");
|
|
||||||
checkEmit(iterator.next(), 2, 5, "HERS", "hers");
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void ushersTestFirstMatch() {
|
|
||||||
|
|
||||||
PayloadTrie<Integer> trie = PayloadTrie.<Integer>builder().addKeywords(PRONOUNS_WITH_PAYLOADS).build();
|
|
||||||
PayloadEmit<Integer> firstMatch = trie.firstMatch("ushers");
|
|
||||||
checkEmit(firstMatch, 2, 3, "he", 20);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void ushersTestByCallback() {
|
|
||||||
|
|
||||||
PayloadTrie<Integer> trie = PayloadTrie.<Integer>builder().addKeywords(PRONOUNS_WITH_PAYLOADS).build();
|
|
||||||
|
|
||||||
final List<PayloadEmit<Integer>> emits = new LinkedList<>();
|
|
||||||
PayloadEmitHandler<Integer> emitHandler = emit -> {
|
|
||||||
emits.add(emit);
|
|
||||||
return true;
|
|
||||||
};
|
|
||||||
trie.parseText("ushers", emitHandler);
|
|
||||||
assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5
|
|
||||||
Iterator<PayloadEmit<Integer>> iterator = emits.iterator();
|
|
||||||
|
|
||||||
checkEmit(iterator.next(), 2, 3, "he", 20);
|
|
||||||
checkEmit(iterator.next(), 1, 3, "she", 4);
|
|
||||||
checkEmit(iterator.next(), 2, 5, "hers", 9);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void misleadingTest() {
|
|
||||||
|
|
||||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword("hers", "pronon:hers").build();
|
|
||||||
Collection<PayloadEmit<String>> emits = trie.parseText("h he her hers");
|
|
||||||
Iterator<PayloadEmit<String>> iterator = emits.iterator();
|
|
||||||
checkEmit(iterator.next(), 9, 12, "hers", "pronon:hers");
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void misleadingTestFirstMatch() {
|
|
||||||
|
|
||||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword("hers", "pronon:hers").build();
|
|
||||||
PayloadEmit<String> firstMatch = trie.firstMatch("h he her hers");
|
|
||||||
checkEmit(firstMatch, 9, 12, "hers", "pronon:hers");
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void recipes() {
|
|
||||||
|
|
||||||
PayloadTrie<Food> trie = PayloadTrie.<Food>builder().addKeywords(FOOD_WITH_PAYLOADS).build();
|
|
||||||
Collection<PayloadEmit<Food>> emits = trie.parseText("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli");
|
|
||||||
Iterator<PayloadEmit<Food>> iterator = emits.iterator();
|
|
||||||
checkEmit(iterator.next(), 2, 12, "cauliflower", new Food("cauliflower"));
|
|
||||||
checkEmit(iterator.next(), 18, 25, "tomatoes", new Food("tomatoes"));
|
|
||||||
checkEmit(iterator.next(), 40, 43, "veal", new Food("veal"));
|
|
||||||
checkEmit(iterator.next(), 51, 58, "broccoli", new Food("broccoli"));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void recipesFirstMatch() {
|
|
||||||
|
|
||||||
PayloadTrie<Food> trie = PayloadTrie.<Food>builder().addKeywords(FOOD_WITH_PAYLOADS).build();
|
|
||||||
PayloadEmit<Food> firstMatch = trie.firstMatch("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli");
|
|
||||||
checkEmit(firstMatch, 2, 12, "cauliflower", new Food("cauliflower"));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void longAndShortOverlappingMatch() {
|
|
||||||
|
|
||||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword("he", "pronon:he").addKeyword("hehehehe", "garbage").build();
|
|
||||||
Collection<PayloadEmit<String>> emits = trie.parseText("hehehehehe");
|
|
||||||
Iterator<PayloadEmit<String>> iterator = emits.iterator();
|
|
||||||
checkEmit(iterator.next(), 0, 1, "he", "pronon:he");
|
|
||||||
checkEmit(iterator.next(), 2, 3, "he", "pronon:he");
|
|
||||||
checkEmit(iterator.next(), 4, 5, "he", "pronon:he");
|
|
||||||
checkEmit(iterator.next(), 6, 7, "he", "pronon:he");
|
|
||||||
checkEmit(iterator.next(), 0, 7, "hehehehe", "garbage");
|
|
||||||
checkEmit(iterator.next(), 8, 9, "he", "pronon:he");
|
|
||||||
checkEmit(iterator.next(), 2, 9, "hehehehe", "garbage");
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void nonOverlapping() {
|
|
||||||
|
|
||||||
PayloadTrie<String> trie = PayloadTrie.<String>builder()
|
|
||||||
.ignoreOverlaps()
|
|
||||||
.addKeyword("ab", "alpha:ab")
|
|
||||||
.addKeyword("cba", "alpha:cba")
|
|
||||||
.addKeyword("ababc", "alpha:ababc")
|
|
||||||
.build();
|
|
||||||
Collection<PayloadEmit<String>> emits = trie.parseText("ababcbab");
|
|
||||||
assertEquals(2, emits.size());
|
|
||||||
Iterator<PayloadEmit<String>> iterator = emits.iterator();
|
|
||||||
// With overlaps: ab@1, ab@3, ababc@4, cba@6, ab@7
|
|
||||||
checkEmit(iterator.next(), 0, 4, "ababc", "alpha:ababc");
|
|
||||||
checkEmit(iterator.next(), 6, 7, "ab", "alpha:ab");
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void nonOverlappingFirstMatch() {
|
|
||||||
|
|
||||||
PayloadTrie<String> trie = PayloadTrie.<String>builder()
|
|
||||||
.ignoreOverlaps()
|
|
||||||
.addKeyword("ab", "alpha:ab")
|
|
||||||
.addKeyword("cba", "alpha:cba")
|
|
||||||
.addKeyword("ababc", "alpha:ababc")
|
|
||||||
.build();
|
|
||||||
PayloadEmit<String> firstMatch = trie.firstMatch("ababcbab");
|
|
||||||
|
|
||||||
checkEmit(firstMatch, 0, 4, "ababc", "alpha:ababc");
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void containsMatch() {
|
|
||||||
|
|
||||||
PayloadTrie<String> trie = PayloadTrie.<String>builder()
|
|
||||||
.ignoreOverlaps()
|
|
||||||
.addKeyword("ab", "alpha:ab")
|
|
||||||
.addKeyword("cba", "alpha:cba")
|
|
||||||
.addKeyword("ababc", "alpha:ababc")
|
|
||||||
.build();
|
|
||||||
assertTrue(trie.containsMatch("ababcbab"));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void startOfChurchillSpeech() {
|
|
||||||
|
|
||||||
PayloadTrie<String> trie = PayloadTrie.<String>builder()
|
|
||||||
.ignoreOverlaps()
|
|
||||||
.addKeyword("T")
|
|
||||||
.addKeyword("u")
|
|
||||||
.addKeyword("ur")
|
|
||||||
.addKeyword("r")
|
|
||||||
.addKeyword("urn")
|
|
||||||
.addKeyword("ni")
|
|
||||||
.addKeyword("i")
|
|
||||||
.addKeyword("in")
|
|
||||||
.addKeyword("n")
|
|
||||||
.addKeyword("urning")
|
|
||||||
.build();
|
|
||||||
Collection<PayloadEmit<String>> emits = trie.parseText("Turning");
|
|
||||||
assertEquals(2, emits.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void partialMatch() {
|
|
||||||
|
|
||||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().onlyWholeWords().addKeyword("sugar", "food:sugar").build();
|
|
||||||
Collection<PayloadEmit<String>> emits = trie.parseText("sugarcane sugarcane sugar canesugar"); // left, middle, right test
|
|
||||||
assertEquals(1, emits.size()); // Match must not be made
|
|
||||||
checkEmit(emits.iterator().next(), 20, 24, "sugar", "food:sugar");
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void partialMatchFirstMatch() {
|
|
||||||
|
|
||||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().onlyWholeWords().addKeyword("sugar", "food:sugar").build();
|
|
||||||
PayloadEmit<String> firstMatch = trie.firstMatch("sugarcane sugarcane sugar canesugar"); // left, middle, right test
|
|
||||||
|
|
||||||
checkEmit(firstMatch, 20, 24, "sugar", "food:sugar");
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void tokenizeFullSentence() {
|
|
||||||
|
|
||||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeywords(GREEK_LETTERS_WITH_PAYLOADS).build();
|
|
||||||
Collection<PayloadToken<String>> tokens = trie.tokenize("Hear: Alpha team first, Beta from the rear, Gamma in reserve");
|
|
||||||
assertEquals(7, tokens.size());
|
|
||||||
Iterator<PayloadToken<String>> tokensIt = tokens.iterator();
|
|
||||||
assertEquals("Hear: ", tokensIt.next().getFragment());
|
|
||||||
assertEquals("Alpha", tokensIt.next().getFragment());
|
|
||||||
assertEquals(" team first, ", tokensIt.next().getFragment());
|
|
||||||
assertEquals("Beta", tokensIt.next().getFragment());
|
|
||||||
assertEquals(" from the rear, ", tokensIt.next().getFragment());
|
|
||||||
assertEquals("Gamma", tokensIt.next().getFragment());
|
|
||||||
assertEquals(" in reserve", tokensIt.next().getFragment());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// @see https://github.com/robert-bor/aho-corasick/issues/5
|
|
||||||
@Test
|
|
||||||
public void testStringIndexOutOfBoundsException() {
|
|
||||||
|
|
||||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().onlyWholeWords().addKeywords(UNICODE_WITH_PAYLOADS).build();
|
|
||||||
Collection<PayloadEmit<String>> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ");
|
|
||||||
assertEquals(4, emits.size()); // Match must not be made
|
|
||||||
Iterator<PayloadEmit<String>> it = emits.iterator();
|
|
||||||
|
|
||||||
checkEmit(it.next(), 0, 6, "turning", "uni:turning");
|
|
||||||
checkEmit(it.next(), 8, 11, "once", "uni:once");
|
|
||||||
checkEmit(it.next(), 13, 17, "again", "uni:again");
|
|
||||||
checkEmit(it.next(), 19, 23, "börkü", "uni:börkü");
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testIgnoreCase() {
|
|
||||||
|
|
||||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().addKeywords(UNICODE_WITH_PAYLOADS).build();
|
|
||||||
Collection<PayloadEmit<String>> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ");
|
|
||||||
assertEquals(4, emits.size()); // Match must not be made
|
|
||||||
Iterator<PayloadEmit<String>> it = emits.iterator();
|
|
||||||
|
|
||||||
checkEmit(it.next(), 0, 6, "turning", "uni:turning");
|
|
||||||
checkEmit(it.next(), 8, 11, "once", "uni:once");
|
|
||||||
checkEmit(it.next(), 13, 17, "again", "uni:again");
|
|
||||||
checkEmit(it.next(), 19, 23, "börkü", "uni:börkü");
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testIgnoreCaseFirstMatch() {
|
|
||||||
|
|
||||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().addKeywords(UNICODE_WITH_PAYLOADS).build();
|
|
||||||
PayloadEmit<String> firstMatch = trie.firstMatch("TurninG OnCe AgAiN BÖRKÜ");
|
|
||||||
|
|
||||||
checkEmit(firstMatch, 0, 6, "turning", "uni:turning");
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void tokenizeTokensInSequence() {
|
|
||||||
|
|
||||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeywords(GREEK_LETTERS_WITH_PAYLOADS).build();
|
|
||||||
Collection<PayloadToken<String>> tokens = trie.tokenize("Alpha Beta Gamma");
|
|
||||||
assertEquals(5, tokens.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// @see https://github.com/robert-bor/aho-corasick/issues/7
|
|
||||||
@Test
|
|
||||||
public void testZeroLength() {
|
|
||||||
|
|
||||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreOverlaps().onlyWholeWords().ignoreCase().addKeyword("").build();
|
|
||||||
trie.tokenize(
|
|
||||||
"Try a natural lip and subtle bronzer to keep all the focus on those big bright eyes with NARS Eyeshadow Duo in Rated R And the winner is... Boots No7 Advanced Renewal Anti-ageing Glycolic Peel Kit ($25 amazon.com) won most-appealing peel.");
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// @see https://github.com/robert-bor/aho-corasick/issues/8
|
|
||||||
@Test
|
|
||||||
public void testUnicode1() {
|
|
||||||
|
|
||||||
String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char
|
|
||||||
assertEquals("THIS", target.substring(5, 9)); // Java does it the right way
|
|
||||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().onlyWholeWords().addKeyword("this", "pronon:this").build();
|
|
||||||
Collection<PayloadEmit<String>> emits = trie.parseText(target);
|
|
||||||
assertEquals(1, emits.size());
|
|
||||||
Iterator<PayloadEmit<String>> it = emits.iterator();
|
|
||||||
checkEmit(it.next(), 5, 8, "this", "pronon:this");
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// @see https://github.com/robert-bor/aho-corasick/issues/8
|
|
||||||
@Test
|
|
||||||
public void testUnicode2() {
|
|
||||||
|
|
||||||
String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char
|
|
||||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().onlyWholeWords().addKeyword("this", "pronon:this").build();
|
|
||||||
assertEquals("THIS", target.substring(5, 9)); // Java does it the right way
|
|
||||||
PayloadEmit<String> firstMatch = trie.firstMatch(target);
|
|
||||||
checkEmit(firstMatch, 5, 8, "this", "pronon:this");
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testPartialMatchWhiteSpaces() {
|
|
||||||
|
|
||||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().onlyWholeWordsWhiteSpaceSeparated().addKeyword("#sugar-123", "sugar").build();
|
|
||||||
Collection<PayloadEmit<String>> emits = trie.parseText("#sugar-123 #sugar-1234"); // left, middle, right test
|
|
||||||
assertEquals(1, emits.size()); // Match must not be made
|
|
||||||
checkEmit(emits.iterator().next(), 0, 9, "#sugar-123", "sugar");
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testLargeString() {
|
|
||||||
|
|
||||||
final int interval = 100;
|
|
||||||
final int textSize = 1000000;
|
|
||||||
final String keyword = FOOD[1];
|
|
||||||
final Food payload = FOOD_PAYLOAD[1];
|
|
||||||
final StringBuilder text = randomNumbers(textSize);
|
|
||||||
|
|
||||||
injectKeyword(text, keyword, interval);
|
|
||||||
|
|
||||||
PayloadTrie<Food> trie = PayloadTrie.<Food>builder().onlyWholeWords().addKeyword(keyword, payload).build();
|
|
||||||
|
|
||||||
final Collection<PayloadEmit<Food>> emits = trie.parseText(text);
|
|
||||||
|
|
||||||
assertEquals(textSize / interval, emits.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void test_containsMatchWithCaseInsensitive() {
|
|
||||||
|
|
||||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().ignoreCase().addKeyword("foo", "bar").build();
|
|
||||||
|
|
||||||
assertTrue(trie.containsMatch("FOOBAR"));
|
|
||||||
assertFalse(trie.containsMatch("FO!?AR"));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// @see https://github.com/robert-bor/aho-corasick/issues/85
|
|
||||||
@Test
|
|
||||||
public void test_wholeWords() {
|
|
||||||
|
|
||||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword("foo", "bar").onlyWholeWords().build();
|
|
||||||
// access via PayloadTrie.parseText(CharSequence)
|
|
||||||
Collection<PayloadEmit<String>> result1 = trie.parseText("foobar");
|
|
||||||
// access via PayloadTrie.parseText(CharSequence, PayloadEmitHandler<String>)
|
|
||||||
Collection<PayloadEmit<String>> result2 = new LinkedList<>();
|
|
||||||
trie.parseText("foobar", result2::add);
|
|
||||||
|
|
||||||
assertTrue(result1.isEmpty());
|
|
||||||
assertEquals(result1, result2);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// @see https://github.com/robert-bor/aho-corasick/issues/85
|
|
||||||
@Test
|
|
||||||
public void test_wholeWordsWhiteSpaceSeparated() {
|
|
||||||
|
|
||||||
PayloadTrie<String> trie = PayloadTrie.<String>builder().addKeyword("foo", "bar").onlyWholeWordsWhiteSpaceSeparated().build();
|
|
||||||
// access via PayloadTrie.parseText(CharSequence)
|
|
||||||
Collection<PayloadEmit<String>> result1 = trie.parseText("foo#bar");
|
|
||||||
// access via PayloadTrie.parseText(CharSequence, PayloadEmitHandler<String>)
|
|
||||||
Collection<PayloadEmit<String>> result2 = new LinkedList<>();
|
|
||||||
trie.parseText("foo#bar", result2::add);
|
|
||||||
|
|
||||||
assertTrue(result1.isEmpty());
|
|
||||||
assertEquals(result1, result2);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void checkEmit(final PayloadEmit<Food> next, final int expectedStart, final int expectedEnd, final String expectedKeyword, final Food expectedPayload) {
|
|
||||||
|
|
||||||
assertEquals("Start of emit should have been " + expectedStart, expectedStart, next.getStart());
|
|
||||||
assertEquals("End of emit should have been " + expectedEnd, expectedEnd, next.getEnd());
|
|
||||||
assertEquals("Keyword of emit shoud be " + expectedKeyword, expectedKeyword, next.getKeyword());
|
|
||||||
assertEquals("Payload of emit shoud be " + expectedPayload, expectedPayload, next.getPayload());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void checkEmit(final PayloadEmit<Integer> next, final int expectedStart, final int expectedEnd, final String expectedKeyword, final Integer expectedPayload) {
|
|
||||||
|
|
||||||
assertEquals("Start of emit should have been " + expectedStart, expectedStart, next.getStart());
|
|
||||||
assertEquals("End of emit should have been " + expectedEnd, expectedEnd, next.getEnd());
|
|
||||||
assertEquals("Keyword of emit shoud be " + expectedKeyword, expectedKeyword, next.getKeyword());
|
|
||||||
assertEquals("Payload of emit shoud be " + expectedPayload, expectedPayload, next.getPayload());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void checkEmit(final PayloadEmit<String> next, final int expectedStart, final int expectedEnd, final String expectedKeyword, final String expectedPayload) {
|
|
||||||
|
|
||||||
assertEquals("Start of emit should have been " + expectedStart, expectedStart, next.getStart());
|
|
||||||
assertEquals("End of emit should have been " + expectedEnd, expectedEnd, next.getEnd());
|
|
||||||
assertEquals("Keyword of emit shoud be " + expectedKeyword, expectedKeyword, next.getKeyword());
|
|
||||||
assertEquals("Payload of emit shoud be " + expectedPayload, expectedPayload, next.getPayload());
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,76 +1,25 @@
|
|||||||
package org.ahocorasick.trie;
|
package org.ahocorasick.trie;
|
||||||
|
|
||||||
|
import org.ahocorasick.trie.State;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
import java.util.Collection;
|
import static junit.framework.Assert.assertEquals;
|
||||||
import java.util.Collections;
|
|
||||||
|
|
||||||
import static org.junit.Assert.*;
|
|
||||||
|
|
||||||
public class StateTest {
|
public class StateTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test_constructSequenceOfCharacters() {
|
public void constructSequenceOfCharacters() {
|
||||||
|
State rootState = new State();
|
||||||
final State rootState = new State();
|
rootState
|
||||||
rootState.addState('a').addState('b').addState('c');
|
.addState('a')
|
||||||
|
.addState('b')
|
||||||
|
.addState('c');
|
||||||
State currentState = rootState.nextState('a');
|
State currentState = rootState.nextState('a');
|
||||||
assertEquals(1, currentState.getDepth());
|
assertEquals(1, currentState.getDepth());
|
||||||
currentState = currentState.nextState('b');
|
currentState = currentState.nextState('b');
|
||||||
assertEquals(2, currentState.getDepth());
|
assertEquals(2, currentState.getDepth());
|
||||||
currentState = currentState.nextState('c');
|
currentState = currentState.nextState('c');
|
||||||
assertEquals(3, currentState.getDepth());
|
assertEquals(3, currentState.getDepth());
|
||||||
currentState = currentState.nextState('F');
|
|
||||||
assertNull(currentState);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void test_getStates() {
|
|
||||||
|
|
||||||
final State rootState = new State();
|
|
||||||
rootState.addState("foo");
|
|
||||||
final State currentState = rootState.nextState('f');
|
|
||||||
final Collection<State> states = rootState.getStates();
|
|
||||||
|
|
||||||
assertEquals(1, states.size());
|
|
||||||
assertEquals(currentState, states.iterator().next());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void test_getTransitions() {
|
|
||||||
|
|
||||||
final State rootState = new State();
|
|
||||||
rootState.addState("foo");
|
|
||||||
final State currentState = rootState.nextState('f');
|
|
||||||
final Collection<Character> transitions = rootState.getTransitions();
|
|
||||||
|
|
||||||
assertEquals(1, transitions.size());
|
|
||||||
assertEquals(Character.valueOf('f'), transitions.iterator().next());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void test_getFailure() {
|
|
||||||
|
|
||||||
final State failureState = new State();
|
|
||||||
final State rootState = new State();
|
|
||||||
rootState.setFailure(failureState);
|
|
||||||
|
|
||||||
assertEquals(failureState, rootState.getFailure());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void test_checkEmits() {
|
|
||||||
|
|
||||||
final State rootState = new State();
|
|
||||||
rootState.addState('a').addEmit(Collections.singleton("tag"));
|
|
||||||
final Collection<String> actual = rootState.nextState('a').emit();
|
|
||||||
|
|
||||||
assertEquals(1, actual.size());
|
|
||||||
assertEquals("tag", actual.iterator().next());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,47 +0,0 @@
|
|||||||
package org.ahocorasick.trie;
|
|
||||||
|
|
||||||
import static java.util.concurrent.ThreadLocalRandom.current;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Contains functionality common to tests.
|
|
||||||
*/
|
|
||||||
public class TestHelper {
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Injects keywords into a string builder.
|
|
||||||
*
|
|
||||||
* @param source Should contain a bunch of random data that cannot match
|
|
||||||
* any keyword.
|
|
||||||
* @param keyword A keyword to inject repeatedly in the text.
|
|
||||||
* @param interval How often to inject the keyword.
|
|
||||||
*/
|
|
||||||
@SuppressWarnings("SameParameterValue")
|
|
||||||
static void injectKeyword(final StringBuilder source, final String keyword, final int interval) {
|
|
||||||
|
|
||||||
final int length = source.length();
|
|
||||||
for (int i = 0; i < length; i += interval) {
|
|
||||||
source.replace(i, i + keyword.length(), keyword);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Generates a random sequence of ASCII numbers.
|
|
||||||
*
|
|
||||||
* @param count The number of numbers to generate.
|
|
||||||
* @return A character sequence filled with random digits.
|
|
||||||
*/
|
|
||||||
@SuppressWarnings("SameParameterValue")
|
|
||||||
public static StringBuilder randomNumbers(int count) {
|
|
||||||
|
|
||||||
int localCount = count;
|
|
||||||
final StringBuilder sb = new StringBuilder(localCount);
|
|
||||||
|
|
||||||
while (--localCount > 0) {
|
|
||||||
sb.append(current().nextInt(0, 10));
|
|
||||||
}
|
|
||||||
|
|
||||||
return sb;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -1,282 +1,221 @@
|
|||||||
package org.ahocorasick.trie;
|
package org.ahocorasick.trie;
|
||||||
|
|
||||||
import org.ahocorasick.trie.handler.AbstractStatefulEmitHandler;
|
|
||||||
import org.ahocorasick.trie.handler.EmitHandler;
|
import org.ahocorasick.trie.handler.EmitHandler;
|
||||||
import org.ahocorasick.trie.handler.StatefulEmitHandler;
|
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
|
||||||
|
|
||||||
import static java.lang.String.format;
|
import static junit.framework.Assert.assertEquals;
|
||||||
import static org.ahocorasick.trie.TestHelper.injectKeyword;
|
import static org.junit.Assert.assertTrue;
|
||||||
import static org.ahocorasick.trie.TestHelper.randomNumbers;
|
|
||||||
import static org.ahocorasick.trie.Trie.builder;
|
|
||||||
import static org.junit.Assert.*;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Test the {@link Trie} class functionality.
|
|
||||||
*/
|
|
||||||
public class TrieTest {
|
public class TrieTest {
|
||||||
|
|
||||||
private final static String[] ALPHABET = new String[]{"abc", "bcd", "cde"};
|
@Test
|
||||||
|
public void keywordAndTextAreTheSame() {
|
||||||
private final static String[] PRONOUNS = new String[]{"hers", "his", "she", "he"};
|
Trie trie = Trie.builder()
|
||||||
|
.addKeyword("abc")
|
||||||
private final static String[] FOOD = new String[]{"veal", "cauliflower", "broccoli", "tomatoes"};
|
.build();
|
||||||
|
Collection<Emit> emits = trie.parseText("abc");
|
||||||
private final static String[] GREEK_LETTERS = new String[]{"Alpha", "Beta", "Gamma"};
|
Iterator<Emit> iterator = emits.iterator();
|
||||||
|
checkEmit(iterator.next(), 0, 2, "abc");
|
||||||
private final static String[] UNICODE = new String[]{"turning", "once", "again", "börkü"};
|
|
||||||
|
|
||||||
|
|
||||||
private static Trie trie(final String keyword) {
|
|
||||||
|
|
||||||
return Trie.builder().addKeyword(keyword).build();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static Trie trieIgnoreWhiteSpace(final String keyword) {
|
|
||||||
|
|
||||||
return Trie.builder().addKeyword(keyword).ignoreWhiteSpace().build();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private static Trie trie(final String[] keywords) {
|
|
||||||
|
|
||||||
return Trie.builder().addKeywords(keywords).ignoreWhiteSpace().build();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private static Trie trieIgnoreWhiteSpace(final String[] keywords) {
|
|
||||||
|
|
||||||
return Trie.builder().addKeywords(keywords).ignoreWhiteSpace().build();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test_KeywordAndTextAreTheSame() {
|
public void keywordAndTextAreTheSameFirstMatch() {
|
||||||
|
Trie trie = Trie.builder()
|
||||||
final Trie trie = trie(ALPHABET[0]);
|
.addKeyword("abc")
|
||||||
final Collection<Emit> emits = trie.parseText(ALPHABET[0]);
|
.build();
|
||||||
final Iterator<Emit> iterator = emits.iterator();
|
Emit firstMatch = trie.firstMatch("abc");
|
||||||
checkEmit(iterator.next(), 0, 2, ALPHABET[0]);
|
checkEmit(firstMatch, 0, 2, "abc");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test_ignoringWhitespace_KeywordAndTextAreTheSame() {
|
public void textIsLongerThanKeyword() {
|
||||||
|
Trie trie = Trie.builder()
|
||||||
final Trie trie = trieIgnoreWhiteSpace(ALPHABET);
|
.addKeyword("abc")
|
||||||
final Collection<Emit> emits = trie.parseText("a b c d e");
|
.build();
|
||||||
final Iterator<Emit> iterator = emits.iterator();
|
Collection<Emit> emits = trie.parseText(" abc");
|
||||||
checkEmit(iterator.next(), 0, 4, ALPHABET[0]);
|
Iterator<Emit> iterator = emits.iterator();
|
||||||
checkEmit(iterator.next(), 2, 6, ALPHABET[1]);
|
checkEmit(iterator.next(), 1, 3, "abc");
|
||||||
checkEmit(iterator.next(), 4, 8, ALPHABET[2]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test_KeywordAndTextAreTheSameFirstMatch() {
|
public void textIsLongerThanKeywordFirstMatch() {
|
||||||
|
Trie trie = Trie.builder()
|
||||||
final Trie trie = trie(ALPHABET[0]);
|
.addKeyword("abc")
|
||||||
final Emit firstMatch = trie.firstMatch(ALPHABET[0]);
|
.build();
|
||||||
checkEmit(firstMatch, 0, 2, ALPHABET[0]);
|
Emit firstMatch = trie.firstMatch(" abc");
|
||||||
|
checkEmit(firstMatch, 1, 3, "abc");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test_TextIsLongerThanKeyword() {
|
public void variousKeywordsOneMatch() {
|
||||||
|
Trie trie = Trie.builder()
|
||||||
final Trie trie = trie(ALPHABET[0]);
|
.addKeyword("abc")
|
||||||
final Collection<Emit> emits = trie.parseText(" " + ALPHABET[0]);
|
.addKeyword("bcd")
|
||||||
final Iterator<Emit> iterator = emits.iterator();
|
.addKeyword("cde")
|
||||||
checkEmit(iterator.next(), 1, 3, ALPHABET[0]);
|
.build();
|
||||||
}
|
Collection<Emit> emits = trie.parseText("bcd");
|
||||||
|
Iterator<Emit> iterator = emits.iterator();
|
||||||
|
|
||||||
@Test
|
|
||||||
public void test_TextIsLongerThanKeywordFirstMatch() {
|
|
||||||
|
|
||||||
final Trie trie = trie(ALPHABET[0]);
|
|
||||||
final Emit firstMatch = trie.firstMatch(" " + ALPHABET[0]);
|
|
||||||
checkEmit(firstMatch, 1, 3, ALPHABET[0]);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void test_VariousKeywordsOneMatch() {
|
|
||||||
|
|
||||||
final Trie trie = trie(ALPHABET);
|
|
||||||
final Collection<Emit> emits = trie.parseText("bcd");
|
|
||||||
final Iterator<Emit> iterator = emits.iterator();
|
|
||||||
checkEmit(iterator.next(), 0, 2, "bcd");
|
checkEmit(iterator.next(), 0, 2, "bcd");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test_VariousKeywordsFirstMatch() {
|
public void variousKeywordsFirstMatch() {
|
||||||
|
Trie trie = Trie.builder()
|
||||||
final Trie trie = trie(ALPHABET);
|
.addKeyword("abc")
|
||||||
final Emit firstMatch = trie.firstMatch("bc d");
|
.addKeyword("bcd")
|
||||||
checkEmit(firstMatch, 0, 3, "bcd");
|
.addKeyword("cde")
|
||||||
|
.build();
|
||||||
|
Emit firstMatch = trie.firstMatch("bcd");
|
||||||
|
checkEmit(firstMatch, 0, 2, "bcd");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test(expected = AssertionError.class)
|
|
||||||
public void test_NullInputTextFirstMatch() {
|
|
||||||
|
|
||||||
final Trie trie = trie(ALPHABET);
|
|
||||||
final Emit firstMatch = trie.firstMatch(null);
|
|
||||||
assertNull(firstMatch);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test_UshersTestAndStopOnHit() {
|
public void ushersTestAndStopOnHit() {
|
||||||
|
Trie trie = Trie.builder()
|
||||||
final Trie trie = Trie.builder().addKeywords(PRONOUNS).stopOnHit().build();
|
.addKeyword("hers")
|
||||||
final Collection<Emit> emits = trie.parseText("ushers");
|
.addKeyword("his")
|
||||||
assertEquals(1, emits.size()); // she @ 3, he @ 3, hers @ 5
|
.addKeyword("she")
|
||||||
final Iterator<Emit> iterator = emits.iterator();
|
.addKeyword("he")
|
||||||
|
.stopOnHit()
|
||||||
|
.build();
|
||||||
|
Collection<Emit> emits = trie.parseText("ushers");
|
||||||
|
assertEquals(2, emits.size()); // she @ 3, he @ 3, hers @ 5
|
||||||
|
Iterator<Emit> iterator = emits.iterator();
|
||||||
checkEmit(iterator.next(), 2, 3, "he");
|
checkEmit(iterator.next(), 2, 3, "he");
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void test_UshersTestStopOnHitSkipOne() {
|
|
||||||
|
|
||||||
final Trie trie = Trie.builder().addKeywords(PRONOUNS).stopOnHit().build();
|
|
||||||
|
|
||||||
final StatefulEmitHandler testEmitHandler = new AbstractStatefulEmitHandler() {
|
|
||||||
boolean first = true;
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean emit(final Emit emit) {
|
|
||||||
|
|
||||||
if (first) {
|
|
||||||
// return false for the first element
|
|
||||||
first = false;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
addEmit(emit);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
trie.parseText("ushers", testEmitHandler);
|
|
||||||
final Collection<Emit> emits = testEmitHandler.getEmits();
|
|
||||||
assertEquals(1, emits.size()); // she @ 3, he @ 3, hers @ 5
|
|
||||||
final Iterator<Emit> iterator = emits.iterator();
|
|
||||||
checkEmit(iterator.next(), 1, 3, "she");
|
checkEmit(iterator.next(), 1, 3, "she");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test_UshersTest() {
|
public void ushersTest() {
|
||||||
|
Trie trie = Trie.builder()
|
||||||
final Trie trie = trie(PRONOUNS);
|
.addKeyword("hers")
|
||||||
final Collection<Emit> emits = trie.parseText("ushers");
|
.addKeyword("his")
|
||||||
|
.addKeyword("she")
|
||||||
|
.addKeyword("he")
|
||||||
|
.build();
|
||||||
|
Collection<Emit> emits = trie.parseText("ushers");
|
||||||
assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5
|
assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5
|
||||||
final Iterator<Emit> iterator = emits.iterator();
|
Iterator<Emit> iterator = emits.iterator();
|
||||||
checkEmit(iterator.next(), 2, 3, "he");
|
checkEmit(iterator.next(), 2, 3, "he");
|
||||||
checkEmit(iterator.next(), 1, 3, "she");
|
checkEmit(iterator.next(), 1, 3, "she");
|
||||||
checkEmit(iterator.next(), 2, 5, "hers");
|
checkEmit(iterator.next(), 2, 5, "hers");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test_UshersTestWithCapitalKeywords() {
|
public void ushersTestWithCapitalKeywords() {
|
||||||
|
Trie trie = Trie.builder()
|
||||||
final Trie trie = Trie.builder().ignoreCase().addKeyword("HERS").addKeyword("HIS").addKeyword("SHE").addKeyword("HE").build();
|
.caseInsensitive()
|
||||||
final Collection<Emit> emits = trie.parseText("ushers");
|
.addKeyword("HERS")
|
||||||
|
.addKeyword("HIS")
|
||||||
|
.addKeyword("SHE")
|
||||||
|
.addKeyword("HE")
|
||||||
|
.build();
|
||||||
|
Collection<Emit> emits = trie.parseText("ushers");
|
||||||
assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5
|
assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5
|
||||||
final Iterator<Emit> iterator = emits.iterator();
|
Iterator<Emit> iterator = emits.iterator();
|
||||||
checkEmit(iterator.next(), 2, 3, "HE");
|
checkEmit(iterator.next(), 2, 3, "he");
|
||||||
checkEmit(iterator.next(), 1, 3, "SHE");
|
checkEmit(iterator.next(), 1, 3, "she");
|
||||||
checkEmit(iterator.next(), 2, 5, "HERS");
|
checkEmit(iterator.next(), 2, 5, "hers");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test_UshersTestFirstMatch() {
|
public void ushersTestFirstMatch() {
|
||||||
|
Trie trie = Trie.builder()
|
||||||
final Trie trie = trie(PRONOUNS);
|
.addKeyword("hers")
|
||||||
final Emit firstMatch = trie.firstMatch("ushers");
|
.addKeyword("his")
|
||||||
|
.addKeyword("she")
|
||||||
|
.addKeyword("he")
|
||||||
|
.build();
|
||||||
|
Emit firstMatch = trie.firstMatch("ushers");
|
||||||
checkEmit(firstMatch, 2, 3, "he");
|
checkEmit(firstMatch, 2, 3, "he");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test_UshersTestByCallback() {
|
public void ushersTestByCallback() {
|
||||||
|
Trie trie = Trie.builder()
|
||||||
|
.addKeyword("hers")
|
||||||
|
.addKeyword("his")
|
||||||
|
.addKeyword("she")
|
||||||
|
.addKeyword("he")
|
||||||
|
.build();
|
||||||
|
|
||||||
final Trie trie = trie(PRONOUNS);
|
|
||||||
final List<Emit> emits = new ArrayList<>();
|
final List<Emit> emits = new ArrayList<>();
|
||||||
final EmitHandler emitHandler = emit -> {
|
EmitHandler emitHandler = new EmitHandler() {
|
||||||
emits.add(emit);
|
|
||||||
return true;
|
@Override
|
||||||
|
public void emit(Emit emit) {
|
||||||
|
emits.add(emit);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
trie.parseText("ushers", emitHandler);
|
trie.parseText("ushers", emitHandler);
|
||||||
assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5
|
assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5
|
||||||
final Iterator<Emit> iterator = emits.iterator();
|
Iterator<Emit> iterator = emits.iterator();
|
||||||
checkEmit(iterator.next(), 2, 3, "he");
|
checkEmit(iterator.next(), 2, 3, "he");
|
||||||
checkEmit(iterator.next(), 1, 3, "she");
|
checkEmit(iterator.next(), 1, 3, "she");
|
||||||
checkEmit(iterator.next(), 2, 5, "hers");
|
checkEmit(iterator.next(), 2, 5, "hers");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test_MisleadingTest() {
|
public void misleadingTest() {
|
||||||
|
Trie trie = Trie.builder()
|
||||||
final Trie trie = trie("hers");
|
.addKeyword("hers")
|
||||||
final Collection<Emit> emits = trie.parseText("h he her hers");
|
.build();
|
||||||
final Iterator<Emit> iterator = emits.iterator();
|
Collection<Emit> emits = trie.parseText("h he her hers");
|
||||||
|
Iterator<Emit> iterator = emits.iterator();
|
||||||
checkEmit(iterator.next(), 9, 12, "hers");
|
checkEmit(iterator.next(), 9, 12, "hers");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test_MisleadingTestFirstMatch() {
|
public void misleadingTestFirstMatch() {
|
||||||
|
Trie trie = Trie.builder()
|
||||||
final Trie trie = trie("hers");
|
.addKeyword("hers")
|
||||||
final Emit firstMatch = trie.firstMatch("h he her hers");
|
.build();
|
||||||
|
Emit firstMatch = trie.firstMatch("h he her hers");
|
||||||
checkEmit(firstMatch, 9, 12, "hers");
|
checkEmit(firstMatch, 9, 12, "hers");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test_Recipes() {
|
public void recipes() {
|
||||||
|
Trie trie = Trie.builder()
|
||||||
final Trie trie = trie(FOOD);
|
.addKeyword("veal")
|
||||||
final Collection<Emit> emits = trie.parseText("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli");
|
.addKeyword("cauliflower")
|
||||||
final Iterator<Emit> iterator = emits.iterator();
|
.addKeyword("broccoli")
|
||||||
|
.addKeyword("tomatoes")
|
||||||
|
.build();
|
||||||
|
Collection<Emit> emits = trie.parseText("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli");
|
||||||
|
Iterator<Emit> iterator = emits.iterator();
|
||||||
checkEmit(iterator.next(), 2, 12, "cauliflower");
|
checkEmit(iterator.next(), 2, 12, "cauliflower");
|
||||||
checkEmit(iterator.next(), 18, 25, "tomatoes");
|
checkEmit(iterator.next(), 18, 25, "tomatoes");
|
||||||
checkEmit(iterator.next(), 40, 43, "veal");
|
checkEmit(iterator.next(), 40, 43, "veal");
|
||||||
checkEmit(iterator.next(), 51, 58, "broccoli");
|
checkEmit(iterator.next(), 51, 58, "broccoli");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test_RecipesFirstMatch() {
|
public void recipesFirstMatch() {
|
||||||
|
Trie trie = Trie.builder()
|
||||||
final Trie trie = trie(FOOD);
|
.addKeyword("veal")
|
||||||
final Emit firstMatch = trie.firstMatch("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli");
|
.addKeyword("cauliflower")
|
||||||
|
.addKeyword("broccoli")
|
||||||
|
.addKeyword("tomatoes")
|
||||||
|
.build();
|
||||||
|
Emit firstMatch = trie.firstMatch("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli");
|
||||||
|
|
||||||
checkEmit(firstMatch, 2, 12, "cauliflower");
|
checkEmit(firstMatch, 2, 12, "cauliflower");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test_LongAndShortOverlappingMatch() {
|
public void longAndShortOverlappingMatch() {
|
||||||
|
Trie trie = Trie.builder()
|
||||||
final Trie trie = Trie.builder().addKeyword("he").addKeyword("hehehehe").build();
|
.addKeyword("he")
|
||||||
final Collection<Emit> emits = trie.parseText("hehehehehe");
|
.addKeyword("hehehehe")
|
||||||
final Iterator<Emit> iterator = emits.iterator();
|
.build();
|
||||||
|
Collection<Emit> emits = trie.parseText("hehehehehe");
|
||||||
|
Iterator<Emit> iterator = emits.iterator();
|
||||||
checkEmit(iterator.next(), 0, 1, "he");
|
checkEmit(iterator.next(), 0, 1, "he");
|
||||||
checkEmit(iterator.next(), 2, 3, "he");
|
checkEmit(iterator.next(), 2, 3, "he");
|
||||||
checkEmit(iterator.next(), 4, 5, "he");
|
checkEmit(iterator.next(), 4, 5, "he");
|
||||||
@ -286,43 +225,46 @@ public class TrieTest {
|
|||||||
checkEmit(iterator.next(), 2, 9, "hehehehe");
|
checkEmit(iterator.next(), 2, 9, "hehehehe");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test_NonOverlapping() {
|
public void nonOverlapping() {
|
||||||
|
Trie trie = Trie.builder().removeOverlaps()
|
||||||
final Trie trie = Trie.builder().ignoreOverlaps().addKeyword("ab").addKeyword("cba").addKeyword("ababc").build();
|
.addKeyword("ab")
|
||||||
final Collection<Emit> emits = trie.parseText("ababcbab");
|
.addKeyword("cba")
|
||||||
|
.addKeyword("ababc")
|
||||||
|
.build();
|
||||||
|
Collection<Emit> emits = trie.parseText("ababcbab");
|
||||||
assertEquals(2, emits.size());
|
assertEquals(2, emits.size());
|
||||||
final Iterator<Emit> iterator = emits.iterator();
|
Iterator<Emit> iterator = emits.iterator();
|
||||||
// With overlaps: ab@1, ab@3, ababc@4, cba@6, ab@7
|
// With overlaps: ab@1, ab@3, ababc@4, cba@6, ab@7
|
||||||
checkEmit(iterator.next(), 0, 4, "ababc");
|
checkEmit(iterator.next(), 0, 4, "ababc");
|
||||||
checkEmit(iterator.next(), 6, 7, "ab");
|
checkEmit(iterator.next(), 6, 7, "ab");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test_NonOverlappingFirstMatch() {
|
public void nonOverlappingFirstMatch() {
|
||||||
|
Trie trie = Trie.builder().removeOverlaps()
|
||||||
final Trie trie = Trie.builder().ignoreOverlaps().addKeyword("ab").addKeyword("cba").addKeyword("ababc").build();
|
.addKeyword("ab")
|
||||||
final Emit firstMatch = trie.firstMatch("ababcbab");
|
.addKeyword("cba")
|
||||||
|
.addKeyword("ababc")
|
||||||
|
.build();
|
||||||
|
Emit firstMatch = trie.firstMatch("ababcbab");
|
||||||
|
|
||||||
checkEmit(firstMatch, 0, 4, "ababc");
|
checkEmit(firstMatch, 0, 4, "ababc");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test_ContainsMatch() {
|
public void containsMatch() {
|
||||||
|
Trie trie = Trie.builder().removeOverlaps()
|
||||||
final Trie trie = Trie.builder().ignoreOverlaps().addKeyword("ab").addKeyword("cba").addKeyword("ababc").build();
|
.addKeyword("ab")
|
||||||
|
.addKeyword("cba")
|
||||||
|
.addKeyword("ababc")
|
||||||
|
.build();
|
||||||
assertTrue(trie.containsMatch("ababcbab"));
|
assertTrue(trie.containsMatch("ababcbab"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test_StartOfChurchillSpeech() {
|
public void startOfChurchillSpeech() {
|
||||||
|
Trie trie = Trie.builder().removeOverlaps()
|
||||||
final Trie trie = Trie.builder()
|
|
||||||
.ignoreOverlaps()
|
|
||||||
.addKeyword("T")
|
.addKeyword("T")
|
||||||
.addKeyword("u")
|
.addKeyword("u")
|
||||||
.addKeyword("ur")
|
.addKeyword("ur")
|
||||||
@ -334,40 +276,42 @@ public class TrieTest {
|
|||||||
.addKeyword("n")
|
.addKeyword("n")
|
||||||
.addKeyword("urning")
|
.addKeyword("urning")
|
||||||
.build();
|
.build();
|
||||||
final Collection<Emit> emits = trie.parseText("Turning");
|
Collection<Emit> emits = trie.parseText("Turning");
|
||||||
assertEquals(2, emits.size());
|
assertEquals(2, emits.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test_PartialMatch() {
|
public void partialMatch() {
|
||||||
|
Trie trie = Trie.builder()
|
||||||
final Trie trie = Trie.builder().onlyWholeWords().addKeyword("sugar").build();
|
.onlyWholeWords()
|
||||||
final Collection<Emit> emits = trie.parseText("sugarcane sugarcane sugar canesugar"); // left, middle, right test
|
.addKeyword("sugar")
|
||||||
|
.build();
|
||||||
|
Collection<Emit> emits = trie.parseText("sugarcane sugarcane sugar canesugar"); // left, middle, right test
|
||||||
assertEquals(1, emits.size()); // Match must not be made
|
assertEquals(1, emits.size()); // Match must not be made
|
||||||
checkEmit(emits.iterator().next(), 20, 24, "sugar");
|
checkEmit(emits.iterator().next(), 20, 24, "sugar");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test_PartialMatchFirstMatch() {
|
public void partialMatchFirstMatch() {
|
||||||
|
Trie trie = Trie.builder()
|
||||||
final Trie trie = Trie.builder().onlyWholeWords().addKeyword("sugar").build();
|
.onlyWholeWords()
|
||||||
|
.addKeyword("sugar")
|
||||||
// left, middle, right test
|
.build();
|
||||||
final Emit firstMatch = trie.firstMatch("sugarcane sugarcane sugar canesugar");
|
Emit firstMatch = trie.firstMatch("sugarcane sugarcane sugar canesugar"); // left, middle, right test
|
||||||
|
|
||||||
checkEmit(firstMatch, 20, 24, "sugar");
|
checkEmit(firstMatch, 20, 24, "sugar");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test_TokenizeFullSentence() {
|
public void tokenizeFullSentence() {
|
||||||
|
Trie trie = Trie.builder()
|
||||||
final Trie trie = trie(GREEK_LETTERS);
|
.addKeyword("Alpha")
|
||||||
final Collection<Token> tokens = trie.tokenize("Hear: Alpha team first, Beta from the rear, Gamma in reserve");
|
.addKeyword("Beta")
|
||||||
|
.addKeyword("Gamma")
|
||||||
|
.build();
|
||||||
|
Collection<Token> tokens = trie.tokenize("Hear: Alpha team first, Beta from the rear, Gamma in reserve");
|
||||||
assertEquals(7, tokens.size());
|
assertEquals(7, tokens.size());
|
||||||
final Iterator<Token> tokensIt = tokens.iterator();
|
Iterator<Token> tokensIt = tokens.iterator();
|
||||||
assertEquals("Hear: ", tokensIt.next().getFragment());
|
assertEquals("Hear: ", tokensIt.next().getFragment());
|
||||||
assertEquals("Alpha", tokensIt.next().getFragment());
|
assertEquals("Alpha", tokensIt.next().getFragment());
|
||||||
assertEquals(" team first, ", tokensIt.next().getFragment());
|
assertEquals(" team first, ", tokensIt.next().getFragment());
|
||||||
@ -377,211 +321,129 @@ public class TrieTest {
|
|||||||
assertEquals(" in reserve", tokensIt.next().getFragment());
|
assertEquals(" in reserve", tokensIt.next().getFragment());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Test boundary check with case-insensitive matches with whole words.
|
|
||||||
*/
|
|
||||||
@Test
|
@Test
|
||||||
public void test_StringIndexOutOfBoundsException() {
|
public void bug5InGithubReportedByXCurry() {
|
||||||
|
Trie trie = Trie.builder().caseInsensitive().onlyWholeWords()
|
||||||
final Trie trie = Trie.builder().ignoreCase().onlyWholeWords().addKeywords(UNICODE).build();
|
.addKeyword("turning")
|
||||||
final Collection<Emit> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ");
|
.addKeyword("once")
|
||||||
|
.addKeyword("again")
|
||||||
|
.addKeyword("börkü")
|
||||||
|
.build();
|
||||||
|
Collection<Emit> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ");
|
||||||
assertEquals(4, emits.size()); // Match must not be made
|
assertEquals(4, emits.size()); // Match must not be made
|
||||||
final Iterator<Emit> it = emits.iterator();
|
Iterator<Emit> it = emits.iterator();
|
||||||
checkEmit(it.next(), 0, 6, "turning");
|
checkEmit(it.next(), 0, 6, "turning");
|
||||||
checkEmit(it.next(), 8, 11, "once");
|
checkEmit(it.next(), 8, 11, "once");
|
||||||
checkEmit(it.next(), 13, 17, "again");
|
checkEmit(it.next(), 13, 17, "again");
|
||||||
checkEmit(it.next(), 19, 23, "börkü");
|
checkEmit(it.next(), 19, 23, "börkü");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test_IgnoreCase() {
|
public void caseInsensitive() {
|
||||||
|
Trie trie = Trie.builder().caseInsensitive()
|
||||||
final Trie trie = Trie.builder().ignoreCase().addKeywords(UNICODE).build();
|
.addKeyword("turning")
|
||||||
final Collection<Emit> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ");
|
.addKeyword("once")
|
||||||
|
.addKeyword("again")
|
||||||
|
.addKeyword("börkü")
|
||||||
|
.build();
|
||||||
|
Collection<Emit> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ");
|
||||||
assertEquals(4, emits.size()); // Match must not be made
|
assertEquals(4, emits.size()); // Match must not be made
|
||||||
final Iterator<Emit> it = emits.iterator();
|
Iterator<Emit> it = emits.iterator();
|
||||||
checkEmit(it.next(), 0, 6, "turning");
|
checkEmit(it.next(), 0, 6, "turning");
|
||||||
checkEmit(it.next(), 8, 11, "once");
|
checkEmit(it.next(), 8, 11, "once");
|
||||||
checkEmit(it.next(), 13, 17, "again");
|
checkEmit(it.next(), 13, 17, "again");
|
||||||
checkEmit(it.next(), 19, 23, "börkü");
|
checkEmit(it.next(), 19, 23, "börkü");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test_IgnoreCaseFirstMatch() {
|
public void caseInsensitiveFirstMatch() {
|
||||||
|
Trie trie = Trie.builder().caseInsensitive()
|
||||||
final Trie trie = Trie.builder().ignoreCase().addKeywords(UNICODE).build();
|
.addKeyword("turning")
|
||||||
final Emit firstMatch = trie.firstMatch("TurninG OnCe AgAiN BÖRKÜ");
|
.addKeyword("once")
|
||||||
|
.addKeyword("again")
|
||||||
|
.addKeyword("börkü")
|
||||||
|
.build();
|
||||||
|
Emit firstMatch = trie.firstMatch("TurninG OnCe AgAiN BÖRKÜ");
|
||||||
|
|
||||||
checkEmit(firstMatch, 0, 6, "turning");
|
checkEmit(firstMatch, 0, 6, "turning");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test_TokenizeTokensInSequence() {
|
public void tokenizeTokensInSequence() {
|
||||||
|
Trie trie = Trie.builder()
|
||||||
final Trie trie = trie(GREEK_LETTERS);
|
.addKeyword("Alpha")
|
||||||
final Collection<Token> tokens = trie.tokenize("Alpha Beta Gamma");
|
.addKeyword("Beta")
|
||||||
|
.addKeyword("Gamma")
|
||||||
|
.build();
|
||||||
|
Collection<Token> tokens = trie.tokenize("Alpha Beta Gamma");
|
||||||
assertEquals(5, tokens.size());
|
assertEquals(5, tokens.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Test offered by XCurry, https://github.com/robert-bor/aho-corasick/issues/7
|
||||||
/**
|
|
||||||
* Fix adding a word of size 0 ("") as a dictionary. A bug in the dictionary
|
|
||||||
* parsing code (at end of line) caused it to generate words of 0 length,
|
|
||||||
* which were being added to the trie. Removing the additional commas
|
|
||||||
* resolved the issue.
|
|
||||||
*/
|
|
||||||
@Test
|
@Test
|
||||||
public void test_ZeroLength() {
|
public void zeroLengthTestBug7InGithubReportedByXCurry() {
|
||||||
|
Trie trie = Trie.builder().removeOverlaps().onlyWholeWords().caseInsensitive()
|
||||||
final Trie trie = Trie.builder().ignoreOverlaps().onlyWholeWords().ignoreCase().addKeyword("").build();
|
.addKeyword("")
|
||||||
trie.tokenize("Try a natural lip and subtle bronzer to keep all the focus on those "
|
.build();
|
||||||
+ "big bright eyes with NARS Eyeshadow Duo in Rated R And the "
|
trie.tokenize("Try a natural lip and subtle bronzer to keep all the focus on those big bright eyes with NARS Eyeshadow Duo in Rated R And the winner is... Boots No7 Advanced Renewal Anti-ageing Glycolic Peel Kit ($25 amazon.com) won most-appealing peel.");
|
||||||
+ "winner is... Boots No7 Advanced Renewal Anti-ageing Glycolic "
|
|
||||||
+ "Peel Kit ($25 amazon.com) won most-appealing peel.");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Test offered by dwyerk, https://github.com/robert-bor/aho-corasick/issues/8
|
||||||
@Test
|
@Test
|
||||||
public void test_Emit_PunctuatedKeyword_AllOffsetsFound() {
|
public void unicodeIssueBug8ReportedByDwyerk() {
|
||||||
|
String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char
|
||||||
final String keyword = "{{var}}";
|
assertEquals("THIS", target.substring(5, 9)); // Java does it the right way
|
||||||
final int len = keyword.length() - 1;
|
Trie trie = Trie.builder().caseInsensitive().onlyWholeWords()
|
||||||
final Trie trie = builder().ignoreOverlaps().addKeyword(keyword).build();
|
.addKeyword("this")
|
||||||
|
.build();
|
||||||
final Collection<Emit> emits = trie.parseText(format("__%s__ **%s** {{%s}} %s%s", keyword, keyword, keyword, keyword, keyword));
|
Collection<Emit> emits = trie.parseText(target);
|
||||||
|
|
||||||
assertEquals(5, emits.size());
|
|
||||||
final Iterator<Emit> it = emits.iterator();
|
|
||||||
|
|
||||||
checkEmit(it.next(), 2, 2 + len, keyword);
|
|
||||||
checkEmit(it.next(), 14, 14 + len, keyword);
|
|
||||||
checkEmit(it.next(), 26, 26 + len, keyword);
|
|
||||||
checkEmit(it.next(), 36, 36 + len, keyword);
|
|
||||||
checkEmit(it.next(), 43, 43 + len, keyword);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Notice the capital I with a dot. The code used to compute the offsets
|
|
||||||
* at (6, 9), which caused {@link Trie#tokenize(String)} to crash because
|
|
||||||
* 9 is past the end of the string. That character is two bytes wide, so it
|
|
||||||
* pushes the offset calculation off.
|
|
||||||
*/
|
|
||||||
@Test
|
|
||||||
public void test_Unicode1() {
|
|
||||||
// The second character ('İ') is
|
|
||||||
// Unicode, which was read by AC as a 2-byte char
|
|
||||||
final String target = "LİKE THIS";
|
|
||||||
// Java does it the right way
|
|
||||||
assertEquals("THIS", target.substring(5, 9));
|
|
||||||
final Trie trie = Trie.builder().ignoreCase().onlyWholeWords().addKeyword("this").build();
|
|
||||||
final Collection<Emit> emits = trie.parseText(target);
|
|
||||||
assertEquals(1, emits.size());
|
assertEquals(1, emits.size());
|
||||||
final Iterator<Emit> it = emits.iterator();
|
Iterator<Emit> it = emits.iterator();
|
||||||
checkEmit(it.next(), 5, 8, "this");
|
checkEmit(it.next(), 5, 8, "this");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Notice the capital I with a dot. The code used to compute the offsets
|
|
||||||
* at (6, 9), which caused {@link Trie#tokenize(String)} to crash because
|
|
||||||
* 9 is past the end of the string. That character is two bytes wide, so it
|
|
||||||
* pushes the offset calculation off.
|
|
||||||
*/
|
|
||||||
@Test
|
@Test
|
||||||
public void test_Unicode2() {
|
public void unicodeIssueBug8ReportedByDwyerkFirstMatch() {
|
||||||
// The second character ('İ') is
|
String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char
|
||||||
// Unicode, which was read by AC as a 2-byte char
|
Trie trie = Trie.builder()
|
||||||
final String target = "LİKE THIS";
|
.caseInsensitive()
|
||||||
final Trie trie = Trie.builder().ignoreCase().onlyWholeWords().addKeyword("this").build();
|
.onlyWholeWords()
|
||||||
// Java does it the right way
|
.addKeyword("this")
|
||||||
assertEquals("THIS", target.substring(5, 9));
|
.build();
|
||||||
final Emit firstMatch = trie.firstMatch(target);
|
assertEquals("THIS", target.substring(5, 9)); // Java does it the right way
|
||||||
|
Emit firstMatch = trie.firstMatch(target);
|
||||||
checkEmit(firstMatch, 5, 8, "this");
|
checkEmit(firstMatch, 5, 8, "this");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void unicodeIssueBug39ReportedByHumanzz(){
|
||||||
|
// Problem: "İ".length => 1, "İ".toLowerCase().length => 2. This causes all sorts of unexpected behaviors
|
||||||
|
// and bugs where the Emit will have a size different from the original string.
|
||||||
|
// Soln: As in issue #8, convert at character level Character.toLowerCase('İ') => 'i' + make sure
|
||||||
|
// that emit gets the properly cased keyword.
|
||||||
|
String upperLengthOne = "İnt";
|
||||||
|
Trie trie = Trie.builder()
|
||||||
|
.caseInsensitive()
|
||||||
|
.onlyWholeWords()
|
||||||
|
.addKeyword(upperLengthOne)
|
||||||
|
.build();
|
||||||
|
Collection<Emit> emits = trie.parseText("İnt is good");
|
||||||
|
assertEquals(1, emits.size());
|
||||||
|
checkEmit(emits.iterator().next(), 0, 2, "int");
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test_PartialMatchWhiteSpaces() {
|
public void partialMatchWhiteSpaces() {
|
||||||
|
Trie trie = Trie.builder()
|
||||||
final Trie trie = Trie.builder().onlyWholeWordsWhiteSpaceSeparated().addKeyword("#sugar-123").build();
|
.onlyWholeWordsWhiteSpaceSeparated()
|
||||||
final Collection<Emit> emits = trie.parseText("#sugar-123 #sugar-1234"); // left, middle, right test
|
.addKeyword("#sugar-123")
|
||||||
|
.build();
|
||||||
|
Collection < Emit > emits = trie.parseText("#sugar-123 #sugar-1234"); // left, middle, right test
|
||||||
assertEquals(1, emits.size()); // Match must not be made
|
assertEquals(1, emits.size()); // Match must not be made
|
||||||
checkEmit(emits.iterator().next(), 0, 9, "#sugar-123");
|
checkEmit(emits.iterator().next(), 0, 9, "#sugar-123");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void test_LargeString() {
|
|
||||||
|
|
||||||
final int interval = 100;
|
|
||||||
final int textSize = 1000000;
|
|
||||||
final String keyword = FOOD[1];
|
|
||||||
final StringBuilder text = randomNumbers(textSize);
|
|
||||||
|
|
||||||
injectKeyword(text, keyword, interval);
|
|
||||||
|
|
||||||
final Trie trie = Trie.builder().onlyWholeWords().addKeyword(keyword).build();
|
|
||||||
|
|
||||||
final Collection<Emit> emits = trie.parseText(text);
|
|
||||||
|
|
||||||
assertEquals(textSize / interval, emits.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void test_UnicodeIssueBug39ReportedByHumanzz() {
|
|
||||||
// Problem: "İ".length => 1, "İ".toLowerCase().length => 2. This causes
|
|
||||||
// all sorts of unexpected behaviors
|
|
||||||
// and bugs where the Emit will have a size different from the original
|
|
||||||
// string.
|
|
||||||
// Soln: As in issue #8, convert at character level Character.toLowerCase
|
|
||||||
// ('İ') => 'i' + make sure
|
|
||||||
// that emit gets the properly cased keyword.
|
|
||||||
final String upperLengthOne = "İnt";
|
|
||||||
final Trie trie = Trie.builder().ignoreCase().onlyWholeWords().addKeyword(upperLengthOne).build();
|
|
||||||
final Collection<Emit> emits = trie.parseText("İnt is good");
|
|
||||||
assertEquals(1, emits.size());
|
|
||||||
checkEmit(emits.iterator().next(), 0, 2, upperLengthOne);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test(timeout = 30_000)
|
|
||||||
public void test_ParallelSearch() throws InterruptedException {
|
|
||||||
|
|
||||||
final int interval = 100;
|
|
||||||
final int textSize = 1000000;
|
|
||||||
final String keyword = FOOD[1];
|
|
||||||
final StringBuilder matchingText = randomNumbers(textSize);
|
|
||||||
injectKeyword(matchingText, keyword, interval);
|
|
||||||
final StringBuilder nonMatchingText = randomNumbers(textSize);
|
|
||||||
injectKeyword(nonMatchingText, keyword.substring(0, keyword.length() - 1), interval);
|
|
||||||
|
|
||||||
final Trie trie = Trie.builder().onlyWholeWords().addKeyword(keyword).build();
|
|
||||||
|
|
||||||
final AtomicInteger matchCount = new AtomicInteger(0);
|
|
||||||
final Runnable matchingTask = () -> matchCount.set(trie.parseText(matchingText).size());
|
|
||||||
|
|
||||||
final AtomicInteger nonMatchCount = new AtomicInteger(0);
|
|
||||||
final Runnable nonMatchingTask = () -> nonMatchCount.set(trie.parseText(nonMatchingText).size());
|
|
||||||
final Thread matchingThread = new Thread(matchingTask);
|
|
||||||
final Thread nonMatchingThread = new Thread(nonMatchingTask);
|
|
||||||
matchingThread.start();
|
|
||||||
nonMatchingThread.start();
|
|
||||||
matchingThread.join();
|
|
||||||
nonMatchingThread.join();
|
|
||||||
|
|
||||||
assertEquals(textSize / interval, matchCount.get());
|
|
||||||
assertEquals(0, nonMatchCount.get());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void checkEmit(Emit next, int expectedStart, int expectedEnd, String expectedKeyword) {
|
private void checkEmit(Emit next, int expectedStart, int expectedEnd, String expectedKeyword) {
|
||||||
|
|
||||||
assertEquals("Start of emit should have been " + expectedStart, expectedStart, next.getStart());
|
assertEquals("Start of emit should have been " + expectedStart, expectedStart, next.getStart());
|
||||||
assertEquals("End of emit should have been " + expectedEnd, expectedEnd, next.getEnd());
|
assertEquals("End of emit should have been " + expectedEnd, expectedEnd, next.getEnd());
|
||||||
assertEquals(expectedKeyword, next.getKeyword());
|
assertEquals(expectedKeyword, next.getKeyword());
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user