diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 4bd1bc5..7b17158 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,19 +1,21 @@ -stages: - - test - - versioning - - deploy -variables: - GIT_SUBMODULE_STRATEGY: recursive - GIT_SUBMODULE_FORCE_HTTPS: 'true' include: - project: 'gitlab/gitlab' ref: 'main' - file: 'ci-templates/maven_deps.yml' + file: 'ci-templates/gradle_java.yml' -verify: - stage: test + +deploy: + stage: deploy tags: - dind script: - - echo "Erfolgreich getestet" - + - echo "Building with gradle version ${BUILDVERSION}" + - gradle -Pversion=${BUILDVERSION} publish + - echo "BUILDVERSION=$BUILDVERSION" >> version.env + artifacts: + reports: + dotenv: version.env + rules: + - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH + - if: $CI_COMMIT_BRANCH =~ /^release/ + - if: $CI_COMMIT_TAG \ No newline at end of file diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 751303d..0000000 --- a/.travis.yml +++ /dev/null @@ -1,6 +0,0 @@ -language: java -install: mvn install -DskipTests=true -Dgpg.skip=true -jdk: - - openjdk8 -after_success: - - bash <(curl -s https://codecov.io/bash) diff --git a/build.gradle.kts b/build.gradle.kts new file mode 100644 index 0000000..774d2a9 --- /dev/null +++ b/build.gradle.kts @@ -0,0 +1,63 @@ +plugins { + `java-library` + `maven-publish` + pmd + checkstyle + id("io.freefair.lombok") version "8.4" +} + +repositories { + mavenLocal() + maven { + url = uri("https://nexus.knecon.com/repository/gindev/") + credentials { + username = providers.gradleProperty("mavenUser").getOrNull(); + password = providers.gradleProperty("mavenPassword").getOrNull(); + } + } + + maven { + url = uri("https://repo.maven.apache.org/maven2/") + } +} + +dependencies { + testImplementation("junit:junit:4.13.2") +} + +group = "org.ahocorasick" +description = "Aho-CoraSick algorithm for efficient string matching" +java.sourceCompatibility = JavaVersion.VERSION_17 +java.targetCompatibility = JavaVersion.VERSION_17 + +java { + withSourcesJar() + withJavadocJar() +} + +publishing { + publications.create("maven") { + from(components["java"]) + } +} + +tasks.withType() { + options.encoding = "UTF-8" +} + +tasks.withType() { + options.encoding = "UTF-8" +} + +pmd { + isConsoleOutput = true +} + +tasks.pmdMain { + pmd.ruleSetFiles = files("${rootDir}/config/pmd/pmd.xml") +} + +tasks.pmdTest { + pmd.ruleSetFiles = files("${rootDir}/config/pmd/test_pmd.xml") +} + diff --git a/config/checkstyle/checkstyle.xml b/config/checkstyle/checkstyle.xml new file mode 100644 index 0000000..ba83248 --- /dev/null +++ b/config/checkstyle/checkstyle.xml @@ -0,0 +1,38 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/config/pmd/pmd.xml b/config/pmd/pmd.xml new file mode 100644 index 0000000..7226d60 --- /dev/null +++ b/config/pmd/pmd.xml @@ -0,0 +1,21 @@ + + + + + Knecon ruleset checks the code for bad stuff + + + + + + + + + + + + + diff --git a/config/pmd/test_pmd.xml b/config/pmd/test_pmd.xml new file mode 100644 index 0000000..eb2fc84 --- /dev/null +++ b/config/pmd/test_pmd.xml @@ -0,0 +1,11 @@ + + + + + Knecon test ruleset checks the code for bad stuff + + + diff --git a/gradle.properties.kts b/gradle.properties.kts new file mode 100644 index 0000000..50a62d5 --- /dev/null +++ b/gradle.properties.kts @@ -0,0 +1 @@ +version = 0.7-SNAPSHOT \ No newline at end of file diff --git a/pom.xml b/pom.xml deleted file mode 100644 index e15ff18..0000000 --- a/pom.xml +++ /dev/null @@ -1,183 +0,0 @@ - - - 4.0.0 - - org.ahocorasick - ahocorasick - 0.7-SNAPSHOT - jar - Aho-CoraSick algorithm for efficient string matching - Java library for efficient string matching against a large set of keywords - 2014 - https://github.com/robert-bor/aho-corasick - - - - ossrh - https://oss.sonatype.org/content/repositories/snapshots - - - - - 42 BV - http://blog.42.nl/ - - - - - The Apache Software License, Version 2.0 - http://www.apache.org/licenses/LICENSE-2.0.txt - repo - - - - - scm:git://github.com/robert-bor/aho-corasick - scm:git://github.com/robert-bor/aho-corasick - - - - - Robert Bor - 42 - - - Daniel Beck - neoSearch UG (haftungsbeschränkt) - - - Dave Jarvis - White Magic Software, Ltd. - - - - - 1.8 - UTF-8 - - 4.13.2 - - 2.5.2 - 2.8 - 2.4 - 3.3 - - - - - - - - - junit - junit - ${junit.version} - test - - - - - - install - - - - - - org.apache.maven.plugins - maven-compiler-plugin - 3.8.1 - - ${java.version} - ${java.version} - ${project.build.sourceEncoding} - - - - - org.apache.maven.plugins - maven-javadoc-plugin - 3.4.1 - - - attach-javadocs - - jar - - - 8 - - - - - - - org.apache.maven.plugins - maven-source-plugin - 3.2.1 - - - attach-sources - - jar-no-fork - - - - - - - - - org.jacoco - jacoco-maven-plugin - 0.8.6 - - - - prepare-agent - - - - report - test - - report - - - - - - - - diff --git a/settings.gradle.kts b/settings.gradle.kts new file mode 100644 index 0000000..30281ea --- /dev/null +++ b/settings.gradle.kts @@ -0,0 +1 @@ +rootProject.name = "ahocorasick" diff --git a/src/main/java/org/ahocorasick/interval/Interval.java b/src/main/java/org/ahocorasick/interval/Interval.java index c63fdda..1e8840f 100644 --- a/src/main/java/org/ahocorasick/interval/Interval.java +++ b/src/main/java/org/ahocorasick/interval/Interval.java @@ -12,6 +12,7 @@ public class Interval implements Intervalable { private final int start; private final int end; + /** * Constructs an interval with a start and end position. * @@ -19,10 +20,12 @@ public class Interval implements Intervalable { * @param end The interval's ending text position. */ public Interval(final int start, final int end) { + this.start = start; this.end = end; } + /** * Returns the starting offset into the text for this interval. * @@ -30,9 +33,11 @@ public class Interval implements Intervalable { */ @Override public int getStart() { + return this.start; } + /** * Returns the ending offset into the text for this interval. * @@ -40,9 +45,11 @@ public class Interval implements Intervalable { */ @Override public int getEnd() { + return this.end; } + /** * Returns the length of the interval. * @@ -50,9 +57,11 @@ public class Interval implements Intervalable { */ @Override public int size() { + return end - start + 1; } + /** * Answers whether the given interval overlaps this interval * instance. @@ -61,31 +70,38 @@ public class Interval implements Intervalable { * @return true The intervals overlap. */ public boolean overlapsWith(final Interval other) { - return this.start <= other.getEnd() && - this.end >= other.getStart(); + + return this.start <= other.getEnd() && this.end >= other.getStart(); } + public boolean overlapsWith(int point) { + return this.start <= point && point <= this.end; } + @Override public boolean equals(Object o) { + if (!(o instanceof Intervalable)) { return false; } Intervalable other = (Intervalable) o; - return this.start == other.getStart() && - this.end == other.getEnd(); + return this.start == other.getStart() && this.end == other.getEnd(); } + @Override public int hashCode() { + return this.start % 100 + this.end % 100; } + @Override public int compareTo(Object o) { + if (!(o instanceof Intervalable)) { return -1; } @@ -94,6 +110,7 @@ public class Interval implements Intervalable { return comparison != 0 ? comparison : this.end - other.getEnd(); } + /** * Returns the starting offset and ending offset separated * by a full colon (:). @@ -102,6 +119,8 @@ public class Interval implements Intervalable { */ @Override public String toString() { + return this.start + ":" + this.end; } + } diff --git a/src/main/java/org/ahocorasick/interval/IntervalNode.java b/src/main/java/org/ahocorasick/interval/IntervalNode.java index 92727c9..46bc8c7 100644 --- a/src/main/java/org/ahocorasick/interval/IntervalNode.java +++ b/src/main/java/org/ahocorasick/interval/IntervalNode.java @@ -6,14 +6,19 @@ import java.util.List; public class IntervalNode { - private enum Direction {LEFT, RIGHT} + private enum Direction { + LEFT, + RIGHT + } private IntervalNode left; private IntervalNode right; private int point; private List intervals = new ArrayList<>(); + public IntervalNode(final List intervals) { + this.point = determineMedian(intervals); final List toLeft = new ArrayList<>(); @@ -37,7 +42,9 @@ public class IntervalNode { } } - public int determineMedian(final List intervals) { + + private int determineMedian(final List intervals) { + int start = -1; int end = -1; for (Intervalable interval : intervals) { @@ -53,7 +60,9 @@ public class IntervalNode { return (start + end) / 2; } + public List findOverlaps(final Intervalable interval) { + final List overlaps = new ArrayList<>(); if (this.point < interval.getStart()) { @@ -74,10 +83,9 @@ public class IntervalNode { return overlaps; } - protected void addToOverlaps( - final Intervalable interval, - final List overlaps, - final List newOverlaps) { + + protected void addToOverlaps(final Intervalable interval, final List overlaps, final List newOverlaps) { + for (final Intervalable currentInterval : newOverlaps) { if (!currentInterval.equals(interval)) { overlaps.add(currentInterval); @@ -85,16 +93,21 @@ public class IntervalNode { } } + protected List checkForOverlapsToTheLeft(final Intervalable interval) { + return checkForOverlaps(interval, Direction.LEFT); } + protected List checkForOverlapsToTheRight(final Intervalable interval) { + return checkForOverlaps(interval, Direction.RIGHT); } - protected List checkForOverlaps( - final Intervalable interval, final Direction direction) { + + protected List checkForOverlaps(final Intervalable interval, final Direction direction) { + final List overlaps = new ArrayList<>(); for (final Intervalable currentInterval : this.intervals) { @@ -115,9 +128,10 @@ public class IntervalNode { return overlaps; } + protected List findOverlappingRanges(IntervalNode node, Intervalable interval) { - return node == null - ? Collections.emptyList() - : node.findOverlaps(interval); + + return node == null ? Collections.emptyList() : node.findOverlaps(interval); } + } diff --git a/src/main/java/org/ahocorasick/interval/IntervalTree.java b/src/main/java/org/ahocorasick/interval/IntervalTree.java index 4dc43b9..07b168d 100644 --- a/src/main/java/org/ahocorasick/interval/IntervalTree.java +++ b/src/main/java/org/ahocorasick/interval/IntervalTree.java @@ -10,10 +10,13 @@ public class IntervalTree { private final IntervalNode rootNode; + public IntervalTree(List intervals) { + this.rootNode = new IntervalNode(intervals); } + public List removeOverlaps(final List intervals) { // Sort the intervals on size, then left-most position @@ -42,7 +45,9 @@ public class IntervalTree { return intervals; } + public List findOverlaps(final Intervalable interval) { + return rootNode.findOverlaps(interval); } diff --git a/src/main/java/org/ahocorasick/interval/Intervalable.java b/src/main/java/org/ahocorasick/interval/Intervalable.java index 223309e..edb4de7 100644 --- a/src/main/java/org/ahocorasick/interval/Intervalable.java +++ b/src/main/java/org/ahocorasick/interval/Intervalable.java @@ -2,10 +2,12 @@ package org.ahocorasick.interval; public interface Intervalable extends Comparable { - int getStart(); + int getStart(); - int getEnd(); - int size(); + int getEnd(); + + + int size(); } diff --git a/src/main/java/org/ahocorasick/interval/IntervalableComparatorByPosition.java b/src/main/java/org/ahocorasick/interval/IntervalableComparatorByPosition.java index 2dc0491..bb2c5a0 100644 --- a/src/main/java/org/ahocorasick/interval/IntervalableComparatorByPosition.java +++ b/src/main/java/org/ahocorasick/interval/IntervalableComparatorByPosition.java @@ -6,6 +6,7 @@ public class IntervalableComparatorByPosition implements Comparator { @Override public int compare(final Intervalable intervalable, final Intervalable intervalable2) { + int comparison = intervalable2.size() - intervalable.size(); if (comparison == 0) { diff --git a/src/main/java/org/ahocorasick/trie/DefaultToken.java b/src/main/java/org/ahocorasick/trie/DefaultToken.java index b83942d..70ecae4 100644 --- a/src/main/java/org/ahocorasick/trie/DefaultToken.java +++ b/src/main/java/org/ahocorasick/trie/DefaultToken.java @@ -4,16 +4,22 @@ public class DefaultToken extends Token { private PayloadToken payloadToken; + public DefaultToken(PayloadToken payloadToken) { + super(payloadToken.getFragment()); this.payloadToken = payloadToken; } + public boolean isMatch() { + return payloadToken.isMatch(); } + public Emit getEmit() { + PayloadEmit emit = payloadToken.getEmit(); return new Emit(emit.getStart(), emit.getEnd(), emit.getKeyword()); } diff --git a/src/main/java/org/ahocorasick/trie/Emit.java b/src/main/java/org/ahocorasick/trie/Emit.java index 2e97c60..377e3b1 100644 --- a/src/main/java/org/ahocorasick/trie/Emit.java +++ b/src/main/java/org/ahocorasick/trie/Emit.java @@ -7,19 +7,26 @@ import org.ahocorasick.interval.Intervalable; * Responsible for tracking the bounds of matched terms. */ public class Emit extends Interval implements Intervalable { + private final String keyword; + public Emit(final int start, final int end, final String keyword) { + super(start, end); this.keyword = keyword; } + public String getKeyword() { + return this.keyword; } + @Override public String toString() { + return super.toString() + "=" + this.keyword; } diff --git a/src/main/java/org/ahocorasick/trie/FragmentToken.java b/src/main/java/org/ahocorasick/trie/FragmentToken.java index 37e83d1..2dc9109 100644 --- a/src/main/java/org/ahocorasick/trie/FragmentToken.java +++ b/src/main/java/org/ahocorasick/trie/FragmentToken.java @@ -3,16 +3,21 @@ package org.ahocorasick.trie; public class FragmentToken extends Token { public FragmentToken(String fragment) { + super(fragment); } + @Override public boolean isMatch() { + return false; } + @Override public Emit getEmit() { + return null; } diff --git a/src/main/java/org/ahocorasick/trie/MatchToken.java b/src/main/java/org/ahocorasick/trie/MatchToken.java index ff499b3..d08b4b8 100644 --- a/src/main/java/org/ahocorasick/trie/MatchToken.java +++ b/src/main/java/org/ahocorasick/trie/MatchToken.java @@ -4,19 +4,26 @@ public class MatchToken extends Token { private final Emit emit; + public MatchToken(final String fragment, final Emit emit) { + super(fragment); this.emit = emit; } + @Override public boolean isMatch() { + return true; } + @Override public Emit getEmit() { + return this.emit; } + } diff --git a/src/main/java/org/ahocorasick/trie/Payload.java b/src/main/java/org/ahocorasick/trie/Payload.java index 0fef5df..3c7b1c1 100644 --- a/src/main/java/org/ahocorasick/trie/Payload.java +++ b/src/main/java/org/ahocorasick/trie/Payload.java @@ -1,32 +1,21 @@ package org.ahocorasick.trie; +import lombok.EqualsAndHashCode; +import lombok.Getter; +import lombok.RequiredArgsConstructor; + /** * Contains the matched keyword and some payload data. - * - * @author Daniel Beck + * * @param The type of the wrapped payload data. + * @author Daniel Beck */ -public class Payload implements Comparable> { +@Getter +@EqualsAndHashCode +@RequiredArgsConstructor +public class Payload { private final String keyword; private final T data; - public Payload(final String keyword, final T data) { - super(); - this.keyword = keyword; - this.data = data; - } - - public String getKeyword() { - return keyword; - } - - public T getData() { - return data; - } - - @Override - public int compareTo(Payload other) { - return keyword.compareTo(other.getKeyword()); - } } diff --git a/src/main/java/org/ahocorasick/trie/PayloadEmit.java b/src/main/java/org/ahocorasick/trie/PayloadEmit.java index e0fc036..9eadcc3 100644 --- a/src/main/java/org/ahocorasick/trie/PayloadEmit.java +++ b/src/main/java/org/ahocorasick/trie/PayloadEmit.java @@ -5,7 +5,7 @@ import org.ahocorasick.interval.Intervalable; /** * Contains a matched term and its associated payload data. - * + * * @param Type of the wrapped payload-data. * @author Daniel Beck */ @@ -15,35 +15,44 @@ public class PayloadEmit extends Interval implements Intervalable { private final T payload; + /** * Created a PayloadEmit - * + * * @param start Start of the matched search term. * @param end End of the matched search term. * @param keyword Keyword that matched. * @param payload Emitted payload data. */ public PayloadEmit(final int start, final int end, String keyword, T payload) { + super(start, end); this.keyword = keyword; this.payload = payload; } + public String getKeyword() { + return this.keyword; } + /** * Returns the payload associated to this emit. - * + * * @return the associated payload */ public T getPayload() { + return this.payload; } + @Override public String toString() { + return super.toString() + "=" + this.keyword + (this.payload != null ? "->" + this.payload : ""); } + } diff --git a/src/main/java/org/ahocorasick/trie/PayloadFragmentToken.java b/src/main/java/org/ahocorasick/trie/PayloadFragmentToken.java index d78312c..2fe392f 100644 --- a/src/main/java/org/ahocorasick/trie/PayloadFragmentToken.java +++ b/src/main/java/org/ahocorasick/trie/PayloadFragmentToken.java @@ -6,7 +6,7 @@ package org.ahocorasick.trie; * This token indicates a matching search term was not found, so * {@link #isMatch()} always returns {@code false}. *

- * + * * @author Daniel Beck * * @param The Type of the emitted payloads. @@ -14,19 +14,25 @@ package org.ahocorasick.trie; public class PayloadFragmentToken extends PayloadToken { public PayloadFragmentToken(String fragment) { + super(fragment); } + @Override public boolean isMatch() { + return false; } + /** * Returns null. */ @Override public PayloadEmit getEmit() { + return null; } + } diff --git a/src/main/java/org/ahocorasick/trie/PayloadMatchToken.java b/src/main/java/org/ahocorasick/trie/PayloadMatchToken.java index 0979c98..92225b2 100644 --- a/src/main/java/org/ahocorasick/trie/PayloadMatchToken.java +++ b/src/main/java/org/ahocorasick/trie/PayloadMatchToken.java @@ -6,27 +6,33 @@ package org.ahocorasick.trie; * This token indicates a matching search term was found, so {@link #isMatch()} * always returns {@code true}. *

- * - * @author Daniel Beck * * @param The Type of the emitted payloads. + * @author Daniel Beck */ public class PayloadMatchToken extends PayloadToken { private final PayloadEmit emit; + public PayloadMatchToken(final String fragment, final PayloadEmit emit) { + super(fragment); this.emit = emit; } + @Override public boolean isMatch() { + return true; } + @Override public PayloadEmit getEmit() { + return this.emit; } + } diff --git a/src/main/java/org/ahocorasick/trie/PayloadState.java b/src/main/java/org/ahocorasick/trie/PayloadState.java index cb75e92..6cf70d9 100644 --- a/src/main/java/org/ahocorasick/trie/PayloadState.java +++ b/src/main/java/org/ahocorasick/trie/PayloadState.java @@ -1,6 +1,10 @@ package org.ahocorasick.trie; import java.util.*; +import java.util.stream.Collectors; + +import lombok.Getter; +import lombok.Setter; /** *

@@ -27,13 +31,14 @@ import java.util.*; public class PayloadState { /** - * effective the size of the keyword + * effective the size of the keyword. */ + @Getter private final int depth; /** * only used for the root state to refer to itself in case no matches have been - * found + * found. */ private final PayloadState rootState; @@ -44,26 +49,34 @@ public class PayloadState { private final Map> success = new HashMap<>(); /** - * if no matching states are found, the failure state will be returned + * if no matching states are found, the failure state will be returned. */ + @Getter + @Setter private PayloadState failure; /** * whenever this state is reached, it will emit the matches keywords for future - * reference + * reference. */ private Set> emits; + public PayloadState() { + this(0); } + public PayloadState(final int depth) { + this.depth = depth; this.rootState = depth == 0 ? this : null; } + private PayloadState nextState(final Character character, final boolean ignoreRootState) { + PayloadState nextState = this.success.get(character); if (!ignoreRootState && nextState == null && this.rootState != null) { @@ -73,15 +86,21 @@ public class PayloadState { return nextState; } + public PayloadState nextState(final Character character) { + return nextState(character, false); } + public PayloadState nextStateIgnoreRootState(Character character) { + return nextState(character, true); } + public PayloadState addState(Character character) { + PayloadState nextState = nextStateIgnoreRootState(character); if (nextState == null) { nextState = new PayloadState<>(this.depth + 1); @@ -90,55 +109,56 @@ public class PayloadState { return nextState; } - public int getDepth() { - return this.depth; - } /** * Adds a payload to be emitted for this state. - * + * * @param payload to be emitted. */ public void addEmit(Payload payload) { + if (this.emits == null) { - this.emits = new TreeSet<>(); + this.emits = new HashSet<>(); } this.emits.add(payload); } + /** * Adds a collection of payloads to be emitted for this state. - * + * * @param emits Collection of payloads to be emitted. */ public void addEmit(Collection> emits) { + for (Payload emit : emits) { addEmit(emit); } } + /** * Returns a collection of emitted payloads for this state. - * + * * @return Collection of emitted payloads. */ public Collection> emit() { - return this.emits == null ? Collections.>emptyList() : this.emits; + + return this.emits == null ? Collections.>emptyList() : this.emits.stream() + .sorted(Comparator.comparing(Payload::getKeyword)) + .collect(Collectors.toList()); } - public PayloadState failure() { - return this.failure; - } - - public void setFailure(PayloadState failState) { - this.failure = failState; - } public Collection> getStates() { + return this.success.values(); } + public Collection getTransitions() { + return this.success.keySet(); } + } \ No newline at end of file diff --git a/src/main/java/org/ahocorasick/trie/PayloadToken.java b/src/main/java/org/ahocorasick/trie/PayloadToken.java index 160aa7a..aafcc21 100644 --- a/src/main/java/org/ahocorasick/trie/PayloadToken.java +++ b/src/main/java/org/ahocorasick/trie/PayloadToken.java @@ -9,24 +9,33 @@ package org.ahocorasick.trie; * @param The Type of the emitted payloads. */ public abstract class PayloadToken { + private String fragment; + public PayloadToken(String fragment) { + this.fragment = fragment; } + public String getFragment() { + return this.fragment; } + /** * Return {@code true} if a search term matched. + * * @return {@code true} if this is a match */ public abstract boolean isMatch(); + /** * @return the payload */ public abstract PayloadEmit getEmit(); + } diff --git a/src/main/java/org/ahocorasick/trie/PayloadTrie.java b/src/main/java/org/ahocorasick/trie/PayloadTrie.java index 1bc38cd..237aa9f 100644 --- a/src/main/java/org/ahocorasick/trie/PayloadTrie.java +++ b/src/main/java/org/ahocorasick/trie/PayloadTrie.java @@ -1,7 +1,9 @@ package org.ahocorasick.trie; import static java.lang.Character.isWhitespace; +import static java.lang.Character.toLowerCase; +import java.util.Deque; import java.util.LinkedList; import java.util.Collection; import java.util.List; @@ -23,8 +25,8 @@ import org.ahocorasick.trie.handler.StatefulPayloadEmitHandler; * added keyword. *

* - * @author Daniel Beck * @param The type of the supplied of the payload. + * @author Daniel Beck */ public class PayloadTrie { @@ -32,11 +34,14 @@ public class PayloadTrie { private final PayloadState rootState; + protected PayloadTrie(final TrieConfig trieConfig) { + this.trieConfig = trieConfig; this.rootState = new PayloadState<>(); } + /** * Used by the builder to add a text search keyword with an emit payload. * @@ -45,6 +50,7 @@ public class PayloadTrie { * @throws NullPointerException if the keyword is null. */ private void addKeyword(String keyword, T emit) { + if (keyword.isEmpty()) { return; } @@ -52,6 +58,7 @@ public class PayloadTrie { addState(keyword).addEmit(new Payload<>(keyword, emit)); } + /** * Used by the builder to add a text search keyword. * @@ -59,6 +66,7 @@ public class PayloadTrie { * @throws NullPointerException if the keyword is null. */ private void addKeyword(String keyword) { + if (keyword.isEmpty()) { return; } @@ -66,15 +74,21 @@ public class PayloadTrie { addState(keyword).addEmit(new Payload<>(keyword, null)); } + private PayloadState addState(final String keyword) { + PayloadState state = getRootState(); for (final Character character : keyword.toCharArray()) { + if (isIgnoreWhiteSpace() && isWhitespace(character)) { + continue; + } Character adjustedChar = isCaseInsensitive() ? Character.toLowerCase(character) : character; state = state.addState(adjustedChar); } return state; } + /** * Tokenizes the specified text and returns the emitted outputs. * @@ -82,13 +96,14 @@ public class PayloadTrie { * @return the emitted outputs */ public Collection> tokenize(final String text) { + final Collection> tokens = new LinkedList<>(); final Collection> collectedEmits = parseText(text); int lastCollectedPosition = -1; for (final PayloadEmit emit : collectedEmits) { if (emit.getStart() - lastCollectedPosition > 1) { - tokens.add( createFragment( emit, text, lastCollectedPosition) ); + tokens.add(createFragment(emit, text, lastCollectedPosition)); } tokens.add(createMatch(emit, text)); @@ -96,24 +111,25 @@ public class PayloadTrie { } if (text.length() - lastCollectedPosition > 1) { - tokens.add( createFragment( null, text, lastCollectedPosition) ); + tokens.add(createFragment(null, text, lastCollectedPosition)); } return tokens; } + private PayloadToken createFragment(final PayloadEmit emit, final String text, final int lastCollectedPosition) { - return new PayloadFragmentToken<>( - text.substring( lastCollectedPosition + 1, - emit == null ? text.length() : emit.getStart() ) ); + + return new PayloadFragmentToken<>(text.substring(lastCollectedPosition + 1, emit == null ? text.length() : emit.getStart())); } + private PayloadToken createMatch(PayloadEmit emit, String text) { - return new PayloadMatchToken<>( text.substring( emit.getStart(), - emit.getEnd() + 1 ), - emit ); + + return new PayloadMatchToken<>(text.substring(emit.getStart(), emit.getEnd() + 1), emit); } + /** * Tokenizes a specified text and returns the emitted outputs. * @@ -121,9 +137,11 @@ public class PayloadTrie { * @return A collection of emits. */ public Collection> parseText(final CharSequence text) { + return parseText(text, new DefaultPayloadEmitHandler<>()); } + /** * Tokenizes the specified text by using a custom EmitHandler and returns the * emitted outputs. @@ -134,6 +152,7 @@ public class PayloadTrie { */ @SuppressWarnings("unchecked") public Collection> parseText(final CharSequence text, final StatefulPayloadEmitHandler emitHandler) { + parseText(text, (PayloadEmitHandler) emitHandler); final List> collectedEmits = emitHandler.getEmits(); @@ -146,18 +165,21 @@ public class PayloadTrie { return collectedEmits; } + /** * Returns true if the text contains one of the search terms; otherwise, * returns false. * * @param text Specified text. * @return true if the text contains one of the search terms. Else, returns - * false. + * false. */ public boolean containsMatch(final CharSequence text) { + return firstMatch(text) != null; } + /** * Tokenizes the specified text by using a custom EmitHandler and returns the * emitted outputs. @@ -166,10 +188,14 @@ public class PayloadTrie { * @param emitHandler The handler that will be used to parse the text. */ public void parseText(final CharSequence text, final PayloadEmitHandler emitHandler) { - PayloadState currentState = getRootState(); + PayloadState currentState = getRootState(); for (int position = 0; position < text.length(); position++) { - char character = text.charAt( position); + char character = text.charAt(position); + + if (trieConfig.isIgnoreWhiteSpace() && isWhitespace(character)) { + continue; + } if (trieConfig.isCaseInsensitive()) { character = Character.toLowerCase(character); @@ -183,6 +209,7 @@ public class PayloadTrie { } } + /** * The first matching text sequence. * @@ -190,6 +217,7 @@ public class PayloadTrie { * @return {@code null} if no matches found. */ public PayloadEmit firstMatch(final CharSequence text) { + assert text != null; if (!trieConfig.isAllowOverlaps()) { @@ -204,8 +232,11 @@ public class PayloadTrie { PayloadState currentState = getRootState(); for (int position = 0; position < text.length(); position++) { - char character = text.charAt( position); + char character = text.charAt(position); + if (trieConfig.isIgnoreWhiteSpace() && isWhitespace(character)) { + continue; + } if (trieConfig.isCaseInsensitive()) { character = Character.toLowerCase(character); } @@ -215,8 +246,13 @@ public class PayloadTrie { if (payloads != null && !payloads.isEmpty()) { for (final Payload payload : payloads) { - final PayloadEmit emit = new PayloadEmit<>(position - payload.getKeyword().length() + 1, position, - payload.getKeyword(), payload.getData()); + int start; + if (isIgnoreWhiteSpace()) { + start = findStart(text, position, payload); + } else { + start = position - payload.getKeyword().length() + 1; + } + final PayloadEmit emit = new PayloadEmit<>(start, position, payload.getKeyword(), payload.getData()); if (trieConfig.isOnlyWholeWords()) { if (!isPartialMatch(text, emit)) { return emit; @@ -232,29 +268,38 @@ public class PayloadTrie { return null; } + private boolean isPartialMatch(final CharSequence searchText, final PayloadEmit emit) { - return (emit.getStart() != 0 && Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) - || (emit.getEnd() + 1 != searchText.length() && Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1))); + + return (emit.getStart() != 0 && Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) || (emit.getEnd() + 1 != searchText.length() && Character.isAlphabetic( + searchText.charAt(emit.getEnd() + 1))); } + private boolean isPartialMatchWhiteSpaceSeparated(final CharSequence searchText, final PayloadEmit emit) { + final long size = searchText.length(); - return (emit.getStart() != 0 && !isWhitespace(searchText.charAt(emit.getStart() - 1))) - || (emit.getEnd() + 1 != size && !isWhitespace(searchText.charAt(emit.getEnd() + 1))); + return (emit.getStart() != 0 && !isWhitespace(searchText.charAt(emit.getStart() - 1))) || (emit.getEnd() + 1 != size && !isWhitespace(searchText.charAt(emit.getEnd() + + 1))); } + private PayloadState getState(PayloadState currentState, final Character character) { + PayloadState newCurrentState = currentState.nextState(character); + var tempState = currentState; while (newCurrentState == null) { - currentState = currentState.failure(); - newCurrentState = currentState.nextState(character); + tempState = tempState.getFailure(); + newCurrentState = tempState.nextState(character); } return newCurrentState; } + private void constructFailureStates() { + final Queue> queue = new LinkedBlockingDeque<>(); final PayloadState startState = getRootState(); @@ -272,9 +317,9 @@ public class PayloadTrie { PayloadState targetState = currentState.nextState(transition); queue.add(targetState); - PayloadState traceFailureState = currentState.failure(); + PayloadState traceFailureState = currentState.getFailure(); while (traceFailureState.nextState(transition) == null) { - traceFailureState = traceFailureState.failure(); + traceFailureState = traceFailureState.getFailure(); } final PayloadState newFailureState = traceFailureState.nextState(transition); @@ -284,13 +329,21 @@ public class PayloadTrie { } } + private boolean processEmits(final CharSequence text, final int position, final Collection> payloads, final PayloadEmitHandler emitHandler) { + boolean emitted = false; for (final Payload payload : payloads) { - final PayloadEmit payloadEmit = new PayloadEmit<>(position - payload.getKeyword().length() + 1, - position, payload.getKeyword(), payload.getData()); - if (!(trieConfig.isOnlyWholeWords() && isPartialMatch(text, payloadEmit)) && - !(trieConfig.isOnlyWholeWordsWhiteSpaceSeparated() && isPartialMatchWhiteSpaceSeparated(text, payloadEmit))) { + int start; + if (isIgnoreWhiteSpace()) { + start = findStart(text, position, payload); + } else { + start = position - payload.getKeyword().length() + 1; + } + final PayloadEmit payloadEmit = new PayloadEmit<>(start, position, payload.getKeyword(), payload.getData()); + if (!(trieConfig.isOnlyWholeWords() && isPartialMatch(text, payloadEmit)) && !(trieConfig.isOnlyWholeWordsWhiteSpaceSeparated() && isPartialMatchWhiteSpaceSeparated( + text, + payloadEmit))) { emitted = emitHandler.emit(payloadEmit) || emitted; if (emitted && trieConfig.isStopOnHit()) { break; @@ -301,41 +354,77 @@ public class PayloadTrie { return emitted; } + + private int findStart(CharSequence text, int position, Payload payload) { + + Deque stack = new LinkedList<>(); + int i; + for (i = 0; i < payload.getKeyword().length(); i++) { + if (isWhitespace(payload.getKeyword().charAt(i))) { + continue; + } + stack.push(isCaseInsensitive() ? toLowerCase(payload.getKeyword().charAt(i)) : payload.getKeyword().charAt(i)); + } + for (i = position; !stack.isEmpty() && i >= 0; --i) { + char c = isCaseInsensitive() ? toLowerCase(text.charAt(i)) : text.charAt(i); + if (c == stack.peek()) { + stack.pop(); + } + } + return i + 1; + } + + private boolean isCaseInsensitive() { + return trieConfig.isCaseInsensitive(); } + + private boolean isIgnoreWhiteSpace() { + + return trieConfig.isIgnoreWhiteSpace(); + } + + private PayloadState getRootState() { + return this.rootState; } + /** * Provides a fluent interface for constructing Trie instances with payloads. - * @param The type of the emitted payload. * + * @param The type of the emitted payload. * @return The builder used to configure its Trie. */ public static PayloadTrieBuilder builder() { + return new PayloadTrieBuilder<>(); } + /** * Builder class to create a PayloadTrie instance. * * @param The type of the emitted payload. */ - public static class PayloadTrieBuilder { + public static final class PayloadTrieBuilder { private final TrieConfig trieConfig = new TrieConfig(); private final PayloadTrie trie = new PayloadTrie<>(trieConfig); + /** * Default (empty) constructor. */ private PayloadTrieBuilder() { + } + /** * Configure the Trie to ignore case when searching for keywords in the text. * This must be called before calling addKeyword because the algorithm converts @@ -345,20 +434,24 @@ public class PayloadTrie { * @return This builder. */ public PayloadTrieBuilder ignoreCase() { + this.trieConfig.setCaseInsensitive(true); return this; } + /** * Configure the Trie to ignore overlapping keywords. * * @return This builder. */ public PayloadTrieBuilder ignoreOverlaps() { + this.trieConfig.setAllowOverlaps(false); return this; } + /** * Adds a keyword to the {@link Trie}'s list of text search keywords. * No {@link Payload} is supplied. @@ -368,10 +461,12 @@ public class PayloadTrie { * @throws NullPointerException if the keyword is null. */ public PayloadTrieBuilder addKeyword(final String keyword) { + this.trie.addKeyword(keyword); return this; } + /** * Adds a keyword and a payload to the {@link Trie}'s list of text * search keywords. @@ -382,10 +477,12 @@ public class PayloadTrie { * @throws NullPointerException if the keyword is null. */ public PayloadTrieBuilder addKeyword(final String keyword, final T payload) { + this.trie.addKeyword(keyword, payload); return this; } + /** * Adds a list of keywords and payloads to the {@link Trie}'s list of * text search keywords. @@ -394,22 +491,26 @@ public class PayloadTrie { * @return This builder. */ public PayloadTrieBuilder addKeywords(final Collection> keywords) { + for (Payload payload : keywords) { this.trie.addKeyword(payload.getKeyword(), payload.getData()); } return this; } + /** * Configure the Trie to match whole keywords in the text. * * @return This builder. */ public PayloadTrieBuilder onlyWholeWords() { + this.trieConfig.setOnlyWholeWords(true); return this; } + /** * Configure the Trie to match whole keywords that are separated by whitespace * in the text. For example, "this keyword thatkeyword" would only match the @@ -418,46 +519,69 @@ public class PayloadTrie { * @return This builder. */ public PayloadTrieBuilder onlyWholeWordsWhiteSpaceSeparated() { + this.trieConfig.setOnlyWholeWordsWhiteSpaceSeparated(true); return this; } + /** * Configure the Trie to stop after the first keyword is found in the text. * * @return This builder. */ public PayloadTrieBuilder stopOnHit() { + trie.trieConfig.setStopOnHit(true); return this; } + /** * Configure the PayloadTrie based on the builder settings. * * @return The configured PayloadTrie. */ public PayloadTrie build() { + this.trie.constructFailureStates(); return this.trie; } + /** * @return This builder. * @deprecated Use ignoreCase() */ @Deprecated public PayloadTrieBuilder caseInsensitive() { + return ignoreCase(); } + /** * @return This builder. * @deprecated Use ignoreOverlaps() */ @Deprecated public PayloadTrieBuilder removeOverlaps() { + return ignoreOverlaps(); } + + + /** + * Configure the Trie to ignore whitespaces. + * + * @return This builder. + */ + public PayloadTrieBuilder ignoreWhiteSpace() { + + trieConfig.setIgnoreWhiteSpace(true); + return this; + } + } + } diff --git a/src/main/java/org/ahocorasick/trie/State.java b/src/main/java/org/ahocorasick/trie/State.java index c1e8d7c..bc8efad 100644 --- a/src/main/java/org/ahocorasick/trie/State.java +++ b/src/main/java/org/ahocorasick/trie/State.java @@ -2,6 +2,9 @@ package org.ahocorasick.trie; import java.util.*; +import lombok.Getter; +import lombok.Setter; + /** *

* A state has various important tasks it must attend to: @@ -26,6 +29,7 @@ public class State { /** * effective the size of the keyword */ + @Getter private final int depth; /** @@ -42,6 +46,8 @@ public class State { /** * if no matching states are found, the failure state will be returned */ + @Setter + @Getter private State failure; /** @@ -49,16 +55,22 @@ public class State { */ private Set emits; + public State() { + this(0); } + public State(final int depth) { + this.depth = depth; this.rootState = depth == 0 ? this : null; } + private State nextState(final Character character, final boolean ignoreRootState) { + State nextState = this.success.get(character); if (!ignoreRootState && nextState == null && this.rootState != null) { @@ -68,15 +80,21 @@ public class State { return nextState; } + public State nextState(final Character character) { + return nextState(character, false); } + public State nextStateIgnoreRootState(Character character) { + return nextState(character, true); } + public State addState(String keyword) { + State state = this; for (final Character character : keyword.toCharArray()) { @@ -86,7 +104,9 @@ public class State { return state; } + public State addState(Character character) { + State nextState = nextStateIgnoreRootState(character); if (nextState == null) { nextState = new State(this.depth + 1); @@ -95,40 +115,39 @@ public class State { return nextState; } - public int getDepth() { - return this.depth; - } public void addEmit(String keyword) { + if (this.emits == null) { this.emits = new TreeSet<>(); } this.emits.add(keyword); } + public void addEmit(Collection emits) { + for (String emit : emits) { addEmit(emit); } } + public Collection emit() { + return this.emits == null ? Collections.emptyList() : this.emits; } - public State failure() { - return this.failure; - } - - public void setFailure(State failState) { - this.failure = failState; - } public Collection getStates() { + return this.success.values(); } + public Collection getTransitions() { + return this.success.keySet(); } + } \ No newline at end of file diff --git a/src/main/java/org/ahocorasick/trie/Token.java b/src/main/java/org/ahocorasick/trie/Token.java index 4e79a35..dcd9814 100644 --- a/src/main/java/org/ahocorasick/trie/Token.java +++ b/src/main/java/org/ahocorasick/trie/Token.java @@ -1,17 +1,25 @@ package org.ahocorasick.trie; public abstract class Token { + private String fragment; + public Token(String fragment) { + this.fragment = fragment; } + public String getFragment() { + return this.fragment; } + public abstract boolean isMatch(); + public abstract Emit getEmit(); + } diff --git a/src/main/java/org/ahocorasick/trie/Trie.java b/src/main/java/org/ahocorasick/trie/Trie.java index 5ffee10..a7838fc 100644 --- a/src/main/java/org/ahocorasick/trie/Trie.java +++ b/src/main/java/org/ahocorasick/trie/Trie.java @@ -15,20 +15,26 @@ import org.ahocorasick.trie.handler.StatefulEmitHandler; * * @author Robert Bor */ -public class Trie { +public final class Trie { private final PayloadTrie payloadTrie; + private Trie(final PayloadTrie payloadTrie) { + this.payloadTrie = payloadTrie; } + public Collection tokenize(final String text) { + Collection> tokens = this.payloadTrie.tokenize(text); return asTokens(tokens); } + private static Collection asTokens(Collection> tokens) { + Collection result = new ArrayList<>(); for (PayloadToken payloadToken : tokens) { result.add(new DefaultToken(payloadToken)); @@ -36,7 +42,9 @@ public class Trie { return result; } + private static Collection asEmits(Collection> emits) { + Collection result = new ArrayList<>(); for (PayloadEmit emit : emits) { result.add(asEmit(emit)); @@ -44,30 +52,40 @@ public class Trie { return result; } + private static Emit asEmit(PayloadEmit payloadEmit) { + return new Emit(payloadEmit.getStart(), payloadEmit.getEnd(), payloadEmit.getKeyword()); } + public Collection parseText(final CharSequence text) { + Collection> parsedText = this.payloadTrie.parseText(text); return asEmits(parsedText); } + @SuppressWarnings("UnusedReturnValue") - public Collection parseText( final CharSequence text, final StatefulEmitHandler emitHandler) { - Collection> parsedText = this.payloadTrie.parseText(text, - new StatefulPayloadEmitDelegateHandler(emitHandler)); + public Collection parseText(final CharSequence text, final StatefulEmitHandler emitHandler) { + + Collection> parsedText = this.payloadTrie.parseText(text, new StatefulPayloadEmitDelegateHandler(emitHandler)); return asEmits(parsedText); } + public boolean containsMatch(final CharSequence text) { + return firstMatch(text) != null; } + public void parseText(final CharSequence text, final EmitHandler emitHandler) { + this.payloadTrie.parseText(text, new PayloadEmitDelegateHandler(emitHandler)); } + /** * The first matching text sequence. * @@ -75,35 +93,38 @@ public class Trie { * @return {@code null} if no matches found. */ public Emit firstMatch(final CharSequence text) { + assert text != null; - final PayloadEmit payload = this.payloadTrie.firstMatch( text ); - return payload == null - ? null - : new Emit( payload.getStart(), - payload.getEnd(), - payload.getKeyword() ); + final PayloadEmit payload = this.payloadTrie.firstMatch(text); + return payload == null ? null : new Emit(payload.getStart(), payload.getEnd(), payload.getKeyword()); } + /** * Provides a fluent interface for constructing Trie instances. * * @return The builder used to configure its Trie. */ public static TrieBuilder builder() { + return new TrieBuilder(); } - public static class TrieBuilder { + + public static final class TrieBuilder { private final PayloadTrieBuilder delegate = PayloadTrie.builder(); + /** * Default (empty) constructor. */ private TrieBuilder() { + } + /** * Configure the Trie to ignore case when searching for keywords in the text. * This must be called before calling addKeyword because the algorithm converts @@ -113,21 +134,37 @@ public class Trie { * @return This builder. */ public TrieBuilder ignoreCase() { + delegate.ignoreCase(); // this.trieConfig.setCaseInsensitive(true); return this; } + /** * Configure the Trie to ignore overlapping keywords. * * @return This builder. */ public TrieBuilder ignoreOverlaps() { + delegate.ignoreOverlaps(); return this; } + + /** + * Configure the Trie to ignore whitespaces. + * + * @return This builder. + */ + public TrieBuilder ignoreWhiteSpace() { + + delegate.ignoreWhiteSpace(); + return this; + } + + /** * Adds a keyword to the Trie's list of text search keywords. * @@ -136,10 +173,12 @@ public class Trie { * @throws NullPointerException if the keyword is null. */ public TrieBuilder addKeyword(final String keyword) { + delegate.addKeyword(keyword, null); return this; } + /** * Adds a list of keywords to the Trie's list of text search keywords. * @@ -147,12 +186,14 @@ public class Trie { * @return This builder. */ public TrieBuilder addKeywords(final String... keywords) { + for (String keyword : keywords) { delegate.addKeyword(keyword, null); } return this; } + /** * Adds a list of keywords to the Trie's list of text search keywords. * @@ -160,23 +201,27 @@ public class Trie { * @return This builder. */ @SuppressWarnings("unused") - public TrieBuilder addKeywords( final Collection keywords ) { + public TrieBuilder addKeywords(final Collection keywords) { + for (String keyword : keywords) { this.delegate.addKeyword(keyword, null); } return this; } + /** * Configure the Trie to match whole keywords in the text. * * @return This builder. */ public TrieBuilder onlyWholeWords() { + this.delegate.onlyWholeWords(); return this; } + /** * Configure the Trie to match whole keywords that are separated by whitespace * in the text. For example, "this keyword thatkeyword" would only match the @@ -185,44 +230,35 @@ public class Trie { * @return This builder. */ public TrieBuilder onlyWholeWordsWhiteSpaceSeparated() { + this.delegate.onlyWholeWordsWhiteSpaceSeparated(); return this; } + /** * Configure the Trie to stop after the first keyword is found in the text. * * @return This builder. */ public TrieBuilder stopOnHit() { + this.delegate.stopOnHit(); return this; } + /** * Configure the Trie based on the builder settings. * * @return The configured Trie. */ public Trie build() { + PayloadTrie payloadTrie = this.delegate.build(); return new Trie(payloadTrie); } - /** - * @return This builder. - * @deprecated Use ignoreCase() - */ - public TrieBuilder caseInsensitive() { - return ignoreCase(); - } - - /** - * @return This builder. - * @deprecated Use ignoreOverlaps() - */ - public TrieBuilder removeOverlaps() { - return ignoreOverlaps(); - } } + } diff --git a/src/main/java/org/ahocorasick/trie/TrieConfig.java b/src/main/java/org/ahocorasick/trie/TrieConfig.java index f7487dd..daf38fb 100644 --- a/src/main/java/org/ahocorasick/trie/TrieConfig.java +++ b/src/main/java/org/ahocorasick/trie/TrieConfig.java @@ -4,51 +4,86 @@ public class TrieConfig { private boolean allowOverlaps = true; - private boolean onlyWholeWords = false; + private boolean onlyWholeWords; - private boolean onlyWholeWordsWhiteSpaceSeparated = false; + private boolean onlyWholeWordsWhiteSpaceSeparated; - private boolean caseInsensitive = false; + private boolean caseInsensitive; + + private boolean ignoreWhiteSpace; + + private boolean stopOnHit; - private boolean stopOnHit = false; public boolean isStopOnHit() { + return stopOnHit; } + public void setStopOnHit(boolean stopOnHit) { + this.stopOnHit = stopOnHit; } + public boolean isAllowOverlaps() { + return allowOverlaps; } + public void setAllowOverlaps(boolean allowOverlaps) { + this.allowOverlaps = allowOverlaps; } + public boolean isOnlyWholeWords() { + return onlyWholeWords; } + public void setOnlyWholeWords(boolean onlyWholeWords) { + this.onlyWholeWords = onlyWholeWords; } + public boolean isOnlyWholeWordsWhiteSpaceSeparated() { + return onlyWholeWordsWhiteSpaceSeparated; } + public void setOnlyWholeWordsWhiteSpaceSeparated(boolean onlyWholeWordsWhiteSpaceSeparated) { + this.onlyWholeWordsWhiteSpaceSeparated = onlyWholeWordsWhiteSpaceSeparated; } + public boolean isCaseInsensitive() { + return caseInsensitive; } + + public boolean isIgnoreWhiteSpace() { + + return ignoreWhiteSpace; + } + + public void setCaseInsensitive(boolean caseInsensitive) { + this.caseInsensitive = caseInsensitive; } + + + public void setIgnoreWhiteSpace(boolean ignoreWhiteSpace) { + + this.ignoreWhiteSpace = ignoreWhiteSpace; + } + } diff --git a/src/main/java/org/ahocorasick/trie/handler/AbstractStatefulEmitHandler.java b/src/main/java/org/ahocorasick/trie/handler/AbstractStatefulEmitHandler.java index eaa170c..4cb208a 100644 --- a/src/main/java/org/ahocorasick/trie/handler/AbstractStatefulEmitHandler.java +++ b/src/main/java/org/ahocorasick/trie/handler/AbstractStatefulEmitHandler.java @@ -9,12 +9,16 @@ public abstract class AbstractStatefulEmitHandler implements StatefulEmitHandler private final List emits = new ArrayList<>(); + public void addEmit(final Emit emit) { + this.emits.add(emit); } + @Override public List getEmits() { + return this.emits; } diff --git a/src/main/java/org/ahocorasick/trie/handler/AbstractStatefulPayloadEmitHandler.java b/src/main/java/org/ahocorasick/trie/handler/AbstractStatefulPayloadEmitHandler.java index 6d5d088..4552f0f 100644 --- a/src/main/java/org/ahocorasick/trie/handler/AbstractStatefulPayloadEmitHandler.java +++ b/src/main/java/org/ahocorasick/trie/handler/AbstractStatefulPayloadEmitHandler.java @@ -9,12 +9,16 @@ public abstract class AbstractStatefulPayloadEmitHandler implements StatefulP private final List> emits = new ArrayList<>(); + public void addEmit(final PayloadEmit emit) { + this.emits.add(emit); } + @Override public List> getEmits() { + return this.emits; } diff --git a/src/main/java/org/ahocorasick/trie/handler/DefaultEmitHandler.java b/src/main/java/org/ahocorasick/trie/handler/DefaultEmitHandler.java index 80a18c1..a1e4935 100644 --- a/src/main/java/org/ahocorasick/trie/handler/DefaultEmitHandler.java +++ b/src/main/java/org/ahocorasick/trie/handler/DefaultEmitHandler.java @@ -9,14 +9,19 @@ public class DefaultEmitHandler implements StatefulEmitHandler { private final List emits = new ArrayList<>(); + @Override public boolean emit(final Emit emit) { + this.emits.add(emit); return true; } + @Override public List getEmits() { + return this.emits; } + } diff --git a/src/main/java/org/ahocorasick/trie/handler/DefaultPayloadEmitHandler.java b/src/main/java/org/ahocorasick/trie/handler/DefaultPayloadEmitHandler.java index 8e7b1c3..0414d4d 100644 --- a/src/main/java/org/ahocorasick/trie/handler/DefaultPayloadEmitHandler.java +++ b/src/main/java/org/ahocorasick/trie/handler/DefaultPayloadEmitHandler.java @@ -9,14 +9,19 @@ public class DefaultPayloadEmitHandler implements StatefulPayloadEmitHandler< private final List> emits = new ArrayList<>(); + @Override public boolean emit(final PayloadEmit emit) { + this.emits.add(emit); return true; } + @Override public List> getEmits() { + return this.emits; } + } diff --git a/src/main/java/org/ahocorasick/trie/handler/EmitHandler.java b/src/main/java/org/ahocorasick/trie/handler/EmitHandler.java index 1332ec2..176bb0b 100644 --- a/src/main/java/org/ahocorasick/trie/handler/EmitHandler.java +++ b/src/main/java/org/ahocorasick/trie/handler/EmitHandler.java @@ -3,5 +3,7 @@ package org.ahocorasick.trie.handler; import org.ahocorasick.trie.Emit; public interface EmitHandler { + boolean emit(Emit emit); + } diff --git a/src/main/java/org/ahocorasick/trie/handler/PayloadEmitDelegateHandler.java b/src/main/java/org/ahocorasick/trie/handler/PayloadEmitDelegateHandler.java index 2d42552..b3c78f7 100644 --- a/src/main/java/org/ahocorasick/trie/handler/PayloadEmitDelegateHandler.java +++ b/src/main/java/org/ahocorasick/trie/handler/PayloadEmitDelegateHandler.java @@ -11,13 +11,17 @@ public class PayloadEmitDelegateHandler implements PayloadEmitHandler { private EmitHandler handler; + public PayloadEmitDelegateHandler(EmitHandler handler) { + this.handler = handler; } + @Override public boolean emit(PayloadEmit emit) { + Emit newEmit = new Emit(emit.getStart(), emit.getEnd(), emit.getKeyword()); return handler.emit(newEmit); } diff --git a/src/main/java/org/ahocorasick/trie/handler/PayloadEmitHandler.java b/src/main/java/org/ahocorasick/trie/handler/PayloadEmitHandler.java index 173c712..d0b9c66 100644 --- a/src/main/java/org/ahocorasick/trie/handler/PayloadEmitHandler.java +++ b/src/main/java/org/ahocorasick/trie/handler/PayloadEmitHandler.java @@ -3,5 +3,7 @@ package org.ahocorasick.trie.handler; import org.ahocorasick.trie.PayloadEmit; public interface PayloadEmitHandler { + boolean emit(PayloadEmit emit); + } diff --git a/src/main/java/org/ahocorasick/trie/handler/StatefulEmitHandler.java b/src/main/java/org/ahocorasick/trie/handler/StatefulEmitHandler.java index 13cb20e..b674271 100644 --- a/src/main/java/org/ahocorasick/trie/handler/StatefulEmitHandler.java +++ b/src/main/java/org/ahocorasick/trie/handler/StatefulEmitHandler.java @@ -5,5 +5,7 @@ import java.util.List; import org.ahocorasick.trie.Emit; public interface StatefulEmitHandler extends EmitHandler { + List getEmits(); + } diff --git a/src/main/java/org/ahocorasick/trie/handler/StatefulPayloadEmitDelegateHandler.java b/src/main/java/org/ahocorasick/trie/handler/StatefulPayloadEmitDelegateHandler.java index 1a8e1da..e7ba5e8 100644 --- a/src/main/java/org/ahocorasick/trie/handler/StatefulPayloadEmitDelegateHandler.java +++ b/src/main/java/org/ahocorasick/trie/handler/StatefulPayloadEmitDelegateHandler.java @@ -15,12 +15,16 @@ public class StatefulPayloadEmitDelegateHandler implements StatefulPayloadEmitHa private StatefulEmitHandler handler; + public StatefulPayloadEmitDelegateHandler(StatefulEmitHandler handler) { + this.handler = handler; } + private static List> asEmits(Collection emits) { + List> result = new ArrayList<>(); for (Emit emit : emits) { result.add(new PayloadEmit(emit.getStart(), emit.getEnd(), emit.getKeyword(), null)); @@ -28,15 +32,20 @@ public class StatefulPayloadEmitDelegateHandler implements StatefulPayloadEmitHa return result; } + @Override public boolean emit(PayloadEmit emit) { + Emit newEmit = new Emit(emit.getStart(), emit.getEnd(), emit.getKeyword()); return handler.emit(newEmit); } + @Override public List> getEmits() { + List emits = this.handler.getEmits(); return asEmits(emits); } + } diff --git a/src/main/java/org/ahocorasick/trie/handler/StatefulPayloadEmitHandler.java b/src/main/java/org/ahocorasick/trie/handler/StatefulPayloadEmitHandler.java index bb42049..c24b71f 100644 --- a/src/main/java/org/ahocorasick/trie/handler/StatefulPayloadEmitHandler.java +++ b/src/main/java/org/ahocorasick/trie/handler/StatefulPayloadEmitHandler.java @@ -4,6 +4,8 @@ import java.util.List; import org.ahocorasick.trie.PayloadEmit; -public interface StatefulPayloadEmitHandler extends PayloadEmitHandler{ +public interface StatefulPayloadEmitHandler extends PayloadEmitHandler { + List> getEmits(); + } diff --git a/src/test/java/org/ahocorasick/interval/IntervalTest.java b/src/test/java/org/ahocorasick/interval/IntervalTest.java index 328b902..63698ab 100644 --- a/src/test/java/org/ahocorasick/interval/IntervalTest.java +++ b/src/test/java/org/ahocorasick/interval/IntervalTest.java @@ -12,38 +12,51 @@ public class IntervalTest { @Test public void test_construct() { + final Interval i = new Interval(1, 3); assertEquals(1, i.getStart()); assertEquals(3, i.getEnd()); } + @Test public void test_size() { + assertEquals(3, new Interval(0, 2).size()); } + @Test public void test_intervaloverlaps() { + assertTrue(new Interval(1, 3).overlapsWith(new Interval(2, 4))); } + @Test public void test_intervalDoesNotOverlap() { + assertFalse(new Interval(1, 13).overlapsWith(new Interval(27, 42))); } + @Test public void test_pointOverlaps() { + assertTrue(new Interval(1, 3).overlapsWith(2)); } + @Test public void test_pointDoesNotOverlap() { + assertFalse(new Interval(1, 13).overlapsWith(42)); } + @Test public void test_comparable() { + final Set intervals = new TreeSet<>(); intervals.add(new Interval(4, 6)); intervals.add(new Interval(2, 7)); @@ -54,13 +67,17 @@ public class IntervalTest { assertEquals(4, it.next().getStart()); } + @Test public void test_checkToString() { + assertEquals("4:6", new Interval(4, 6).toString()); } + @Test public void test_compareToNegativeTest() { + assertEquals(-1, new Interval(4, 6).compareTo(new Object())); } diff --git a/src/test/java/org/ahocorasick/interval/IntervalTreeTest.java b/src/test/java/org/ahocorasick/interval/IntervalTreeTest.java index fc41a3e..7995126 100644 --- a/src/test/java/org/ahocorasick/interval/IntervalTreeTest.java +++ b/src/test/java/org/ahocorasick/interval/IntervalTreeTest.java @@ -12,6 +12,7 @@ public class IntervalTreeTest { @Test public void findOverlaps() { + List intervals = new ArrayList<>(); intervals.add(new Interval(0, 2)); intervals.add(new Interval(1, 3)); @@ -28,8 +29,10 @@ public class IntervalTreeTest { assertOverlap(overlapsIt.next(), 0, 2); } + @Test public void removeOverlaps() { + List intervals = new ArrayList<>(); intervals.add(new Interval(0, 2)); intervals.add(new Interval(4, 5)); @@ -43,7 +46,9 @@ public class IntervalTreeTest { } + protected void assertOverlap(Intervalable interval, int expectedStart, int expectedEnd) { + assertEquals(expectedStart, interval.getStart()); assertEquals(expectedEnd, interval.getEnd()); } diff --git a/src/test/java/org/ahocorasick/interval/IntervalableComparatorByPositionTest.java b/src/test/java/org/ahocorasick/interval/IntervalableComparatorByPositionTest.java index a36c831..e67fdd6 100644 --- a/src/test/java/org/ahocorasick/interval/IntervalableComparatorByPositionTest.java +++ b/src/test/java/org/ahocorasick/interval/IntervalableComparatorByPositionTest.java @@ -12,6 +12,7 @@ public class IntervalableComparatorByPositionTest { @Test public void sortOnPosition() { + List intervals = new ArrayList(); intervals.add(new Interval(4, 5)); intervals.add(new Interval(1, 4)); diff --git a/src/test/java/org/ahocorasick/interval/IntervalableComparatorBySizeTest.java b/src/test/java/org/ahocorasick/interval/IntervalableComparatorBySizeTest.java index 8fc7db1..7c7cb6e 100644 --- a/src/test/java/org/ahocorasick/interval/IntervalableComparatorBySizeTest.java +++ b/src/test/java/org/ahocorasick/interval/IntervalableComparatorBySizeTest.java @@ -12,6 +12,7 @@ public class IntervalableComparatorBySizeTest { @Test public void sortOnSize() { + List intervals = new ArrayList(); intervals.add(new Interval(4, 5)); intervals.add(new Interval(1, 4)); @@ -22,8 +23,10 @@ public class IntervalableComparatorBySizeTest { assertEquals(2, intervals.get(2).size()); } + @Test public void sortOnSizeThenPosition() { + List intervals = new ArrayList(); intervals.add(new Interval(4, 7)); intervals.add(new Interval(2, 5)); diff --git a/src/test/java/org/ahocorasick/trie/EmitTest.java b/src/test/java/org/ahocorasick/trie/EmitTest.java index 33f2d2c..1e8a335 100644 --- a/src/test/java/org/ahocorasick/trie/EmitTest.java +++ b/src/test/java/org/ahocorasick/trie/EmitTest.java @@ -15,18 +15,22 @@ public class EmitTest { */ @Test public void test_Equality_SameValues_ObjectsAreEqual() { + final Emit one = new Emit(13, 42, null); final Emit two = new Emit(13, 42, null); - assertEquals( one, two ); + assertEquals(one, two); } + /** * Test that two {@link Emit} instances having different values are equal. */ @Test public void test_Equality_DifferingValues_ObjectsAreNotEqual() { + final Emit one = new Emit(13, 42, null); final Emit two = new Emit(13, 43, null); assertNotEquals(one, two); } + } diff --git a/src/test/java/org/ahocorasick/trie/PayloadTrieTest.java b/src/test/java/org/ahocorasick/trie/PayloadTrieTest.java index a0fbaf3..026a186 100644 --- a/src/test/java/org/ahocorasick/trie/PayloadTrieTest.java +++ b/src/test/java/org/ahocorasick/trie/PayloadTrieTest.java @@ -17,111 +17,117 @@ import static org.junit.Assert.*; public class PayloadTrieTest { - private final static String[] ALPHABET = new String[] { "abc", "bcd", "cde" }; - private final static String[] ALPHABET_PAYLOAD = new String[] { "alpha:abc", "alpha:bcd", "alpha:cde" }; + private final static String[] ALPHABET = new String[]{"abc", "bcd", "cde"}; + private final static String[] ALPHABET_PAYLOAD = new String[]{"alpha:abc", "alpha:bcd", "alpha:cde"}; - private final static List> ALPHABET_WITH_PAYLOADS = asList( - new Payload<>( ALPHABET[ 0 ], ALPHABET_PAYLOAD[ 0 ] ), - new Payload<>( ALPHABET[ 1 ], ALPHABET_PAYLOAD[ 1 ] ), - new Payload<>( ALPHABET[ 2 ], ALPHABET_PAYLOAD[ 2 ] )); + private final static List> ALPHABET_WITH_PAYLOADS = asList(new Payload<>(ALPHABET[0], ALPHABET_PAYLOAD[0]), + new Payload<>(ALPHABET[1], ALPHABET_PAYLOAD[1]), + new Payload<>(ALPHABET[2], ALPHABET_PAYLOAD[2])); - private final static String[] PRONOUNS = new String[] { "hers", "his", "she", "he" }; - private final static int[] PRONOUNS_PAYLOAD_ID = new int[] { 9, 12, 4, 20 }; + private final static String[] PRONOUNS = new String[]{"hers", "his", "she", "he"}; + private final static int[] PRONOUNS_PAYLOAD_ID = new int[]{9, 12, 4, 20}; - private final static List> PRONOUNS_WITH_PAYLOADS = asList( - new Payload<>( PRONOUNS[ 0 ], PRONOUNS_PAYLOAD_ID[ 0 ] ), - new Payload<>( PRONOUNS[ 1 ], PRONOUNS_PAYLOAD_ID[ 1 ] ), - new Payload<>( PRONOUNS[ 2 ], PRONOUNS_PAYLOAD_ID[ 2 ] ), - new Payload<>( PRONOUNS[ 3 ], PRONOUNS_PAYLOAD_ID[ 3 ] ) - ); + private final static List> PRONOUNS_WITH_PAYLOADS = asList(new Payload<>(PRONOUNS[0], PRONOUNS_PAYLOAD_ID[0]), + new Payload<>(PRONOUNS[1], PRONOUNS_PAYLOAD_ID[1]), + new Payload<>(PRONOUNS[2], PRONOUNS_PAYLOAD_ID[2]), + new Payload<>(PRONOUNS[3], PRONOUNS_PAYLOAD_ID[3])); - private final static String[] FOOD = new String[] { "veal", "cauliflower", "broccoli", "tomatoes" }; - private final static Food[] FOOD_PAYLOAD = new Food[] { new Food("veal"), new Food("cauliflower"), new Food("broccoli"), - new Food("tomatoes") }; + private final static String[] FOOD = new String[]{"veal", "cauliflower", "broccoli", "tomatoes"}; + private final static Food[] FOOD_PAYLOAD = new Food[]{new Food("veal"), new Food("cauliflower"), new Food("broccoli"), new Food("tomatoes")}; - private final static List> FOOD_WITH_PAYLOADS = asList( - new Payload<>( FOOD[ 0 ], FOOD_PAYLOAD[ 0 ] ), - new Payload<>( FOOD[ 1 ], FOOD_PAYLOAD[ 1 ] ), - new Payload<>( FOOD[ 2 ], FOOD_PAYLOAD[ 2 ] ), - new Payload<>( FOOD[ 3 ], FOOD_PAYLOAD[ 3 ] ) - ); + private final static List> FOOD_WITH_PAYLOADS = asList(new Payload<>(FOOD[0], FOOD_PAYLOAD[0]), + new Payload<>(FOOD[1], FOOD_PAYLOAD[1]), + new Payload<>(FOOD[2], FOOD_PAYLOAD[2]), + new Payload<>(FOOD[3], FOOD_PAYLOAD[3])); - private final static String[] GREEK_LETTERS = new String[] { "Alpha", "Beta", "Gamma" }; - private final static String[] GREEK_LETTERS_PAYLOAD = new String[] { "greek:Alpha", "greek:Beta", "greek:Gamma" }; + private final static String[] GREEK_LETTERS = new String[]{"Alpha", "Beta", "Gamma"}; + private final static String[] GREEK_LETTERS_PAYLOAD = new String[]{"greek:Alpha", "greek:Beta", "greek:Gamma"}; - private final static List> GREEK_LETTERS_WITH_PAYLOADS = asList( - new Payload<>( GREEK_LETTERS[ 0 ], GREEK_LETTERS_PAYLOAD[ 0 ] ), - new Payload<>( GREEK_LETTERS[ 1 ], GREEK_LETTERS_PAYLOAD[ 1 ] ), - new Payload<>( GREEK_LETTERS[ 2 ], GREEK_LETTERS_PAYLOAD[ 2 ] )); + private final static List> GREEK_LETTERS_WITH_PAYLOADS = asList(new Payload<>(GREEK_LETTERS[0], GREEK_LETTERS_PAYLOAD[0]), + new Payload<>(GREEK_LETTERS[1], GREEK_LETTERS_PAYLOAD[1]), + new Payload<>(GREEK_LETTERS[2], GREEK_LETTERS_PAYLOAD[2])); - private final static String[] UNICODE = new String[] { "turning", "once", "again", "börkü" }; - private final static String[] UNICODE_PAYLOAD = new String[] { "uni:turning", "uni:once", "uni:again", "uni:börkü" }; + private final static String[] UNICODE = new String[]{"turning", "once", "again", "börkü"}; + private final static String[] UNICODE_PAYLOAD = new String[]{"uni:turning", "uni:once", "uni:again", "uni:börkü"}; - private final static List> UNICODE_WITH_PAYLOADS = asList( - new Payload<>( UNICODE[ 0 ], UNICODE_PAYLOAD[ 0 ] ), - new Payload<>( UNICODE[ 1 ], UNICODE_PAYLOAD[ 1 ] ), - new Payload<>( UNICODE[ 2 ], UNICODE_PAYLOAD[ 2 ] ), - new Payload<>( UNICODE[ 3 ], UNICODE_PAYLOAD[ 3 ] )); + private final static List> UNICODE_WITH_PAYLOADS = asList(new Payload<>(UNICODE[0], UNICODE_PAYLOAD[0]), + new Payload<>(UNICODE[1], UNICODE_PAYLOAD[1]), + new Payload<>(UNICODE[2], UNICODE_PAYLOAD[2]), + new Payload<>(UNICODE[3], UNICODE_PAYLOAD[3])); public static class Food { + private final String name; + public Food(String name) { + this.name = name; } + @Override public int hashCode() { + final int prime = 31; int result = 1; result = prime * result + ((name == null) ? 0 : name.hashCode()); return result; } + @Override - public boolean equals( Object obj ) { - if( this == obj ) { + public boolean equals(Object obj) { + + if (this == obj) { return true; } - if( obj == null ) { + if (obj == null) { return false; } - if( getClass() != obj.getClass() ) { + if (getClass() != obj.getClass()) { return false; } Food other = (Food) obj; - if( name == null ) { + if (name == null) { return other.name == null; - } - else { - return name.equals( other.name ); + } else { + return name.equals(other.name); } } + } + @Test public void keywordAndTextAreTheSame() { + PayloadTrie trie = PayloadTrie.builder().addKeyword(ALPHABET[0], ALPHABET_PAYLOAD[0]).build(); Collection> emits = trie.parseText(ALPHABET[0]); Iterator> iterator = emits.iterator(); checkEmit(iterator.next(), 0, 2, ALPHABET[0], ALPHABET_PAYLOAD[0]); } + @Test public void keywordAndTextAreTheSameFirstMatch() { + PayloadTrie trie = PayloadTrie.builder().addKeyword(ALPHABET[0], ALPHABET_PAYLOAD[0]).build(); PayloadEmit firstMatch = trie.firstMatch(ALPHABET[0]); checkEmit(firstMatch, 0, 2, ALPHABET[0], ALPHABET_PAYLOAD[0]); } + @Test public void textIsLongerThanKeyword() { + PayloadTrie trie = PayloadTrie.builder().addKeyword(ALPHABET[0], ALPHABET_PAYLOAD[0]).build(); Collection> emits = trie.parseText(" " + ALPHABET[0]); Iterator> iterator = emits.iterator(); checkEmit(iterator.next(), 1, 3, ALPHABET[0], ALPHABET_PAYLOAD[0]); } + @Test public void textIsLongerThanKeywordFirstMatch() { @@ -130,23 +136,29 @@ public class PayloadTrieTest { checkEmit(firstMatch, 1, 3, ALPHABET[0], ALPHABET_PAYLOAD[0]); } + @Test public void variousKeywordsOneMatch() { + PayloadTrie trie = PayloadTrie.builder().addKeywords(ALPHABET_WITH_PAYLOADS).build(); Collection> emits = trie.parseText("bcd"); Iterator> iterator = emits.iterator(); checkEmit(iterator.next(), 0, 2, "bcd", "alpha:bcd"); } + @Test public void variousKeywordsFirstMatch() { + PayloadTrie trie = PayloadTrie.builder().addKeywords(ALPHABET_WITH_PAYLOADS).build(); PayloadEmit firstMatch = trie.firstMatch("bcd"); checkEmit(firstMatch, 0, 2, "bcd", "alpha:bcd"); } + @Test public void ushersTestAndStopOnHit() { + PayloadTrie trie = PayloadTrie.builder().addKeywords(PRONOUNS_WITH_PAYLOADS).stopOnHit().build(); Collection> emits = trie.parseText("ushers"); assertEquals(1, emits.size()); // she @ 3, he @ 3, hers @ 5 @@ -154,15 +166,19 @@ public class PayloadTrieTest { checkEmit(iterator.next(), 2, 3, "he", 20); } + @Test public void ushersTestStopOnHitSkipOne() { + PayloadTrie trie = PayloadTrie.builder().addKeywords(PRONOUNS_WITH_PAYLOADS).stopOnHit().build(); StatefulPayloadEmitHandler testEmitHandler = new AbstractStatefulPayloadEmitHandler() { boolean first = true; + @Override public boolean emit(final PayloadEmit emit) { + if (first) { // return false for the first element first = false; @@ -181,8 +197,10 @@ public class PayloadTrieTest { checkEmit(iterator.next(), 1, 3, "she", 4); } + @Test public void ushersTest() { + PayloadTrie trie = PayloadTrie.builder().addKeywords(PRONOUNS_WITH_PAYLOADS).build(); Collection> emits = trie.parseText("ushers"); assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5 @@ -193,10 +211,17 @@ public class PayloadTrieTest { checkEmit(iterator.next(), 2, 5, "hers", 9); } + @Test public void ushersTestWithCapitalKeywords() { - PayloadTrie trie = PayloadTrie.builder().ignoreCase().addKeyword("HERS", "hers").addKeyword("HIS", "his") - .addKeyword("SHE", "she").addKeyword("HE", "he").build(); + + PayloadTrie trie = PayloadTrie.builder() + .ignoreCase() + .addKeyword("HERS", "hers") + .addKeyword("HIS", "his") + .addKeyword("SHE", "she") + .addKeyword("HE", "he") + .build(); Collection> emits = trie.parseText("ushers"); assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5 Iterator> iterator = emits.iterator(); @@ -205,15 +230,19 @@ public class PayloadTrieTest { checkEmit(iterator.next(), 2, 5, "HERS", "hers"); } + @Test public void ushersTestFirstMatch() { + PayloadTrie trie = PayloadTrie.builder().addKeywords(PRONOUNS_WITH_PAYLOADS).build(); PayloadEmit firstMatch = trie.firstMatch("ushers"); checkEmit(firstMatch, 2, 3, "he", 20); } + @Test public void ushersTestByCallback() { + PayloadTrie trie = PayloadTrie.builder().addKeywords(PRONOUNS_WITH_PAYLOADS).build(); final List> emits = new LinkedList<>(); @@ -230,23 +259,29 @@ public class PayloadTrieTest { checkEmit(iterator.next(), 2, 5, "hers", 9); } + @Test public void misleadingTest() { + PayloadTrie trie = PayloadTrie.builder().addKeyword("hers", "pronon:hers").build(); Collection> emits = trie.parseText("h he her hers"); Iterator> iterator = emits.iterator(); checkEmit(iterator.next(), 9, 12, "hers", "pronon:hers"); } + @Test public void misleadingTestFirstMatch() { + PayloadTrie trie = PayloadTrie.builder().addKeyword("hers", "pronon:hers").build(); PayloadEmit firstMatch = trie.firstMatch("h he her hers"); checkEmit(firstMatch, 9, 12, "hers", "pronon:hers"); } + @Test public void recipes() { + PayloadTrie trie = PayloadTrie.builder().addKeywords(FOOD_WITH_PAYLOADS).build(); Collection> emits = trie.parseText("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli"); Iterator> iterator = emits.iterator(); @@ -256,17 +291,20 @@ public class PayloadTrieTest { checkEmit(iterator.next(), 51, 58, "broccoli", new Food("broccoli")); } + @Test public void recipesFirstMatch() { + PayloadTrie trie = PayloadTrie.builder().addKeywords(FOOD_WITH_PAYLOADS).build(); PayloadEmit firstMatch = trie.firstMatch("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli"); checkEmit(firstMatch, 2, 12, "cauliflower", new Food("cauliflower")); } + @Test public void longAndShortOverlappingMatch() { - PayloadTrie trie = PayloadTrie.builder().addKeyword("he", "pronon:he").addKeyword("hehehehe", "garbage") - .build(); + + PayloadTrie trie = PayloadTrie.builder().addKeyword("he", "pronon:he").addKeyword("hehehehe", "garbage").build(); Collection> emits = trie.parseText("hehehehehe"); Iterator> iterator = emits.iterator(); checkEmit(iterator.next(), 0, 1, "he", "pronon:he"); @@ -278,10 +316,16 @@ public class PayloadTrieTest { checkEmit(iterator.next(), 2, 9, "hehehehe", "garbage"); } + @Test public void nonOverlapping() { - PayloadTrie trie = PayloadTrie.builder().ignoreOverlaps().addKeyword("ab", "alpha:ab") - .addKeyword("cba", "alpha:cba").addKeyword("ababc", "alpha:ababc").build(); + + PayloadTrie trie = PayloadTrie.builder() + .ignoreOverlaps() + .addKeyword("ab", "alpha:ab") + .addKeyword("cba", "alpha:cba") + .addKeyword("ababc", "alpha:ababc") + .build(); Collection> emits = trie.parseText("ababcbab"); assertEquals(2, emits.size()); Iterator> iterator = emits.iterator(); @@ -290,49 +334,79 @@ public class PayloadTrieTest { checkEmit(iterator.next(), 6, 7, "ab", "alpha:ab"); } + @Test public void nonOverlappingFirstMatch() { - PayloadTrie trie = PayloadTrie.builder().ignoreOverlaps().addKeyword("ab", "alpha:ab") - .addKeyword("cba", "alpha:cba").addKeyword("ababc", "alpha:ababc").build(); + + PayloadTrie trie = PayloadTrie.builder() + .ignoreOverlaps() + .addKeyword("ab", "alpha:ab") + .addKeyword("cba", "alpha:cba") + .addKeyword("ababc", "alpha:ababc") + .build(); PayloadEmit firstMatch = trie.firstMatch("ababcbab"); checkEmit(firstMatch, 0, 4, "ababc", "alpha:ababc"); } + @Test public void containsMatch() { - PayloadTrie trie = PayloadTrie.builder().ignoreOverlaps().addKeyword("ab", "alpha:ab") - .addKeyword("cba", "alpha:cba").addKeyword("ababc", "alpha:ababc").build(); + + PayloadTrie trie = PayloadTrie.builder() + .ignoreOverlaps() + .addKeyword("ab", "alpha:ab") + .addKeyword("cba", "alpha:cba") + .addKeyword("ababc", "alpha:ababc") + .build(); assertTrue(trie.containsMatch("ababcbab")); } + @Test public void startOfChurchillSpeech() { - PayloadTrie trie = PayloadTrie.builder().ignoreOverlaps().addKeyword("T").addKeyword("u").addKeyword("ur") - .addKeyword("r").addKeyword("urn").addKeyword("ni").addKeyword("i").addKeyword("in").addKeyword("n") - .addKeyword("urning").build(); + + PayloadTrie trie = PayloadTrie.builder() + .ignoreOverlaps() + .addKeyword("T") + .addKeyword("u") + .addKeyword("ur") + .addKeyword("r") + .addKeyword("urn") + .addKeyword("ni") + .addKeyword("i") + .addKeyword("in") + .addKeyword("n") + .addKeyword("urning") + .build(); Collection> emits = trie.parseText("Turning"); assertEquals(2, emits.size()); } + @Test public void partialMatch() { + PayloadTrie trie = PayloadTrie.builder().onlyWholeWords().addKeyword("sugar", "food:sugar").build(); Collection> emits = trie.parseText("sugarcane sugarcane sugar canesugar"); // left, middle, right test assertEquals(1, emits.size()); // Match must not be made checkEmit(emits.iterator().next(), 20, 24, "sugar", "food:sugar"); } + @Test public void partialMatchFirstMatch() { + PayloadTrie trie = PayloadTrie.builder().onlyWholeWords().addKeyword("sugar", "food:sugar").build(); PayloadEmit firstMatch = trie.firstMatch("sugarcane sugarcane sugar canesugar"); // left, middle, right test checkEmit(firstMatch, 20, 24, "sugar", "food:sugar"); } + @Test public void tokenizeFullSentence() { + PayloadTrie trie = PayloadTrie.builder().addKeywords(GREEK_LETTERS_WITH_PAYLOADS).build(); Collection> tokens = trie.tokenize("Hear: Alpha team first, Beta from the rear, Gamma in reserve"); assertEquals(7, tokens.size()); @@ -346,11 +420,12 @@ public class PayloadTrieTest { assertEquals(" in reserve", tokensIt.next().getFragment()); } + // @see https://github.com/robert-bor/aho-corasick/issues/5 @Test public void testStringIndexOutOfBoundsException() { - PayloadTrie trie = PayloadTrie.builder().ignoreCase().onlyWholeWords().addKeywords(UNICODE_WITH_PAYLOADS) - .build(); + + PayloadTrie trie = PayloadTrie.builder().ignoreCase().onlyWholeWords().addKeywords(UNICODE_WITH_PAYLOADS).build(); Collection> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ"); assertEquals(4, emits.size()); // Match must not be made Iterator> it = emits.iterator(); @@ -361,8 +436,10 @@ public class PayloadTrieTest { checkEmit(it.next(), 19, 23, "börkü", "uni:börkü"); } + @Test public void testIgnoreCase() { + PayloadTrie trie = PayloadTrie.builder().ignoreCase().addKeywords(UNICODE_WITH_PAYLOADS).build(); Collection> emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ"); assertEquals(4, emits.size()); // Match must not be made @@ -374,65 +451,75 @@ public class PayloadTrieTest { checkEmit(it.next(), 19, 23, "börkü", "uni:börkü"); } + @Test public void testIgnoreCaseFirstMatch() { + PayloadTrie trie = PayloadTrie.builder().ignoreCase().addKeywords(UNICODE_WITH_PAYLOADS).build(); PayloadEmit firstMatch = trie.firstMatch("TurninG OnCe AgAiN BÖRKÜ"); checkEmit(firstMatch, 0, 6, "turning", "uni:turning"); } + @Test public void tokenizeTokensInSequence() { + PayloadTrie trie = PayloadTrie.builder().addKeywords(GREEK_LETTERS_WITH_PAYLOADS).build(); Collection> tokens = trie.tokenize("Alpha Beta Gamma"); assertEquals(5, tokens.size()); } + // @see https://github.com/robert-bor/aho-corasick/issues/7 @Test public void testZeroLength() { - PayloadTrie trie = PayloadTrie.builder().ignoreOverlaps().onlyWholeWords().ignoreCase().addKeyword("") - .build(); + + PayloadTrie trie = PayloadTrie.builder().ignoreOverlaps().onlyWholeWords().ignoreCase().addKeyword("").build(); trie.tokenize( "Try a natural lip and subtle bronzer to keep all the focus on those big bright eyes with NARS Eyeshadow Duo in Rated R And the winner is... Boots No7 Advanced Renewal Anti-ageing Glycolic Peel Kit ($25 amazon.com) won most-appealing peel."); } + // @see https://github.com/robert-bor/aho-corasick/issues/8 @Test public void testUnicode1() { + String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char assertEquals("THIS", target.substring(5, 9)); // Java does it the right way - PayloadTrie trie = PayloadTrie.builder().ignoreCase().onlyWholeWords().addKeyword("this", "pronon:this") - .build(); + PayloadTrie trie = PayloadTrie.builder().ignoreCase().onlyWholeWords().addKeyword("this", "pronon:this").build(); Collection> emits = trie.parseText(target); assertEquals(1, emits.size()); Iterator> it = emits.iterator(); checkEmit(it.next(), 5, 8, "this", "pronon:this"); } + // @see https://github.com/robert-bor/aho-corasick/issues/8 @Test public void testUnicode2() { + String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char - PayloadTrie trie = PayloadTrie.builder().ignoreCase().onlyWholeWords().addKeyword("this", "pronon:this") - .build(); + PayloadTrie trie = PayloadTrie.builder().ignoreCase().onlyWholeWords().addKeyword("this", "pronon:this").build(); assertEquals("THIS", target.substring(5, 9)); // Java does it the right way PayloadEmit firstMatch = trie.firstMatch(target); checkEmit(firstMatch, 5, 8, "this", "pronon:this"); } + @Test public void testPartialMatchWhiteSpaces() { - PayloadTrie trie = PayloadTrie.builder().onlyWholeWordsWhiteSpaceSeparated() - .addKeyword("#sugar-123", "sugar").build(); + + PayloadTrie trie = PayloadTrie.builder().onlyWholeWordsWhiteSpaceSeparated().addKeyword("#sugar-123", "sugar").build(); Collection> emits = trie.parseText("#sugar-123 #sugar-1234"); // left, middle, right test assertEquals(1, emits.size()); // Match must not be made checkEmit(emits.iterator().next(), 0, 9, "#sugar-123", "sugar"); } + @Test public void testLargeString() { + final int interval = 100; final int textSize = 1000000; final String keyword = FOOD[1]; @@ -448,17 +535,21 @@ public class PayloadTrieTest { assertEquals(textSize / interval, emits.size()); } + @Test public void test_containsMatchWithCaseInsensitive() { + PayloadTrie trie = PayloadTrie.builder().ignoreCase().addKeyword("foo", "bar").build(); assertTrue(trie.containsMatch("FOOBAR")); assertFalse(trie.containsMatch("FO!?AR")); } + // @see https://github.com/robert-bor/aho-corasick/issues/85 @Test public void test_wholeWords() { + PayloadTrie trie = PayloadTrie.builder().addKeyword("foo", "bar").onlyWholeWords().build(); // access via PayloadTrie.parseText(CharSequence) Collection> result1 = trie.parseText("foobar"); @@ -470,9 +561,11 @@ public class PayloadTrieTest { assertEquals(result1, result2); } + // @see https://github.com/robert-bor/aho-corasick/issues/85 @Test public void test_wholeWordsWhiteSpaceSeparated() { + PayloadTrie trie = PayloadTrie.builder().addKeyword("foo", "bar").onlyWholeWordsWhiteSpaceSeparated().build(); // access via PayloadTrie.parseText(CharSequence) Collection> result1 = trie.parseText("foo#bar"); @@ -484,39 +577,31 @@ public class PayloadTrieTest { assertEquals(result1, result2); } - private void checkEmit( - final PayloadEmit next, - final int expectedStart, - final int expectedEnd, - final String expectedKeyword, - final Food expectedPayload) { + + private void checkEmit(final PayloadEmit next, final int expectedStart, final int expectedEnd, final String expectedKeyword, final Food expectedPayload) { + assertEquals("Start of emit should have been " + expectedStart, expectedStart, next.getStart()); assertEquals("End of emit should have been " + expectedEnd, expectedEnd, next.getEnd()); assertEquals("Keyword of emit shoud be " + expectedKeyword, expectedKeyword, next.getKeyword()); assertEquals("Payload of emit shoud be " + expectedPayload, expectedPayload, next.getPayload()); } - private void checkEmit( - final PayloadEmit next, - final int expectedStart, - final int expectedEnd, - final String expectedKeyword, - final Integer expectedPayload) { + + private void checkEmit(final PayloadEmit next, final int expectedStart, final int expectedEnd, final String expectedKeyword, final Integer expectedPayload) { + assertEquals("Start of emit should have been " + expectedStart, expectedStart, next.getStart()); assertEquals("End of emit should have been " + expectedEnd, expectedEnd, next.getEnd()); assertEquals("Keyword of emit shoud be " + expectedKeyword, expectedKeyword, next.getKeyword()); assertEquals("Payload of emit shoud be " + expectedPayload, expectedPayload, next.getPayload()); } - private void checkEmit( - final PayloadEmit next, - final int expectedStart, - final int expectedEnd, - final String expectedKeyword, - final String expectedPayload) { + + private void checkEmit(final PayloadEmit next, final int expectedStart, final int expectedEnd, final String expectedKeyword, final String expectedPayload) { + assertEquals("Start of emit should have been " + expectedStart, expectedStart, next.getStart()); assertEquals("End of emit should have been " + expectedEnd, expectedEnd, next.getEnd()); assertEquals("Keyword of emit shoud be " + expectedKeyword, expectedKeyword, next.getKeyword()); assertEquals("Payload of emit shoud be " + expectedPayload, expectedPayload, next.getPayload()); } + } diff --git a/src/test/java/org/ahocorasick/trie/StateTest.java b/src/test/java/org/ahocorasick/trie/StateTest.java index 02840d9..8db6743 100644 --- a/src/test/java/org/ahocorasick/trie/StateTest.java +++ b/src/test/java/org/ahocorasick/trie/StateTest.java @@ -11,11 +11,9 @@ public class StateTest { @Test public void test_constructSequenceOfCharacters() { + final State rootState = new State(); - rootState - .addState('a') - .addState('b') - .addState('c'); + rootState.addState('a').addState('b').addState('c'); State currentState = rootState.nextState('a'); assertEquals(1, currentState.getDepth()); currentState = currentState.nextState('b'); @@ -26,8 +24,10 @@ public class StateTest { assertNull(currentState); } + @Test public void test_getStates() { + final State rootState = new State(); rootState.addState("foo"); final State currentState = rootState.nextState('f'); @@ -37,8 +37,10 @@ public class StateTest { assertEquals(currentState, states.iterator().next()); } + @Test public void test_getTransitions() { + final State rootState = new State(); rootState.addState("foo"); final State currentState = rootState.nextState('f'); @@ -48,20 +50,23 @@ public class StateTest { assertEquals(Character.valueOf('f'), transitions.iterator().next()); } + @Test - public void test_failure() { + public void test_getFailure() { + final State failureState = new State(); final State rootState = new State(); rootState.setFailure(failureState); - assertEquals(failureState, rootState.failure()); + assertEquals(failureState, rootState.getFailure()); } + @Test public void test_checkEmits() { + final State rootState = new State(); - rootState.addState('a') - .addEmit(Collections.singleton("tag")); + rootState.addState('a').addEmit(Collections.singleton("tag")); final Collection actual = rootState.nextState('a').emit(); assertEquals(1, actual.size()); diff --git a/src/test/java/org/ahocorasick/trie/TestHelper.java b/src/test/java/org/ahocorasick/trie/TestHelper.java index 3893be9..8ba9fc7 100644 --- a/src/test/java/org/ahocorasick/trie/TestHelper.java +++ b/src/test/java/org/ahocorasick/trie/TestHelper.java @@ -6,39 +6,42 @@ import static java.util.concurrent.ThreadLocalRandom.current; * Contains functionality common to tests. */ public class TestHelper { - /** - * Injects keywords into a string builder. - * - * @param source Should contain a bunch of random data that cannot match - * any keyword. - * @param keyword A keyword to inject repeatedly in the text. - * @param interval How often to inject the keyword. - */ - @SuppressWarnings( "SameParameterValue" ) - static void injectKeyword( - final StringBuilder source, - final String keyword, - final int interval ) { - final int length = source.length(); - for( int i = 0; i < length; i += interval ) { - source.replace( i, i + keyword.length(), keyword ); - } - } - /** - * Generates a random sequence of ASCII numbers. - * - * @param count The number of numbers to generate. - * @return A character sequence filled with random digits. - */ - @SuppressWarnings( "SameParameterValue" ) - public static StringBuilder randomNumbers( int count ) { - final StringBuilder sb = new StringBuilder( count ); + /** + * Injects keywords into a string builder. + * + * @param source Should contain a bunch of random data that cannot match + * any keyword. + * @param keyword A keyword to inject repeatedly in the text. + * @param interval How often to inject the keyword. + */ + @SuppressWarnings("SameParameterValue") + static void injectKeyword(final StringBuilder source, final String keyword, final int interval) { - while( --count > 0 ) { - sb.append( current().nextInt( 0, 10 ) ); + final int length = source.length(); + for (int i = 0; i < length; i += interval) { + source.replace(i, i + keyword.length(), keyword); + } + } + + + /** + * Generates a random sequence of ASCII numbers. + * + * @param count The number of numbers to generate. + * @return A character sequence filled with random digits. + */ + @SuppressWarnings("SameParameterValue") + public static StringBuilder randomNumbers(int count) { + + int localCount = count; + final StringBuilder sb = new StringBuilder(localCount); + + while (--localCount > 0) { + sb.append(current().nextInt(0, 10)); + } + + return sb; } - return sb; - } } diff --git a/src/test/java/org/ahocorasick/trie/TrieTest.java b/src/test/java/org/ahocorasick/trie/TrieTest.java index a72d987..302dd99 100644 --- a/src/test/java/org/ahocorasick/trie/TrieTest.java +++ b/src/test/java/org/ahocorasick/trie/TrieTest.java @@ -21,574 +21,570 @@ import static org.junit.Assert.*; * Test the {@link Trie} class functionality. */ public class TrieTest { - private final static String[] ALPHABET = new String[]{ - "abc", "bcd", "cde" - }; - private final static String[] PRONOUNS = new String[]{ - "hers", "his", "she", "he" - }; + private final static String[] ALPHABET = new String[]{"abc", "bcd", "cde"}; - private final static String[] FOOD = new String[]{ - "veal", "cauliflower", "broccoli", "tomatoes" - }; + private final static String[] PRONOUNS = new String[]{"hers", "his", "she", "he"}; - private final static String[] GREEK_LETTERS = new String[]{ - "Alpha", "Beta", "Gamma" - }; + private final static String[] FOOD = new String[]{"veal", "cauliflower", "broccoli", "tomatoes"}; - private final static String[] UNICODE = new String[]{ - "turning", "once", "again", "börkü" - }; + private final static String[] GREEK_LETTERS = new String[]{"Alpha", "Beta", "Gamma"}; - private static Trie trie( final String keyword ) { - return Trie.builder() - .addKeyword( keyword ) - .build(); - } + private final static String[] UNICODE = new String[]{"turning", "once", "again", "börkü"}; - private static Trie trie( final String[] keywords ) { - return Trie.builder() - .addKeywords( keywords ) - .build(); - } - @Test - public void test_KeywordAndTextAreTheSame() { - final Trie trie = trie( ALPHABET[ 0 ] ); - final Collection emits = trie.parseText( ALPHABET[ 0 ] ); - final Iterator iterator = emits.iterator(); - checkEmit( iterator.next(), 0, 2, ALPHABET[ 0 ] ); - } + private static Trie trie(final String keyword) { - @Test - public void test_KeywordAndTextAreTheSameFirstMatch() { - final Trie trie = trie( ALPHABET[ 0 ] ); - final Emit firstMatch = trie.firstMatch( ALPHABET[ 0 ] ); - checkEmit( firstMatch, 0, 2, ALPHABET[ 0 ] ); - } + return Trie.builder().addKeyword(keyword).build(); + } - @Test - public void test_TextIsLongerThanKeyword() { - final Trie trie = trie( ALPHABET[ 0 ] ); - final Collection emits = trie.parseText( " " + ALPHABET[ 0 ] ); - final Iterator iterator = emits.iterator(); - checkEmit( iterator.next(), 1, 3, ALPHABET[ 0 ] ); - } - @Test - public void test_TextIsLongerThanKeywordFirstMatch() { - final Trie trie = trie( ALPHABET[ 0 ] ); - final Emit firstMatch = trie.firstMatch( " " + ALPHABET[ 0 ] ); - checkEmit( firstMatch, 1, 3, ALPHABET[ 0 ] ); - } + private static Trie trieIgnoreWhiteSpace(final String keyword) { - @Test - public void test_VariousKeywordsOneMatch() { - final Trie trie = trie( ALPHABET ); - final Collection emits = trie.parseText( "bcd" ); - final Iterator iterator = emits.iterator(); - checkEmit( iterator.next(), 0, 2, "bcd" ); - } + return Trie.builder().addKeyword(keyword).ignoreWhiteSpace().build(); + } - @Test - public void test_VariousKeywordsFirstMatch() { - final Trie trie = trie( ALPHABET ); - final Emit firstMatch = trie.firstMatch( "bcd" ); - checkEmit( firstMatch, 0, 2, "bcd" ); - } - @Test(expected=AssertionError.class) - public void test_NullInputTextFirstMatch() { - final Trie trie = trie( ALPHABET ); - final Emit firstMatch = trie.firstMatch( null ); - assertNull( firstMatch ); - } + private static Trie trie(final String[] keywords) { - @Test - public void test_UshersTestAndStopOnHit() { - final Trie trie = Trie.builder() - .addKeywords( PRONOUNS ) - .stopOnHit() - .build(); - final Collection emits = trie.parseText( "ushers" ); - assertEquals( 1, emits.size() ); // she @ 3, he @ 3, hers @ 5 - final Iterator iterator = emits.iterator(); - checkEmit( iterator.next(), 2, 3, "he" ); - } + return Trie.builder().addKeywords(keywords).ignoreWhiteSpace().build(); + } - @Test - public void test_UshersTestStopOnHitSkipOne() { - final Trie trie = Trie.builder() - .addKeywords( PRONOUNS ) - .stopOnHit() - .build(); - final StatefulEmitHandler testEmitHandler = - new AbstractStatefulEmitHandler() { - boolean first = true; + private static Trie trieIgnoreWhiteSpace(final String[] keywords) { - @Override - public boolean emit( final Emit emit ) { - if( first ) { - // return false for the first element - first = false; - return false; + return Trie.builder().addKeywords(keywords).ignoreWhiteSpace().build(); + } + + + @Test + public void test_KeywordAndTextAreTheSame() { + + final Trie trie = trie(ALPHABET[0]); + final Collection emits = trie.parseText(ALPHABET[0]); + final Iterator iterator = emits.iterator(); + checkEmit(iterator.next(), 0, 2, ALPHABET[0]); + } + + + @Test + public void test_ignoringWhitespace_KeywordAndTextAreTheSame() { + + final Trie trie = trieIgnoreWhiteSpace(ALPHABET); + final Collection emits = trie.parseText("a b c d e"); + final Iterator iterator = emits.iterator(); + checkEmit(iterator.next(), 0, 4, ALPHABET[0]); + checkEmit(iterator.next(), 2, 6, ALPHABET[1]); + checkEmit(iterator.next(), 4, 8, ALPHABET[2]); + } + + + @Test + public void test_KeywordAndTextAreTheSameFirstMatch() { + + final Trie trie = trie(ALPHABET[0]); + final Emit firstMatch = trie.firstMatch(ALPHABET[0]); + checkEmit(firstMatch, 0, 2, ALPHABET[0]); + } + + + @Test + public void test_TextIsLongerThanKeyword() { + + final Trie trie = trie(ALPHABET[0]); + final Collection emits = trie.parseText(" " + ALPHABET[0]); + final Iterator iterator = emits.iterator(); + checkEmit(iterator.next(), 1, 3, ALPHABET[0]); + } + + + @Test + public void test_TextIsLongerThanKeywordFirstMatch() { + + final Trie trie = trie(ALPHABET[0]); + final Emit firstMatch = trie.firstMatch(" " + ALPHABET[0]); + checkEmit(firstMatch, 1, 3, ALPHABET[0]); + } + + + @Test + public void test_VariousKeywordsOneMatch() { + + final Trie trie = trie(ALPHABET); + final Collection emits = trie.parseText("bcd"); + final Iterator iterator = emits.iterator(); + checkEmit(iterator.next(), 0, 2, "bcd"); + } + + + @Test + public void test_VariousKeywordsFirstMatch() { + + final Trie trie = trie(ALPHABET); + final Emit firstMatch = trie.firstMatch("bc d"); + checkEmit(firstMatch, 0, 3, "bcd"); + } + + + @Test(expected = AssertionError.class) + public void test_NullInputTextFirstMatch() { + + final Trie trie = trie(ALPHABET); + final Emit firstMatch = trie.firstMatch(null); + assertNull(firstMatch); + } + + + @Test + public void test_UshersTestAndStopOnHit() { + + final Trie trie = Trie.builder().addKeywords(PRONOUNS).stopOnHit().build(); + final Collection emits = trie.parseText("ushers"); + assertEquals(1, emits.size()); // she @ 3, he @ 3, hers @ 5 + final Iterator iterator = emits.iterator(); + checkEmit(iterator.next(), 2, 3, "he"); + } + + + @Test + public void test_UshersTestStopOnHitSkipOne() { + + final Trie trie = Trie.builder().addKeywords(PRONOUNS).stopOnHit().build(); + + final StatefulEmitHandler testEmitHandler = new AbstractStatefulEmitHandler() { + boolean first = true; + + + @Override + public boolean emit(final Emit emit) { + + if (first) { + // return false for the first element + first = false; + return false; + } + addEmit(emit); + return true; } - addEmit( emit ); - return true; - } }; - trie.parseText( "ushers", testEmitHandler ); - final Collection emits = testEmitHandler.getEmits(); - assertEquals( 1, emits.size() ); // she @ 3, he @ 3, hers @ 5 - final Iterator iterator = emits.iterator(); - checkEmit( iterator.next(), 1, 3, "she" ); - } + trie.parseText("ushers", testEmitHandler); + final Collection emits = testEmitHandler.getEmits(); + assertEquals(1, emits.size()); // she @ 3, he @ 3, hers @ 5 + final Iterator iterator = emits.iterator(); + checkEmit(iterator.next(), 1, 3, "she"); + } - @Test - public void test_UshersTest() { - final Trie trie = trie( PRONOUNS ); - final Collection emits = trie.parseText( "ushers" ); - assertEquals( 3, emits.size() ); // she @ 3, he @ 3, hers @ 5 - final Iterator iterator = emits.iterator(); - checkEmit( iterator.next(), 2, 3, "he" ); - checkEmit( iterator.next(), 1, 3, "she" ); - checkEmit( iterator.next(), 2, 5, "hers" ); - } - @Test - public void test_UshersTestWithCapitalKeywords() { - final Trie trie = Trie.builder() - .ignoreCase() - .addKeyword( "HERS" ) - .addKeyword( "HIS" ) - .addKeyword( "SHE" ) - .addKeyword( "HE" ) - .build(); - final Collection emits = trie.parseText( "ushers" ); - assertEquals( 3, emits.size() ); // she @ 3, he @ 3, hers @ 5 - final Iterator iterator = emits.iterator(); - checkEmit( iterator.next(), 2, 3, "HE" ); - checkEmit( iterator.next(), 1, 3, "SHE" ); - checkEmit( iterator.next(), 2, 5, "HERS" ); - } + @Test + public void test_UshersTest() { - @Test - public void test_UshersTestFirstMatch() { - final Trie trie = trie( PRONOUNS ); - final Emit firstMatch = trie.firstMatch( "ushers" ); - checkEmit( firstMatch, 2, 3, "he" ); - } + final Trie trie = trie(PRONOUNS); + final Collection emits = trie.parseText("ushers"); + assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5 + final Iterator iterator = emits.iterator(); + checkEmit(iterator.next(), 2, 3, "he"); + checkEmit(iterator.next(), 1, 3, "she"); + checkEmit(iterator.next(), 2, 5, "hers"); + } - @Test - public void test_UshersTestByCallback() { - final Trie trie = trie( PRONOUNS ); - final List emits = new ArrayList<>(); - final EmitHandler emitHandler = emit -> { - emits.add( emit ); - return true; - }; - trie.parseText( "ushers", emitHandler ); - assertEquals( 3, emits.size() ); // she @ 3, he @ 3, hers @ 5 - final Iterator iterator = emits.iterator(); - checkEmit( iterator.next(), 2, 3, "he" ); - checkEmit( iterator.next(), 1, 3, "she" ); - checkEmit( iterator.next(), 2, 5, "hers" ); - } - @Test - public void test_MisleadingTest() { - final Trie trie = trie( "hers" ); - final Collection emits = trie.parseText( "h he her hers" ); - final Iterator iterator = emits.iterator(); - checkEmit( iterator.next(), 9, 12, "hers" ); - } + @Test + public void test_UshersTestWithCapitalKeywords() { - @Test - public void test_MisleadingTestFirstMatch() { - final Trie trie = trie( "hers" ); - final Emit firstMatch = trie.firstMatch( "h he her hers" ); - checkEmit( firstMatch, 9, 12, "hers" ); - } + final Trie trie = Trie.builder().ignoreCase().addKeyword("HERS").addKeyword("HIS").addKeyword("SHE").addKeyword("HE").build(); + final Collection emits = trie.parseText("ushers"); + assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5 + final Iterator iterator = emits.iterator(); + checkEmit(iterator.next(), 2, 3, "HE"); + checkEmit(iterator.next(), 1, 3, "SHE"); + checkEmit(iterator.next(), 2, 5, "HERS"); + } - @Test - public void test_Recipes() { - final Trie trie = trie( FOOD ); - final Collection emits = trie.parseText( - "2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli" ); - final Iterator iterator = emits.iterator(); - checkEmit( iterator.next(), 2, 12, "cauliflower" ); - checkEmit( iterator.next(), 18, 25, "tomatoes" ); - checkEmit( iterator.next(), 40, 43, "veal" ); - checkEmit( iterator.next(), 51, 58, "broccoli" ); - } - @Test - public void test_RecipesFirstMatch() { - final Trie trie = trie( FOOD ); - final Emit firstMatch = trie.firstMatch( - "2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli" ); + @Test + public void test_UshersTestFirstMatch() { - checkEmit( firstMatch, 2, 12, "cauliflower" ); - } + final Trie trie = trie(PRONOUNS); + final Emit firstMatch = trie.firstMatch("ushers"); + checkEmit(firstMatch, 2, 3, "he"); + } - @Test - public void test_LongAndShortOverlappingMatch() { - final Trie trie = Trie.builder() - .addKeyword( "he" ) - .addKeyword( "hehehehe" ) - .build(); - final Collection emits = trie.parseText( "hehehehehe" ); - final Iterator iterator = emits.iterator(); - checkEmit( iterator.next(), 0, 1, "he" ); - checkEmit( iterator.next(), 2, 3, "he" ); - checkEmit( iterator.next(), 4, 5, "he" ); - checkEmit( iterator.next(), 6, 7, "he" ); - checkEmit( iterator.next(), 0, 7, "hehehehe" ); - checkEmit( iterator.next(), 8, 9, "he" ); - checkEmit( iterator.next(), 2, 9, "hehehehe" ); - } - @Test - public void test_NonOverlapping() { - final Trie trie = Trie.builder() - .ignoreOverlaps() - .addKeyword( "ab" ) - .addKeyword( "cba" ) - .addKeyword( "ababc" ) - .build(); - final Collection emits = trie.parseText( "ababcbab" ); - assertEquals( 2, emits.size() ); - final Iterator iterator = emits.iterator(); - // With overlaps: ab@1, ab@3, ababc@4, cba@6, ab@7 - checkEmit( iterator.next(), 0, 4, "ababc" ); - checkEmit( iterator.next(), 6, 7, "ab" ); - } + @Test + public void test_UshersTestByCallback() { - @Test - public void test_NonOverlappingFirstMatch() { - final Trie trie = Trie.builder() - .ignoreOverlaps() - .addKeyword( "ab" ) - .addKeyword( "cba" ) - .addKeyword( "ababc" ) - .build(); - final Emit firstMatch = trie.firstMatch( "ababcbab" ); + final Trie trie = trie(PRONOUNS); + final List emits = new ArrayList<>(); + final EmitHandler emitHandler = emit -> { + emits.add(emit); + return true; + }; + trie.parseText("ushers", emitHandler); + assertEquals(3, emits.size()); // she @ 3, he @ 3, hers @ 5 + final Iterator iterator = emits.iterator(); + checkEmit(iterator.next(), 2, 3, "he"); + checkEmit(iterator.next(), 1, 3, "she"); + checkEmit(iterator.next(), 2, 5, "hers"); + } - checkEmit( firstMatch, 0, 4, "ababc" ); - } - @Test - public void test_ContainsMatch() { - final Trie trie = Trie.builder() - .ignoreOverlaps() - .addKeyword( "ab" ) - .addKeyword( "cba" ) - .addKeyword( "ababc" ) - .build(); - assertTrue( trie.containsMatch( "ababcbab" ) ); - } + @Test + public void test_MisleadingTest() { - @Test - public void test_StartOfChurchillSpeech() { - final Trie trie = Trie.builder() - .ignoreOverlaps() - .addKeyword( "T" ) - .addKeyword( "u" ) - .addKeyword( "ur" ) - .addKeyword( "r" ) - .addKeyword( "urn" ) - .addKeyword( "ni" ) - .addKeyword( "i" ) - .addKeyword( "in" ) - .addKeyword( "n" ) - .addKeyword( "urning" ) - .build(); - final Collection emits = trie.parseText( "Turning" ); - assertEquals( 2, emits.size() ); - } + final Trie trie = trie("hers"); + final Collection emits = trie.parseText("h he her hers"); + final Iterator iterator = emits.iterator(); + checkEmit(iterator.next(), 9, 12, "hers"); + } - @Test - public void test_PartialMatch() { - final Trie trie = Trie.builder() - .onlyWholeWords() - .addKeyword( "sugar" ) - .build(); - final Collection emits = trie.parseText( - "sugarcane sugarcane sugar canesugar" ); // left, middle, right test - assertEquals( 1, emits.size() ); // Match must not be made - checkEmit( emits.iterator().next(), 20, 24, "sugar" ); - } - @Test - public void test_PartialMatchFirstMatch() { - final Trie trie = Trie.builder() - .onlyWholeWords() - .addKeyword( "sugar" ) - .build(); + @Test + public void test_MisleadingTestFirstMatch() { - // left, middle, right test - final Emit firstMatch = - trie.firstMatch( "sugarcane sugarcane sugar canesugar" ); + final Trie trie = trie("hers"); + final Emit firstMatch = trie.firstMatch("h he her hers"); + checkEmit(firstMatch, 9, 12, "hers"); + } - checkEmit( firstMatch, 20, 24, "sugar" ); - } - @Test - public void test_TokenizeFullSentence() { - final Trie trie = trie( GREEK_LETTERS ); - final Collection tokens = trie.tokenize( - "Hear: Alpha team first, Beta from the rear, Gamma in reserve" ); - assertEquals( 7, tokens.size() ); - final Iterator tokensIt = tokens.iterator(); - assertEquals( "Hear: ", tokensIt.next().getFragment() ); - assertEquals( "Alpha", tokensIt.next().getFragment() ); - assertEquals( " team first, ", tokensIt.next().getFragment() ); - assertEquals( "Beta", tokensIt.next().getFragment() ); - assertEquals( " from the rear, ", tokensIt.next().getFragment() ); - assertEquals( "Gamma", tokensIt.next().getFragment() ); - assertEquals( " in reserve", tokensIt.next().getFragment() ); - } + @Test + public void test_Recipes() { - /** - * Test boundary check with case-insensitive matches with whole words. - */ - @Test - public void test_StringIndexOutOfBoundsException() { - final Trie trie = Trie.builder() - .ignoreCase() - .onlyWholeWords() - .addKeywords( UNICODE ) - .build(); - final Collection emits = trie.parseText( "TurninG OnCe AgAiN BÖRKÜ" ); - assertEquals( 4, emits.size() ); // Match must not be made - final Iterator it = emits.iterator(); - checkEmit( it.next(), 0, 6, "turning" ); - checkEmit( it.next(), 8, 11, "once" ); - checkEmit( it.next(), 13, 17, "again" ); - checkEmit( it.next(), 19, 23, "börkü" ); - } + final Trie trie = trie(FOOD); + final Collection emits = trie.parseText("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli"); + final Iterator iterator = emits.iterator(); + checkEmit(iterator.next(), 2, 12, "cauliflower"); + checkEmit(iterator.next(), 18, 25, "tomatoes"); + checkEmit(iterator.next(), 40, 43, "veal"); + checkEmit(iterator.next(), 51, 58, "broccoli"); + } - @Test - public void test_IgnoreCase() { - final Trie trie = Trie.builder() - .ignoreCase() - .addKeywords( UNICODE ) - .build(); - final Collection emits = trie.parseText( "TurninG OnCe AgAiN BÖRKÜ" ); - assertEquals( 4, emits.size() ); // Match must not be made - final Iterator it = emits.iterator(); - checkEmit( it.next(), 0, 6, "turning" ); - checkEmit( it.next(), 8, 11, "once" ); - checkEmit( it.next(), 13, 17, "again" ); - checkEmit( it.next(), 19, 23, "börkü" ); - } - @Test - public void test_IgnoreCaseFirstMatch() { - final Trie trie = Trie.builder() - .ignoreCase() - .addKeywords( UNICODE ) - .build(); - final Emit firstMatch = trie.firstMatch( "TurninG OnCe AgAiN BÖRKÜ" ); + @Test + public void test_RecipesFirstMatch() { - checkEmit( firstMatch, 0, 6, "turning" ); - } + final Trie trie = trie(FOOD); + final Emit firstMatch = trie.firstMatch("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli"); - @Test - public void test_TokenizeTokensInSequence() { - final Trie trie = trie( GREEK_LETTERS ); - final Collection tokens = trie.tokenize( "Alpha Beta Gamma" ); - assertEquals( 5, tokens.size() ); - } + checkEmit(firstMatch, 2, 12, "cauliflower"); + } - /** - * Fix adding a word of size 0 ("") as a dictionary. A bug in the dictionary - * parsing code (at end of line) caused it to generate words of 0 length, - * which were being added to the trie. Removing the additional commas - * resolved the issue. - */ - @Test - public void test_ZeroLength() { - final Trie trie = Trie.builder() - .ignoreOverlaps() - .onlyWholeWords() - .ignoreCase() - .addKeyword( "" ) - .build(); - trie.tokenize( - "Try a natural lip and subtle bronzer to keep all the focus on those " + - "big bright eyes with NARS Eyeshadow Duo in Rated R And the " + - "winner is... Boots No7 Advanced Renewal Anti-ageing Glycolic " + - "Peel Kit ($25 amazon.com) won most-appealing peel." ); - } - @Test - public void test_Emit_PunctuatedKeyword_AllOffsetsFound() { - final String keyword = "{{var}}"; - final int len = keyword.length() - 1; - final Trie trie = builder() - .ignoreOverlaps() - .addKeyword( keyword ) - .build(); + @Test + public void test_LongAndShortOverlappingMatch() { - final Collection emits = trie.parseText( - format( "__%s__ **%s** {{%s}} %s%s", - keyword, keyword, keyword, keyword, keyword ) - ); + final Trie trie = Trie.builder().addKeyword("he").addKeyword("hehehehe").build(); + final Collection emits = trie.parseText("hehehehehe"); + final Iterator iterator = emits.iterator(); + checkEmit(iterator.next(), 0, 1, "he"); + checkEmit(iterator.next(), 2, 3, "he"); + checkEmit(iterator.next(), 4, 5, "he"); + checkEmit(iterator.next(), 6, 7, "he"); + checkEmit(iterator.next(), 0, 7, "hehehehe"); + checkEmit(iterator.next(), 8, 9, "he"); + checkEmit(iterator.next(), 2, 9, "hehehehe"); + } - assertEquals( 5, emits.size() ); - final Iterator it = emits.iterator(); - checkEmit( it.next(), 2, 2 + len, keyword ); - checkEmit( it.next(), 14, 14 + len, keyword ); - checkEmit( it.next(), 26, 26 + len, keyword ); - checkEmit( it.next(), 36, 36 + len, keyword ); - checkEmit( it.next(), 43, 43 + len, keyword ); - } + @Test + public void test_NonOverlapping() { - /** - * Notice the capital I with a dot. The code used to compute the offsets - * at (6, 9), which caused {@link Trie#tokenize(String)} to crash because - * 9 is past the end of the string. That character is two bytes wide, so it - * pushes the offset calculation off. - */ - @Test - public void test_Unicode1() { - // The second character ('İ') is - // Unicode, which was read by AC as a 2-byte char - final String target = "LİKE THIS"; - // Java does it the right way - assertEquals( "THIS", - target.substring( 5, 9 ) ); - final Trie trie = Trie.builder().ignoreCase().onlyWholeWords() - .addKeyword( "this" ) - .build(); - final Collection emits = trie.parseText( target ); - assertEquals( 1, emits.size() ); - final Iterator it = emits.iterator(); - checkEmit( it.next(), 5, 8, "this" ); - } + final Trie trie = Trie.builder().ignoreOverlaps().addKeyword("ab").addKeyword("cba").addKeyword("ababc").build(); + final Collection emits = trie.parseText("ababcbab"); + assertEquals(2, emits.size()); + final Iterator iterator = emits.iterator(); + // With overlaps: ab@1, ab@3, ababc@4, cba@6, ab@7 + checkEmit(iterator.next(), 0, 4, "ababc"); + checkEmit(iterator.next(), 6, 7, "ab"); + } - /** - * Notice the capital I with a dot. The code used to compute the offsets - * at (6, 9), which caused {@link Trie#tokenize(String)} to crash because - * 9 is past the end of the string. That character is two bytes wide, so it - * pushes the offset calculation off. - */ - @Test - public void test_Unicode2() { - // The second character ('İ') is - // Unicode, which was read by AC as a 2-byte char - final String target = "LİKE THIS"; - final Trie trie = Trie.builder() - .ignoreCase() - .onlyWholeWords() - .addKeyword( "this" ) - .build(); - // Java does it the right way - assertEquals( "THIS", - target.substring( 5, 9 ) ); - final Emit firstMatch = trie.firstMatch( target ); - checkEmit( firstMatch, 5, 8, "this" ); - } - @Test - public void test_PartialMatchWhiteSpaces() { - final Trie trie = Trie.builder() - .onlyWholeWordsWhiteSpaceSeparated() - .addKeyword( "#sugar-123" ) - .build(); - final Collection emits = - trie.parseText( "#sugar-123 #sugar-1234" ); // left, middle, right test - assertEquals( 1, emits.size() ); // Match must not be made - checkEmit( emits.iterator().next(), 0, 9, "#sugar-123" ); - } + @Test + public void test_NonOverlappingFirstMatch() { - @Test - public void test_LargeString() { - final int interval = 100; - final int textSize = 1000000; - final String keyword = FOOD[ 1 ]; - final StringBuilder text = randomNumbers( textSize ); + final Trie trie = Trie.builder().ignoreOverlaps().addKeyword("ab").addKeyword("cba").addKeyword("ababc").build(); + final Emit firstMatch = trie.firstMatch("ababcbab"); - injectKeyword( text, keyword, interval ); + checkEmit(firstMatch, 0, 4, "ababc"); + } - final Trie trie = Trie.builder() - .onlyWholeWords() - .addKeyword( keyword ) - .build(); - final Collection emits = trie.parseText( text ); + @Test + public void test_ContainsMatch() { - assertEquals( textSize / interval, emits.size() ); - } + final Trie trie = Trie.builder().ignoreOverlaps().addKeyword("ab").addKeyword("cba").addKeyword("ababc").build(); + assertTrue(trie.containsMatch("ababcbab")); + } - @Test - public void test_UnicodeIssueBug39ReportedByHumanzz() { - // Problem: "İ".length => 1, "İ".toLowerCase().length => 2. This causes - // all sorts of unexpected behaviors - // and bugs where the Emit will have a size different from the original - // string. - // Soln: As in issue #8, convert at character level Character.toLowerCase - // ('İ') => 'i' + make sure - // that emit gets the properly cased keyword. - final String upperLengthOne = "İnt"; - final Trie trie = Trie.builder() - .ignoreCase() - .onlyWholeWords() - .addKeyword( upperLengthOne ) - .build(); - final Collection emits = trie.parseText( "İnt is good" ); - assertEquals( 1, emits.size() ); - checkEmit( emits.iterator().next(), 0, 2, upperLengthOne ); - } - @Test(timeout = 30_000) - public void test_ParallelSearch() throws InterruptedException { - final int interval = 100; - final int textSize = 1000000; - final String keyword = FOOD[ 1 ]; - final StringBuilder matchingText = randomNumbers( textSize ); - injectKeyword( matchingText, keyword, interval ); - final StringBuilder nonMatchingText = randomNumbers( textSize ); - injectKeyword( nonMatchingText, - keyword.substring( 0, keyword.length() - 1 ), - interval ); + @Test + public void test_StartOfChurchillSpeech() { - final Trie trie = Trie.builder() - .onlyWholeWords() - .addKeyword( keyword ) - .build(); + final Trie trie = Trie.builder() + .ignoreOverlaps() + .addKeyword("T") + .addKeyword("u") + .addKeyword("ur") + .addKeyword("r") + .addKeyword("urn") + .addKeyword("ni") + .addKeyword("i") + .addKeyword("in") + .addKeyword("n") + .addKeyword("urning") + .build(); + final Collection emits = trie.parseText("Turning"); + assertEquals(2, emits.size()); + } - final AtomicInteger matchCount = new AtomicInteger( 0 ); - final Runnable matchingTask = () -> matchCount.set( - trie.parseText( matchingText ).size() ); - final AtomicInteger nonMatchCount = new AtomicInteger( 0 ); - final Runnable nonMatchingTask = () -> nonMatchCount.set( trie.parseText( - nonMatchingText ).size() ); - final Thread matchingThread = new Thread( matchingTask ); - final Thread nonMatchingThread = new Thread( nonMatchingTask ); - matchingThread.start(); - nonMatchingThread.start(); - matchingThread.join(); - nonMatchingThread.join(); + @Test + public void test_PartialMatch() { - assertEquals( textSize / interval, matchCount.get() ); - assertEquals( 0, nonMatchCount.get() ); - } + final Trie trie = Trie.builder().onlyWholeWords().addKeyword("sugar").build(); + final Collection emits = trie.parseText("sugarcane sugarcane sugar canesugar"); // left, middle, right test + assertEquals(1, emits.size()); // Match must not be made + checkEmit(emits.iterator().next(), 20, 24, "sugar"); + } + + + @Test + public void test_PartialMatchFirstMatch() { + + final Trie trie = Trie.builder().onlyWholeWords().addKeyword("sugar").build(); + + // left, middle, right test + final Emit firstMatch = trie.firstMatch("sugarcane sugarcane sugar canesugar"); + + checkEmit(firstMatch, 20, 24, "sugar"); + } + + + @Test + public void test_TokenizeFullSentence() { + + final Trie trie = trie(GREEK_LETTERS); + final Collection tokens = trie.tokenize("Hear: Alpha team first, Beta from the rear, Gamma in reserve"); + assertEquals(7, tokens.size()); + final Iterator tokensIt = tokens.iterator(); + assertEquals("Hear: ", tokensIt.next().getFragment()); + assertEquals("Alpha", tokensIt.next().getFragment()); + assertEquals(" team first, ", tokensIt.next().getFragment()); + assertEquals("Beta", tokensIt.next().getFragment()); + assertEquals(" from the rear, ", tokensIt.next().getFragment()); + assertEquals("Gamma", tokensIt.next().getFragment()); + assertEquals(" in reserve", tokensIt.next().getFragment()); + } + + + /** + * Test boundary check with case-insensitive matches with whole words. + */ + @Test + public void test_StringIndexOutOfBoundsException() { + + final Trie trie = Trie.builder().ignoreCase().onlyWholeWords().addKeywords(UNICODE).build(); + final Collection emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ"); + assertEquals(4, emits.size()); // Match must not be made + final Iterator it = emits.iterator(); + checkEmit(it.next(), 0, 6, "turning"); + checkEmit(it.next(), 8, 11, "once"); + checkEmit(it.next(), 13, 17, "again"); + checkEmit(it.next(), 19, 23, "börkü"); + } + + + @Test + public void test_IgnoreCase() { + + final Trie trie = Trie.builder().ignoreCase().addKeywords(UNICODE).build(); + final Collection emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ"); + assertEquals(4, emits.size()); // Match must not be made + final Iterator it = emits.iterator(); + checkEmit(it.next(), 0, 6, "turning"); + checkEmit(it.next(), 8, 11, "once"); + checkEmit(it.next(), 13, 17, "again"); + checkEmit(it.next(), 19, 23, "börkü"); + } + + + @Test + public void test_IgnoreCaseFirstMatch() { + + final Trie trie = Trie.builder().ignoreCase().addKeywords(UNICODE).build(); + final Emit firstMatch = trie.firstMatch("TurninG OnCe AgAiN BÖRKÜ"); + + checkEmit(firstMatch, 0, 6, "turning"); + } + + + @Test + public void test_TokenizeTokensInSequence() { + + final Trie trie = trie(GREEK_LETTERS); + final Collection tokens = trie.tokenize("Alpha Beta Gamma"); + assertEquals(5, tokens.size()); + } + + + /** + * Fix adding a word of size 0 ("") as a dictionary. A bug in the dictionary + * parsing code (at end of line) caused it to generate words of 0 length, + * which were being added to the trie. Removing the additional commas + * resolved the issue. + */ + @Test + public void test_ZeroLength() { + + final Trie trie = Trie.builder().ignoreOverlaps().onlyWholeWords().ignoreCase().addKeyword("").build(); + trie.tokenize("Try a natural lip and subtle bronzer to keep all the focus on those " + + "big bright eyes with NARS Eyeshadow Duo in Rated R And the " + + "winner is... Boots No7 Advanced Renewal Anti-ageing Glycolic " + + "Peel Kit ($25 amazon.com) won most-appealing peel."); + } + + + @Test + public void test_Emit_PunctuatedKeyword_AllOffsetsFound() { + + final String keyword = "{{var}}"; + final int len = keyword.length() - 1; + final Trie trie = builder().ignoreOverlaps().addKeyword(keyword).build(); + + final Collection emits = trie.parseText(format("__%s__ **%s** {{%s}} %s%s", keyword, keyword, keyword, keyword, keyword)); + + assertEquals(5, emits.size()); + final Iterator it = emits.iterator(); + + checkEmit(it.next(), 2, 2 + len, keyword); + checkEmit(it.next(), 14, 14 + len, keyword); + checkEmit(it.next(), 26, 26 + len, keyword); + checkEmit(it.next(), 36, 36 + len, keyword); + checkEmit(it.next(), 43, 43 + len, keyword); + } + + + /** + * Notice the capital I with a dot. The code used to compute the offsets + * at (6, 9), which caused {@link Trie#tokenize(String)} to crash because + * 9 is past the end of the string. That character is two bytes wide, so it + * pushes the offset calculation off. + */ + @Test + public void test_Unicode1() { + // The second character ('İ') is + // Unicode, which was read by AC as a 2-byte char + final String target = "LİKE THIS"; + // Java does it the right way + assertEquals("THIS", target.substring(5, 9)); + final Trie trie = Trie.builder().ignoreCase().onlyWholeWords().addKeyword("this").build(); + final Collection emits = trie.parseText(target); + assertEquals(1, emits.size()); + final Iterator it = emits.iterator(); + checkEmit(it.next(), 5, 8, "this"); + } + + + /** + * Notice the capital I with a dot. The code used to compute the offsets + * at (6, 9), which caused {@link Trie#tokenize(String)} to crash because + * 9 is past the end of the string. That character is two bytes wide, so it + * pushes the offset calculation off. + */ + @Test + public void test_Unicode2() { + // The second character ('İ') is + // Unicode, which was read by AC as a 2-byte char + final String target = "LİKE THIS"; + final Trie trie = Trie.builder().ignoreCase().onlyWholeWords().addKeyword("this").build(); + // Java does it the right way + assertEquals("THIS", target.substring(5, 9)); + final Emit firstMatch = trie.firstMatch(target); + checkEmit(firstMatch, 5, 8, "this"); + } + + + @Test + public void test_PartialMatchWhiteSpaces() { + + final Trie trie = Trie.builder().onlyWholeWordsWhiteSpaceSeparated().addKeyword("#sugar-123").build(); + final Collection emits = trie.parseText("#sugar-123 #sugar-1234"); // left, middle, right test + assertEquals(1, emits.size()); // Match must not be made + checkEmit(emits.iterator().next(), 0, 9, "#sugar-123"); + } + + + @Test + public void test_LargeString() { + + final int interval = 100; + final int textSize = 1000000; + final String keyword = FOOD[1]; + final StringBuilder text = randomNumbers(textSize); + + injectKeyword(text, keyword, interval); + + final Trie trie = Trie.builder().onlyWholeWords().addKeyword(keyword).build(); + + final Collection emits = trie.parseText(text); + + assertEquals(textSize / interval, emits.size()); + } + + + @Test + public void test_UnicodeIssueBug39ReportedByHumanzz() { + // Problem: "İ".length => 1, "İ".toLowerCase().length => 2. This causes + // all sorts of unexpected behaviors + // and bugs where the Emit will have a size different from the original + // string. + // Soln: As in issue #8, convert at character level Character.toLowerCase + // ('İ') => 'i' + make sure + // that emit gets the properly cased keyword. + final String upperLengthOne = "İnt"; + final Trie trie = Trie.builder().ignoreCase().onlyWholeWords().addKeyword(upperLengthOne).build(); + final Collection emits = trie.parseText("İnt is good"); + assertEquals(1, emits.size()); + checkEmit(emits.iterator().next(), 0, 2, upperLengthOne); + } + + + @Test(timeout = 30_000) + public void test_ParallelSearch() throws InterruptedException { + + final int interval = 100; + final int textSize = 1000000; + final String keyword = FOOD[1]; + final StringBuilder matchingText = randomNumbers(textSize); + injectKeyword(matchingText, keyword, interval); + final StringBuilder nonMatchingText = randomNumbers(textSize); + injectKeyword(nonMatchingText, keyword.substring(0, keyword.length() - 1), interval); + + final Trie trie = Trie.builder().onlyWholeWords().addKeyword(keyword).build(); + + final AtomicInteger matchCount = new AtomicInteger(0); + final Runnable matchingTask = () -> matchCount.set(trie.parseText(matchingText).size()); + + final AtomicInteger nonMatchCount = new AtomicInteger(0); + final Runnable nonMatchingTask = () -> nonMatchCount.set(trie.parseText(nonMatchingText).size()); + final Thread matchingThread = new Thread(matchingTask); + final Thread nonMatchingThread = new Thread(nonMatchingTask); + matchingThread.start(); + nonMatchingThread.start(); + matchingThread.join(); + nonMatchingThread.join(); + + assertEquals(textSize / interval, matchCount.get()); + assertEquals(0, nonMatchCount.get()); + } + + + private void checkEmit(Emit next, int expectedStart, int expectedEnd, String expectedKeyword) { + + assertEquals("Start of emit should have been " + expectedStart, expectedStart, next.getStart()); + assertEquals("End of emit should have been " + expectedEnd, expectedEnd, next.getEnd()); + assertEquals(expectedKeyword, next.getKeyword()); + } - private void checkEmit( Emit next, int expectedStart, int expectedEnd, - String expectedKeyword ) { - assertEquals( "Start of emit should have been " + expectedStart, - expectedStart, - next.getStart() ); - assertEquals( "End of emit should have been " + expectedEnd, - expectedEnd, - next.getEnd() ); - assertEquals( expectedKeyword, next.getKeyword() ); - } }