Pull request #394: RED-3800 String Performance matching test

Merge in RED/redaction-service from RED-3800-string-performance-test to master

* commit '21d717f0837c9cc9b2f5b251d372db88b25a7b6d':
  RED-3800 String Performance matching test
  RED-3800 String Performance matching test
This commit is contained in:
Timo Bejan 2022-05-24 11:09:08 +02:00
commit 77584c9a5a
3 changed files with 7002 additions and 0 deletions

View File

@ -0,0 +1,85 @@
package com.iqser.red.service.redaction.v1.server.stringmatching;
import lombok.AllArgsConstructor;
import lombok.EqualsAndHashCode;
import lombok.SneakyThrows;
import org.ahocorasick.trie.Trie;
import org.apache.commons.io.IOUtils;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.core.io.ClassPathResource;
import org.springframework.test.context.junit4.SpringRunner;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
@RunWith(SpringRunner.class)
public class StringMatchingPerformanceTest {
@Test
@SneakyThrows
public void testStringPerformance() {
String text = IOUtils.toString(new ClassPathResource("stringmatching/hamlet.txt").getInputStream()).toLowerCase();
Set<String> dictionary = IOUtils.readLines(new ClassPathResource("stringmatching/names.txt").getInputStream())
.stream()
.map(String::toLowerCase).collect(Collectors.toSet());
System.out.println("Loaded text has a length of " + text.length() + " symbols");
System.out.println("Dictionary has " + dictionary.size() + " entries");
var patterns = dictionary.stream()
.map(p -> Pattern.compile(Pattern.quote(p))).collect(Collectors.toList());
var trie = Trie.builder().ignoreCase().addKeywords(dictionary).build();
// 1. Naive approach
long t1 = System.currentTimeMillis();
var naiveIndexes = new HashSet<Index>();
for (var entry : dictionary) {
var startIndex = 0;
do {
startIndex = text.indexOf(entry, startIndex + 1);
if (startIndex != -1) {
naiveIndexes.add(new Index(startIndex, startIndex + entry.length()));
}
} while (startIndex != -1);
}
long t2 = System.currentTimeMillis();
System.out.println("Naive approach found " + naiveIndexes.size() + " entries in " + (t2 - t1) + "ms");
// 2. Boyer Moore
t1 = System.currentTimeMillis();
var boyerMooreIndexes = new HashSet<Index>();
for (var pattern : patterns) {
boyerMooreIndexes.addAll(pattern.matcher(text).results().map(r -> new Index(r.start(), r.end())).collect(Collectors.toList()));
}
t2 = System.currentTimeMillis();
System.out.println("Boyer Moore found " + boyerMooreIndexes.size() + " entries in " + (t2 - t1) + "ms");
// 3. Aho Corasick
t1 = System.currentTimeMillis();
var result = trie.parseText(text);
var ahoCorasickIndexes = result.stream().map(r -> new Index(r.getStart(), r.getEnd() + 1)).collect(Collectors.toSet());
t2 = System.currentTimeMillis();
System.out.println("Aho Corasick found " + ahoCorasickIndexes.size() + " entries in " + (t2 - t1) + "ms");
// Assert that all algorithms are equal
assertThat(naiveIndexes).isEqualTo(boyerMooreIndexes).isEqualTo(ahoCorasickIndexes);
}
@AllArgsConstructor
@EqualsAndHashCode(of = {"start", "end"})
public static class Index {
int start;
int end;
}
}