RED-3800 String Performance matching test

This commit is contained in:
Timo Bejan 2022-05-24 11:58:34 +03:00
parent 7967453c67
commit c85ce25ed4
3 changed files with 6994 additions and 0 deletions

View File

@ -0,0 +1,77 @@
package com.iqser.red.service.redaction.v1.server.stringmatching;
import lombok.AllArgsConstructor;
import lombok.SneakyThrows;
import org.ahocorasick.trie.Trie;
import org.apache.commons.io.IOUtils;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.core.io.ClassPathResource;
import org.springframework.test.context.junit4.SpringRunner;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
@RunWith(SpringRunner.class)
public class StringMatchingPerformanceTest {
@Test
@SneakyThrows
public void testStringPerformance() {
String text = IOUtils.toString(new ClassPathResource("stringmatching/hamlet.txt").getInputStream()).toLowerCase();
Set<String> dictionary = IOUtils.readLines(new ClassPathResource("stringmatching/names.txt").getInputStream())
.stream()
.map(String::toLowerCase).collect(Collectors.toSet());
System.out.println("Loaded text has a length of " + text.length() + " symbols");
System.out.println("Dictionary has " + dictionary.size() + " entries");
var patterns = dictionary.stream()
.map(p -> Pattern.compile(Pattern.quote(p))).collect(Collectors.toList());
var trie = Trie.builder().ignoreCase().addKeywords(dictionary).build();
// 1. Naive approach
long t1 = System.currentTimeMillis();
var naiveIndexes = new HashSet<Index>();
for (var entry : dictionary) {
var startIndex = 0;
do {
startIndex = text.indexOf(entry, startIndex + 1);
if (startIndex != -1) {
naiveIndexes.add(new Index(startIndex, startIndex + entry.length()));
}
} while (startIndex != -1);
}
long t2 = System.currentTimeMillis();
System.out.println("Naive approach found " + naiveIndexes.size() + " entires in " + (t2 - t1) + "ms");
// 2. Boyer Moore
t1 = System.currentTimeMillis();
var boyerMooreIndexes = new HashSet<Index>();
for (var pattern : patterns) {
boyerMooreIndexes.addAll(pattern.matcher(text).results().map(r -> new Index(r.start(), r.end())).collect(Collectors.toList()));
}
t2 = System.currentTimeMillis();
System.out.println("Boyer Moore found " + boyerMooreIndexes.size() + " entires in " + (t2 - t1) + "ms");
// 3. Aho Corasick
t1 = System.currentTimeMillis();
var result = trie.parseText(text);
var ahoCorasickIndexes = result.stream().map(r -> new Index(r.getStart(), r.getEnd() + 1)).collect(Collectors.toSet());
t2 = System.currentTimeMillis();
System.out.println("Aho Corasick found " + ahoCorasickIndexes.size() + " entires in " + (t2 - t1) + "ms");
}
@AllArgsConstructor
public static class Index {
int start;
int end;
}
}