RED-3800 String Performance matching test
This commit is contained in:
parent
7967453c67
commit
c85ce25ed4
@ -0,0 +1,77 @@
|
||||
package com.iqser.red.service.redaction.v1.server.stringmatching;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import org.ahocorasick.trie.Trie;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
import org.springframework.test.context.junit4.SpringRunner;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@RunWith(SpringRunner.class)
|
||||
public class StringMatchingPerformanceTest {
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testStringPerformance() {
|
||||
|
||||
String text = IOUtils.toString(new ClassPathResource("stringmatching/hamlet.txt").getInputStream()).toLowerCase();
|
||||
Set<String> dictionary = IOUtils.readLines(new ClassPathResource("stringmatching/names.txt").getInputStream())
|
||||
.stream()
|
||||
.map(String::toLowerCase).collect(Collectors.toSet());
|
||||
|
||||
System.out.println("Loaded text has a length of " + text.length() + " symbols");
|
||||
System.out.println("Dictionary has " + dictionary.size() + " entries");
|
||||
|
||||
var patterns = dictionary.stream()
|
||||
.map(p -> Pattern.compile(Pattern.quote(p))).collect(Collectors.toList());
|
||||
var trie = Trie.builder().ignoreCase().addKeywords(dictionary).build();
|
||||
|
||||
|
||||
// 1. Naive approach
|
||||
long t1 = System.currentTimeMillis();
|
||||
var naiveIndexes = new HashSet<Index>();
|
||||
for (var entry : dictionary) {
|
||||
var startIndex = 0;
|
||||
do {
|
||||
startIndex = text.indexOf(entry, startIndex + 1);
|
||||
if (startIndex != -1) {
|
||||
naiveIndexes.add(new Index(startIndex, startIndex + entry.length()));
|
||||
}
|
||||
} while (startIndex != -1);
|
||||
}
|
||||
long t2 = System.currentTimeMillis();
|
||||
System.out.println("Naive approach found " + naiveIndexes.size() + " entires in " + (t2 - t1) + "ms");
|
||||
|
||||
|
||||
// 2. Boyer Moore
|
||||
t1 = System.currentTimeMillis();
|
||||
var boyerMooreIndexes = new HashSet<Index>();
|
||||
for (var pattern : patterns) {
|
||||
boyerMooreIndexes.addAll(pattern.matcher(text).results().map(r -> new Index(r.start(), r.end())).collect(Collectors.toList()));
|
||||
}
|
||||
t2 = System.currentTimeMillis();
|
||||
System.out.println("Boyer Moore found " + boyerMooreIndexes.size() + " entires in " + (t2 - t1) + "ms");
|
||||
|
||||
|
||||
// 3. Aho Corasick
|
||||
t1 = System.currentTimeMillis();
|
||||
var result = trie.parseText(text);
|
||||
var ahoCorasickIndexes = result.stream().map(r -> new Index(r.getStart(), r.getEnd() + 1)).collect(Collectors.toSet());
|
||||
t2 = System.currentTimeMillis();
|
||||
System.out.println("Aho Corasick found " + ahoCorasickIndexes.size() + " entires in " + (t2 - t1) + "ms");
|
||||
|
||||
}
|
||||
|
||||
@AllArgsConstructor
|
||||
public static class Index {
|
||||
int start;
|
||||
int end;
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user