RED-8715: Improve NearestNeighbor Algorithm in LayoutParser

* replaced the old algorithm with an algorithm based on a kd-tree
This commit is contained in:
maverickstuder 2024-03-11 14:42:28 +01:00
parent f4cae8a7dc
commit 16be2467fd
3 changed files with 14 additions and 105 deletions

View File

@ -25,4 +25,5 @@ dependencies {
implementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:${jacksonVersion}")
implementation("org.springframework.boot:spring-boot-starter-web:3.1.3")
implementation("org.jgrapht:jgrapht-core:1.5.2")
implementation("org.tinspin:tinspin-indexes:2.1.3")
}

View File

@ -1,10 +1,10 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import org.springframework.stereotype.Service;
import org.tinspin.index.Index;
import org.tinspin.index.kdtree.KDTree;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Neighbor;
@ -13,115 +13,23 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Neigh
public class NearestNeighbourService {
private static final int NUMBER_OF_NEIGHBOURS = 8;
private static final double STEP = 16.0;
public void findNearestNeighbors(List<Character> characters) {
KDTree<Character> kdTree = KDTree.create(2);
characters.forEach(c -> kdTree.insert(new double[]{c.getX(), c.getY()}, c));
if (characters.isEmpty() || characters.size() == 1) {
return;
}
characters.sort(Comparator.comparingDouble(Character::getX));
int maxNeighborCount = NUMBER_OF_NEIGHBOURS;
if (characters.size() <= NUMBER_OF_NEIGHBOURS) {
maxNeighborCount = characters.size() - 1;
}
for (int i = 0; i < characters.size(); i++) {
Neighbor[] candidates = new Neighbor[maxNeighborCount + 1];
int neighborInsertionIndex = 0;
int neighborCount = 0;
int start = i;
int end = i + 1;
double distance = Double.POSITIVE_INFINITY;
for (double searchDistance = 0; searchDistance < distance; ) {
searchDistance += STEP;
boolean newCandidatesFound = false;
while (start > 0 && characters.get(i).getX() - characters.get(start - 1).getX() < searchDistance) {
start--;
candidates[neighborInsertionIndex] = new Neighbor(characters.get(start), characters.get(i));
neighborCount++;
if (neighborCount > maxNeighborCount) {
neighborInsertionIndex = clearMostDistant(candidates);
neighborCount--;
} else {
neighborInsertionIndex++;
}
newCandidatesFound = true;
}
while (end < characters.size() && characters.get(end).getX() - characters.get(i).getX() < searchDistance) {
candidates[neighborInsertionIndex] = new Neighbor(characters.get(end), characters.get(i));
neighborCount++;
if (neighborCount > maxNeighborCount) {
neighborInsertionIndex = clearMostDistant(candidates);
neighborCount--;
} else {
neighborInsertionIndex++;
}
end++;
newCandidatesFound = true;
}
if (newCandidatesFound && neighborCount >= maxNeighborCount) {
distance = maxDistance(candidates);
}
for(Character c : characters) {
Index.PointIteratorKnn<Character> iterator = kdTree.queryKnn(new double[]{c.getX(), c.getY()}, NUMBER_OF_NEIGHBOURS + 1);
// skip the first as this is identity
if(iterator.hasNext()) {
iterator.next();
}
if (neighborCount < maxNeighborCount) {
clearMostDistant(candidates);
}
List<Neighbor> candidatesList = new ArrayList<>(maxNeighborCount);
for (Neighbor candidate : candidates) {
if (candidate != null) {
candidatesList.add(candidate);
}
}
candidatesList.sort(Comparator.comparingDouble(Neighbor::getDistance));
assert candidatesList.size() == maxNeighborCount;
characters.get(i).setNeighbors(candidatesList);
}
}
private double maxDistance(Neighbor[] candidates) {
double maxDistance = 0;
for (Neighbor candidate : candidates) {
if (candidate == null) {
continue;
}
if (candidate.getDistance() > maxDistance) {
maxDistance = candidate.getDistance();
while(iterator.hasNext()) {
c.getNeighbors().add(new Neighbor(iterator.next().value(), c));
}
}
return maxDistance;
}
private int clearMostDistant(Neighbor[] candidates) {
double maxDistance = 0;
int maxIndex = 0;
for (int i = 0; i < candidates.length; i++) {
Neighbor candidate = candidates[i];
if (candidate == null) {
continue;
}
if (candidate.getDistance() > maxDistance) {
maxDistance = candidate.getDistance();
maxIndex = i;
}
}
candidates[maxIndex] = null;
return maxIndex;
}
}

View File

@ -27,7 +27,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
@SneakyThrows
public void testViewerDocument() {
String fileName = "files/bdr/notMergedParagraphs.pdf";
String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
var documentFile = new ClassPathResource(fileName).getFile();
@ -35,7 +35,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
long start = System.currentTimeMillis();
Document document = buildGraph(fileName, LayoutParsingType.CLARIFYND);
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
}