From 16be2467fdbc8138dda6fc76526f9067f893d30c Mon Sep 17 00:00:00 2001 From: maverickstuder Date: Mon, 11 Mar 2024 14:42:28 +0100 Subject: [PATCH] RED-8715: Improve NearestNeighbor Algorithm in LayoutParser * replaced the old algorithm with an algorithm based on a kd-tree --- .../build.gradle.kts | 1 + .../service/NearestNeighbourService.java | 114 ++---------------- .../server/graph/ViewerDocumentTest.java | 4 +- 3 files changed, 14 insertions(+), 105 deletions(-) diff --git a/layoutparser-service/layoutparser-service-processor/build.gradle.kts b/layoutparser-service/layoutparser-service-processor/build.gradle.kts index e56f8b5..ed30bd3 100644 --- a/layoutparser-service/layoutparser-service-processor/build.gradle.kts +++ b/layoutparser-service/layoutparser-service-processor/build.gradle.kts @@ -25,4 +25,5 @@ dependencies { implementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:${jacksonVersion}") implementation("org.springframework.boot:spring-boot-starter-web:3.1.3") implementation("org.jgrapht:jgrapht-core:1.5.2") + implementation("org.tinspin:tinspin-indexes:2.1.3") } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/NearestNeighbourService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/NearestNeighbourService.java index 842e3df..826754a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/NearestNeighbourService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/NearestNeighbourService.java @@ -1,10 +1,10 @@ package com.knecon.fforesight.service.layoutparser.processor.docstrum.service; -import java.util.ArrayList; -import java.util.Comparator; import java.util.List; import org.springframework.stereotype.Service; +import org.tinspin.index.Index; +import org.tinspin.index.kdtree.KDTree; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Neighbor; @@ -13,115 +13,23 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Neigh public class NearestNeighbourService { private static final int NUMBER_OF_NEIGHBOURS = 8; - private static final double STEP = 16.0; public void findNearestNeighbors(List characters) { + KDTree kdTree = KDTree.create(2); + characters.forEach(c -> kdTree.insert(new double[]{c.getX(), c.getY()}, c)); - if (characters.isEmpty() || characters.size() == 1) { - return; - } - - characters.sort(Comparator.comparingDouble(Character::getX)); - - int maxNeighborCount = NUMBER_OF_NEIGHBOURS; - if (characters.size() <= NUMBER_OF_NEIGHBOURS) { - maxNeighborCount = characters.size() - 1; - } - - for (int i = 0; i < characters.size(); i++) { - - Neighbor[] candidates = new Neighbor[maxNeighborCount + 1]; - int neighborInsertionIndex = 0; - int neighborCount = 0; - int start = i; - int end = i + 1; - - double distance = Double.POSITIVE_INFINITY; - - for (double searchDistance = 0; searchDistance < distance; ) { - - searchDistance += STEP; - boolean newCandidatesFound = false; - - while (start > 0 && characters.get(i).getX() - characters.get(start - 1).getX() < searchDistance) { - start--; - candidates[neighborInsertionIndex] = new Neighbor(characters.get(start), characters.get(i)); - neighborCount++; - if (neighborCount > maxNeighborCount) { - neighborInsertionIndex = clearMostDistant(candidates); - neighborCount--; - } else { - neighborInsertionIndex++; - } - newCandidatesFound = true; - } - - while (end < characters.size() && characters.get(end).getX() - characters.get(i).getX() < searchDistance) { - candidates[neighborInsertionIndex] = new Neighbor(characters.get(end), characters.get(i)); - neighborCount++; - if (neighborCount > maxNeighborCount) { - neighborInsertionIndex = clearMostDistant(candidates); - neighborCount--; - } else { - neighborInsertionIndex++; - } - end++; - newCandidatesFound = true; - } - - if (newCandidatesFound && neighborCount >= maxNeighborCount) { - distance = maxDistance(candidates); - } + for(Character c : characters) { + Index.PointIteratorKnn iterator = kdTree.queryKnn(new double[]{c.getX(), c.getY()}, NUMBER_OF_NEIGHBOURS + 1); + // skip the first as this is identity + if(iterator.hasNext()) { + iterator.next(); } - if (neighborCount < maxNeighborCount) { - clearMostDistant(candidates); - } - - List candidatesList = new ArrayList<>(maxNeighborCount); - for (Neighbor candidate : candidates) { - if (candidate != null) { - candidatesList.add(candidate); - } - } - candidatesList.sort(Comparator.comparingDouble(Neighbor::getDistance)); - assert candidatesList.size() == maxNeighborCount; - characters.get(i).setNeighbors(candidatesList); - } - } - - - private double maxDistance(Neighbor[] candidates) { - - double maxDistance = 0; - for (Neighbor candidate : candidates) { - if (candidate == null) { - continue; - } - if (candidate.getDistance() > maxDistance) { - maxDistance = candidate.getDistance(); + while(iterator.hasNext()) { + c.getNeighbors().add(new Neighbor(iterator.next().value(), c)); } } - return maxDistance; - } - - private int clearMostDistant(Neighbor[] candidates) { - - double maxDistance = 0; - int maxIndex = 0; - for (int i = 0; i < candidates.length; i++) { - Neighbor candidate = candidates[i]; - if (candidate == null) { - continue; - } - if (candidate.getDistance() > maxDistance) { - maxDistance = candidate.getDistance(); - maxIndex = i; - } - } - candidates[maxIndex] = null; - return maxIndex; } } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index 9adf903..3153952 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -27,7 +27,7 @@ public class ViewerDocumentTest extends BuildDocumentTest { @SneakyThrows public void testViewerDocument() { - String fileName = "files/bdr/notMergedParagraphs.pdf"; + String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; var documentFile = new ClassPathResource(fileName).getFile(); @@ -35,7 +35,7 @@ public class ViewerDocumentTest extends BuildDocumentTest { LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService); long start = System.currentTimeMillis(); - Document document = buildGraph(fileName, LayoutParsingType.CLARIFYND); + Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER); layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true); System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000); }