RED-8715: Improve NearestNeighbor Algorithm in LayoutParser
* replaced the old algorithm with an algorithm based on a kd-tree
This commit is contained in:
parent
f4cae8a7dc
commit
16be2467fd
@ -25,4 +25,5 @@ dependencies {
|
||||
implementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:${jacksonVersion}")
|
||||
implementation("org.springframework.boot:spring-boot-starter-web:3.1.3")
|
||||
implementation("org.jgrapht:jgrapht-core:1.5.2")
|
||||
implementation("org.tinspin:tinspin-indexes:2.1.3")
|
||||
}
|
||||
|
||||
@ -1,10 +1,10 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.tinspin.index.Index;
|
||||
import org.tinspin.index.kdtree.KDTree;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Neighbor;
|
||||
@ -13,115 +13,23 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Neigh
|
||||
public class NearestNeighbourService {
|
||||
|
||||
private static final int NUMBER_OF_NEIGHBOURS = 8;
|
||||
private static final double STEP = 16.0;
|
||||
|
||||
|
||||
public void findNearestNeighbors(List<Character> characters) {
|
||||
KDTree<Character> kdTree = KDTree.create(2);
|
||||
characters.forEach(c -> kdTree.insert(new double[]{c.getX(), c.getY()}, c));
|
||||
|
||||
if (characters.isEmpty() || characters.size() == 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
characters.sort(Comparator.comparingDouble(Character::getX));
|
||||
|
||||
int maxNeighborCount = NUMBER_OF_NEIGHBOURS;
|
||||
if (characters.size() <= NUMBER_OF_NEIGHBOURS) {
|
||||
maxNeighborCount = characters.size() - 1;
|
||||
}
|
||||
|
||||
for (int i = 0; i < characters.size(); i++) {
|
||||
|
||||
Neighbor[] candidates = new Neighbor[maxNeighborCount + 1];
|
||||
int neighborInsertionIndex = 0;
|
||||
int neighborCount = 0;
|
||||
int start = i;
|
||||
int end = i + 1;
|
||||
|
||||
double distance = Double.POSITIVE_INFINITY;
|
||||
|
||||
for (double searchDistance = 0; searchDistance < distance; ) {
|
||||
|
||||
searchDistance += STEP;
|
||||
boolean newCandidatesFound = false;
|
||||
|
||||
while (start > 0 && characters.get(i).getX() - characters.get(start - 1).getX() < searchDistance) {
|
||||
start--;
|
||||
candidates[neighborInsertionIndex] = new Neighbor(characters.get(start), characters.get(i));
|
||||
neighborCount++;
|
||||
if (neighborCount > maxNeighborCount) {
|
||||
neighborInsertionIndex = clearMostDistant(candidates);
|
||||
neighborCount--;
|
||||
} else {
|
||||
neighborInsertionIndex++;
|
||||
}
|
||||
newCandidatesFound = true;
|
||||
}
|
||||
|
||||
while (end < characters.size() && characters.get(end).getX() - characters.get(i).getX() < searchDistance) {
|
||||
candidates[neighborInsertionIndex] = new Neighbor(characters.get(end), characters.get(i));
|
||||
neighborCount++;
|
||||
if (neighborCount > maxNeighborCount) {
|
||||
neighborInsertionIndex = clearMostDistant(candidates);
|
||||
neighborCount--;
|
||||
} else {
|
||||
neighborInsertionIndex++;
|
||||
}
|
||||
end++;
|
||||
newCandidatesFound = true;
|
||||
}
|
||||
|
||||
if (newCandidatesFound && neighborCount >= maxNeighborCount) {
|
||||
distance = maxDistance(candidates);
|
||||
}
|
||||
for(Character c : characters) {
|
||||
Index.PointIteratorKnn<Character> iterator = kdTree.queryKnn(new double[]{c.getX(), c.getY()}, NUMBER_OF_NEIGHBOURS + 1);
|
||||
// skip the first as this is identity
|
||||
if(iterator.hasNext()) {
|
||||
iterator.next();
|
||||
}
|
||||
if (neighborCount < maxNeighborCount) {
|
||||
clearMostDistant(candidates);
|
||||
}
|
||||
|
||||
List<Neighbor> candidatesList = new ArrayList<>(maxNeighborCount);
|
||||
for (Neighbor candidate : candidates) {
|
||||
if (candidate != null) {
|
||||
candidatesList.add(candidate);
|
||||
}
|
||||
}
|
||||
candidatesList.sort(Comparator.comparingDouble(Neighbor::getDistance));
|
||||
assert candidatesList.size() == maxNeighborCount;
|
||||
characters.get(i).setNeighbors(candidatesList);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private double maxDistance(Neighbor[] candidates) {
|
||||
|
||||
double maxDistance = 0;
|
||||
for (Neighbor candidate : candidates) {
|
||||
if (candidate == null) {
|
||||
continue;
|
||||
}
|
||||
if (candidate.getDistance() > maxDistance) {
|
||||
maxDistance = candidate.getDistance();
|
||||
while(iterator.hasNext()) {
|
||||
c.getNeighbors().add(new Neighbor(iterator.next().value(), c));
|
||||
}
|
||||
}
|
||||
return maxDistance;
|
||||
}
|
||||
|
||||
|
||||
private int clearMostDistant(Neighbor[] candidates) {
|
||||
|
||||
double maxDistance = 0;
|
||||
int maxIndex = 0;
|
||||
for (int i = 0; i < candidates.length; i++) {
|
||||
Neighbor candidate = candidates[i];
|
||||
if (candidate == null) {
|
||||
continue;
|
||||
}
|
||||
if (candidate.getDistance() > maxDistance) {
|
||||
maxDistance = candidate.getDistance();
|
||||
maxIndex = i;
|
||||
}
|
||||
}
|
||||
candidates[maxIndex] = null;
|
||||
return maxIndex;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -27,7 +27,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
@SneakyThrows
|
||||
public void testViewerDocument() {
|
||||
|
||||
String fileName = "files/bdr/notMergedParagraphs.pdf";
|
||||
String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
|
||||
var documentFile = new ClassPathResource(fileName).getFile();
|
||||
@ -35,7 +35,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
Document document = buildGraph(fileName, LayoutParsingType.CLARIFYND);
|
||||
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
|
||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
||||
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user