RED-7081: getBBox() Performance Improvement

This commit is contained in:
Kilian Schuettler 2023-07-04 20:19:53 +02:00 committed by Timo Bejan
parent 788613c92e
commit 15a6d46f5c
5 changed files with 117 additions and 33 deletions

View File

@ -208,7 +208,7 @@ public class AtomicTextBlock implements TextBlock {
List<Rectangle2D> rectanglesPerLine = stringBoundary.split(getAllLineBreaksInBoundary(stringBoundary)) List<Rectangle2D> rectanglesPerLine = stringBoundary.split(getAllLineBreaksInBoundary(stringBoundary))
.stream() .stream()
.map(this::getPositions) .map(this::getPositions)
.map(RectangleTransformations::rectangleUnionWithGaps) .map(RectangleTransformations::rectangleBBoxWithGaps)
.flatMap(Collection::stream) .flatMap(Collection::stream)
.toList(); .toList();
Map<Page, List<Rectangle2D>> rectanglePerLinePerPage = new HashMap<>(); Map<Page, List<Rectangle2D>> rectanglePerLinePerPage = new HashMap<>();

View File

@ -1,6 +1,5 @@
package com.knecon.fforesight.service.layoutparser.processor.utils; package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.awt.geom.Area;
import java.awt.geom.Rectangle2D; import java.awt.geom.Rectangle2D;
import java.awt.geom.RectangularShape; import java.awt.geom.RectangularShape;
import java.util.Collections; import java.util.Collections;
@ -19,11 +18,14 @@ import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlo
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.AtomicTextBlock; import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.AtomicTextBlock;
import lombok.AllArgsConstructor;
import lombok.NoArgsConstructor;
public class RectangleTransformations { public class RectangleTransformations {
public static PDRectangle toPDRectangleUnion(List<Rectangle> rectangles) { public static PDRectangle toPDRectangleBBox(List<Rectangle> rectangles) {
Rectangle2D rectangle2D = RectangleTransformations.bBoxUnionRectangle(rectangles); Rectangle2D rectangle2D = RectangleTransformations.rectangleBBox(rectangles);
PDRectangle annotationPosition = new PDRectangle(); PDRectangle annotationPosition = new PDRectangle();
annotationPosition.setLowerLeftX((float) rectangle2D.getMinX()); annotationPosition.setLowerLeftX((float) rectangle2D.getMinX());
@ -34,15 +36,15 @@ public class RectangleTransformations {
} }
public static Rectangle2D bBoxUnionAtomicTextBlock(List<AtomicTextBlock> atomicTextBlocks) { public static Rectangle2D atomicTextBlockBBox(List<AtomicTextBlock> atomicTextBlocks) {
return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DUnion()); return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DBBoxCollector());
} }
public static Rectangle2D bBoxUnionRectangle(List<Rectangle> rectangles) { public static Rectangle2D rectangleBBox(List<Rectangle> rectangles) {
return rectangles.stream().map(RectangleTransformations::toRectangle2D).collect(new Rectangle2DUnion()); return rectangles.stream().map(RectangleTransformations::toRectangle2D).collect(new Rectangle2DBBoxCollector());
} }
@ -64,9 +66,9 @@ public class RectangleTransformations {
} }
public static Rectangle2D rectangleUnion(List<Rectangle2D> rectangle2DList) { public static Rectangle2D rectangle2DBBox(List<Rectangle2D> rectangle2DList) {
return rectangle2DList.stream().collect(new Rectangle2DUnion()); return rectangle2DList.stream().collect(new Rectangle2DBBoxCollector());
} }
@ -76,7 +78,7 @@ public class RectangleTransformations {
* @param rectangle2DList A list of rectangles to combine * @param rectangle2DList A list of rectangles to combine
* @return A list of rectangles which are combined if they are closer than the split threshold * @return A list of rectangles which are combined if they are closer than the split threshold
*/ */
public static List<Rectangle2D> rectangleUnionWithGaps(List<Rectangle2D> rectangle2DList) { public static List<Rectangle2D> rectangleBBoxWithGaps(List<Rectangle2D> rectangle2DList) {
if (rectangle2DList.isEmpty()) { if (rectangle2DList.isEmpty()) {
return Collections.emptyList(); return Collections.emptyList();
@ -98,49 +100,87 @@ public class RectangleTransformations {
previousRectangle = currentRectangle; previousRectangle = currentRectangle;
} }
} }
return rectangleListsWithGaps.stream().map(RectangleTransformations::rectangleUnion).toList(); return rectangleListsWithGaps.stream().map(RectangleTransformations::rectangle2DBBox).toList();
} }
private static class Rectangle2DUnion implements Collector<Rectangle2D, Area, Rectangle2D> { private static class Rectangle2DBBoxCollector implements Collector<Rectangle2D, Rectangle2DBBoxCollector.BBox, Rectangle2D> {
@Override @Override
public Supplier<Area> supplier() { public Supplier<BBox> supplier() {
return Area::new; return BBox::new;
} }
@Override @Override
public BiConsumer<Area, Rectangle2D> accumulator() { public BiConsumer<BBox, Rectangle2D> accumulator() {
return (area, rectangle2D) -> area.add(new Area(rectangle2D)); return (bb, rect) -> bb.addRectangle(rect.getMinX(), rect.getMinY(), rect.getMaxX(), rect.getMaxY());
} }
@Override @Override
public BinaryOperator<Area> combiner() { public BinaryOperator<BBox> combiner() {
return (area1, area2) -> { return (b1, b2) -> new BBox(Math.min(b1.lowerLeftX, b2.lowerLeftX),
area1.add(area2); Math.min(b1.lowerLeftY, b2.lowerLeftY),
return area1; Math.max(b1.upperRightX, b2.upperRightX),
}; Math.max(b1.upperRightY, b2.upperRightY));
} }
@Override @Override
public Function<Area, Rectangle2D> finisher() { public Function<BBox, Rectangle2D> finisher() {
return Area::getBounds2D; return bb -> new Rectangle2D.Double(bb.lowerLeftX, bb.lowerLeftY, bb.upperRightX - bb.lowerLeftX, bb.upperRightY - bb.lowerLeftY);
} }
@Override @Override
public Set<Characteristics> characteristics() { public Set<Characteristics> characteristics() {
return Set.of(Characteristics.CONCURRENT, Characteristics.UNORDERED); return Set.of(Characteristics.UNORDERED);
}
@AllArgsConstructor
@NoArgsConstructor
private static class BBox {
Double lowerLeftX;
Double lowerLeftY;
Double upperRightX;
Double upperRightY;
public void addRectangle(double lowerLeftX, double lowerLeftY, double upperRightX, double upperRightY) {
if (this.lowerLeftX == null) {
this.lowerLeftX = lowerLeftX;
} else if (this.lowerLeftX > lowerLeftX) {
this.lowerLeftX = lowerLeftX;
}
if (this.lowerLeftY == null) {
this.lowerLeftY = lowerLeftY;
} else if (this.lowerLeftY > lowerLeftY) {
this.lowerLeftY = lowerLeftY;
}
if (this.upperRightX == null) {
this.upperRightX = upperRightX;
} else if (this.upperRightX < upperRightX) {
this.upperRightX = upperRightX;
}
if (this.upperRightY == null) {
this.upperRightY = upperRightY;
} else if (this.upperRightY < upperRightY) {
this.upperRightY = upperRightY;
}
}
} }
} }
} }

View File

@ -35,12 +35,16 @@ public class BuildDocumentGraphTest extends BaseTest {
@SneakyThrows @SneakyThrows
protected Document buildGraph(String filename) { protected Document buildGraph(String filename) {
if (filename.equals("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06")) { if (!filename.endsWith(".pdf")) {
prepareStorage(filename + ".pdf", "cv_table_parsing_response/empty.json", "image_service_response/S-Metolachlor_RAR_01_Volume_1_2018-09-06.IMAGE_INFO.json"); filename = filename + ".pdf";
} else {
prepareStorage(filename + ".pdf");
} }
ClassPathResource fileResource = new ClassPathResource(filename + ".pdf");
if (filename.equals("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06")) {
prepareStorage(filename, "cv_table_parsing_response/empty.json", "image_service_response/S-Metolachlor_RAR_01_Volume_1_2018-09-06.IMAGE_INFO.json");
} else {
prepareStorage(filename);
}
ClassPathResource fileResource = new ClassPathResource(filename);
try (InputStream inputStream = fileResource.getInputStream()) { try (InputStream inputStream = fileResource.getInputStream()) {
PDDocument pdDocument = Loader.loadPDF(inputStream); PDDocument pdDocument = Loader.loadPDF(inputStream);

View File

@ -1,5 +1,45 @@
package com.knecon.fforesight.service.layoutparser.server.graph; package com.knecon.fforesight.service.layoutparser.server.graph;
public class DocumentDataTests { import java.io.File;
import java.io.FileOutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import org.junit.jupiter.api.Test;
import org.springframework.core.io.ClassPathResource;
import com.iqser.red.commons.jackson.ObjectMapperFactory;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
import com.knecon.fforesight.service.layoutparser.processor.mapper.redaction.DocumentDataMapper;
import lombok.SneakyThrows;
public class DocumentDataTests extends BuildDocumentGraphTest{
@Test
@SneakyThrows
public void createDocumentDataForAllFiles() {
String outPath = "/tmp/document_data_output_layoutparser";
ClassPathResource resource = new ClassPathResource("files");
List<String> pdfFileNames = Files.walk(resource.getFile().toPath())
.filter(path -> path.getFileName().toString().endsWith(".pdf"))
.map(Path::toAbsolutePath)
.map(Path::toString)
.toList();
System.out.printf("%d Files found%n", pdfFileNames.size());
for (int i = 0; i < pdfFileNames.size(); i++) {
System.out.printf("%d/%d: %s%n", i, pdfFileNames.size(), pdfFileNames.get(i));
}
for (String pdfFileName : pdfFileNames) {
System.out.println(pdfFileName);
DocumentData documentData = DocumentDataMapper.toDocumentData(buildGraph(resource.getFile().toPath().getParent().relativize(Path.of(pdfFileName)).toString()));
File outputFile = Path.of(outPath).resolve(resource.getFile().toPath().relativize(Path.of(pdfFileName))).toFile();
outputFile.toPath().getParent().toFile().mkdirs();
try (var out = new FileOutputStream(outputFile.toString().replace(".pdf", ".json"))) {
ObjectMapperFactory.create().writeValue(out, documentData);
}
}
}
} }

View File

@ -20,7 +20,7 @@ public class DocumentGraphJsonWritingTest extends BuildDocumentGraphTest {
@Disabled @Disabled
public void writeJsonForFileTest() { public void writeJsonForFileTest() {
writeJsons("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06"); writeJsons("files/216");
} }
@SneakyThrows @SneakyThrows