From f4e93ef03b3e08626d182441f428f2fa4e2c783e Mon Sep 17 00:00:00 2001 From: yhampe Date: Tue, 28 Nov 2023 15:14:04 +0100 Subject: [PATCH] RED-7375: * using ViewerDocumentService to draw TableExtractorResponse into documents --- .../api/queue/LayoutParsingRequest.java | 3 + .../processor/LayoutParsingPipeline.java | 12 ++- .../LayoutParsingStorageService.java | 10 +++ .../model/table/ExtractedTable.java | 17 +++++ .../model/table/ExtractedTableData.java | 19 +++++ .../model/table/TableExtractorData.java | 25 ++++++ .../model/table/TableExtractorResponse.java | 22 ++++++ .../visualization/ViewerDocumentService.java | 57 ++++++++++++++ .../layoutparser/server/BdrJsonBuildTest.java | 3 +- .../HeadlinesGoldStandardIntegrationTest.java | 3 +- .../graph/DocumentGraphJsonWritingTest.java | 3 +- .../server/graph/ViewerDocumentTest.java | 76 ++++++++++++++++++- .../PdfSegmentationServiceTest.java | 3 +- .../services/RulingCleaningServiceTest.java | 5 +- .../server/utils/BuildDocumentTest.java | 3 +- 15 files changed, 251 insertions(+), 10 deletions(-) create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/ExtractedTable.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/ExtractedTableData.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/TableExtractorData.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/TableExtractorResponse.java diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingRequest.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingRequest.java index c364bbc..6da6471 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingRequest.java +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingRequest.java @@ -20,6 +20,9 @@ public record LayoutParsingRequest( @NonNull String originFileStorageId,// @Schema(description = "Optional Path to the table extraction file.")// Optional tablesFileStorageId,// + + @Schema(description= "Optional Path to the the table parsing service file") + Optional tableExtractorFileId, @Schema(description = "Optional Path to the image classification file.")// Optional imagesFileStorageId,// diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index a02f627..9063fb2 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -32,6 +32,7 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.C import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService; import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService; @@ -87,12 +88,17 @@ public class LayoutParsingPipeline { imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get()); } + TableExtractorResponse tableExtractorResponse = new TableExtractorResponse(); + if(layoutParsingRequest.tableExtractorFileId().isPresent()) { + tableExtractorResponse = layoutParsingStorageService.getExtractedTableFile(layoutParsingRequest.tableExtractorFileId().get()); + } + TableServiceResponse tableServiceResponse = new TableServiceResponse(); if (layoutParsingRequest.tablesFileStorageId().isPresent()) { tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get()); } - ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(), originDocument, imageServiceResponse, tableServiceResponse); + ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(), originDocument, imageServiceResponse, tableServiceResponse, tableExtractorResponse); Document documentGraph = DocumentGraphFactory.buildDocumentGraph(classificationDocument); int numberOfPages = originDocument.getNumberOfPages(); @@ -102,6 +108,7 @@ public class LayoutParsingPipeline { try (var out = new ByteArrayOutputStream()) { viewerDocumentService.createViewerDocument(originDocument, documentGraph, out, false); + viewerDocumentService.drawExtractedTables(originDocument,documentGraph,out,tableExtractorResponse.getExtractedTableData()); layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, out); } @@ -157,10 +164,11 @@ public class LayoutParsingPipeline { public ClassificationDocument parseLayout(LayoutParsingType layoutParsingType, PDDocument originDocument, ImageServiceResponse imageServiceResponse, - TableServiceResponse tableServiceResponse) { + TableServiceResponse tableServiceResponse, TableExtractorResponse tableExtractorResponse) { Map> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse); Map> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse); + //Hier muss ich die table cells einlesen ClassificationDocument classificationDocument = new ClassificationDocument(); List classificationPages = new ArrayList<>(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java index 6e35cbe..27b33b9 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java @@ -23,6 +23,8 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.Si import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorData; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.tenantcommons.TenantContext; @@ -62,6 +64,14 @@ public class LayoutParsingStorageService { } } + public TableExtractorResponse getExtractedTableFile(String storageId) throws IOException { + try (InputStream inputStream = getObject(storageId)) { + TableExtractorResponse tableExtractorResponse = objectMapper.readValue(inputStream,TableExtractorResponse.class); + inputStream.close(); + return tableExtractorResponse; + } + } + public TableServiceResponse getTablesFile(String storageId) throws IOException { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/ExtractedTable.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/ExtractedTable.java new file mode 100644 index 0000000..df5838a --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/ExtractedTable.java @@ -0,0 +1,17 @@ +package com.knecon.fforesight.service.layoutparser.processor.python_api.model.table; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@Builder +@NoArgsConstructor +@AllArgsConstructor +public class ExtractedTable { + private boolean rotated; + private ExtractedTableData extractedTableValue; + + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/ExtractedTableData.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/ExtractedTableData.java new file mode 100644 index 0000000..f3756a3 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/ExtractedTableData.java @@ -0,0 +1,19 @@ +package com.knecon.fforesight.service.layoutparser.processor.python_api.model.table; + +import java.util.List; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@Builder +@NoArgsConstructor +@AllArgsConstructor +public class ExtractedTableData { + private String label; + private float score; + private List boundingBox; + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/TableExtractorData.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/TableExtractorData.java new file mode 100644 index 0000000..6c937fe --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/TableExtractorData.java @@ -0,0 +1,25 @@ +package com.knecon.fforesight.service.layoutparser.processor.python_api.model.table; + +import java.util.List; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@Builder +@NoArgsConstructor +@AllArgsConstructor +public class TableExtractorData { + + private int pageNumber; + private int pageRotation; + private int imageHeigth; + private int imageWidth; + private float pdfHeight; + private float pdfWidth; + private int dpi; + private List tables; + private List objects; +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/TableExtractorResponse.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/TableExtractorResponse.java new file mode 100644 index 0000000..335afa4 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/TableExtractorResponse.java @@ -0,0 +1,22 @@ +package com.knecon.fforesight.service.layoutparser.processor.python_api.model.table; + +import java.util.List; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@Builder +@NoArgsConstructor +@AllArgsConstructor +public class TableExtractorResponse { private String dossierId; + private String fileId; + private String targetFileExtension; + private String responseFileExtension; + private String X_TENANT_ID; + private List extractedTableData; + + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/ViewerDocumentService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/ViewerDocumentService.java index 2290eed..aa178bb 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/ViewerDocumentService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/ViewerDocumentService.java @@ -1,12 +1,22 @@ package com.knecon.fforesight.service.layoutparser.processor.services.visualization; +import java.awt.Color; import java.awt.geom.AffineTransform; import java.awt.geom.Rectangle2D; +import java.io.File; +import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; import java.util.HashSet; +import java.util.List; import java.util.Set; +import javax.print.Doc; + +import org.apache.pdfbox.Loader; import org.apache.pdfbox.cos.COSDictionary; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.pdmodel.PDDocument; @@ -23,13 +33,21 @@ import org.apache.pdfbox.pdmodel.graphics.state.PDExtendedGraphicsState; import org.apache.pdfbox.util.Matrix; import org.springframework.stereotype.Service; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle; import com.knecon.fforesight.service.layoutparser.processor.model.visualization.ColoredLine; import com.knecon.fforesight.service.layoutparser.processor.model.visualization.ColoredRectangle; import com.knecon.fforesight.service.layoutparser.processor.model.visualization.FilledRectangle; import com.knecon.fforesight.service.layoutparser.processor.model.visualization.LayoutGrid; import com.knecon.fforesight.service.layoutparser.processor.model.visualization.PlacedText; import com.knecon.fforesight.service.layoutparser.processor.model.visualization.VisualizationsOnPage; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.ExtractedTable; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.ExtractedTableData; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorData; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse; +import com.knecon.fforesight.service.layoutparser.processor.utils.PdfVisualisationUtility; import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; @@ -47,6 +65,45 @@ public class ViewerDocumentService { private final LayoutGridService layoutGridService; + @SneakyThrows + public void drawExtractedTables(PDDocument pdDocument, Document document, OutputStream outputStream, List tableExtractorData) { + + for (TableExtractorData tableExtractorDatum : tableExtractorData) { + int pageNumber = tableExtractorDatum.getPageNumber(); + List tableRectangles = new ArrayList<>(); + List objectRectangles = new ArrayList<>(); + for (ExtractedTable table : tableExtractorDatum.getTables()) { + List boundingBox = table.getExtractedTableValue().getBoundingBox(); + float x0 = boundingBox.get(0); + float x1 = boundingBox.get(2); + float y0 = boundingBox.get(1); + float y1 = boundingBox.get(3); + Rectangle2D tableRectangle = new Rectangle(y0, x0, x1 - x0, y1 - y0); + tableRectangles.add(tableRectangle); + } + for (ExtractedTableData object : tableExtractorDatum.getObjects()) { + List boundingBox = object.getBoundingBox(); + float x0 = boundingBox.get(0); + float x1 = boundingBox.get(2); + float y0 = boundingBox.get(1); + float y1 = boundingBox.get(3); + Rectangle2D objectRectangle = new Rectangle(y0, x0, x1 - x0, y1 - y0); + objectRectangles.add(objectRectangle); + } + PdfVisualisationUtility.drawRectangle2DList(pdDocument, + pageNumber, + tableRectangles, + PdfVisualisationUtility.Options.builder().strokeColor(Color.PINK).strokeWidth(1).stroke(true).build()); + PdfVisualisationUtility.drawRectangle2DList(pdDocument, + pageNumber, + objectRectangles, + PdfVisualisationUtility.Options.builder().strokeColor(Color.CYAN).strokeWidth(1).stroke(true).build()); + } + pdDocument.save(outputStream); + + + } + @SneakyThrows public void createViewerDocument(PDDocument pdDocument, Document document, OutputStream outputStream, boolean layerVisibilityDefaultValue) { diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java index 8e6255d..a100e33 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java @@ -26,6 +26,7 @@ import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipelin import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper; @@ -50,7 +51,7 @@ public class BdrJsonBuildTest extends AbstractTest { return DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.TAAS, pdDocument, new ImageServiceResponse(), - new TableServiceResponse())); + new TableServiceResponse(), new TableExtractorResponse())); } } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java index b2e35d6..dea30e9 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java @@ -33,6 +33,7 @@ import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipelin import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.tenantcommons.TenantsClient; @@ -98,7 +99,7 @@ public class HeadlinesGoldStandardIntegrationTest { Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, Loader.loadPDF(pdfFileResource.getFile()), new ImageServiceResponse(), - new TableServiceResponse())); + new TableServiceResponse(), new TableExtractorResponse())); var foundHeadlines = documentGraph.streamAllSubNodes() .map(SemanticNode::getHeadline) diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java index 973e0b9..2877e89 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java @@ -16,6 +16,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.Do import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper; @@ -58,7 +59,7 @@ public class DocumentGraphJsonWritingTest extends BuildDocumentTest { Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, Loader.loadPDF(filename.toFile()), new ImageServiceResponse(), - new TableServiceResponse())); + new TableServiceResponse(), new TableExtractorResponse())); DocumentData documentData = DocumentDataMapper.toDocumentData(documentGraph); ObjectMapper mapper = ObjectMapperFactory.create(); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index e8dd8d6..c666223 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -1,27 +1,50 @@ package com.knecon.fforesight.service.layoutparser.server.graph; +import static com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw.drawRectangle2DList; + +import java.awt.Color; +import java.awt.geom.Rectangle2D; +import java.awt.geom.RectangularShape; +import java.io.File; import java.io.FileOutputStream; +import java.nio.file.Files; import java.nio.file.Path; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedList; import java.util.List; +import java.util.Map; import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; +import org.json.JSONArray; +import org.json.JSONObject; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.core.io.ClassPathResource; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation; +import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.services.GapDetectionService; +import com.knecon.fforesight.service.layoutparser.processor.services.GapsAcrossLinesService; +import com.knecon.fforesight.service.layoutparser.processor.services.LineDetectionService; import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService; import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; @@ -29,7 +52,9 @@ import com.knecon.fforesight.service.layoutparser.processor.services.mapper.Docu import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper; import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService; import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService; +import com.knecon.fforesight.service.layoutparser.processor.utils.PdfVisualisationUtility; import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest; +import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; import lombok.SneakyThrows; @@ -55,12 +80,61 @@ public class ViewerDocumentTest extends BuildDocumentTest { } } + @Test + @SneakyThrows + public void testLayoutParsingServiceResults() { + String tableSourceFileName ="C:\\Users\\YannikHampe\\Downloads\\3875a78f1db6ff94b05e38446e65ba9a.EXTRACTED_TABLES.json\\3875a78f1db6ff94b05e38446e65ba9a.EXTRACTED_TABLES.json"; + Path pdfFileResource = Path.of("C:\\Users\\YannikHampe\\Downloads\\2009-1048395_50pages_tables.pdf"); + String tmpFileName = "C:/Users/YANNIK~1/AppData/Local/Temp/rectangles."+pdfFileResource.getFileName(); + ObjectMapper objectMapper = new ObjectMapper(); + PDDocument pdDocument = Loader.loadPDF(pdfFileResource.toFile()); + JsonNode jsonNode = objectMapper.readTree(new String(Files.readAllBytes(new File(tableSourceFileName).toPath()))); + JsonNode dataNode = jsonNode.get("data"); + + dataNode.forEach(node -> { + List rectangles = new ArrayList<>(); + int pageNumber = node.get("page_number").asInt()+1; + JsonNode tables = node.get("tables"); + tables.forEach(entry -> { + JsonNode table = entry.get("table"); + //table bounding box + if(Float.valueOf(String.valueOf(table.get("score"))) < 0.99) { + return; + } + JsonNode tableBox = table.get("bbox"); + float x0 = Float.valueOf(tableBox.get(0).toString()); + float x1 = Float.valueOf(tableBox.get(2).toString()); + float y0 = Float.valueOf(tableBox.get(1).toString()); + float y1 = Float.valueOf(tableBox.get(3).toString()); + Rectangle2D rectangle2D = new Rectangle(y0, x0, x1 - x0, y1 - y0); + rectangles.add(rectangle2D); + //columns and rows + JsonNode rowsAndColumns = entry.get("objects"); + rowsAndColumns.forEach(rowOrColumn -> { + JsonNode bbox = rowOrColumn.get("bbox"); + float rx0 = Float.valueOf(bbox.get(0).toString()); + float rx1 = Float.valueOf(bbox.get(2).toString()); + float ry0 = Float.valueOf(bbox.get(1).toString()); + float ry1 = Float.valueOf(bbox.get(3).toString()); + Rectangle2D rowOrColumnRectangle = new Rectangle(ry0, rx0, rx1 - rx0, ry1 - ry0); + rectangles.add(rowOrColumnRectangle); + }); + }); + PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, rectangles, PdfVisualisationUtility.Options.builder().strokeColor(Color.GREEN).strokeWidth(2).stroke(true).build()); + + }); + try (var out = new FileOutputStream(tmpFileName)) { + pdDocument.save(out); + } + + } + public ClassificationDocument buildClassificationDocument(PDDocument originDocument) { ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, originDocument, new ImageServiceResponse(), - new TableServiceResponse()); + new TableServiceResponse(), new TableExtractorResponse()); redactManagerClassificationService.classifyDocument(classificationDocument); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java index a35b401..493ec6e 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java @@ -33,6 +33,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePag import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter; import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService; import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService; @@ -66,7 +67,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, originDocument, new ImageServiceResponse(), - new TableServiceResponse()); + new TableServiceResponse(), new TableExtractorResponse()); redactManagerClassificationService.classifyDocument(classificationDocument); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java index ae7e418..06f73f7 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java @@ -20,6 +20,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Do import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor; import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService; @@ -79,11 +80,11 @@ public class RulingCleaningServiceTest extends BuildDocumentTest { Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, Loader.loadPDF(filename.toFile()), new ImageServiceResponse(), - new TableServiceResponse())); + new TableServiceResponse(), new TableExtractorResponse())); Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, Loader.loadPDF(filename.toFile()), new ImageServiceResponse(), - new TableServiceResponse())); + new TableServiceResponse(), new TableExtractorResponse())); DocumentData documentDataBefore = DocumentDataMapper.toDocumentData(documentGraphBefore); DocumentData documentDataAfter = DocumentDataMapper.toDocumentData(documentGraphAfter); if (!compareStructures(documentDataBefore.getDocumentStructure(), documentDataAfter.getDocumentStructure(), filename.getFileName().toString())) { diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java index dc9d0d6..5f15ec5 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java @@ -11,6 +11,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; @@ -28,7 +29,7 @@ public abstract class BuildDocumentTest extends AbstractTest { ClassPathResource fileResource = new ClassPathResource(filename); prepareStorage(filename); try (InputStream inputStream = fileResource.getInputStream(); PDDocument pdDocument = Loader.loadPDF(inputStream.readAllBytes())) { - return layoutParsingPipeline.parseLayout(layoutParsingType, pdDocument, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), new TableServiceResponse()); + return layoutParsingPipeline.parseLayout(layoutParsingType, pdDocument, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), new TableServiceResponse(), new TableExtractorResponse()); } }