From ff3dc76caaf79bdd05577fa134a002642cc76b5d Mon Sep 17 00:00:00 2001 From: yhampe Date: Thu, 1 Feb 2024 09:28:10 +0100 Subject: [PATCH] RED-7375 table extractor prototype created new branch because old one broke --- .../api/queue/LayoutParsingRequest.java | 3 + .../processor/LayoutParsingPipeline.java | 13 +++- .../LayoutParsingStorageService.java | 9 +++ .../TableExtractorResponseAdapter.java | 74 +++++++++++++++++++ .../model/table/ExtractedTable.java | 20 +++++ .../model/table/ExtractedTableData.java | 20 +++++ .../model/table/TableExtractorCells.java | 21 ++++++ .../model/table/TableExtractorData.java | 19 +++++ .../model/table/TableExtractorResponse.java | 23 ++++++ .../visualization/ViewerDocumentService.java | 33 ++++++++- 10 files changed, 232 insertions(+), 3 deletions(-) create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/adapter/TableExtractorResponseAdapter.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/ExtractedTable.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/ExtractedTableData.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/TableExtractorCells.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/TableExtractorData.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/TableExtractorResponse.java diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingRequest.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingRequest.java index c364bbc..71aa830 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingRequest.java +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingRequest.java @@ -20,6 +20,9 @@ public record LayoutParsingRequest( @NonNull String originFileStorageId,// @Schema(description = "Optional Path to the table extraction file.")// Optional tablesFileStorageId,// + + @Schema(description= "Optional Path to the the table parsing service file") + Optional tableExtractorFileId,// @Schema(description = "Optional Path to the image classification file.")// Optional imagesFileStorageId,// diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index ba1b1c2..739ded9 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -33,8 +33,11 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageB import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter; import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter; +import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.TableExtractorResponseAdapter; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorCells; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService; import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService; @@ -71,6 +74,7 @@ import lombok.extern.slf4j.Slf4j; public class LayoutParsingPipeline { ImageServiceResponseAdapter imageServiceResponseAdapter; + TableExtractorResponseAdapter tableExtractorResponseAdapter; CvTableParsingAdapter cvTableParsingAdapter; LayoutParsingStorageService layoutParsingStorageService; SectionsBuilderService sectionsBuilderService; @@ -101,6 +105,11 @@ public class LayoutParsingPipeline { imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get()); } + TableExtractorResponse tableExtractorResponse = new TableExtractorResponse(); + if (layoutParsingRequest.tableExtractorFileId().isPresent()) { + tableExtractorResponse = layoutParsingStorageService.getExtractedTableFile(layoutParsingRequest.tableExtractorFileId().get()); + } + TableServiceResponse tableServiceResponse = new TableServiceResponse(); if (layoutParsingRequest.tablesFileStorageId().isPresent()) { tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get()); @@ -119,8 +128,10 @@ public class LayoutParsingPipeline { layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph)); layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph)); + Map> extractedTableCells = tableExtractorResponseAdapter.buildExtractedTablesPerPage(tableExtractorResponse); + log.info("Creating viewer document for {}", layoutParsingRequest.identifier()); - viewerDocumentService.createViewerDocument(originFile, documentGraph, viewerDocumentFile, false); + viewerDocumentService.createViewerDocument(originFile, documentGraph, viewerDocumentFile, extractedTableCells,false); layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile); if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.TAAS)) { log.info("Building research document data for {}", layoutParsingRequest.identifier()); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java index 3082d54..3cb9766 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java @@ -20,6 +20,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.Si import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.tenantcommons.TenantContext; @@ -62,6 +63,14 @@ public class LayoutParsingStorageService { } } + public TableExtractorResponse getExtractedTableFile(String storageId) throws IOException { + + try (InputStream inputStream = getObject(storageId)) { + TableExtractorResponse tableExtractorResponse = objectMapper.readValue(inputStream, TableExtractorResponse.class); + inputStream.close(); + return tableExtractorResponse; + } + } public TableServiceResponse getTablesFile(String storageId) throws IOException { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/adapter/TableExtractorResponseAdapter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/adapter/TableExtractorResponseAdapter.java new file mode 100644 index 0000000..4b8d553 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/adapter/TableExtractorResponseAdapter.java @@ -0,0 +1,74 @@ +package com.knecon.fforesight.service.layoutparser.processor.python_api.adapter; + +import java.awt.geom.Rectangle2D; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType; +import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.ExtractedTable; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.ExtractedTableData; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableData; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorCells; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse; + +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; + +@Service +@Slf4j +@RequiredArgsConstructor +public class TableExtractorResponseAdapter { + + public Map> buildExtractedTablesPerPage(TableExtractorResponse tableExtractorResponse) { + Map> tableCells = new HashMap<>(); + tableExtractorResponse.getData() + .forEach(tableData -> tableCells.computeIfAbsent(tableData.getPage_number(), tableCell -> new ArrayList<>()) + .addAll(convertTableCells(tableData.getTables()))); + + return tableCells; + + } + + public List convertTableCells(List tableObjects) { + + List parsedTableCells = new ArrayList<>(); + + tableObjects.stream().forEach(t -> { + TableExtractorCells tableCells = new TableExtractorCells(); + tableCells.setX0(t.getTable().getBbox().get(0)); + tableCells.setX1(t.getTable().getBbox().get(2)); + tableCells.setY0(t.getTable().getBbox().get(1)); + tableCells.setY1(t.getTable().getBbox().get(3)); + tableCells.setWidth(tableCells.getX1()- tableCells.getX0()); + tableCells.setHeight(tableCells.getY1()- tableCells.getY0()); + tableCells.setLabel(t.getTable().getLabel()); + log.info("Parsed table cell {} with label {}",tableCells, tableCells.getLabel()); + parsedTableCells.add(tableCells); + t.getObjects().forEach(o -> { + TableExtractorCells objectCell = new TableExtractorCells(); + objectCell.setX0(t.getTable().getBbox().get(0)); + objectCell.setX1(t.getTable().getBbox().get(2)); + objectCell.setY0(t.getTable().getBbox().get(1)); + objectCell.setY1(t.getTable().getBbox().get(3)); + objectCell.setWidth(objectCell.getX1()- objectCell.getX0()); + objectCell.setHeight(objectCell.getY1()- objectCell.getY0()); + objectCell.setLabel(o.getLabel()); + log.info("Parsed object cell {} with label {}",objectCell, objectCell.getLabel()); + parsedTableCells.add(objectCell); + }); + }); + + return parsedTableCells; + + } + + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/ExtractedTable.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/ExtractedTable.java new file mode 100644 index 0000000..12c594b --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/ExtractedTable.java @@ -0,0 +1,20 @@ +package com.knecon.fforesight.service.layoutparser.processor.python_api.model.table; + +import java.util.List; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@Builder +@NoArgsConstructor +@AllArgsConstructor +public class ExtractedTable { + + private boolean rotated; + private ExtractedTableData table; + private List objects; + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/ExtractedTableData.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/ExtractedTableData.java new file mode 100644 index 0000000..1e6d58d --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/ExtractedTableData.java @@ -0,0 +1,20 @@ +package com.knecon.fforesight.service.layoutparser.processor.python_api.model.table; + +import java.util.List; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@Builder +@NoArgsConstructor +@AllArgsConstructor +public class ExtractedTableData { + + private String label; + private float score; + private List bbox; + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/TableExtractorCells.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/TableExtractorCells.java new file mode 100644 index 0000000..74e9730 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/TableExtractorCells.java @@ -0,0 +1,21 @@ +package com.knecon.fforesight.service.layoutparser.processor.python_api.model.table; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@Builder +@NoArgsConstructor +@AllArgsConstructor +public class TableExtractorCells { + private float x0; + private float y0; + private float x1; + private float y1; + private float width; + private float height; + private String label; + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/TableExtractorData.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/TableExtractorData.java new file mode 100644 index 0000000..61af72e --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/TableExtractorData.java @@ -0,0 +1,19 @@ +package com.knecon.fforesight.service.layoutparser.processor.python_api.model.table; + +import java.util.List; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@Builder +@NoArgsConstructor +@AllArgsConstructor +public class TableExtractorData { + + private int page_number; + private int image; + private List tables; +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/TableExtractorResponse.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/TableExtractorResponse.java new file mode 100644 index 0000000..5f0a527 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/model/table/TableExtractorResponse.java @@ -0,0 +1,23 @@ +package com.knecon.fforesight.service.layoutparser.processor.python_api.model.table; + +import java.util.List; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@Builder +@NoArgsConstructor +@AllArgsConstructor +public class TableExtractorResponse { + + private String dossierId; + private String fileId; + private String targetFileExtension; + private String responseFileExtension; + private String X_TENANT_ID; + private List data; + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/ViewerDocumentService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/ViewerDocumentService.java index 2335517..56391af 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/ViewerDocumentService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/ViewerDocumentService.java @@ -1,5 +1,6 @@ package com.knecon.fforesight.service.layoutparser.processor.services.visualization; +import java.awt.Color; import java.awt.geom.AffineTransform; import java.awt.geom.Rectangle2D; import java.io.File; @@ -7,6 +8,8 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardCopyOption; +import java.util.List; +import java.util.Map; import org.apache.pdfbox.Loader; import org.apache.pdfbox.cos.COSName; @@ -31,6 +34,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.visualization. import com.knecon.fforesight.service.layoutparser.processor.model.visualization.LayoutGrid; import com.knecon.fforesight.service.layoutparser.processor.model.visualization.PlacedText; import com.knecon.fforesight.service.layoutparser.processor.model.visualization.VisualizationsOnPage; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorCells; import io.micrometer.observation.Observation; import io.micrometer.observation.ObservationRegistry; @@ -54,13 +58,14 @@ public class ViewerDocumentService { @SneakyThrows @Observed(name = "ViewerDocumentService", contextualName = "create-viewer-document") - public void createViewerDocument(File originFile, Document document, File destinationFile, boolean layerVisibilityDefaultValue) { + public void createViewerDocument(File originFile, Document document, File destinationFile, Map> extractedTableCells, boolean layerVisibilityDefaultValue) { Path tmpFile = Files.createTempFile("tmpViewerDocument", ".pdf"); PDDocument pdDocument = openPDDocument(originFile); LayoutGrid layoutGrid = layoutGridService.createLayoutGrid(document); - PDOptionalContentGroup layer = addLayerToDocument(pdDocument, layerVisibilityDefaultValue); + PDOptionalContentGroup layer = addLayerToDocument(pdDocument, false); + PDOptionalContentGroup visualLayoutParsingLayer = addLayerToDocument(pdDocument, true); PDFont font = new PDType1Font(Standard14Fonts.FontName.HELVETICA); for (int pageNumber = 0; pageNumber < pdDocument.getNumberOfPages(); pageNumber++) { @@ -120,6 +125,30 @@ public class ViewerDocumentService { } contentStream.restoreGraphicsState(); contentStream.endMarkedContent(); + + contentStream.beginMarkedContent(COSName.OC, visualLayoutParsingLayer); + contentStream.saveGraphicsState(); + + contentStream.setLineWidth(LINE_WIDTH); + for (TableExtractorCells tableCells : extractedTableCells.get(pageNumber)) { + contentStream.setStrokingColor(new Color(0xB700FF)); + contentStream.addRect((float) tableCells.getX0(), (float) tableCells.getY0(), (float) tableCells.getWidth(), (float) tableCells.getHeight()); + contentStream.stroke(); + contentStream.setFont(font, FONT_SIZE); + contentStream.beginText(); + Matrix textMatrix = new Matrix((float) textDeRotationMatrix.getScaleX(), + (float) textDeRotationMatrix.getShearX(), + (float) textDeRotationMatrix.getShearY(), + (float) textDeRotationMatrix.getScaleY(), + tableCells.getX0() , + tableCells.getY0()); + textMatrix.translate(-((font.getStringWidth(tableCells.getLabel()) / 1000) * FONT_SIZE + (2 * LINE_WIDTH) + 4), -FONT_SIZE); + contentStream.setTextMatrix(textMatrix); + contentStream.showText(tableCells.getLabel()); + contentStream.endText(); + } + contentStream.restoreGraphicsState(); + contentStream.endMarkedContent(); } if (pageNumber % 500 == 0 && pageNumber != 0) { // re-open document every once in a while to save on RAM