From 3a700aecd433bb40e5f0ba2d2c52604a57e235f1 Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Wed, 18 Dec 2024 15:07:31 +0100 Subject: [PATCH] RED-8670: add table detection from idp result * some 'slight' refactoring --- .../api/queue/LayoutParsingRequest.java | 45 +-- .../build.gradle.kts | 2 + .../processor/LayoutParserSettings.java | 2 + .../processor/LayoutParsingPipeline.java | 142 +++---- .../LayoutParsingStorageService.java | 19 +- .../docstrum/DocstrumSegmentationService.java | 46 +-- .../processor/docstrum/model/BoundingBox.java | 33 +- .../docstrum/model/TextBoundingBox.java | 19 +- .../docstrum/service/LineBuilderService.java | 2 +- .../docstrum/service/ReadingOrderService.java | 81 +++- .../docstrum/service/ZoneBuilderService.java | 2 +- .../DividingColumnDetectionService.java | 11 +- .../GapDetectionService.java | 3 +- .../GapInformation.java | 2 +- .../GapsAcrossLinesService.java | 5 +- .../InvisibleTableDetectionService.java | 4 +- .../LineDetectionService.java | 4 +- .../LineInformation.java | 2 +- .../processor/model/AbstractPageBlock.java | 11 +- .../model/ClassificationDocument.java | 2 + .../processor/model/ClassificationPage.java | 49 ++- .../processor/model/PageContents.java | 9 +- .../processor/model/PageInformation.java | 67 +++- .../processor/model/SectionIdentifier.java | 3 +- .../{docstrum => }/model/UnionFind.java | 2 +- .../outline/OutlineExtractorService.java | 29 +- .../SectionTreeEnhancementService.java | 4 +- .../processor/model/table/Cell.java | 69 ++-- .../processor/model/table/CleanRulings.java | 6 + .../processor/model/table/Ruling.java | 9 + .../processor/model/table/TablePageBlock.java | 343 +++-------------- .../{ => text}/ClassificationFooter.java | 4 +- .../{ => text}/ClassificationHeader.java | 4 +- .../processor/model/text/RedTextPosition.java | 5 +- .../processor/model/text/TextPageBlock.java | 74 +++- .../processor/model/text/Word.java | 19 +- .../adapter/ImageServiceResponseAdapter.java | 95 +++-- .../services/BodyTextFrameService.java | 4 +- .../MainBodyTextFrameExtractionService.java | 25 -- .../services/PageContentExtractor.java | 218 ++++++++--- .../services/PageInformationService.java | 24 -- .../services/RulingCleaningService.java | 86 +++-- .../services/SectionsBuilderService.java | 29 +- .../services/TableExtractionService.java | 159 -------- .../BlockificationPostprocessingService.java | 2 +- .../blockification/BlockificationService.java | 42 +++ .../DocstrumBlockificationService.java | 95 ++--- .../DocuMineBlockificationService.java | 15 +- .../RedactManagerBlockificationService.java | 16 +- .../TableOfContentsClassificationService.java | 2 +- .../factory/DocumentGraphFactory.java | 21 +- .../SearchTextWithTextPositionFactory.java | 10 +- .../services/factory/SectionNodeFactory.java | 125 ++++--- .../services/factory/TableNodeFactory.java | 50 +-- .../services/factory/TextBlockFactory.java | 8 +- .../services/graphics/FindGraphicsRaster.java | 32 +- .../graphics/GraphicExtractorService.java | 36 +- .../services/mapper/OutlineMapper.java | 2 +- .../parsing/PDFLinesTextStripper.java | 13 +- .../services/tables/AreaSweepGridifier.java | 138 +++++++ .../RectangularIntersectionFinder.java | 11 +- .../tables}/RulingIntersectionFinder.java | 5 +- .../tables}/RulingTextDirAdjustUtil.java | 4 +- .../services/tables/TableAreaFiller.java | 109 ++++++ .../tables/TableExtractionService.java | 270 ++++++++++++++ .../tables/TableFromCellsExtractor.java | 133 +++++++ .../tables/TableGridStructureCalculator.java | 353 ++++++++++++++++++ .../tables}/TableMergingUtility.java | 2 +- .../visualization/IdpResultLayer.java | 113 ++++++ .../visualization/LayoutGridService.java | 17 +- .../services/visualization/LineUtils.java | 125 +++++++ .../processor/utils/BBoxMergingUtility.java | 34 -- .../processor/utils/CoordinateTransforms.java | 2 + .../processor/utils/GeometricComparators.java | 13 + .../processor/utils/PageInformation.java | 59 --- .../processor/utils/ProtobufUtil.java | 42 --- .../utils/RectangleTransformations.java | 10 +- .../utils/TextPositionOperations.java | 8 +- .../processor/utils/UnionFind.java | 44 --- .../visualization/LayoutDebugLayer.java | 11 +- .../processor/visualization/LayoutGrid.java | 25 +- .../services/tables/TableAreaFillerTest.java | 60 +++ .../layoutparser/server/AbstractTest.java | 9 + .../layoutparser/server/BdrJsonBuildTest.java | 12 +- .../server/BuildDocumentTest.java | 3 + .../HeadlinesGoldStandardIntegrationTest.java | 2 + .../server/LayoutparserEnd2EndTest.java | 31 +- .../server/OutlineDetectionTest.java | 3 + .../server/SimplifiedTextServiceTest.java | 2 + .../graph/DocumentGraphJsonWritingTest.java | 12 +- .../graph/DocumentGraphMappingTest.java | 2 +- .../server/graph/ViewerDocumentTest.java | 4 +- .../PdfSegmentationServiceTest.java | 197 +++++----- .../GapAcrossLinesDetectionServiceTest.java | 36 +- .../InvisibleTableDetectionServiceTest.java | 66 ---- ...ainBodyTextFrameExtractionServiceTest.java | 3 +- .../services/PageContentExtractorTest.java | 5 +- .../services/PageInformationServiceTest.java | 63 ---- .../services/RulingCleaningServiceTest.java | 9 +- .../services/RulingsClassifierTest.java | 15 +- .../service/viewerdoc/LayerIdentifier.java | 3 +- .../viewerdoc/layers/IdpLayerConfig.java | 1 + .../layers/LayoutDebugLayerConfig.java | 3 +- .../layers/LayoutGridLayerConfig.java | 3 +- .../model/Standard14EmbeddableFont.java | 5 +- 105 files changed, 2593 insertions(+), 1726 deletions(-) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{services => experimental}/DividingColumnDetectionService.java (89%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{services => experimental}/GapDetectionService.java (97%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{model => experimental}/GapInformation.java (87%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{services => experimental}/GapsAcrossLinesService.java (96%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{services => experimental}/InvisibleTableDetectionService.java (90%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{services => experimental}/LineDetectionService.java (96%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{model => experimental}/LineInformation.java (88%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{docstrum => }/model/UnionFind.java (91%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/{ => text}/ClassificationFooter.java (73%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/{ => text}/ClassificationHeader.java (73%) delete mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/MainBodyTextFrameExtractionService.java delete mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageInformationService.java delete mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationService.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/AreaSweepGridifier.java rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{utils => services/tables}/RectangularIntersectionFinder.java (91%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{utils => services/tables}/RulingIntersectionFinder.java (97%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{utils => services/tables}/RulingTextDirAdjustUtil.java (92%) create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableAreaFiller.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableExtractionService.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableFromCellsExtractor.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableGridStructureCalculator.java rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{utils => services/tables}/TableMergingUtility.java (98%) create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/IdpResultLayer.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LineUtils.java delete mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/BBoxMergingUtility.java delete mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PageInformation.java delete mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/ProtobufUtil.java delete mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/UnionFind.java create mode 100644 layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableAreaFillerTest.java delete mode 100644 layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/InvisibleTableDetectionServiceTest.java delete mode 100644 layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageInformationServiceTest.java diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingRequest.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingRequest.java index 89f98b6..9bcb97a 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingRequest.java +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingRequest.java @@ -10,38 +10,23 @@ import lombok.NonNull; @Builder @Schema(description = "Object containing all storage paths the service needs to know.") public record LayoutParsingRequest( - @Schema(description = "Enum specifying the type of layout parsing to be performed.", allowableValues = "{RedactManager, DocuMine, TAAS}")// - @NonNull LayoutParsingType layoutParsingType, + @Schema(description = "Enum specifying the type of layout parsing to be performed.", allowableValues = "{RedactManager, DocuMine, TAAS}") @NonNull LayoutParsingType layoutParsingType, + @Schema(description = "General purpose identifiers. They are not changed by the service at all and are returned as is in the response queue.") Map identifier, + @Schema(description = "Path to the original PDF file.") @NonNull String originFileStorageId, - @Schema(description = "General purpose identifiers. They are not changed by the service at all and are returned as is in the response queue.")// - Map identifier, + @Schema(description = "Optional Path to the table extraction file.") Optional tablesFileStorageId, + @Schema(description = "Optional Path to the image classification file.") Optional imagesFileStorageId, + @Schema(description = "Path where the IDP Result File is stored.") Optional idpResultStorageId, + @Schema(description = "Optional Path to the the visual layout parsing service file") Optional visualLayoutParsingFileId, - @Schema(description = "Path to the original PDF file.")// - @NonNull String originFileStorageId,// - - @Schema(description = "Optional Path to the table extraction file.")// - Optional tablesFileStorageId,// - @Schema(description = "Optional Path to the image classification file.")// - Optional imagesFileStorageId,// - - @Schema(description = "Optional Path to the the visual layout parsing service file") Optional visualLayoutParsingFileId,// - - @Schema(description = "Path where the Document Structure File will be stored.")// - @NonNull String structureFileStorageId,// - @Schema(description = "Path where the Research Data File will be stored.")// - String researchDocumentStorageId,// - @Schema(description = "Path where the Document Text File will be stored.")// - @NonNull String textBlockFileStorageId,// - @Schema(description = "Path where the Document Positions File will be stored.")// - @NonNull String positionBlockFileStorageId,// - @Schema(description = "Path where the Document Pages File will be stored.")// - @NonNull String pageFileStorageId,// - @Schema(description = "Path where the Document Markdown File will be stored.")// - Optional documentMarkdownFileStorageId,// - @Schema(description = "Path where the Simplified Text File will be stored.")// - @NonNull String simplifiedTextStorageId,// - @Schema(description = "Path where the Viewer Document PDF will be stored.")// - @NonNull String viewerDocumentStorageId + @Schema(description = "Path where the Document Structure File will be stored.") @NonNull String structureFileStorageId, + @Schema(description = "Path where the Research Data File will be stored.") String researchDocumentStorageId, + @Schema(description = "Path where the Document Text File will be stored.") @NonNull String textBlockFileStorageId, + @Schema(description = "Path where the Document Positions File will be stored.") @NonNull String positionBlockFileStorageId, + @Schema(description = "Path where the Document Pages File will be stored.") @NonNull String pageFileStorageId, + @Schema(description = "Path where the Document Markdown File will be stored.") Optional documentMarkdownFileStorageId, + @Schema(description = "Path where the Simplified Text File will be stored.") @NonNull String simplifiedTextStorageId, + @Schema(description = "Path where the Viewer Document PDF will be stored.") @NonNull String viewerDocumentStorageId ) { } diff --git a/layoutparser-service/layoutparser-service-processor/build.gradle.kts b/layoutparser-service/layoutparser-service-processor/build.gradle.kts index cec9b41..4b50384 100644 --- a/layoutparser-service/layoutparser-service-processor/build.gradle.kts +++ b/layoutparser-service/layoutparser-service-processor/build.gradle.kts @@ -23,6 +23,8 @@ dependencies { } implementation("com.iqser.red.commons:storage-commons:2.50.0") + api("com.knecon.fforesight:azure-ocr-service-api:0.23.0") + implementation("org.apache.pdfbox:pdfbox:${pdfBoxVersion}") implementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}") implementation("com.fasterxml.jackson.module:jackson-module-afterburner:${jacksonVersion}") diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParserSettings.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParserSettings.java index f9ff6e4..a9b480a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParserSettings.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParserSettings.java @@ -17,4 +17,6 @@ public class LayoutParserSettings { boolean debug; LayoutParsingType layoutParsingTypeOverride; + String pdftronLicense; + int extractionThreads = 1; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index 8d4f05d..57440d7 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -14,39 +14,39 @@ import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.concurrent.atomic.AtomicReference; +import java.util.function.Function; +import java.util.stream.Collectors; import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; -import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; import org.springframework.beans.factory.annotation.Value; import org.springframework.stereotype.Service; import com.iqser.red.service.redaction.v1.server.mapper.DocumentDataMapper; -import com.iqser.red.service.redaction.v1.server.model.document.nodes.ImageType; import com.iqser.red.service.redaction.v1.server.model.document.nodes.NodeType; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character; -import com.knecon.fforesight.service.layoutparser.processor.model.DocumentWithVisualization; -import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; -import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection; -import com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationService; -import com.knecon.fforesight.service.layoutparser.processor.services.mapper.MarkdownMapper; +import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ReadingOrderService; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; - +import com.knecon.fforesight.service.layoutparser.processor.model.DocumentWithVisualization; +import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService; +import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTree; import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeBuilderService; import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeEnhancementService; -import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTree; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter; @@ -56,24 +56,26 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.ima import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse; +import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor; import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService; import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService; import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService; -import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService; +import com.knecon.fforesight.service.layoutparser.processor.services.tables.TableExtractionService; import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier; import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService; -import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService; -import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService; -import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService; +import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationService; +import com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationService; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; -import com.knecon.fforesight.service.layoutparser.processor.services.graphics.Box; import com.knecon.fforesight.service.layoutparser.processor.services.graphics.GraphicExtractorService; +import com.knecon.fforesight.service.layoutparser.processor.services.mapper.MarkdownMapper; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper; import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper; import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService; import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; -import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation; +import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation; import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations; +import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult; +import com.knecon.fforesight.service.ocr.v1.api.model.Table; import io.micrometer.observation.Observation; import io.micrometer.observation.ObservationRegistry; @@ -98,10 +100,8 @@ public class LayoutParsingPipeline { final SimplifiedSectionTextService simplifiedSectionTextService; final RulingCleaningService rulingCleaningService; final TableExtractionService tableExtractionService; - final DocuMineBlockificationService docuMineBlockificationService; - final RedactManagerBlockificationService redactManagerBlockificationService; + final BlockificationService blockificationService; final BlockificationPostprocessingService blockificationPostprocessingService; - final DocstrumBlockificationService docstrumBlockificationService; final LayoutGridService layoutGridService; final ObservationRegistry observationRegistry; final VisualLayoutParsingAdapter visualLayoutParsingAdapter; @@ -111,11 +111,11 @@ public class LayoutParsingPipeline { final SectionTreeEnhancementService sectionTreeEnhancementService; final LayoutParserSettings settings; final ClassificationService classificationService; + final ReadingOrderService readingOrderService; @Value("${LAYOUT_PARSER_VERSION:}") private String layoutParserVersion; - public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException { long start = System.currentTimeMillis(); @@ -134,14 +134,16 @@ public class LayoutParsingPipeline { TableServiceResponse tableServiceResponse = layoutParsingRequest.tablesFileStorageId() .map(layoutParsingStorageService::getTablesFile) .orElse(new TableServiceResponse()); + IdpResult idpResult = layoutParsingRequest.idpResultStorageId() + .map(layoutParsingStorageService::getIdpResultFile).orElse(IdpResult.empty()); - LayoutParsingType layoutParsingType = settings.getLayoutParsingTypeOverride() == null // - ? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(); + LayoutParsingType layoutParsingType = settings.getLayoutParsingTypeOverride() == null ? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(); ClassificationDocument classificationDocument = parseLayout(layoutParsingType, originFile, imageServiceResponse, tableServiceResponse, + idpResult, visualLayoutParsingResponse, layoutParsingRequest.identifier()); @@ -159,7 +161,8 @@ public class LayoutParsingPipeline { if (layoutParsingRequest.documentMarkdownFileStorageId() .isPresent()) { layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId() - .get(), new MarkdownMapper().toMarkdownContent(documentWithVisualization.document())); + .get(), + new MarkdownMapper().toMarkdownContent(documentWithVisualization.document())); } layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentWithVisualization.document())); layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile); @@ -237,15 +240,22 @@ public class LayoutParsingPipeline { File originFile, ImageServiceResponse imageServiceResponse, TableServiceResponse tableServiceResponse, + IdpResult idpResult, VisualLayoutParsingResponse visualLayoutParsingResponse, Map identifier) { - PDDocument originDocument = openDocument(originFile); - addNumberOfPagesToTrace(originDocument.getNumberOfPages(), Files.size(originFile.toPath())); + PageContentExtractor extractor = new PageContentExtractor(originFile, settings.getExtractionThreads()); + extractor.startAsync(); + int pageCount = extractor.getPageCount(); + addNumberOfPagesToTrace(pageCount, Files.size(originFile.toPath())); Map> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse); - Map> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse); + Map> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse, idpResult); Map> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse); + Function pageNumberExtractor = table -> table.bboxes().get(0).pageNumber(); + Map> idpTablesPerPage = idpResult.tables() + .stream() + .collect(Collectors.groupingBy(pageNumberExtractor)); ClassificationDocument classificationDocument = new ClassificationDocument(); @@ -255,32 +265,20 @@ public class LayoutParsingPipeline { List classificationPages = new ArrayList<>(); - classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument)); - - long pageCount = originDocument.getNumberOfPages(); + classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originFile)); for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) { - - if (pageNumber % 100 == 0) { - // re-open document every once in a while to save on RAM. This has no significant performance impact. - // This is due to PDFBox caching all images and some other stuff with Soft References. This dereferences them and forces the freeing of memory. - originDocument.close(); - originDocument = openDocument(originFile); - } - + PageContents pageContents = extractor.awaitPageContents(pageNumber); if (pageNumber % 100 == 0 || pageNumber == pageCount || pageNumber == 1) { - log.info("Extracting text on Page {} for {}", pageNumber, identifier); + log.info("Processing text on Page {} for {}", pageNumber, identifier); } classificationDocument.setPages(classificationPages); PDFLinesTextStripper stripper = new PDFLinesTextStripper(); - PDPage pdPage = originDocument.getPage(pageNumber - 1); - stripper.setPageNumber(pageNumber); - stripper.setStartPage(pageNumber); - stripper.setEndPage(pageNumber); - stripper.setPdpage(pdPage); - stripper.getText(originDocument); - List words = stripper.getWords(); + + List words = pageContents.getWords(); + List rulings = pageContents.getRulings(); + PageInformation pageInformation = pageContents.getPageInformation(); // rotateDirAdjExactly(words, pdPage); // works really well for many highly rotated documents (e.g. VV-331340.pdf), but it decreases the headline performance by 1.3%, so I am leaving it out for now @@ -291,39 +289,23 @@ public class LayoutParsingPipeline { } classificationDocument.getLayoutDebugLayer().addTextVisualizations(words, pageNumber); - PDRectangle pdr = pdPage.getMediaBox(); - - List rulings = stripper.getRulings(); classificationDocument.getLayoutDebugLayer().addRulingVisualization(rulings, pageNumber); CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), rulings); - PageInformation pageInformation = PageInformation.fromPDPage(pageNumber, pdPage); List emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals(), pageInformation); - classificationDocument.getLayoutDebugLayer().addCellVisualizations(emptyTableCells, pageNumber); - + classificationDocument.getLayoutDebugLayer().addCellVisualizations(emptyTableCells, pageNumber, null); TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings); - List graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getWords(), false); + List tables = tableExtractionService.extractTables(emptyTableCells, words, pageInformation, idpTablesPerPage.get(pageNumber), layoutParsingType, classificationDocument.getLayoutDebugLayer()); - pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>()) - .addAll(graphics.stream() - .map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), - ImageType.GRAPHIC, - false, - stripper.getPageNumber(), - "")) - .toList()); + List graphics = graphicExtractorService.extractPathElementGraphics(pageContents.getGraphicBBoxes(), pageNumber, cleanRulings); + pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>()).addAll(graphics); - ClassificationPage classificationPage = switch (layoutParsingType) { - case REDACT_MANAGER_OLD -> redactManagerBlockificationService.blockify(stripper.getWords(), cleanRulings, classificationDocument.getLayoutDebugLayer()); - case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings); - case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> - docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getLayoutDebugLayer(), layoutParsingType); - case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> - docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getLayoutDebugLayer(), layoutParsingType); - }; + List textBlocks = blockificationService.blockify(layoutParsingType, words, cleanRulings, classificationDocument.getLayoutDebugLayer()); - updateClassificationPage(pdPage, pdr, classificationPage, cleanRulings, pageNumber, pageInformation); + List blocks = readingOrderService.resolve(textBlocks, tables); + + ClassificationPage classificationPage = new ClassificationPage(blocks, pageInformation, cleanRulings); blockificationPostprocessingService.findHeadlinesFromOutline(classificationDocument, pageNumber, classificationPage, pageInformation); @@ -345,16 +327,12 @@ public class LayoutParsingPipeline { } } - tableExtractionService.extractTables(emptyTableCells, classificationPage); - buildPageStatistics(classificationPage); increaseDocumentStatistics(classificationPage, classificationDocument); classificationPages.add(classificationPage); } - originDocument.close(); - classificationService.classify(classificationDocument, layoutParsingType, identifier); SectionTree sectionTree = sectionTreeBuilderService.createSectionTree(classificationDocument); @@ -371,24 +349,6 @@ public class LayoutParsingPipeline { } - private static void updateClassificationPage(PDPage pdPage, - PDRectangle pdr, - ClassificationPage classificationPage, - CleanRulings cleanRulings, - int pageNumber, - PageInformation pageInformation) { - - int rotation = pdPage.getRotation(); - boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270); - classificationPage.setCleanRulings(cleanRulings); - classificationPage.setRotation(rotation); - classificationPage.setLandscape(isLandscape); - classificationPage.setPageNumber(pageNumber); - classificationPage.setPageWidth((float) pageInformation.width()); - classificationPage.setPageHeight((float) pageInformation.height()); - } - - private static void rotateDirAdjExactly(List words, PDPage pdPage) { for (TextDirection dir : TextDirection.values()) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java index 857382b..3a1c363 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java @@ -25,6 +25,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse; +import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult; import com.knecon.fforesight.service.viewerdoc.service.ViewerDocVersioningUtility; import com.knecon.fforesight.tenantcommons.TenantContext; @@ -95,7 +96,23 @@ public class LayoutParsingStorageService { } -@SneakyThrows + + @SneakyThrows + public IdpResult getIdpResultFile(String storageId) { + + if (!storageService.objectExists(TenantContext.getTenantId(), storageId)) { + return IdpResult.empty(); + } + try (var idpResultStream = getObject(storageId)) { + + IdpResult idpResult = objectMapper.readValue(idpResultStream, IdpResult.class); + idpResultStream.close(); + return idpResult; + } + } + + + @SneakyThrows public VisualLayoutParsingResponse getVisualLayoutParsingFile(String storageId) { try (InputStream inputStream = getObject(storageId)) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/DocstrumSegmentationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/DocstrumSegmentationService.java index 476d4c3..d7f5dcd 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/DocstrumSegmentationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/DocstrumSegmentationService.java @@ -1,9 +1,7 @@ package com.knecon.fforesight.service.layoutparser.processor.docstrum; import java.util.ArrayList; -import java.util.EnumMap; import java.util.List; -import java.util.stream.Collectors; import org.springframework.stereotype.Service; @@ -16,10 +14,8 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.Rea import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.SpacingService; import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ZoneBuilderService; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; -import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; -import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer; import lombok.RequiredArgsConstructor; @@ -27,7 +23,6 @@ import lombok.RequiredArgsConstructor; @RequiredArgsConstructor public class DocstrumSegmentationService { - public static final double SAME_DIRECTION_THRESHOLD = 0.9; private final NearestNeighbourService nearestNeighbourService; private final SpacingService spacingService; private final LineBuilderService lineBuilderService; @@ -35,52 +30,27 @@ public class DocstrumSegmentationService { private final ReadingOrderService readingOrderService; - public List segmentPage(List textPositions, boolean xyOrder, CleanRulings usedRulings, LayoutDebugLayer visualizations) { + public List segmentPage(List words, boolean xyOrder, CleanRulings usedRulings) { - EnumMap directionCounts = new EnumMap<>(TextDirection.class); - - List newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.ZERO); - directionCounts.put(TextDirection.ZERO, newZones.size()); + List newZones = computeZones(words, usedRulings, TextDirection.ZERO); List zones = new ArrayList<>(newZones); - newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.QUARTER_CIRCLE); - directionCounts.put(TextDirection.QUARTER_CIRCLE, newZones.size()); + newZones = computeZones(words, usedRulings, TextDirection.QUARTER_CIRCLE); zones.addAll(newZones); - newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.HALF_CIRCLE); - directionCounts.put(TextDirection.HALF_CIRCLE, newZones.size()); + newZones = computeZones(words, usedRulings, TextDirection.HALF_CIRCLE); zones.addAll(newZones); - newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.THREE_QUARTER_CIRCLE); - directionCounts.put(TextDirection.THREE_QUARTER_CIRCLE, newZones.size()); + newZones = computeZones(words, usedRulings, TextDirection.THREE_QUARTER_CIRCLE); zones.addAll(newZones); - return readingOrderService.resolve(zones, xyOrder, mostSameDirection(directionCounts)); + return readingOrderService.resolve(zones, xyOrder); } - private boolean mostSameDirection(EnumMap directionCounts) { + private List computeZones(List words, CleanRulings rulings, TextDirection direction) { - int total = directionCounts.values() - .stream() - .mapToInt(i -> i).sum(); - - if ((double) directionCounts.get(TextDirection.ZERO) / total > SAME_DIRECTION_THRESHOLD) { - return true; - } else if ((double) directionCounts.get(TextDirection.QUARTER_CIRCLE) / total > SAME_DIRECTION_THRESHOLD) { - return true; - } else if ((double) directionCounts.get(TextDirection.HALF_CIRCLE) / total > SAME_DIRECTION_THRESHOLD) { - return true; - } else if ((double) directionCounts.get(TextDirection.THREE_QUARTER_CIRCLE) / total > SAME_DIRECTION_THRESHOLD) { - return true; - } - return false; - } - - - private List computeZones(List textPositions, CleanRulings rulings, LayoutDebugLayer visualizations, TextDirection direction) { - - List characters = textPositions.stream() + List characters = words.stream() .filter(t -> t.getDir() == direction) .map(Word::getCharacters) .flatMap(List::stream) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/BoundingBox.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/BoundingBox.java index f282671..5da554e 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/BoundingBox.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/BoundingBox.java @@ -1,7 +1,6 @@ package com.knecon.fforesight.service.layoutparser.processor.docstrum.model; import java.awt.geom.Rectangle2D; -import java.util.Comparator; import java.util.List; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; @@ -25,8 +24,6 @@ public abstract class BoundingBox { // Also, these are definitely correct and should be used whenever possible. protected Rectangle2D bBoxPdf; - protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f; - public double getX() { @@ -204,23 +201,22 @@ public abstract class BoundingBox { } - public double verticalOverlap(BoundingBox other) { + public double verticalOverlapPdf(BoundingBox other) { return Math.max(0, Math.min(this.getPdfMaxY(), other.getPdfMaxY()) - Math.max(this.getPdfMinY(), other.getPdfMinY())); } - public static final Comparator ILL_DEFINED_ORDER = (o1, o2) -> { + public double verticalOverlap(BoundingBox other) { - if (o1.equals(o2)) { - return 0; - } - if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD * ((o1.getHeight() + o2.getHeight()) / 2)) { - return Double.compare(o1.getPdfMinX(), o2.getPdfMinX()); - } else { - return Double.compare(o1.getPdfMaxY(), o2.getPdfMaxY()); - } - }; + return Math.max(0, Math.min(this.getMaxY(), other.getMaxY()) - Math.max(this.getMinY(), other.getMinY())); + } + + + public double horizontalOverlap(BoundingBox other) { + + return Math.max(0, Math.min(this.getMaxX(), other.getMaxX()) - Math.max(this.getMinX(), other.getMinX())); + } public double horizontalDistance(BoundingBox other) { @@ -276,4 +272,13 @@ public abstract class BoundingBox { return this.intersectsX(other) && this.getMinY() >= other.getMaxY(); } + + public double intersectedArea(BoundingBox r2) { + + double xOverlap = horizontalOverlap(r2); + double yOverlap = verticalOverlap(r2); + + return xOverlap * yOverlap; + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/TextBoundingBox.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/TextBoundingBox.java index 842cc7b..17bf4e1 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/TextBoundingBox.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/TextBoundingBox.java @@ -2,7 +2,9 @@ package com.knecon.fforesight.service.layoutparser.processor.docstrum.model; import java.awt.geom.Rectangle2D; import java.util.List; -import java.util.Set; +import java.util.Map; +import java.util.Optional; +import java.util.function.Function; import java.util.stream.Collectors; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection; @@ -36,19 +38,16 @@ public abstract class TextBoundingBox extends BoundingBox { .map(TextBoundingBox::getBBoxDirAdj) .collect(RectangleTransformations.collectBBox()); - Set textDirections = components.stream() + Optional mostCommonDir = components.stream() .filter(c -> c instanceof TextBoundingBox) .map(c -> (TextBoundingBox) c) .map(TextBoundingBox::getDir) - .collect(Collectors.toSet()); + .collect(Collectors.groupingBy(Function.identity(), Collectors.counting())).entrySet() + .stream() + .max(Map.Entry.comparingByValue()) + .map(Map.Entry::getKey); - if (textDirections.isEmpty()) { - dir = TextDirection.ZERO; - } else if (textDirections.size() > 1) { - throw new IllegalArgumentException("More than one text direction found"); - } else { - dir = textDirections.iterator().next(); - } + dir = mostCommonDir.orElse(TextDirection.ZERO); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/LineBuilderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/LineBuilderService.java index 31ccbcb..6c9f987 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/LineBuilderService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/LineBuilderService.java @@ -9,7 +9,7 @@ import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.AngleFilter; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line; -import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind; +import com.knecon.fforesight.service.layoutparser.processor.model.UnionFind; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; @Service diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ReadingOrderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ReadingOrderService.java index 2f57594..5b407c6 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ReadingOrderService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ReadingOrderService.java @@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.docstrum.service; import java.awt.geom.Rectangle2D; import java.util.ArrayList; import java.util.Comparator; +import java.util.EnumMap; import java.util.HashMap; import java.util.List; import java.util.ListIterator; @@ -12,25 +13,43 @@ import java.util.stream.Collectors; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox; -import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone; import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils; +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; @Service public class ReadingOrderService { private static final double THRESHOLD = 5; public static final double MULTI_COLUMN_DETECTION_THRESHOLD = 1.5; + public static final double SAME_DIRECTION_THRESHOLD = 0.9; private static final Comparator COMPARATOR = // - Comparator.comparing(TextBoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) - .thenComparing(TextBoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)); + Comparator.comparing(TextBoundingBox::getY, + (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) + .thenComparing(TextBoundingBox::getX, + (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)); private static final Comparator COMPARATOR_DIR_ADJ = // - Comparator.comparing(TextBoundingBox::getYDirAdj, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) + Comparator.comparing(TextBoundingBox::getYDirAdj, + (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) .thenComparing(TextBoundingBox::getXDirAdj, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)); - public List resolve(List zones, boolean xyReadingOrder, boolean useDirAdjCoords) { + public List resolve(List textBlocks, List tables) { + + List unsortedBlocks = new ArrayList<>(textBlocks.size() + tables.size()); + unsortedBlocks.addAll(textBlocks); + unsortedBlocks.addAll(tables); + return resolve(unsortedBlocks, false); + } + + + public List resolve(List zones, boolean xyReadingOrder) { + + boolean useDirAdjCoords = mostSameDirection(zones); if (zones.isEmpty() || zones.size() == 1) { return zones; @@ -41,7 +60,7 @@ public class ReadingOrderService { } Map histogram = new HashMap<>(); - for (Zone zone : zones) { + for (TextBoundingBox zone : zones) { Rectangle2D bbox = useDirAdjCoords ? zone.getBBoxDirAdj() : zone.getBBox(); long minY = Math.round(bbox.getMinY()); long maxY = Math.round(bbox.getMaxY()); @@ -52,8 +71,7 @@ public class ReadingOrderService { if (histogram.values() .stream() - .mapToInt(Integer::intValue).average() - .orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) { + .mapToInt(Integer::intValue).average().orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) { return resolveSingleColumnReadingOrder(zones, useDirAdjCoords); } else { @@ -63,7 +81,7 @@ public class ReadingOrderService { } - private static List resolveSingleColumnReadingOrder(List zones, boolean useDirAdjCoords) { + private static List resolveSingleColumnReadingOrder(List zones, boolean useDirAdjCoords) { if (useDirAdjCoords) { return zones.stream() @@ -71,7 +89,7 @@ public class ReadingOrderService { .stream() .flatMap(words -> words.stream() .sorted(COMPARATOR_DIR_ADJ)) - .toList(); + .collect(Collectors.toList()); } zones.sort(COMPARATOR); @@ -79,7 +97,7 @@ public class ReadingOrderService { } - private List resolveMultiColumnReadingOder(List zones, boolean useDirAdjCoords) { + private List resolveMultiColumnReadingOder(List zones, boolean useDirAdjCoords) { // Simple reading order resolver for multi column page layout as described here : https://pub.towardsai.net/advanced-rag-02-unveiling-pdf-parsing-b84ae866344e // TODO implement a more fancy reading order resolver see https://github.com/BobLd/DocumentLayoutAnalysis/blob/master/README.md#reading-order @@ -87,7 +105,7 @@ public class ReadingOrderService { double minX = Double.POSITIVE_INFINITY; double maxX = Double.NEGATIVE_INFINITY; - for (Zone zone : zones) { + for (T zone : zones) { Rectangle2D bbox = useDirAdjCoords ? zone.getBBoxDirAdj() : zone.getBBox(); if (bbox.getX() < minX) { minX = zone.getXDirAdj(); @@ -99,11 +117,11 @@ public class ReadingOrderService { double midLineXCoordinate = (minX + maxX) / 2; - List leftOf = new ArrayList<>(); - List rightOf = new ArrayList<>(); - List middle = new ArrayList<>(); + List leftOf = new ArrayList<>(); + List rightOf = new ArrayList<>(); + List middle = new ArrayList<>(); - for (Zone zone : zones) { + for (T zone : zones) { Rectangle2D bbox = useDirAdjCoords ? zone.getBBoxDirAdj() : zone.getBBox(); if (bbox.getX() < midLineXCoordinate && bbox.getX() + bbox.getWidth() < midLineXCoordinate) { leftOf.add(zone); @@ -166,14 +184,14 @@ public class ReadingOrderService { middle.addAll(leftNotIntersecting); middle.addAll(rightNotIntersecting); */ - List sortedZones = new ArrayList<>(); + List sortedZones = new ArrayList<>(); sortedZones.addAll(leftOf); sortedZones.addAll(rightOf); - ListIterator itty = middle.listIterator(); + ListIterator itty = middle.listIterator(); while (itty.hasNext()) { - Zone current = itty.next(); + T current = itty.next(); Rectangle2D bbox = useDirAdjCoords ? current.getBBoxDirAdj() : current.getBBox(); for (int i = 0; i < sortedZones.size(); i++) { if (bbox.getY() < sortedZones.get(i).getY()) { @@ -189,4 +207,29 @@ public class ReadingOrderService { return sortedZones; } + + private boolean mostSameDirection(List zones) { + + EnumMap directionCounts = new EnumMap<>(TextDirection.class); + + for (TextBoundingBox zone : zones) { + TextDirection dir = zone.getDir(); + directionCounts.put(dir, directionCounts.getOrDefault(dir, 0) + 1); + } + int total = directionCounts.values() + .stream() + .mapToInt(i -> i).sum(); + + if ((double) directionCounts.getOrDefault(TextDirection.ZERO, 0) / total > SAME_DIRECTION_THRESHOLD) { + return true; + } else if ((double) directionCounts.getOrDefault(TextDirection.QUARTER_CIRCLE, 0) / total > SAME_DIRECTION_THRESHOLD) { + return true; + } else if ((double) directionCounts.getOrDefault(TextDirection.HALF_CIRCLE, 0) / total > SAME_DIRECTION_THRESHOLD) { + return true; + } else if ((double) directionCounts.getOrDefault(TextDirection.THREE_QUARTER_CIRCLE, 0) / total > SAME_DIRECTION_THRESHOLD) { + return true; + } + return false; + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ZoneBuilderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ZoneBuilderService.java index bfdcd5c..1bfe263 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ZoneBuilderService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ZoneBuilderService.java @@ -10,7 +10,7 @@ import java.util.stream.Collectors; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line; -import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind; +import com.knecon.fforesight.service.layoutparser.processor.model.UnionFind; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/DividingColumnDetectionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/experimental/DividingColumnDetectionService.java similarity index 89% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/DividingColumnDetectionService.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/experimental/DividingColumnDetectionService.java index f1eeeb1..f3f149b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/DividingColumnDetectionService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/experimental/DividingColumnDetectionService.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.services; +package com.knecon.fforesight.service.layoutparser.processor.experimental; import java.awt.geom.Line2D; import java.awt.geom.Rectangle2D; @@ -7,7 +7,6 @@ import java.util.LinkedList; import java.util.List; import java.util.stream.Stream; -import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation; import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; import lombok.experimental.UtilityClass; @@ -23,13 +22,13 @@ public class DividingColumnDetectionService { public List detectColumns(PageContents pageContents) { - if (pageContents.getSortedWords().size() < 2) { - return List.of(pageContents.getCropBox()); + if (pageContents.getWords().size() < 2) { + return List.of(pageContents.getPageInformation().cropBox()); } - GapInformation linesWithGapInformation = GapDetectionService.findGapsInLines(pageContents.getSortedWords(), pageContents.getCropBox()); + GapInformation linesWithGapInformation = GapDetectionService.findGapsInLines(pageContents.getWords(), pageContents.getPageInformation().cropBox()); - return detectColumnsFromLines(linesWithGapInformation.getXGaps(), pageContents.getCropBox()); + return detectColumnsFromLines(linesWithGapInformation.getXGaps(), pageContents.getPageInformation().cropBox()); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapDetectionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/experimental/GapDetectionService.java similarity index 97% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapDetectionService.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/experimental/GapDetectionService.java index 5d8cab8..40423ff 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapDetectionService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/experimental/GapDetectionService.java @@ -1,10 +1,9 @@ -package com.knecon.fforesight.service.layoutparser.processor.services; +package com.knecon.fforesight.service.layoutparser.processor.experimental; import java.awt.geom.Rectangle2D; import java.util.LinkedList; import java.util.List; -import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import lombok.AllArgsConstructor; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/GapInformation.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/experimental/GapInformation.java similarity index 87% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/GapInformation.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/experimental/GapInformation.java index f445b63..de2a818 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/GapInformation.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/experimental/GapInformation.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.model; +package com.knecon.fforesight.service.layoutparser.processor.experimental; import java.awt.geom.Rectangle2D; import java.util.LinkedList; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapsAcrossLinesService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/experimental/GapsAcrossLinesService.java similarity index 96% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapsAcrossLinesService.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/experimental/GapsAcrossLinesService.java index 94bcce2..0b426a8 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapsAcrossLinesService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/experimental/GapsAcrossLinesService.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.services; +package com.knecon.fforesight.service.layoutparser.processor.experimental; import java.awt.geom.Rectangle2D; import java.awt.geom.RectangularShape; @@ -6,9 +6,6 @@ import java.util.LinkedList; import java.util.List; import java.util.Queue; import java.util.stream.Stream; -import com.iqser.red.commons.jackson.ObjectMapperFactory; - -import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation; import lombok.AllArgsConstructor; import lombok.Getter; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/InvisibleTableDetectionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/experimental/InvisibleTableDetectionService.java similarity index 90% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/InvisibleTableDetectionService.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/experimental/InvisibleTableDetectionService.java index b9ea1c4..c1e628f 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/InvisibleTableDetectionService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/experimental/InvisibleTableDetectionService.java @@ -1,12 +1,10 @@ -package com.knecon.fforesight.service.layoutparser.processor.services; +package com.knecon.fforesight.service.layoutparser.processor.experimental; import java.awt.geom.Rectangle2D; import java.awt.geom.RectangularShape; import java.util.LinkedList; import java.util.List; -import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation; -import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import lombok.experimental.UtilityClass; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/LineDetectionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/experimental/LineDetectionService.java similarity index 96% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/LineDetectionService.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/experimental/LineDetectionService.java index 62a4a6b..47b270d 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/LineDetectionService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/experimental/LineDetectionService.java @@ -1,11 +1,9 @@ -package com.knecon.fforesight.service.layoutparser.processor.services; +package com.knecon.fforesight.service.layoutparser.processor.experimental; import java.awt.geom.Rectangle2D; import java.util.LinkedList; import java.util.List; -import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation; -import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/LineInformation.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/experimental/LineInformation.java similarity index 88% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/LineInformation.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/experimental/LineInformation.java index 600db39..d793425 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/LineInformation.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/experimental/LineInformation.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.model; +package com.knecon.fforesight.service.layoutparser.processor.experimental; import java.awt.geom.Rectangle2D; import java.util.List; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/AbstractPageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/AbstractPageBlock.java index 5679e44..ee53bfd 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/AbstractPageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/AbstractPageBlock.java @@ -1,12 +1,15 @@ package com.knecon.fforesight.service.layoutparser.processor.model; import java.util.HashSet; +import java.util.List; import java.util.Set; import com.fasterxml.jackson.annotation.JsonIgnore; import com.iqser.red.service.redaction.v1.server.model.document.nodes.LayoutEngine; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; +import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import lombok.AllArgsConstructor; import lombok.Data; @@ -17,18 +20,18 @@ import lombok.NoArgsConstructor; @AllArgsConstructor @NoArgsConstructor @EqualsAndHashCode(callSuper = true) -public abstract class AbstractPageBlock extends BoundingBox { +public abstract class AbstractPageBlock extends TextBoundingBox { @JsonIgnore protected PageBlockType classification; - Set engines = new HashSet<>(); + protected Set engines = new HashSet<>(); @JsonIgnore protected int page; @JsonIgnore - private Orientation orientation = Orientation.NONE; + protected Orientation orientation = Orientation.NONE; public abstract String getText(); @@ -42,4 +45,6 @@ public abstract class AbstractPageBlock extends BoundingBox { public abstract boolean isEmpty(); + public abstract List getWords(); + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java index 49b9800..1f188ce 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java @@ -5,6 +5,8 @@ import java.util.List; import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree; import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTree; +import com.knecon.fforesight.service.layoutparser.processor.model.text.ClassificationFooter; +import com.knecon.fforesight.service.layoutparser.processor.model.text.ClassificationHeader; import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText; import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java index dcd9315..a7036b6 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java @@ -1,5 +1,6 @@ package com.knecon.fforesight.service.layoutparser.processor.model; +import java.awt.geom.AffineTransform; import java.awt.geom.Rectangle2D; import java.util.ArrayList; import java.util.HashMap; @@ -11,29 +12,38 @@ import com.knecon.fforesight.service.layoutparser.processor.model.image.Classifi import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; +import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms; import lombok.Data; import lombok.NonNull; -import lombok.RequiredArgsConstructor; @Data -@RequiredArgsConstructor - public class ClassificationPage { + public ClassificationPage(List pageBlocks, PageInformation pageInformation, CleanRulings cleanRulings) { + + this.cleanRulings = cleanRulings; + this.pageNumber = pageInformation.number(); + this.textBlocks = pageBlocks; + var mediaBox = pageInformation.mediabox(); + int rotation = pageInformation.rotationDegrees(); + this.landscape = mediaBox.getWidth() > mediaBox.getHeight() && (rotation == 0 || rotation == 180) // + || mediaBox.getHeight() > mediaBox.getWidth() && (rotation == 90 || rotation == 270); + this.pageInformation = pageInformation; + } + + + private PageInformation pageInformation; @NonNull private List textBlocks; private List outlineObjects = new ArrayList<>(); - private List headlines = new ArrayList<>(); - private List images = new ArrayList<>(); private Rectangle bodyTextFrame; private boolean landscape; - private int rotation; private int pageNumber; @@ -42,11 +52,32 @@ public class ClassificationPage { private StringFrequencyCounter fontCounter = new StringFrequencyCounter(); private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter(); - private float pageWidth; - private float pageHeight; - private CleanRulings cleanRulings; private Map> markedContentBboxPerType = new HashMap<>(); + + public AffineTransform getPdfToPageTransform() { + + return CoordinateTransforms.calculateInitialUserSpaceCoordsToPageCoords(getPageInformation()); + } + + + public int getRotation() { + + return pageInformation.rotationDegrees(); + } + + + public float getPageWidth() { + + return (float) pageInformation.width(); + } + + + public float getPageHeight() { + + return (float) pageInformation.height(); + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageContents.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageContents.java index d4617ce..06c2e16 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageContents.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageContents.java @@ -1,10 +1,10 @@ package com.knecon.fforesight.service.layoutparser.processor.model; -import java.awt.geom.Rectangle2D; import java.util.List; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; +import com.knecon.fforesight.service.layoutparser.processor.services.graphics.Box; import lombok.AllArgsConstructor; import lombok.Builder; @@ -15,8 +15,9 @@ import lombok.Getter; @AllArgsConstructor public class PageContents { - List sortedWords; - Rectangle2D cropBox; - Rectangle2D mediaBox; + PageInformation pageInformation; + List words; List rulings; + List graphicBBoxes; + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageInformation.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageInformation.java index 9080937..c2972cf 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageInformation.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageInformation.java @@ -2,16 +2,63 @@ package com.knecon.fforesight.service.layoutparser.processor.model; import java.awt.geom.Rectangle2D; -import lombok.AllArgsConstructor; -import lombok.Getter; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.common.PDRectangle; -@Getter -@AllArgsConstructor -public class PageInformation { +import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page; - PageContents pageContents; - LineInformation lineInformation; - Rectangle2D mainBodyTextFrame; - GapInformation gapInformation; +public record PageInformation(Rectangle2D mediabox, Rectangle2D cropBox, int number, int rotationDegrees) { -} + public static PageInformation fromPDPage(int pageNum, PDPage page) { + + PDRectangle mediaBox = page.getMediaBox(); + PDRectangle cropBox = page.getCropBox(); + return new PageInformation(new Rectangle2D.Double(mediaBox.getLowerLeftX(), mediaBox.getLowerLeftY(), mediaBox.getWidth(), mediaBox.getHeight()), + new Rectangle2D.Double(cropBox.getLowerLeftX(), cropBox.getLowerLeftY(), cropBox.getWidth(), cropBox.getHeight()), + pageNum, + page.getRotation()); + } + + + public static PageInformation fromPage(Page page) { + + return new PageInformation(new Rectangle2D.Double(0, 0, page.getWidth(), page.getHeight()), + new Rectangle2D.Double(0, 0, page.getWidth(), page.getHeight()), + page.getNumber(), + page.getRotation()); + } + + + public double height() { + + return mediabox.getHeight(); + } + + + public double heightRot() { + + if (rotationDegrees == 90 || rotationDegrees == 270) { + return width(); + } + return height(); + } + + + public double width() { + + return mediabox.getWidth(); + } + + + public double minX() { + + return mediabox.getX(); + } + + + public double minY() { + + return mediabox.getY(); + } + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/SectionIdentifier.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/SectionIdentifier.java index f828180..51c5a66 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/SectionIdentifier.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/SectionIdentifier.java @@ -4,6 +4,7 @@ import java.util.Collections; import java.util.LinkedList; import java.util.List; import java.util.Locale; +import java.util.Objects; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -77,7 +78,7 @@ public class SectionIdentifier { List identifiers = new LinkedList<>(); for (int i = 1; i <= 4; i++) { String numericalIdentifier = numericalIdentifierMatcher.group(i); - if (numericalIdentifier == null || numericalIdentifier.equals("0") || numericalIdentifier.isEmpty() || numericalIdentifier.isBlank()) { + if (numericalIdentifier == null || Objects.equals(numericalIdentifier, "0") || numericalIdentifier.isBlank()) { break; } identifiers.add(Integer.parseInt(numericalIdentifier.trim())); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/UnionFind.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/UnionFind.java similarity index 91% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/UnionFind.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/UnionFind.java index 78e8142..bec0d49 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/UnionFind.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/UnionFind.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.docstrum.model; +package com.knecon.fforesight.service.layoutparser.processor.model; import java.util.Collection; import java.util.LinkedHashMap; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineExtractorService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineExtractorService.java index 80f2370..df7eb82 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineExtractorService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/OutlineExtractorService.java @@ -2,12 +2,14 @@ package com.knecon.fforesight.service.layoutparser.processor.model.outline; import java.awt.geom.AffineTransform; import java.awt.geom.Point2D; +import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Locale; import java.util.Optional; +import org.apache.pdfbox.Loader; import org.apache.pdfbox.cos.COSArray; import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSDictionary; @@ -28,7 +30,7 @@ import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlin import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms; -import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation; +import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation; import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; @@ -48,19 +50,22 @@ public class OutlineExtractorService { @SneakyThrows - public OutlineObjectTree getOutlineObjectTree(PDDocument document) { + public OutlineObjectTree getOutlineObjectTree(File documentFile) { - PDDocumentOutline documentOutline = document.getDocumentCatalog().getDocumentOutline(); + try (var document = Loader.loadPDF(documentFile)) { - List rootNodes = new ArrayList<>(); - if (documentOutline != null) { - for (PDOutlineItem child : documentOutline.children()) { - Optional outlineObjectWithChildren = createOutlineObjectWithChildren(child, document, 1); - outlineObjectWithChildren.ifPresent(rootNodes::add); + PDDocumentOutline documentOutline = document.getDocumentCatalog().getDocumentOutline(); + + List rootNodes = new ArrayList<>(); + if (documentOutline != null) { + for (PDOutlineItem child : documentOutline.children()) { + Optional outlineObjectWithChildren = createOutlineObjectWithChildren(child, document, 1); + outlineObjectWithChildren.ifPresent(rootNodes::add); + } } - } - return new OutlineObjectTree(rootNodes); + return new OutlineObjectTree(rootNodes); + } } @@ -128,9 +133,7 @@ public class OutlineExtractorService { log.info(String.format("Error occurred during position resolution for outline item on page %s with title %s: " + e, pageNumber, title)); } - return Optional.of(new OutlineObjectTreeNode(new OutlineObject(title, - pageNumber, - transformPointToPageCoords(outlinePosition, userSpaceToPageCoords), depth))); + return Optional.of(new OutlineObjectTreeNode(new OutlineObject(title, pageNumber, transformPointToPageCoords(outlinePosition, userSpaceToPageCoords), depth))); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/SectionTreeEnhancementService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/SectionTreeEnhancementService.java index af4d6a2..e3fb0a6 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/SectionTreeEnhancementService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/outline/SectionTreeEnhancementService.java @@ -10,8 +10,8 @@ import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; -import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter; -import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader; +import com.knecon.fforesight.service.layoutparser.processor.model.text.ClassificationFooter; +import com.knecon.fforesight.service.layoutparser.processor.model.text.ClassificationHeader; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Cell.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Cell.java index 9faee3b..99c4fbe 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Cell.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Cell.java @@ -4,13 +4,15 @@ import java.awt.geom.AffineTransform; import java.awt.geom.Point2D; import java.awt.geom.Rectangle2D; import java.util.ArrayList; -import java.util.Iterator; +import java.util.Collection; import java.util.List; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities; +import com.knecon.fforesight.service.ocr.v1.api.model.TableCell; import lombok.Data; import lombok.EqualsAndHashCode; @@ -22,7 +24,7 @@ import lombok.NoArgsConstructor; @NoArgsConstructor public class Cell extends BoundingBox { - private List textBlocks = new ArrayList<>(); + private List textBlocks = new ArrayList<>(); private List headerCells = new ArrayList<>(); @@ -33,17 +35,41 @@ public class Cell extends BoundingBox { private int pageNumber; - public Cell(Point2D topLeft, Point2D bottomRight) { + public Cell(Point2D topLeft, Point2D bottomRight, AffineTransform pdfToPageTransform) { this.bBoxPdf = new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), (bottomRight.getX() - topLeft.getX()), (bottomRight.getY() - topLeft.getY())); - this.bBox = bBoxPdf; + this.bBox = RectangleTransformations.transform(bBoxPdf, pdfToPageTransform); } - public Cell(Rectangle2D bBoxInitialUserSpace, AffineTransform initialUserSpaceToJava) { + public static Cell fromPageCoordinates(Point2D topLeft, Point2D bottomRight, AffineTransform pageToPdfTransform) { + + var bBox = new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), (bottomRight.getX() - topLeft.getX()), (bottomRight.getY() - topLeft.getY())); + return fromPageCoordinates(bBox, pageToPdfTransform); + } + + + public static Cell fromPageCoordinates(Rectangle2D r, AffineTransform pageToPdfTransform) { + + Cell cell = new Cell(); + var bBoxPdf = RectangleTransformations.transform(r, pageToPdfTransform); + cell.bBox = r; + cell.bBoxPdf = bBoxPdf; + return cell; + } + + + public Cell(TableCell tableCell, AffineTransform pdfToPageTransform) { + + this.bBoxPdf = tableCell.textRegion().region().bbox().get().getBounds2D(); + this.bBox = RectangleTransformations.transform(bBoxPdf, pdfToPageTransform); + } + + + public Cell(Rectangle2D bBoxInitialUserSpace, AffineTransform pdfToPageTransform) { this.bBoxPdf = bBoxInitialUserSpace; - this.bBox = initialUserSpaceToJava.createTransformedShape(bBoxInitialUserSpace).getBounds2D(); + this.bBox = RectangleTransformations.transform(bBoxPdf, pdfToPageTransform); } @@ -56,9 +82,12 @@ public class Cell extends BoundingBox { } - public void addTextBlock(TextPageBlock textBlock) { + public List getWords() { - textBlocks.add(textBlock); + return getTextBlocks().stream() + .map(AbstractPageBlock::getWords) + .flatMap(Collection::stream) + .toList(); } @@ -67,24 +96,12 @@ public class Cell extends BoundingBox { StringBuilder sb = new StringBuilder(); - Iterator itty = textBlocks.iterator(); - Word previous = null; - while (itty.hasNext()) { - - TextPageBlock textBlock = itty.next(); - - for (Word word : textBlock.getWords()) { - if (previous != null) { - if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) { - sb.append('\n'); - } else { - sb.append(' '); - } - } - sb.append(word.toString()); - previous = word; + for (int i = 0; i < textBlocks.size(); i++) { + AbstractPageBlock textBlock = textBlocks.get(i); + sb.append(textBlock); + if (i < textBlocks.size() - 1) { + sb.append("\n"); } - } return TextNormalizationUtilities.cleanString(sb.toString()); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/CleanRulings.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/CleanRulings.java index 8c698d8..333df96 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/CleanRulings.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/CleanRulings.java @@ -22,6 +22,12 @@ public class CleanRulings { List verticals; // unmodifiable sorted by X list + public static CleanRulings empty() { + + return new CleanRulings(Collections.emptyList(), Collections.emptyList()); + } + + public CleanRulings(List horizontals, List verticals) { this.horizontals = horizontals.stream() diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Ruling.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Ruling.java index e910ff1..d4938ad 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Ruling.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Ruling.java @@ -30,15 +30,24 @@ public class Ruling extends Line2D.Float { OTHER } + public enum Style { + SOLID, + DASHED + } + @Getter @Setter private Classification classification; + @Getter + @Setter + private Style style; public Ruling(Point2D p1, Point2D p2) { super(p1, p2); this.classification = Classification.OTHER; + this.style = Style.SOLID; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java index 6894336..dc593f2 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java @@ -1,48 +1,48 @@ package com.knecon.fforesight.service.layoutparser.processor.model.table; -import java.awt.geom.Point2D; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashSet; +import java.util.Collection; import java.util.List; -import java.util.Set; -import java.util.TreeMap; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; +import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import lombok.Getter; -import lombok.Setter; import lombok.extern.slf4j.Slf4j; @Slf4j +@Getter public class TablePageBlock extends AbstractPageBlock { - public static final double CELL_AREA_CONTAINED_THRESHOLD = 0.98; - private final TreeMap cellTreeMap = new TreeMap<>(); + private final TextPageBlock caption; - private final int rotation; - @Getter - @Setter - private String headline; - private int unrotatedRowCount; - private int unrotatedColCount; - private List> rows; - @Getter - @Setter - private List cells; + private final List> rows; - public TablePageBlock(List cells, int rotation) { + public TablePageBlock(TextPageBlock caption, List> rows) { - setToBBoxOfComponents(cells); - this.cells = cells; - addCells(cells); - classification = PageBlockType.TABLE; - this.rotation = rotation; + this.classification = PageBlockType.TABLE; + this.caption = caption; + this.rows = rows; + setBBoxes(); + } + + + private void setBBoxes() { + + List components = Stream.of(getCells().stream(), + getCells().stream() + .map(Cell::getTextBlocks) + .flatMap(Collection::stream)) + .flatMap(Function.identity()) + .map(o -> (BoundingBox) o) + .toList(); + setToBBoxOfComponents(components); } @@ -53,28 +53,19 @@ public class TablePageBlock extends AbstractPageBlock { } - public List> getRows() { - - if (rows == null) { - rows = computeRows(); - - // Ignore rows that does not contain any cells and values. - List> rowsToRemove = new ArrayList<>(); - for (List row : rows) { - if (row.size() == 1 && row.get(0).getTextBlocks().isEmpty()) { - rowsToRemove.add(row); - } - } - rows.removeAll(rowsToRemove); - - computeHeaders(); - } - - return rows; + @Override + public List getWords() { + return getCells().stream() + .map(Cell::getTextBlocks) + .flatMap(Collection::stream) + .map(AbstractPageBlock::getWords) + .flatMap(Collection::stream) + .toList(); } + public int getRowCount() { return getRows().size(); @@ -85,259 +76,16 @@ public class TablePageBlock extends AbstractPageBlock { return getRows().stream() .mapToInt(List::size) - .max() - .orElse(0); + .max().orElse(0); } - /** - * Detect header cells (either first row or first column): - * Column is marked as header if originalCell text is bold and row originalCell text is not bold. - * Defaults to row. - */ - private void computeHeaders() { - - if (rows == null) { - rows = computeRows(); - } - // A bold originalCell is a header originalCell as long as every originalCell to the left/top is bold, too - // we move from left to right and top to bottom - for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) { - List rowCells = rows.get(rowIndex); - if (rowCells.size() == 1) { - continue; - } - - for (int colIndex = 0; colIndex < rowCells.size(); colIndex++) { - Cell cell = rowCells.get(colIndex); - List cellsToTheLeft = rowCells.subList(0, colIndex); - Cell lastHeaderCell = null; - for (Cell leftCell : cellsToTheLeft) { - if (leftCell.isHeaderCell()) { - lastHeaderCell = leftCell; - } else { - break; - } - } - if (lastHeaderCell != null) { - cell.getHeaderCells().add(lastHeaderCell); - } - List cellsToTheTop = new ArrayList<>(); - for (int i = 0; i < rowIndex; i++) { - try { - cellsToTheTop.add(rows.get(i) - .get(colIndex)); - } catch (IndexOutOfBoundsException e) { - log.debug("No cell {} in row {}, ignoring.", colIndex, rowIndex); - } - } - for (Cell topCell : cellsToTheTop) { - if (topCell.isHeaderCell()) { - lastHeaderCell = topCell; - } else { - break; - } - } - if (lastHeaderCell != null) { - cell.getHeaderCells().add(lastHeaderCell); - } - if (!cell.getTextBlocks().isEmpty() && cell.getTextBlocks() - .get(0).getMostPopularWordStyle().equals("bold")) { - cell.setHeaderCell(true); - } - } - } - - } - - - private List> computeRows() { - - List> rows = new ArrayList<>(); - if (rotation == 90) { - for (int i = 0; i < unrotatedColCount; i++) { // rows - List lastRow = new ArrayList<>(); - for (int j = unrotatedRowCount - 1; j >= 0; j--) { // cols - Cell cell = cellTreeMap.get(new CellPosition(j, i)); - if (cell != null) { - lastRow.add(cell); - } - } - rows.add(lastRow); - } - } else if (rotation == 270) { - for (int i = unrotatedColCount - 1; i >= 0; i--) { // rows - List lastRow = new ArrayList<>(); - for (int j = 0; j < unrotatedRowCount; j++) { // cols - Cell cell = cellTreeMap.get(new CellPosition(j, i)); - if (cell != null) { - lastRow.add(cell); - } - } - rows.add(lastRow); - } - } else { - for (int i = 0; i < unrotatedRowCount; i++) { - List lastRow = new ArrayList<>(); - for (int j = 0; j < unrotatedColCount; j++) { - Cell cell = cellTreeMap.get(new CellPosition(i, j)); // JAVA_8 use getOrDefault() - if (cell != null) { - lastRow.add(cell); - } - } - rows.add(lastRow); - } - } - - return rows; - - } - - - private void addCells(List cells) { - - if (cells.isEmpty()) { - return; - } - - cells.removeIf(cell -> cell.getWidth() < 1.1 || cell.getHeight() < 1.1); - - List> rowsOfCellsMatrix = calculateTableStructure(cells); - - for (int i = 0; i < rowsOfCellsMatrix.size(); i++) { - for (int j = 0; j < rowsOfCellsMatrix.get(i).size(); j++) { - addCellToRowAndCol(rowsOfCellsMatrix.get(i) - .get(j), i, j); - } - } - - } - - - /** - * Calculates the structure of the table. For spanning rows and columns multiple cells with the same values will be inserted. - * - * @param cells The found cells - * @return TablePageBlock Structure as a rows of cells matrix - */ - private List> calculateTableStructure(List cells) { - - if (cells.isEmpty()) { - return new ArrayList<>(); - } - - Set uniqueX = new HashSet<>(); - Set uniqueY = new HashSet<>(); - cells.stream() - .filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3) - .forEach(c -> { - uniqueX.add(c.getPdfMinX()); - uniqueX.add(c.getPdfMaxX()); - uniqueY.add(c.getPdfMinY()); - uniqueY.add(c.getPdfMaxY()); - }); - - var sortedUniqueX = uniqueX.stream() - .sorted() - .toList(); - var sortedUniqueY = uniqueY.stream() - .sorted() - .toList(); - - List> rowsOfCells = new ArrayList<>(); - - Double prevY = null; - - for (Double y : sortedUniqueY) { - - List row = new ArrayList<>(); - - Double prevX = null; - for (Double x : sortedUniqueX) { - - if (prevY != null && prevX != null) { - var cellFromGridStructure = new Cell(new Point2D.Double(prevX, prevY), new Point2D.Double(x, y)); - - if (cellFromGridStructure.hasMinimumSize()) { - - cells.stream() - .map(originalCell -> new CellWithIntersection(originalCell, - RectangleTransformations.calculateIntersectedArea(cellFromGridStructure.getBBoxPdf(), - originalCell.getBBoxPdf()))) - .filter(cellWithIntersection -> cellWithIntersection.intersectedArea > 0) - .filter(cellWithIntersection -> cellWithIntersection.originalCell.getArea() > cellWithIntersection.intersectedArea * CELL_AREA_CONTAINED_THRESHOLD) - .max(Comparator.comparing(CellWithIntersection::intersectedArea)) - .map(CellWithIntersection::originalCell) - .ifPresent(matchingCell -> cellFromGridStructure.getTextBlocks().addAll(matchingCell.getTextBlocks())); - - row.add(cellFromGridStructure); - } - } - prevX = x; - } - - // exclude empty rows and rows where all text blocks are empty - if (prevY != null && prevX != null && !row.isEmpty() && !row.stream() - .allMatch(cell -> cell.getTextBlocks().isEmpty())) { - - rowsOfCells.add(row); - } - prevY = y; - } - - Collections.reverse(rowsOfCells); - - // now cells are removed which are part of a column without any text blocks - // this is done by first computing the inverse matrix which contains call columns of cells - // then the column indices that have to be removed are determined - List> columnsOfCells = new ArrayList<>(); - int maxRowLength = rowsOfCells.stream() - .map(List::size) - .max(java.util.Comparator.naturalOrder()) - .orElse(0); - for (int i = 0; i < maxRowLength; i++) { - columnsOfCells.add(new ArrayList<>()); - } - - for (List row : rowsOfCells) { - for (int j = 0; j < row.size(); j++) { - columnsOfCells.get(j).add(row.get(j)); - } - } - - List columnIndicesToRemove = new ArrayList<>(); - int columnIndex = 0; - for (List col : columnsOfCells) { - if (col.stream() - .allMatch(cell -> cell.getTextBlocks().isEmpty())) { - columnIndicesToRemove.add(columnIndex); - } - columnIndex++; - } - columnIndicesToRemove.sort(Collections.reverseOrder()); - - // update all rows so that the values of the empty columns get removed - var rowsOfCellsBefore = new ArrayList<>(rowsOfCells); - rowsOfCells = new ArrayList<>(); - for (List row : rowsOfCellsBefore) { - var updatedRow = new ArrayList<>(row); - columnIndicesToRemove.forEach(idxToRemove -> updatedRow.remove(updatedRow.get(idxToRemove))); - rowsOfCells.add(updatedRow); - } - - return rowsOfCells; - } - - - private void addCellToRowAndCol(Cell cell, int row, int col) { - - unrotatedRowCount = Math.max(unrotatedRowCount, row + 1); - unrotatedColCount = Math.max(unrotatedColCount, col + 1); - - CellPosition cp = new CellPosition(row, col); - cellTreeMap.put(cp, cell); + public List getCells() { + return getRows().stream() + .flatMap(List::stream) + .collect(Collectors.toList()); } @@ -360,7 +108,7 @@ public class TablePageBlock extends AbstractPageBlock { } if (column != null && column.getTextBlocks() != null) { boolean first = true; - for (TextPageBlock textBlock : column.getTextBlocks()) { + for (AbstractPageBlock textBlock : column.getTextBlocks()) { if (!first) { sb.append("\n"); } @@ -392,7 +140,7 @@ public class TablePageBlock extends AbstractPageBlock { sb.append(i == 0 ? "\n" : "\n"); if (column != null && column.getTextBlocks() != null) { boolean first = true; - for (TextPageBlock textBlock : column.getTextBlocks()) { + for (AbstractPageBlock textBlock : column.getTextBlocks()) { if (!first) { sb.append("
"); } @@ -411,9 +159,4 @@ public class TablePageBlock extends AbstractPageBlock { return sb.toString(); } - - record CellWithIntersection(Cell originalCell, double intersectedArea) { - - } - } \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationFooter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/ClassificationFooter.java similarity index 73% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationFooter.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/ClassificationFooter.java index c910293..139c68f 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationFooter.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/ClassificationFooter.java @@ -1,9 +1,7 @@ -package com.knecon.fforesight.service.layoutparser.processor.model; +package com.knecon.fforesight.service.layoutparser.processor.model.text; import java.util.List; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; - import lombok.AllArgsConstructor; import lombok.Data; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationHeader.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/ClassificationHeader.java similarity index 73% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationHeader.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/ClassificationHeader.java index e161801..f1d634f 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationHeader.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/ClassificationHeader.java @@ -1,9 +1,7 @@ -package com.knecon.fforesight.service.layoutparser.processor.model; +package com.knecon.fforesight.service.layoutparser.processor.model.text; import java.util.List; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; - import lombok.AllArgsConstructor; import lombok.Data; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java index 9a34c3e..7b780ea 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java @@ -7,8 +7,7 @@ import org.apache.pdfbox.text.TextPosition; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox; import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2; -import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms; -import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import lombok.AccessLevel; import lombok.AllArgsConstructor; @@ -65,7 +64,7 @@ public class RedTextPosition extends TextBoundingBox { pos.setBBoxDirAdj(dirAdjPosition); AffineTransform affineTransform = getRotationMatrix(TextDirection.fromDegrees(textPosition.getDir()), textPosition.getPageWidth(), textPosition.getPageHeight()); - Rectangle2D bBoxInitialUserSpace = affineTransform.createTransformedShape(dirAdjPosition).getBounds2D(); + Rectangle2D bBoxInitialUserSpace = RectangleTransformations.transform(dirAdjPosition, affineTransform); pos.setBBoxPdf(bBoxInitialUserSpace); // These are definitely correct diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java index 8de8fea..d3712f2 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java @@ -2,47 +2,62 @@ package com.knecon.fforesight.service.layoutparser.processor.model.text; import java.awt.geom.Rectangle2D; import java.util.ArrayList; +import java.util.Collection; import java.util.Collections; import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; import com.fasterxml.jackson.annotation.JsonIgnore; +import com.iqser.red.service.redaction.v1.server.model.document.nodes.LayoutEngine; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.Orientation; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; -import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities; -import lombok.AllArgsConstructor; -import lombok.Builder; import lombok.Data; import lombok.EqualsAndHashCode; import lombok.NoArgsConstructor; @EqualsAndHashCode(callSuper = true) @Data -@AllArgsConstructor -@Builder @NoArgsConstructor public class TextPageBlock extends AbstractPageBlock { - @Builder.Default + @EqualsAndHashCode.Exclude private List words = new ArrayList<>(); - @Builder.Default + @EqualsAndHashCode.Exclude private FrequencyCounters frequencyCounters = new FrequencyCounters(); - private Rectangle2D bBoxDirAdj; - private boolean underlined; private PageBlockType classification; private boolean toDuplicate; + @EqualsAndHashCode.Exclude private String text; private boolean changed; + public TextPageBlock(List words, int page, PageBlockType classification, Set engines, Orientation orientation) { + + this.page = page; + this.classification = classification; + this.engines = engines; + this.orientation = orientation; + setDefaultFields(words); + } + + public TextPageBlock(List words) { + setDefaultFields(words); + } + + + private void setDefaultFields(List words) { + this.words = new ArrayList<>(words); this.frequencyCounters = new FrequencyCounters(); @@ -73,10 +88,6 @@ public class TextPageBlock extends AbstractPageBlock { this.bBoxDirAdj = new Rectangle2D.Double(); return; } - this.bBoxDirAdj = words.stream() - .map(Word::getBBoxDirAdj) - .collect(RectangleTransformations.collectBBox()); - setToBBoxOfComponents(words); } @@ -87,7 +98,7 @@ public class TextPageBlock extends AbstractPageBlock { } - public static TextPageBlock merge(List textBlocksToMerge) { + public static TextPageBlock merge(Collection textBlocksToMerge) { if (textBlocksToMerge.isEmpty()) { throw new IllegalArgumentException("Need to provide at least one TextPageBlock."); @@ -98,14 +109,33 @@ public class TextPageBlock extends AbstractPageBlock { .count() != 1) { throw new IllegalArgumentException("Cannot merge textBlocks on different pages."); } + if (textBlocksToMerge.stream() + .map(AbstractPageBlock::getClassification) + .distinct() + .count() != 1) { + throw new IllegalArgumentException("Cannot merge textBlocks of different types."); + } + if (textBlocksToMerge.stream() + .map(AbstractPageBlock::getDir) + .distinct() + .count() != 1) { + throw new IllegalArgumentException("Cannot merge textBlocks of different directions."); + } List sequences = textBlocksToMerge.stream() .map(TextPageBlock::getWords) - .flatMap(java.util.Collection::stream) - .toList(); - sequences = new ArrayList<>(sequences); + .flatMap(Collection::stream) + .collect(Collectors.toList()); - return new TextPageBlock(sequences); + TextPageBlock first = textBlocksToMerge.iterator().next(); + return new TextPageBlock(sequences, + first.getPage(), + first.getClassification(), + textBlocksToMerge.stream() + .map(AbstractPageBlock::getEngines) + .flatMap(Collection::stream) + .collect(Collectors.toSet()), + Orientation.NONE); } @@ -172,6 +202,14 @@ public class TextPageBlock extends AbstractPageBlock { } + public void removeAll(List words) { + + changed = true; + this.words.removeAll(words); + setDefaultFields(this.words); + } + + public TextPageBlock copy() { return new TextPageBlock(new ArrayList<>(words)); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/Word.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/Word.java index 0e5a647..1c721dc 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/Word.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/Word.java @@ -15,6 +15,7 @@ import org.apache.pdfbox.text.TextPosition; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import lombok.AllArgsConstructor; import lombok.Builder; @@ -66,9 +67,9 @@ public class Word extends TextBoundingBox implements CharSequence { } - public Word(List textPositions, int page) { + public Word(List characters, int page) { - this.characters = new ArrayList<>(textPositions); + this.characters = new ArrayList<>(characters); this.page = page; calculateBBoxAndHashcode(); } @@ -101,12 +102,12 @@ public class Word extends TextBoundingBox implements CharSequence { @Override public Word subSequence(int start, int end) { - var textPositionSequence = new Word(); - textPositionSequence.characters = characters.subList(start, end); - textPositionSequence.page = page; - textPositionSequence.dir = dir; - textPositionSequence.setToBBoxOfComponents(getTextPositions()); - return textPositionSequence; + var word = new Word(); + word.characters = characters.subList(start, end); + word.page = page; + word.dir = dir; + word.setToBBoxOfComponents(getTextPositions()); + return word; } @@ -262,7 +263,7 @@ public class Word extends TextBoundingBox implements CharSequence { public void transform(AffineTransform rotateInstance) { for (RedTextPosition textPosition : getTextPositions()) { - Rectangle2D exactDirAdjCoordinates = rotateInstance.createTransformedShape(textPosition.getBBoxDirAdj()).getBounds2D(); + Rectangle2D exactDirAdjCoordinates = RectangleTransformations.transform(textPosition.getBBoxDirAdj(), rotateInstance); textPosition.setBBoxDirAdj(exactDirAdjCoordinates); } calculateBBoxAndHashcode(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/adapter/ImageServiceResponseAdapter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/adapter/ImageServiceResponseAdapter.java index 04af1fd..78666d8 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/adapter/ImageServiceResponseAdapter.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/python_api/adapter/ImageServiceResponseAdapter.java @@ -13,7 +13,10 @@ import com.iqser.red.service.redaction.v1.server.model.document.nodes.ImageType; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageMetadata; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; +import com.knecon.fforesight.service.ocr.v1.api.model.Figure; +import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult; import lombok.RequiredArgsConstructor; @@ -21,48 +24,78 @@ import lombok.RequiredArgsConstructor; @RequiredArgsConstructor public class ImageServiceResponseAdapter { - public Map> buildClassifiedImagesPerPage(ImageServiceResponse imageServiceResponse) { + public Map> buildClassifiedImagesPerPage(ImageServiceResponse imageServiceResponse, IdpResult idpResult) { Map> images = new HashMap<>(); - imageServiceResponse.getData().forEach(imageMetadata -> { - var classification = imageMetadata.getFilters().isAllPassed() ? ImageType.valueOf(imageMetadata.getClassification() - .getLabel() - .toUpperCase(Locale.ROOT)) : ImageType.OTHER; - images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>()) - .add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(), - imageMetadata.getPosition().getY1(), - imageMetadata.getGeometry().getWidth(), - imageMetadata.getGeometry().getHeight()), classification, imageMetadata.isAlpha(), imageMetadata.getPosition().getPageNumber(),imageMetadata.getRepresentation())); - }); - // Currently This is a copy but, it will be changed later because i don' t think that we should unclassified images. - imageServiceResponse.getDataCV().forEach(imageMetadata -> { - var classification = imageMetadata.getFilters().isAllPassed() ? ImageType.valueOf(imageMetadata.getClassification() - .getLabel() - .toUpperCase(Locale.ROOT)) : ImageType.OTHER; - images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>()) - .add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(), - imageMetadata.getPosition().getY1(), - imageMetadata.getGeometry().getWidth(), - imageMetadata.getGeometry().getHeight()), classification, imageMetadata.isAlpha(), imageMetadata.getPosition().getPageNumber(),imageMetadata.getRepresentation())); - }); + imageServiceResponse.getData() + .forEach(imageMetadata -> addImageMetaData(imageMetadata, images)); + imageServiceResponse.getDataCV() + .forEach(imageMetadata -> addImageMetaData(imageMetadata, images)); + idpResult.figures() + .forEach(figure -> addFigure(figure, images)); return images; } + private static void addFigure(Figure figure, Map> images) { + + var classification = ImageType.GRAPHIC; + ClassifiedImage image = new ClassifiedImage(figure.image().bbox().get().getBounds2D(), classification, false, figure.image().pageNumber(), ""); + getImagesOnPage(figure.image().pageNumber(), images).add(image); + } + + + private static void addImageMetaData(ImageMetadata imageMetadata, Map> images) { + + var image = new ClassifiedImage(getPosition(imageMetadata), + getImageType(imageMetadata), + imageMetadata.isAlpha(), + imageMetadata.getPosition().getPageNumber(), + imageMetadata.getRepresentation()); + getImagesOnPage(imageMetadata.getPosition().getPageNumber(), images).add(image); + } + + + private static Rectangle2D.Double getPosition(ImageMetadata imageMetadata) { + + return new Rectangle2D.Double(imageMetadata.getPosition().getX1(), + imageMetadata.getPosition().getY1(), + imageMetadata.getGeometry().getWidth(), + imageMetadata.getGeometry().getHeight()); + } + + + private static ImageType getImageType(ImageMetadata imageMetadata) { + + if (imageMetadata.getFilters().isAllPassed()) { + return ImageType.valueOf(imageMetadata.getClassification().getLabel().toUpperCase(Locale.ROOT)); + } else { + return ImageType.OTHER; + } + } + + + private static List getImagesOnPage(int pageNumber, Map> images) { + + return images.computeIfAbsent(pageNumber, x -> new ArrayList<>()); + } + + public void findOcr(ClassificationPage classificationPage) { - classificationPage.getImages().forEach(image -> { - if (image.getImageType().equals(ImageType.OTHER)) { - for (AbstractPageBlock textblock : classificationPage.getTextBlocks()) { - if (image.getPosition().contains(textblock.getBBoxPdf())) { - image.setImageType(ImageType.OCR); - return; + classificationPage.getImages() + .forEach(image -> { + if (image.getImageType().equals(ImageType.OTHER)) { + for (AbstractPageBlock textblock : classificationPage.getTextBlocks()) { + if (image.getPosition().contains(textblock.getBBoxPdf())) { + image.setImageType(ImageType.OCR); + return; + } + } } - } - } - }); + }); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java index 0820a49..13fe712 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java @@ -183,7 +183,7 @@ public class BodyTextFrameService { if (cell == null || cell.getTextBlocks() == null) { continue; } - for (TextPageBlock textBlock : cell.getTextBlocks()) { + for (AbstractPageBlock textBlock : cell.getTextBlocks()) { expandRectangle(textBlock, page, expansionsRectangle); } } @@ -198,7 +198,7 @@ public class BodyTextFrameService { } - private void expandRectangle(TextPageBlock textBlock, ClassificationPage page, BodyTextFrameExpansionsRectangle expansionsRectangle) { + private void expandRectangle(AbstractPageBlock textBlock, ClassificationPage page, BodyTextFrameExpansionsRectangle expansionsRectangle) { if (page.getPageWidth() > page.getPageHeight() && page.getRotation() != 0) { if (textBlock.getPdfMinY() < expansionsRectangle.minX) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/MainBodyTextFrameExtractionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/MainBodyTextFrameExtractionService.java deleted file mode 100644 index 0cac3ee..0000000 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/MainBodyTextFrameExtractionService.java +++ /dev/null @@ -1,25 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.processor.services; - -import java.awt.geom.Rectangle2D; - -import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation; -import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; - -import lombok.experimental.UtilityClass; - -@UtilityClass -public class MainBodyTextFrameExtractionService { - - private static final double TEXT_FRAME_PAD_WIDTH = 0.0; - private static final double TEXT_FRAME_PAD_HEIGHT = 0.02; - - - public Rectangle2D calculateMainBodyTextFrame(LineInformation lineInformation) { - - Rectangle2D mainBodyTextFrame = lineInformation.getLineBBox().stream() - .collect(RectangleTransformations.collectBBox()); - - return RectangleTransformations.pad(mainBodyTextFrame, mainBodyTextFrame.getWidth() * TEXT_FRAME_PAD_WIDTH, mainBodyTextFrame.getHeight() * TEXT_FRAME_PAD_HEIGHT); - } - -} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageContentExtractor.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageContentExtractor.java index 6bd802e..3b18dd4 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageContentExtractor.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageContentExtractor.java @@ -1,73 +1,207 @@ package com.knecon.fforesight.service.layoutparser.processor.services; +import java.awt.geom.Rectangle2D; +import java.io.File; import java.io.IOException; -import java.util.Collection; -import java.util.LinkedList; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; import java.util.List; -import java.util.Map; +import java.util.concurrent.CountDownLatch; import java.util.stream.Collectors; import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; -import org.springframework.core.io.ClassPathResource; +import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; +import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; +import com.knecon.fforesight.service.layoutparser.processor.services.graphics.Box; +import com.knecon.fforesight.service.layoutparser.processor.services.graphics.FindGraphicsRaster; +import com.knecon.fforesight.service.layoutparser.processor.services.graphics.GraphicBBDetector; import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; -import lombok.experimental.UtilityClass; +import lombok.AccessLevel; +import lombok.Getter; +import lombok.SneakyThrows; +import lombok.experimental.FieldDefaults; -@UtilityClass +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) public class PageContentExtractor { - public List getSortedPageContents(String filename) throws IOException { + static boolean USE_IMAGE_BASED_GRAPHIC_DETECTION; + @Getter + int pageCount; + @Getter + File document; - List textPositionSequencesPerPage = new LinkedList<>(); - ClassPathResource pdfResource = new ClassPathResource(filename); + PageContents[] pageContents; + CountDownLatch[] finishedLookup; + List> pageNumberBatches; - try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile())) { - for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) { + public PageContentExtractor(File document, int threads) { - PDFLinesTextStripper stripper = new PDFLinesTextStripper(); - PDPage pdPage = pdDocument.getPage(pageNumber - 1); - stripper.setPageNumber(pageNumber); - stripper.setSortByPosition(true); - stripper.setStartPage(pageNumber); - stripper.setEndPage(pageNumber); - stripper.setPdpage(pdPage); - stripper.getText(pdDocument); - - Map> sortedTextPositionSequencesPerDir = stripper.getWords() - .stream() - .collect(Collectors.groupingBy(textPositionSequence -> textPositionSequence.getDir().getDegrees())); - - var sortedTextPositionSequences = sortByDirAccordingToPageRotation(sortedTextPositionSequencesPerDir, pdPage.getRotation()); - - textPositionSequencesPerPage.add(new PageContents(sortedTextPositionSequences, - RectangleTransformations.toRectangle2D(pdPage.getCropBox()), - RectangleTransformations.toRectangle2D(pdPage.getMediaBox()), - stripper.getRulings())); - } + this.document = document; + this.pageCount = getPageCount(document); + this.pageContents = new PageContents[pageCount]; + this.finishedLookup = new CountDownLatch[pageCount]; + for (int i = 0; i < pageCount; i++) { + this.finishedLookup[i] = new CountDownLatch(1); + } + int actualThreads = Math.min(pageCount, threads); + pageNumberBatches = new ArrayList<>(actualThreads); + for (int i = 0; i < actualThreads; i++) { + pageNumberBatches.add(new ArrayList<>(pageCount / actualThreads)); + } + for (int i = 1; i <= pageCount; i++) { + pageNumberBatches.get(i % actualThreads).add(i); } - - return textPositionSequencesPerPage; } - public List sortByDirAccordingToPageRotation(Map> sortedTextPositionSequencesPerDir, int rotation) { + @SneakyThrows + private int getPageCount(File document) { - LinkedList sortedKeys = new LinkedList<>(sortedTextPositionSequencesPerDir.keySet().stream().sorted().toList()); - - for (int i = 0; i < sortedKeys.size(); i++) { - if (sortedKeys.get(i) < rotation) { - Float keyToSwap = sortedKeys.remove(i); - sortedKeys.addLast(keyToSwap); - } + try (var doc = openDocument(document)) { + return doc.getNumberOfPages(); } - return sortedKeys.stream().map(sortedTextPositionSequencesPerDir::get).flatMap(Collection::stream).toList(); + } + + + @SneakyThrows + public void startAsync() { + + for (List pageNumberBatch : pageNumberBatches) { + Thread thread = new Thread(() -> extractPages(pageNumberBatch)); + thread.start(); + } + } + + + @SneakyThrows + private void extractPages(List pageNumbers) { + + var doc = openDocument(document); + int count = 0; + var pageGetter = new PageGetter(doc.getPages() + .iterator(), pageCount); + for (Integer pageNumber : pageNumbers) { + count++; + if (count % 100 == 0) { + // As PDFBox caches all types of stuff, we need to close the document every once in a while to save on RAM + doc.close(); + doc = openDocument(document); + } + + extractPage(pageNumber, doc, pageGetter.getPage(pageNumber)); + } + doc.close(); + } + + + @SneakyThrows + private PDDocument openDocument(File originFile) { + + PDDocument document = Loader.loadPDF(originFile); + document.setAllSecurityToBeRemoved(true); + return document; + } + + + @SneakyThrows + public void extractPage(Integer pageNumber, PDDocument doc, PDPage pdPage) { + + PDFLinesTextStripper stripper = new PDFLinesTextStripper(); + stripper.setPageNumber(pageNumber); + stripper.setStartPage(pageNumber); + stripper.setEndPage(pageNumber); + stripper.setPdpage(pdPage); + stripper.getText(doc); + + PageInformation pageInformation = PageInformation.fromPDPage(pageNumber, pdPage); + List words = stripper.getWords(); + List rulings = stripper.getRulings(); + List graphicBBoxes = findGraphicBBoxes(pageInformation, pdPage, doc, words); + + pageContents[pageNumber - 1] = new PageContents(pageInformation, words, rulings, graphicBBoxes); + finishedLookup[pageNumber - 1].countDown(); + } + + + private static List findGraphicBBoxes(PageInformation pageInformation, PDPage pdPage, PDDocument doc, List words) throws IOException { + + GraphicBBDetector graphicBBDetector = new GraphicBBDetector(pdPage, true); + List graphicBBoxes = graphicBBDetector.findGraphicBB(); + + if (USE_IMAGE_BASED_GRAPHIC_DETECTION) { + // This should only be used if ocr was performed, it is currently in an early stage and needs to be improved. + List wordIgnoreZones = words.stream() + .map(BoundingBox::getBBoxPdf) + .map(box -> RectangleTransformations.pad(box, 2, 2)) + .collect(Collectors.toList()); + graphicBBoxes.addAll(FindGraphicsRaster.findCCBoundingBoxes(doc, wordIgnoreZones, pageInformation)); + } + return graphicBBoxes; + } + + + public PageContents awaitPageContents(Integer pageNumber) throws InterruptedException { + + finishedLookup[pageNumber - 1].await(); + return pageContents[pageNumber - 1]; + } + + + public List awaitAllContents() throws InterruptedException { + + for (CountDownLatch countDownLatch : finishedLookup) { + countDownLatch.await(); + } + return Arrays.asList(pageContents); + } + + + @SneakyThrows + public static List getDocumentContents(File document, int threads) { + + PageContentExtractor extractor = new PageContentExtractor(document, threads); + extractor.startAsync(); + return extractor.awaitAllContents(); + } + + + private static class PageGetter { + + Iterator pageIterator; + int current; + int max; + + + PageGetter(Iterator pageIterator, int max) { + + this.pageIterator = pageIterator; + this.max = max; + this.current = 0; + } + + + public PDPage getPage(int pageNumber) { + + assert pageNumber >= current && pageNumber <= max; + int pagesToIterate = pageNumber - current; + PDPage page = null; + for (int i = 0; i < pagesToIterate; i++) { + page = pageIterator.next(); + } + current = pageNumber; + return page; + } + } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageInformationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageInformationService.java deleted file mode 100644 index dc98f4b..0000000 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageInformationService.java +++ /dev/null @@ -1,24 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.processor.services; - -import java.awt.geom.Rectangle2D; - -import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation; -import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation; -import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; -import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation; - -import lombok.experimental.UtilityClass; - -@UtilityClass -public class PageInformationService { - - public PageInformation build(PageContents pageContents) { - - LineInformation lineInformation = LineDetectionService.calculateLineInformation(pageContents.getSortedWords()); - Rectangle2D mainBodyTextFrame = MainBodyTextFrameExtractionService.calculateMainBodyTextFrame(lineInformation); - GapInformation gapInformation = GapDetectionService.findGapsInLines(pageContents.getSortedWords(), mainBodyTextFrame); - - return new PageInformation(pageContents, lineInformation, mainBodyTextFrame, gapInformation); - } - -} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java index 1188bd4..a90a9a0 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java @@ -5,18 +5,20 @@ import static com.knecon.fforesight.service.layoutparser.processor.utils.Geometr import java.awt.geom.Point2D; import java.awt.geom.Rectangle2D; import java.util.ArrayList; -import java.util.HashMap; +import java.util.Collection; +import java.util.HashSet; import java.util.List; -import java.util.Map; +import java.util.Objects; +import java.util.Set; import java.util.stream.Collectors; import org.springframework.stereotype.Service; +import com.knecon.fforesight.service.layoutparser.processor.model.UnionFind; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; -import com.knecon.fforesight.service.layoutparser.processor.utils.UnionFind; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; @@ -52,22 +54,22 @@ public class RulingCleaningService { private Rulings cleanRulings(Rulings rulings) { - List> groupedOverlappingVerticalRectangles = groupOverlappingRectangles(rulings.verticalLines.stream() - .map(RulingCleaningService::getOverlapRectangle) - .distinct() - .toList()); + var groupedOverlappingVerticalRectangles = groupOverlappingRectangles(rulings.verticalLines.stream() + .map(RulingCleaningService::getOverlapRectangle) + .distinct() + .toList()); List cleanedVerticalRulings = groupedOverlappingVerticalRectangles.stream() - .map(rectList -> getXCenteredRuling(RectangleTransformations.rectangle2DBBox(rectList))) + .map(RulingCleaningService::getXCenteredRuling) .filter(ruling -> ruling.length() > 0) .toList(); - List> groupedOverlappingHorizontalRectangles = groupOverlappingRectangles(rulings.horizontalLines.stream() - .map(RulingCleaningService::getOverlapRectangle) - .distinct() - .toList()); + var groupedOverlappingHorizontalRectangles = groupOverlappingRectangles(rulings.horizontalLines.stream() + .map(RulingCleaningService::getOverlapRectangle) + .distinct() + .toList()); List cleanedHorizontalRulings = groupedOverlappingHorizontalRectangles.stream() - .map(rectList -> getYCenteredRuling(RectangleTransformations.rectangle2DBBox(rectList))) + .map(RulingCleaningService::getYCenteredRuling) .filter(ruling -> ruling.length() > 0) .collect(Collectors.toList()); @@ -75,13 +77,40 @@ public class RulingCleaningService { } - private List> groupOverlappingRectangles(List rectangles) { + private static Ruling getXCenteredRuling(Set rectList) { - UnionFind unionFind = new UnionFind<>(); + Ruling ruling = getXCenteredRuling(rectList.stream() + .map(OverlapRectangle::rectangle2D) + .collect(RectangleTransformations.collectBBox())); + ruling.setStyle(rectList.iterator().next().style); + return ruling; + } + + + private static Ruling getYCenteredRuling(Set rectList) { + + Ruling ruling = getYCenteredRuling(rectList.stream() + .map(OverlapRectangle::rectangle2D) + .collect(RectangleTransformations.collectBBox())); + ruling.setStyle(rectList.iterator().next().style); + return ruling; + } + + + private Collection> groupOverlappingRectangles(List rectangles) { + + UnionFind unionFind = new UnionFind<>(new HashSet<>(rectangles)); for (int i = 0; i < rectangles.size(); i++) { for (int j = i + 1; j < rectangles.size(); j++) { - Rectangle2D rectangle1 = rectangles.get(i); - Rectangle2D rectangle2 = rectangles.get(j); + + OverlapRectangle overlapRectangle1 = rectangles.get(i); + OverlapRectangle overlapRectangle2 = rectangles.get(j); + + if (!Objects.equals(overlapRectangle1.style, overlapRectangle2.style)) { + continue; + } + Rectangle2D rectangle1 = overlapRectangle1.rectangle2D; + Rectangle2D rectangle2 = overlapRectangle2.rectangle2D; // we can stop early when we are too far off because of x-y-sorting if (rectangle1.getMaxX() < rectangle2.getMinX() && rectangle1.getMaxY() < rectangle2.getMinY()) { @@ -89,21 +118,16 @@ public class RulingCleaningService { } if (rectangle1.intersects(rectangle2)) { - unionFind.union(rectangle1, rectangle2); + unionFind.union(overlapRectangle1, overlapRectangle2); } } } - Map> groups = new HashMap<>(); - for (Rectangle2D rectangle : rectangles) { - Rectangle2D root = unionFind.find(rectangle); - groups.computeIfAbsent(root, k -> new ArrayList<>()).add(rectangle); - } - return new ArrayList<>(groups.values()); + return unionFind.getGroups(); } - private static Rectangle2D getOverlapRectangle(Ruling ruling) { + private static OverlapRectangle getOverlapRectangle(Ruling ruling) { float y; float x; @@ -124,12 +148,14 @@ public class RulingCleaningService { y = ruling.y2; h = ruling.y1 - ruling.y2; } - + Rectangle2D overlapRectangle; if (ruling.isHorizontal()) { - return new Rectangle2D.Double(x - THRESHOLD_X_HORIZONTAL, y - THRESHOLD_Y_HORIZONTAL, w + 2 * THRESHOLD_X_HORIZONTAL, h + 2 * THRESHOLD_Y_HORIZONTAL); + overlapRectangle = new Rectangle2D.Double(x - THRESHOLD_X_HORIZONTAL, y - THRESHOLD_Y_HORIZONTAL, w + 2 * THRESHOLD_X_HORIZONTAL, h + 2 * THRESHOLD_Y_HORIZONTAL); } else { - return new Rectangle2D.Double(x - THRESHOLD_X_VERTICAL, y - THRESHOLD_Y_VERTICAL, w + 2 * THRESHOLD_X_VERTICAL, h + 2 * THRESHOLD_Y_VERTICAL); + overlapRectangle = new Rectangle2D.Double(x - THRESHOLD_X_VERTICAL, y - THRESHOLD_Y_VERTICAL, w + 2 * THRESHOLD_X_VERTICAL, h + 2 * THRESHOLD_Y_VERTICAL); } + + return new OverlapRectangle(overlapRectangle, ruling.getStyle()); } @@ -243,4 +269,8 @@ public class RulingCleaningService { } + private record OverlapRectangle(Rectangle2D rectangle2D, Ruling.Style style) { + + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java index 7b2ada4..0952346 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java @@ -12,8 +12,8 @@ import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; -import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter; -import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader; +import com.knecon.fforesight.service.layoutparser.processor.model.text.ClassificationFooter; +import com.knecon.fforesight.service.layoutparser.processor.model.text.ClassificationHeader; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationSection; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; @@ -30,7 +30,6 @@ import lombok.extern.slf4j.Slf4j; @Deprecated public class SectionsBuilderService { - public void buildSections(ClassificationDocument document) { List chunkWords = new ArrayList<>(); @@ -73,8 +72,7 @@ public class SectionsBuilderService { chunkBlockList.add(chunkBlock); chunkWords = new ArrayList<>(); if (!chunkBlock.getTables().isEmpty()) { - previousTable = chunkBlock.getTables() - .get(chunkBlock.getTables().size() - 1); + previousTable = chunkBlock.getTables().get(chunkBlock.getTables().size() - 1); } } if (current instanceof TablePageBlock table) { @@ -236,12 +234,8 @@ public class SectionsBuilderService { List previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable); List tableNonHeaderRow = getRowWithNonHeaderCells(currentTable); // Allow merging of tables if header row is separated from first logical non-header row - if (previousTableNonHeaderRow.isEmpty() - && previousTable.getRowCount() == 1 - && previousTable.getRows() - .get(0).size() == tableNonHeaderRow.size()) { - previousTableNonHeaderRow = previousTable.getRows() - .get(0) + if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows().get(0).size() == tableNonHeaderRow.size()) { + previousTableNonHeaderRow = previousTable.getRows().get(0) .stream() .map(cell -> { Cell fakeCell = Cell.copy(cell); @@ -252,8 +246,7 @@ public class SectionsBuilderService { } if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) { for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table - List row = currentTable.getRows() - .get(i); + List row = currentTable.getRows().get(i); if (row.size() == tableNonHeaderRow.size() && row.stream() .allMatch(cell -> cell.getHeaderCells().isEmpty())) { for (int j = 0; j < row.size(); j++) { @@ -272,13 +265,6 @@ public class SectionsBuilderService { for (AbstractPageBlock container : wordBlockList) { if (container instanceof TablePageBlock table) { - - if (lastHeadline == null || lastHeadline.isEmpty()) { - table.setHeadline("Text in table"); - } else { - table.setHeadline("TablePageBlock in: " + lastHeadline); - } - section.getPageBlocks().add(table); continue; } @@ -310,8 +296,7 @@ public class SectionsBuilderService { private List getRowWithNonHeaderCells(TablePageBlock table) { for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table - List row = table.getRows() - .get(i); + List row = table.getRows().get(i); if (row.size() == 1) { continue; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java deleted file mode 100644 index a055bf9..0000000 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java +++ /dev/null @@ -1,159 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.processor.services; - -import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.CELL_SIZE_COMPARATOR; -import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.RECTANGLE_SIZE_COMPARATOR; - -import java.awt.geom.AffineTransform; -import java.awt.geom.Rectangle2D; -import java.util.ArrayList; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; - -import org.springframework.stereotype.Service; - -import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; -import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; -import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; -import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; -import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms; -import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons; -import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation; -import com.knecon.fforesight.service.layoutparser.processor.utils.RectangularIntersectionFinder; -import com.knecon.fforesight.service.layoutparser.processor.utils.SpreadsheetFinder; - -import lombok.SneakyThrows; - -@Service -public class TableExtractionService { - - private static final int MAX_TABLE_CONTAINED_CELLS_WITH_TEXT = 1; - private static final double TABLE_UNIFORMITY_THRESHOLD = 0.7; - - - /** - * Finds tables on a page and moves textblocks into cells of the found tables. - * Note: This algorithm uses Pdf Coordinate System where {0,0} rotated with the page rotation. - * 0 -> LowerLeft - * 90 -> UpperLeft - * 180 -> UpperRight - * 270 -> LowerRight - *

- * DirAdj (Text direction adjusted) values can not be used here. - * - * @param emptyCells The cells used to build the table. - * @param page Page object that contains textblocks and statistics. - */ - - public void extractTables(List emptyCells, ClassificationPage page) { - - // sort cells by size (height * width) ascending so that textBlocks are always assigned to the smallest cells that contain them - emptyCells.sort(CELL_SIZE_COMPARATOR); - - for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) { - TextPageBlock textBlock = (TextPageBlock) abstractPageBlock; - for (Cell cell : emptyCells) { - if (cell.hasMinimumSize() && doesCellContainTextBlock(cell, textBlock)) { - cell.addTextBlock(textBlock); - break; - } - } - } - - List cells = new ArrayList<>(new HashSet<>(emptyCells)); - DoubleComparisons.sort(cells, BoundingBox.ILL_DEFINED_ORDER); - - List spreadsheetAreas = SpreadsheetFinder.findSpreadsheetsFromCells(cells); - // sort spreadsheetAreas by size (height * width) ascending so that cells are placed in the smallest tables first - // this way no cell duplication occurs when tables are contained in other tables and only the most inner table contains the cells - spreadsheetAreas.sort(RECTANGLE_SIZE_COMPARATOR); - - List tables = new ArrayList<>(); - for (Rectangle2D area : spreadsheetAreas) { - - List containedCells = new ArrayList<>(); - for (Cell c : cells) { - if (c.hasMinimumSize() && area.contains(c.getBBoxPdf())) { - containedCells.add(c); - } - } - - var containedCellsWithText = containedCells.stream() - .filter(cell -> !cell.getTextBlocks().isEmpty()) - .toList(); - - // verify if table would contain fewer cells with text than the threshold allows - if (containedCellsWithText.size() >= MAX_TABLE_CONTAINED_CELLS_WITH_TEXT && checkIfTableCellsAreUniform(containedCells)) { - tables.add(new TablePageBlock(containedCells, page.getRotation())); - cells.removeAll(containedCells); - } - } - - for (TablePageBlock table : tables) { - int position = -1; - - for (AbstractPageBlock pageBlock : page.getTextBlocks()) { - if (pageBlock instanceof TextPageBlock ? table.contains(pageBlock) : table.contains(pageBlock) && position == -1) { - position = page.getTextBlocks().indexOf(pageBlock); - } - } - if (position != -1) { - page.getTextBlocks().add(position, table); - - var toBeRemoved = table.getCells() - .stream() - .map(Cell::getTextBlocks) - .flatMap(List::stream) - .toList(); - // remove text blocks from the page that were also added with the table (from its contained cells) - page.getTextBlocks().removeAll(toBeRemoved); - } - } - } - - - private boolean checkIfTableCellsAreUniform(List containedCells) { - - if (containedCells.size() <= 2) { - return true; - } - - Map> cellsGroupedByRoundedWidth = containedCells.stream() - .map(BoundingBox::getWidth) - .map(size -> Math.round(size / 10.0) * 10) - .collect(Collectors.groupingBy(Long::longValue)); - - return (double) cellsGroupedByRoundedWidth.size() / containedCells.size() <= TABLE_UNIFORMITY_THRESHOLD; - } - - - private boolean doesCellContainTextBlock(Cell cell, TextPageBlock textBlock) { - - return cell.contains(textBlock, RedTextPosition.HEIGHT_PADDING); - } - - - @SneakyThrows - public static List findCells(List horizontalRulingLines, List verticalRulingLines, PageInformation pageInformation) { - - AffineTransform affineTransform = CoordinateTransforms.calculateInitialUserSpaceCoordsToImageCoords(pageInformation, 1); - /* - switch (pageInformation.rotationDegrees()) { - case 90 -> affineTransform.translate(RedTextPosition.HEIGHT_PADDING, 0); //although this is wrong, our text coordinates are wrong as well - case 180 -> affineTransform.translate(0, RedTextPosition.HEIGHT_PADDING); - case 270 -> affineTransform.translate(-RedTextPosition.HEIGHT_PADDING, 0); - default -> affineTransform.translate(0, -RedTextPosition.HEIGHT_PADDING); - } - */ - return RectangularIntersectionFinder.find(horizontalRulingLines, verticalRulingLines) - .stream() - .map(rect -> new Cell(rect, affineTransform)) - .collect(Collectors.toList()); - } - -} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java index f0c906a..b0a94ba 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java @@ -18,7 +18,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentif import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; -import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation; +import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation; import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities; import lombok.Data; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationService.java new file mode 100644 index 0000000..eddeb16 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationService.java @@ -0,0 +1,42 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.blockification; + +import java.util.Collections; +import java.util.List; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; +import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; +import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer; + +import lombok.AccessLevel; +import lombok.RequiredArgsConstructor; +import lombok.experimental.FieldDefaults; + +@Service +@RequiredArgsConstructor +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +public class BlockificationService { + + RedactManagerBlockificationService redactManagerBlockificationService; + DocstrumBlockificationService docstrumBlockificationService; + DocuMineBlockificationService docuMineBlockificationService; + + + public List blockify(LayoutParsingType layoutParsingType, List words, CleanRulings cleanRulings, LayoutDebugLayer layoutDebugLayer) { + + if (words.isEmpty()) { + return Collections.emptyList(); + } + return switch (layoutParsingType) { + case REDACT_MANAGER_OLD -> redactManagerBlockificationService.blockify(words, cleanRulings, layoutDebugLayer); + case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings); + case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> + docstrumBlockificationService.blockify(words, cleanRulings, true, layoutDebugLayer, layoutParsingType); + case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(words, cleanRulings, false, layoutDebugLayer, layoutParsingType); + }; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java index 48fb851..19247d3 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java @@ -10,7 +10,6 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi import com.knecon.fforesight.service.layoutparser.processor.docstrum.DocstrumSegmentationService; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; @@ -30,46 +29,39 @@ public class DocstrumBlockificationService { static final float THRESHOLD = 1f; - public ClassificationPage blockify(List textPositions, - CleanRulings rulings, - boolean xyOrder, - LayoutDebugLayer visualizations, - LayoutParsingType layoutParsingType) { + public List blockify(List words, CleanRulings rulings, boolean xyOrder, LayoutDebugLayer visualizations, LayoutParsingType layoutParsingType) { - CleanRulings usedRulings = rulings.withoutTextRulings(); + CleanRulings rulingsWithoutTextRulings = rulings.withoutTextRulings(); - List zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder, usedRulings, visualizations); + List zones = docstrumSegmentationService.segmentPage(words, xyOrder, rulingsWithoutTextRulings); - if (!textPositions.isEmpty()) { - visualizations.addZoneVisualizations(zones, textPositions.get(0).getPage()); - visualizations.addLineVisualizationsFromZones(zones, textPositions.get(0).getPage()); - visualizations.addCharactersWithNeighbours(zones, textPositions.get(0).getPage()); + if (!words.isEmpty() && visualizations != null) { + visualizations.addZoneVisualizations(zones, words.get(0).getPage()); + visualizations.addLineVisualizationsFromZones(zones, words.get(0).getPage()); + visualizations.addCharactersWithNeighbours(zones, words.get(0).getPage()); } var pageBlocks = toAbstractPageBlocks(zones); - var classificationPage = new ClassificationPage(pageBlocks); - classificationPage.setCleanRulings(rulings); - - mergeIntersectingBlocks(classificationPage, usedRulings, 0, 0); + mergeIntersectingBlocks(pageBlocks, rulingsWithoutTextRulings, 0, 0); if (layoutParsingType == LayoutParsingType.DOCUMINE || layoutParsingType == LayoutParsingType.REDACT_MANAGER || layoutParsingType == LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH) { - combineBlocks(classificationPage, layoutParsingType); + combineBlocks(pageBlocks, rulings, layoutParsingType); } if (layoutParsingType == LayoutParsingType.CLARIFYND) { - mergeIntersectingBlocks(classificationPage, usedRulings, 0, 0); + mergeIntersectingBlocks(pageBlocks, rulingsWithoutTextRulings, 0, 0); } - return classificationPage; + return pageBlocks; } - private List toAbstractPageBlocks(List zones) { + private List toAbstractPageBlocks(List zones) { - List abstractPageBlocks = new ArrayList<>(); + List abstractPageBlocks = new ArrayList<>(); zones.forEach(zone -> { List words = new ArrayList<>(); @@ -88,29 +80,23 @@ public class DocstrumBlockificationService { } - public void combineBlocks(ClassificationPage page, LayoutParsingType layoutParsingType) { + public void combineBlocks(List blocks, CleanRulings rulingsWithoutTextRulings, LayoutParsingType layoutParsingType) { TextPageBlock previous = new TextPageBlock(); - ListIterator itty = page.getTextBlocks().listIterator(); - CleanRulings usedRulings = page.getCleanRulings().withoutTextRulings(); + ListIterator itty = blocks.listIterator(); while (itty.hasNext()) { - AbstractPageBlock block = itty.next(); - if (block instanceof TablePageBlock) { - previous = new TextPageBlock(); - continue; - } - TextPageBlock current = (TextPageBlock) block; + TextPageBlock current = itty.next(); if (previous != null && !previous.getWords().isEmpty()) { - if (current.getDir() != previous.getDir() || usedRulings.lineBetween(current, previous)) { + if (current.getDir() != previous.getDir() || rulingsWithoutTextRulings.lineBetween(current, previous)) { previous = current; continue; } if (current.isHeadline() || previous.isHeadline()) { - if (intersectsYWithPreviousHavingMaxOneLine(previous, current, page)) { + if (intersectsYWithPreviousHavingMaxOneLine(previous, current)) { previous = combineBlocksAndResetIterator(previous, current, itty, false); } else { previous = current; @@ -119,7 +105,7 @@ public class DocstrumBlockificationService { continue; } - if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, page)) { + if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, blocks)) { // previous = combineBlocksAndResetIterator(previous, current, itty, true); previous = combineBlocksAndResetIterator(previous, current, itty, layoutParsingType != LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH); continue; @@ -130,12 +116,12 @@ public class DocstrumBlockificationService { continue; } - if (isSameTopOrBottomWithPreviousHavingMaxTwoLinesAndCurrentThanOneAndMax4OtherBlocksOnHeight(previous, current, page)) { + if (isSameTopOrBottomWithPreviousHavingMaxTwoLinesAndCurrentThanOneAndMax4OtherBlocksOnHeight(previous, current, blocks)) { previous = combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate()); continue; } - if (isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(previous, current, page)) { + if (isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(previous, current, blocks)) { previous = combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate()); continue; } @@ -144,43 +130,43 @@ public class DocstrumBlockificationService { previous = current; } - mergeIntersectingBlocks(page, usedRulings, 0, Y_THRESHOLD); + mergeIntersectingBlocks(blocks, rulingsWithoutTextRulings, 0, Y_THRESHOLD); } - private boolean isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) { + private boolean isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(TextPageBlock previous, TextPageBlock current, List allBlocks) { return current.intersectsY(previous) // && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) // - && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 0; + && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, allBlocks) <= 0; } private boolean isSameTopOrBottomWithPreviousHavingMaxTwoLinesAndCurrentThanOneAndMax4OtherBlocksOnHeight(TextPageBlock previous, TextPageBlock current, - ClassificationPage page) { + List allBlocks) { return (Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) // && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() >= 2 && current.getNumberOfLines() == 1) // - && !hasBetween(current, previous, page.getTextBlocks()) && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 4; + && !hasBetween(current, previous, allBlocks) && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, allBlocks) <= 4; } - private boolean intersectsYWithPreviousHavingMaxOneLine(TextPageBlock previous, TextPageBlock current, ClassificationPage page) { + private boolean intersectsYWithPreviousHavingMaxOneLine(TextPageBlock previous, TextPageBlock current) { return previous.intersectsY(current) && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1); } - private boolean areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) { + private boolean areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(TextPageBlock previous, TextPageBlock current, List allBlocks) { return previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 // && previous.intersectsY(current) // - && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) == 0; + && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, allBlocks) == 0; } - private TextPageBlock combineBlocksAndResetIterator(TextPageBlock previous, TextPageBlock current, ListIterator itty, boolean toDuplicate) { + private TextPageBlock combineBlocksAndResetIterator(TextPageBlock previous, TextPageBlock current, ListIterator itty, boolean toDuplicate) { previous.addAll(current.getWords()); previous = buildTextBlock(previous.getWords(), 0); @@ -196,7 +182,7 @@ public class DocstrumBlockificationService { } - private boolean hasBetween(TextPageBlock block, TextPageBlock other, List allBlocks) { + private boolean hasBetween(TextPageBlock block, TextPageBlock other, List allBlocks) { for (AbstractPageBlock current : allBlocks) { @@ -213,7 +199,7 @@ public class DocstrumBlockificationService { } - private int numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(TextPageBlock block, TextPageBlock other, List allBlocks) { + private int numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(TextPageBlock block, TextPageBlock other, List allBlocks) { double minY = Math.min(block.getMinY(), other.getMinY()); double maxY = Math.min(block.getMaxY(), other.getMaxY()); @@ -234,25 +220,18 @@ public class DocstrumBlockificationService { } - public void mergeIntersectingBlocks(ClassificationPage page, CleanRulings usedRulings, float xThreshold, float yThreshold) { + public void mergeIntersectingBlocks(List blocks, CleanRulings usedRulings, float xThreshold, float yThreshold) { - var blocks = page.getTextBlocks(); - ListIterator itty = blocks.listIterator(); + ListIterator itty = blocks.listIterator(); while (itty.hasNext()) { - AbstractPageBlock block = itty.next(); - if (block == null) { - continue; - } - if (block instanceof TablePageBlock) { + TextPageBlock current = itty.next(); + if (current == null) { continue; } - if (block.getClassification() != null && block.getClassification().isHeadline()) { + if (current.getClassification() != null && current.getClassification().isHeadline()) { continue; } - - TextPageBlock current = (TextPageBlock) block; - for (int i = 0; i < blocks.size(); i++) { AbstractPageBlock abstractPageBlock = blocks.get(i); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java index 6e757d7..b7da73a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java @@ -33,14 +33,14 @@ public class DocuMineBlockificationService { * This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this! * Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling. * - * @param textPositions The textPositions of a page. + * @param words The words of a page. * @param cleanRulings All rulings on a page * @return Page object that contains the Textblock and text statistics. */ - public ClassificationPage blockify(List textPositions, CleanRulings cleanRulings) { + public List blockify(List words, CleanRulings cleanRulings) { List chunkWords = new ArrayList<>(); - List textPageBlocks = new ArrayList<>(); + List textPageBlocks = new ArrayList<>(); CleanRulings usedRulings = cleanRulings.withoutTextRulings(); @@ -52,7 +52,7 @@ public class DocuMineBlockificationService { boolean wasSplitted = false; Double splitX1 = null; - for (Word word : textPositions) { + for (Word word : words) { boolean lineSeparation = prev != null && word.getYDirAdj() - prev.getMaxYDirAdj() > Math.min(word.getHeight(), prev.getHeight()) * 1.1; boolean startFromTop = prev != null && word.getYDirAdj() < prev.getYDirAdj() - prev.getTextHeight(); @@ -120,7 +120,7 @@ public class DocuMineBlockificationService { textPageBlocks.add(new TextPageBlock(chunkWords)); - return new ClassificationPage(textPageBlocks); + return textPageBlocks; } @@ -171,8 +171,9 @@ public class DocuMineBlockificationService { continue; } - if (current.getDir() == inner.getDir() && current.intersects(inner, yThreshold, xThreshold) && (current.getClassification() == null || current.getClassification() - .equals(inner.getClassification()))) { + if (current.getDir() == inner.getDir() &&// + current.intersects(inner, yThreshold, xThreshold) &&// + (current.getClassification() == null || current.getClassification().equals(inner.getClassification()))) { boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate(); current.addAll(inner.getWords()); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java index 68c9c97..29e6ac6 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java @@ -26,24 +26,24 @@ public class RedactManagerBlockificationService { * This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this! * Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling. * - * @param textPositions The words of a page. + * @param words The words of a page. * @param visualizations * @return Page object that contains the Textblock and text statistics. */ - public ClassificationPage blockify(List textPositions, CleanRulings cleanRulings, LayoutDebugLayer visualizations) { + public List blockify(List words, CleanRulings cleanRulings, LayoutDebugLayer visualizations) { CleanRulings usedRulings = cleanRulings.withoutTextRulings(); int indexOnPage = 0; List chunkWords = new ArrayList<>(); - List chunkBlockList = new ArrayList<>(); + List chunkBlockList = new ArrayList<>(); double minX = 1000, maxX = 0, minY = 1000, maxY = 0; Word prev = null; boolean wasSplitted = false; Double splitX1 = null; - for (Word word : textPositions) { + for (Word word : words) { boolean lineSeparation = word.getYDirAdj() - maxY > word.getHeight() * 1.25; boolean startFromTop = prev != null && word.getYDirAdj() < prev.getYDirAdj() - prev.getTextHeight(); @@ -111,7 +111,7 @@ public class RedactManagerBlockificationService { chunkBlockList.add(cb1); } - Iterator itty = chunkBlockList.iterator(); + Iterator itty = chunkBlockList.iterator(); TextPageBlock previousLeft = null; TextPageBlock previousRight = null; @@ -159,12 +159,12 @@ public class RedactManagerBlockificationService { previous = block; } - if (!textPositions.isEmpty()) { + if (!words.isEmpty() && visualizations != null) { visualizations.addTextBlockVisualizations(chunkBlockList.stream() - .toList(), textPositions.get(0).getPage()); + .toList(), words.get(0).getPage()); } - return new ClassificationPage(chunkBlockList); + return chunkBlockList; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TableOfContentsClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TableOfContentsClassificationService.java index 36f91d1..4cb3f86 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TableOfContentsClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TableOfContentsClassificationService.java @@ -19,7 +19,7 @@ import java.util.stream.Collectors; import org.springframework.stereotype.Service; -import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind; +import com.knecon.fforesight.service.layoutparser.processor.model.UnionFind; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java index 1be92f2..8d1dc08 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java @@ -5,7 +5,6 @@ import static java.util.stream.Collectors.groupingBy; import static java.util.stream.Collectors.toList; import java.awt.geom.Rectangle2D; -import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.LinkedList; @@ -15,7 +14,6 @@ import java.util.NoSuchElementException; import java.util.Objects; import java.util.Optional; import java.util.Set; -import java.util.stream.Collectors; import com.iqser.red.service.redaction.v1.server.model.document.DocumentTree; import com.iqser.red.service.redaction.v1.server.model.document.nodes.AbstractSemanticNode; @@ -36,8 +34,8 @@ import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBl import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; -import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter; -import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader; +import com.knecon.fforesight.service.layoutparser.processor.model.text.ClassificationFooter; +import com.knecon.fforesight.service.layoutparser.processor.model.text.ClassificationHeader; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.DocumentWithVisualization; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; @@ -112,9 +110,7 @@ public class DocumentGraphFactory { public void addParagraphOrHeadline(GenericSemanticNode parentNode, TextPageBlock originalTextBlock, - Context context, - List textBlocksToMerge, - LayoutParsingType layoutParsingType) { + Context context, LayoutParsingType layoutParsingType) { Page page = context.getPage(originalTextBlock.getPage()); @@ -129,17 +125,10 @@ public class DocumentGraphFactory { node = Paragraph.builder().documentTree(context.getDocumentTree()).build(); } - List textBlocks = new ArrayList<>(); - textBlocks.add(originalTextBlock); - textBlocks.addAll(textBlocksToMerge); - - AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSort(textBlocks), node, context, page); + AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSort(originalTextBlock), node, context, page); if (node instanceof DuplicatedParagraph duplicatedParagraph) { - AtomicTextBlock unsortedTextBlock = context.textBlockFactory.buildAtomicTextBlock(textBlocks.stream() - .flatMap(tb -> tb.getWords() - .stream()) - .collect(Collectors.toList()), node, context, page); + AtomicTextBlock unsortedTextBlock = context.textBlockFactory.buildAtomicTextBlock(originalTextBlock.getWords(), node, context, page); duplicatedParagraph.setUnsortedLeafTextBlock(unsortedTextBlock); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java index 4e4e900..8f53440 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SearchTextWithTextPositionFactory.java @@ -29,19 +29,19 @@ public class SearchTextWithTextPositionFactory { public static final double LINEBREAK_DELTA_TOLERANCE = 1.5; - public SearchTextWithTextPositionDto buildSearchTextToTextPositionDto(List sequences) { + public SearchTextWithTextPositionDto buildSearchTextToTextPositionDto(List words) { - if (sequences.isEmpty() || sequences.stream() + if (words.isEmpty() || words.stream() .allMatch(sequence -> sequence.getCharacters().isEmpty())) { return SearchTextWithTextPositionDto.empty(); } Context context = new Context(); - RedTextPosition currentTextPosition = sequences.get(0).getCharacters().get(0).getTextPosition(); + RedTextPosition currentTextPosition = words.get(0).getCharacters().get(0).getTextPosition(); RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").bBoxDirAdj(currentTextPosition.getBBoxDirAdj()).build(); - for (Word word : sequences) { + for (Word word : words) { for (int i = 0; i < word.getCharacters().size(); ++i) { currentTextPosition = word.getCharacters().get(i).getTextPosition(); @@ -66,7 +66,7 @@ public class SearchTextWithTextPositionFactory { ++context.stringIdx; } - List positions = sequences.stream() + List positions = words.stream() .map(Word::getCharacters) .flatMap(Collection::stream) .map(Character::getTextPosition) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java index b2d4826..480bcd2 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java @@ -1,12 +1,12 @@ package com.knecon.fforesight.service.layoutparser.processor.services.factory; -import static java.lang.String.format; import static java.util.Collections.emptyList; import java.util.ArrayList; import java.util.HashSet; import java.util.LinkedList; import java.util.List; +import java.util.Objects; import java.util.Optional; import java.util.Set; @@ -17,12 +17,13 @@ import com.iqser.red.service.redaction.v1.server.model.document.nodes.Section; import com.iqser.red.service.redaction.v1.server.model.document.nodes.SuperSection; import com.iqser.red.service.redaction.v1.server.model.document.nodes.TableOfContents; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; +import com.knecon.fforesight.service.layoutparser.processor.model.UnionFind; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeEntry; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.utils.TableMergingUtility; +import com.knecon.fforesight.service.layoutparser.processor.services.tables.TableMergingUtility; import lombok.experimental.UtilityClass; @@ -60,7 +61,7 @@ public class SectionNodeFactory { section.setTreeId(getTreeId(parentNode, context, section)); - addFirstHeadlineDirectlyToSection(layoutParsingType, pageBlocks, context, section, document); + addFirstHeadlineDirectlyToSection(layoutParsingType, pageBlocks, context, section); boolean containsTablesAndTextBlocks = containsTablesAndTextBlocks(pageBlocks); if (containsTablesAndTextBlocks) { splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(layoutParsingType, @@ -73,8 +74,13 @@ public class SectionNodeFactory { } else if (type.equals(SectionTreeEntry.Type.SUPER_SECTION)) { // If a SuperSection contains more blocks than just a headline, we add a Section which contains the remaining textblocks. addSection(layoutParsingType, section, SectionTreeEntry.Type.SECTION, pageBlocks, emptyList(), context, document); + } else if (!pageBlocks.isEmpty() && pageBlocks.get(0) instanceof TextPageBlock) { + List textPageBlocks = pageBlocks.stream() + .map(block -> (TextPageBlock) block) + .toList(); + addParagraphsAndHeadlinesToSection(layoutParsingType, textPageBlocks, context, section); } else { - addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, pageBlocks, context, section, document); + addTablesToSection(pageBlocks, context, section, document, layoutParsingType); } images.stream() @@ -85,6 +91,28 @@ public class SectionNodeFactory { } + private static void addTablesToSection(List pageBlocks, + DocumentGraphFactory.Context context, + AbstractSemanticNode section, + Document document, + LayoutParsingType layoutParsingType) { + + List remainingBlocks = new ArrayList<>(pageBlocks); + Set alreadyMerged = new HashSet<>(); + for (AbstractPageBlock abstractPageBlock : pageBlocks) { + if (alreadyMerged.contains(abstractPageBlock)) { + continue; + } + if (abstractPageBlock instanceof TablePageBlock tablePageBlock) { + List tablesToMerge = TableMergingUtility.findConsecutiveTablesWithSameColCountAndSameHeaders(tablePageBlock, remainingBlocks); + alreadyMerged.addAll(tablesToMerge); + remainingBlocks.removeAll(tablesToMerge); + TableNodeFactory.addTable(layoutParsingType, section, tablesToMerge, context, document); + } + } + } + + private List getTreeId(GenericSemanticNode parentNode, DocumentGraphFactory.Context context, AbstractSemanticNode section) { if (parentNode == null) { @@ -98,54 +126,63 @@ public class SectionNodeFactory { private void addFirstHeadlineDirectlyToSection(LayoutParsingType layoutParsingType, List pageBlocks, DocumentGraphFactory.Context context, - AbstractSemanticNode section, - Document document) { + AbstractSemanticNode section) { if (pageBlocks.get(0).isHeadline()) { - addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, List.of(pageBlocks.get(0)), context, section, document); + addParagraphsAndHeadlinesToSection(layoutParsingType, List.of((TextPageBlock) pageBlocks.get(0)), context, section); pageBlocks.remove(0); } } - private void addTablesAndParagraphsAndHeadlinesToSection(LayoutParsingType layoutParsingType, - List pageBlocks, - DocumentGraphFactory.Context context, - AbstractSemanticNode section, - Document document) { + private void addParagraphsAndHeadlinesToSection(LayoutParsingType layoutParsingType, + List pageBlocks, + DocumentGraphFactory.Context context, + AbstractSemanticNode section) { - Set alreadyMerged = new HashSet<>(); - List remainingBlocks = new LinkedList<>(pageBlocks); - for (AbstractPageBlock abstractPageBlock : pageBlocks) { + List mergedPageBlocks = pageBlocks; + if (pageBlocks.size() > 1 && (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD) || layoutParsingType.equals(LayoutParsingType.REDACT_MANAGER_OLD))) { + mergedPageBlocks = mergeBlocks(pageBlocks); + } - if (alreadyMerged.contains(abstractPageBlock)) { - continue; - } + for (TextPageBlock textPageBlock : mergedPageBlocks) { + DocumentGraphFactory.addParagraphOrHeadline(section, textPageBlock, context, layoutParsingType); + } + } - remainingBlocks.removeAll(alreadyMerged); - if (abstractPageBlock instanceof TextPageBlock) { + private static List mergeBlocks(List pageBlocks) { - switch (layoutParsingType) { - case REDACT_MANAGER, DOCUMINE, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> { - alreadyMerged.add(abstractPageBlock); - remainingBlocks.remove(abstractPageBlock); - DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, new ArrayList<>(), layoutParsingType); - } - default -> { - List textBlocks = findTextBlocksWithSameClassificationAndAlignsY((TextPageBlock) abstractPageBlock, remainingBlocks); - alreadyMerged.addAll(textBlocks); - DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks, layoutParsingType); - } + UnionFind blockUnionFind = new UnionFind<>(new HashSet<>(pageBlocks)); + for (int i = 0; i < pageBlocks.size(); i++) { + TextPageBlock textPageBlock1 = pageBlocks.get(i); + for (int j = i; j < pageBlocks.size(); j++) { + if (i == j) { + continue; } - } else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) { - List tablesToMerge = TableMergingUtility.findConsecutiveTablesWithSameColCountAndSameHeaders(tablePageBlock, remainingBlocks); - alreadyMerged.addAll(tablesToMerge); - TableNodeFactory.addTable(layoutParsingType, section, tablesToMerge, context, document); - } else { - throw new RuntimeException(format("Unhandled AbstractPageBlockType %s!", abstractPageBlock.getClass())); + var textPageBlock2 = pageBlocks.get(j); + if (!Objects.equals(textPageBlock2.getPage(), textPageBlock1.getPage())) { + continue; + } + if (!Objects.equals(textPageBlock2.getDir(), textPageBlock1.getDir())) { + continue; + } + if (!Objects.equals(textPageBlock2.getClassification(), textPageBlock1.getClassification())) { + continue; + } + if (!textPageBlock2.intersectsYPdf(textPageBlock1)) { + continue; + } + if (textPageBlock2.isToDuplicate()) { + continue; + } + blockUnionFind.union(textPageBlock2, textPageBlock1); } } + return blockUnionFind.getGroups() + .stream() + .map(TextPageBlock::merge) + .toList(); } @@ -222,18 +259,4 @@ public class SectionNodeFactory { return splitList; } - - private List findTextBlocksWithSameClassificationAndAlignsY(TextPageBlock atc, List pageBlocks) { - - return pageBlocks.stream() - .filter(abstractTextContainer -> !abstractTextContainer.equals(atc)) - .filter(abstractTextContainer -> abstractTextContainer.getPage() == atc.getPage()) - .filter(abstractTextContainer -> abstractTextContainer instanceof TextPageBlock) - .filter(abstractTextContainer -> abstractTextContainer.intersectsYPdf(atc)) - .map(abstractTextContainer -> (TextPageBlock) abstractTextContainer) - .filter(abstractTextContainer -> abstractTextContainer.getDir() == atc.getDir()) - .filter(abstractTextContainer -> !abstractTextContainer.isToDuplicate()) - .toList(); - } - } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java index 321fab5..a55b822 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java @@ -4,7 +4,6 @@ import static java.util.Collections.emptyList; import java.util.Collection; import java.util.List; -import java.util.stream.Collectors; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document; import com.iqser.red.service.redaction.v1.server.model.document.nodes.GenericSemanticNode; @@ -17,6 +16,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeEntry; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations; @@ -50,8 +50,6 @@ public class TableNodeFactory { List treeId = context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, table); table.setTreeId(treeId); addTableCells(layoutParsingType, mergedRows, table, context, document); - - ifTableHasNoHeadersSetFirstRowAsHeaders(table); } @@ -76,16 +74,6 @@ public class TableNodeFactory { } - private void ifTableHasNoHeadersSetFirstRowAsHeaders(Table table) { - - if (table.streamHeaders() - .findAny().isEmpty()) { - table.streamRow(0) - .forEach(tableCellNode -> tableCellNode.setHeader(true)); - } - } - - private void addTableCells(LayoutParsingType layoutParsingType, List> rows, Table table, DocumentGraphFactory.Context context, Document document) { for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) { @@ -115,32 +103,32 @@ public class TableNodeFactory { TextBlock textBlock; if (cell.getTextBlocks().isEmpty()) { tableCell.setLeafTextBlock(context.getTextBlockFactory().emptyTextBlock(tableNode, context, page)); - } else if (cell.getTextBlocks().size() == 1) { - textBlock = context.getTextBlockFactory().buildAtomicTextBlock(cell.getTextBlocks().get(0).getWords(), tableCell, context, page); + } else if (cell.getTextBlocks().size() == 1 && cell.getTextBlocks().get(0) instanceof TextPageBlock textPageBlock) { + textBlock = context.getTextBlockFactory().buildAtomicTextBlock(textPageBlock.getWords(), tableCell, context, page); tableCell.setLeafTextBlock(textBlock); - } else if (firstTextBlockIsHeadline(cell)) { - SectionNodeFactory.addSection(layoutParsingType, - tableCell, - SectionTreeEntry.Type.SECTION, - cell.getTextBlocks() - .stream() - .map(tb -> (AbstractPageBlock) tb) - .collect(Collectors.toList()), - emptyList(), - context, - document); - } else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) { - List sequences = TextPositionOperations.mergeAndSort(cell.getTextBlocks()); - textBlock = context.getTextBlockFactory().buildAtomicTextBlock(sequences, tableCell, context, page); + } else if (firstTextBlockIsHeadline(cell) || containsTables(cell.getTextBlocks())) { + SectionNodeFactory.addSection(layoutParsingType, tableCell, SectionTreeEntry.Type.SECTION, cell.getTextBlocks(), emptyList(), context, document); + } else if (cellAreaIsSmallerThanThreshold(cell, page)) { + List words = TextPositionOperations.sort(cell.getWords()); + textBlock = context.getTextBlockFactory().buildAtomicTextBlock(words, tableCell, context, page); tableCell.setLeafTextBlock(textBlock); } else { cell.getTextBlocks() - .forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList(), layoutParsingType)); + .stream() + .map(block -> (TextPageBlock) block) + .forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, layoutParsingType)); } } - private boolean cellAreaIsSmallerThanPageAreaTimesThreshold(Cell cell, Page page) { + private boolean containsTables(List pageBlocks) { + + return pageBlocks.stream() + .anyMatch(pageBlock -> pageBlock instanceof TablePageBlock); + } + + + private boolean cellAreaIsSmallerThanThreshold(Cell cell, Page page) { return cell.getArea() < TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD * page.getHeight() * page.getWidth(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TextBlockFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TextBlockFactory.java index ad70005..d165fc3 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TextBlockFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TextBlockFactory.java @@ -18,16 +18,16 @@ public class TextBlockFactory { long textBlockIdx; - public AtomicTextBlock buildAtomicTextBlock(List sequences, SemanticNode parent, DocumentGraphFactory.Context context, Page page) { + public AtomicTextBlock buildAtomicTextBlock(List words, SemanticNode parent, DocumentGraphFactory.Context context, Page page) { Integer numberOnPage = context.getAndIncrementTextBlockNumberOnPage(page); - return buildAtomicTextBlock(sequences, parent, numberOnPage, page); + return buildAtomicTextBlock(words, parent, numberOnPage, page); } - public AtomicTextBlock buildAtomicTextBlock(List sequences, SemanticNode parent, Integer numberOnPage, Page page) { + public AtomicTextBlock buildAtomicTextBlock(List words, SemanticNode parent, Integer numberOnPage, Page page) { - SearchTextWithTextPositionDto searchTextWithTextPositionDto = SearchTextWithTextPositionFactory.buildSearchTextToTextPositionDto(sequences); + SearchTextWithTextPositionDto searchTextWithTextPositionDto = SearchTextWithTextPositionFactory.buildSearchTextToTextPositionDto(words); int offset = stringOffset; stringOffset += searchTextWithTextPositionDto.getSearchText().length(); long idx = textBlockIdx; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/FindGraphicsRaster.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/FindGraphicsRaster.java index c2a2426..22f51c0 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/FindGraphicsRaster.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/FindGraphicsRaster.java @@ -11,14 +11,15 @@ import java.util.stream.Collectors; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.rendering.ImageType; import org.apache.pdfbox.rendering.PDFRenderer; -import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms; -import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation; +import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import lombok.SneakyThrows; +import lombok.experimental.UtilityClass; -@Service +@UtilityClass public class FindGraphicsRaster { // Pixels that are lighter then this threshold are ignored @@ -33,7 +34,8 @@ public class FindGraphicsRaster { var renderer = new PDFRenderer(doc); var img = renderer.renderImageWithDPI(pageInformation.number() - 1, DPI, ImageType.GRAY); - var imageCtm = CoordinateTransforms.calculateImageCoordsToInitialUserSpaceCoords(pageInformation, CoordinateTransforms.calculateScalingFactor(pageInformation, img.getWidth())); + var imageCtm = CoordinateTransforms.calculateImageCoordsToInitialUserSpaceCoords(pageInformation, + CoordinateTransforms.calculateScalingFactor(pageInformation, img.getWidth())); return findCCBoundingBoxes(img, remove, THRESHOLD, DPI / 72, imageCtm); } @@ -47,13 +49,15 @@ public class FindGraphicsRaster { var w = image.getWidth(); var pixels = new int[w * h]; image.getRaster().getPixels(0, 0, w, h, pixels); - remove.stream().map(rect -> inverseCTM.createTransformedShape(rect).getBounds2D()).forEach(box -> { - for (int y = (int) Math.floor(box.getMinY() / rescale); y <= (int) Math.min(Math.ceil(box.getMaxY() / rescale), h); y++) { - for (int x = (int) Math.floor(box.getMinX() / rescale); x <= (int) Math.min(Math.ceil(box.getMaxX() / rescale), w); x++) { - pixels[w * y + x] = grayScaleTresh; - } - } - }); + remove.stream() + .map(rect -> RectangleTransformations.transform(rect, inverseCTM)) + .forEach(box -> { + for (int y = (int) Math.floor(box.getMinY() / rescale); y <= (int) Math.min(Math.ceil(box.getMaxY() / rescale), h); y++) { + for (int x = (int) Math.floor(box.getMinX() / rescale); x <= (int) Math.min(Math.ceil(box.getMaxX() / rescale), w); x++) { + pixels[w * y + x] = grayScaleTresh; + } + } + }); // var image2 = createImageFromMatrix(pixels, w, h); @@ -130,8 +134,10 @@ public class FindGraphicsRaster { } } } - return boundingBoxes.stream().filter(box -> box.area() > 0).map(box -> box.transform(imageCTM)).collect(Collectors.toList()); + return boundingBoxes.stream() + .filter(box -> box.area() > 0) + .map(box -> box.transform(imageCTM)) + .collect(Collectors.toList()); } - } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/GraphicExtractorService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/GraphicExtractorService.java index 127c6e8..5fe98d7 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/GraphicExtractorService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/graphics/GraphicExtractorService.java @@ -4,15 +4,14 @@ import java.awt.geom.Rectangle2D; import java.util.List; import java.util.stream.Collectors; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.pdmodel.PDPage; import org.springframework.stereotype.Service; +import com.iqser.red.service.redaction.v1.server.model.document.nodes.ImageType; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; +import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; -import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation; import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; @@ -25,32 +24,13 @@ public class GraphicExtractorService { private static final int MIN_GRAPHICS_AREA = 500; private final GraphicsClusteringService graphicsClusteringService; - private final FindGraphicsRaster findGraphicsRaster; @SneakyThrows - public List extractPathElementGraphics(PDDocument pdDocument, - PDPage pdPage, - int pageNumber, - CleanRulings cleanRulings, - List words, - boolean graphicsRaster) { + public List extractPathElementGraphics(List graphicBBoxes, int pageNumber, CleanRulings cleanRulings) { - List characterBBoxes = getCharacterBBoxes(words); List classifiedRulingsBoxes = getLineBBoxesOfAllClassifiedRulings(cleanRulings); - GraphicBBDetector graphicBBDetector = new GraphicBBDetector(pdPage, true); - List graphicBBoxes = graphicBBDetector.findGraphicBB(); - - if (graphicsRaster) { - // This should only be used if ocr was performed, it is currently in an early stage and needs to be improved. - graphicBBoxes.addAll(findGraphicsRaster.findCCBoundingBoxes(pdDocument, - characterBBoxes.stream() - .map(box -> new Rectangle2D.Double(box.x1 - 2, box.y1 - 2, box.width() + 4, box.height() + 4)) - .collect(Collectors.toList()), - PageInformation.fromPDPage(pageNumber, pdPage))); - } - List filteredGraphicBBoxes = graphicBBoxes.stream() .filter(box -> !box.intersectsAny(classifiedRulingsBoxes, 4)) .collect(Collectors.toList()); @@ -59,19 +39,11 @@ public class GraphicExtractorService { return clusters.stream() .filter(box -> box.area() > MIN_GRAPHICS_AREA && box.height() > MIN_GRAPHICS_SIDE_LENGTH && box.width() > MIN_GRAPHICS_SIDE_LENGTH) + .map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), ImageType.GRAPHIC, false, pageNumber, "")) .toList(); } - private List getCharacterBBoxes(List words) { - - return words.stream() - .map(BoundingBox::getBBoxPdf) - .map(Box::new) - .collect(Collectors.toList()); - } - - private List getLineBBoxesOfAllClassifiedRulings(CleanRulings cleanRulings) { return cleanRulings.buildAll() diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/OutlineMapper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/OutlineMapper.java index 87bcfa8..f415868 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/OutlineMapper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/OutlineMapper.java @@ -14,7 +14,7 @@ import com.iqser.red.service.redaction.v1.server.model.document.nodes.Section; import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNode; import com.iqser.red.service.redaction.v1.server.model.document.nodes.SuperSection; import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms; -import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation; +import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation; import com.knecon.fforesight.service.viewerdoc.model.Outline; import lombok.SneakyThrows; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java index d05dd66..8cc460b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java @@ -199,11 +199,18 @@ public class PDFLinesTextStripper extends PDFTextStripper { } - private void addVisibleRulings(List path, boolean stroke) throws IOException { + private void addVisibleRulings(List path, boolean stroke) { try { if (stroke && !getGraphicsState().getStrokingColor().isPattern() && isBlack(getGraphicsState().getStrokingColor()) || // !stroke && !getGraphicsState().getNonStrokingColor().isPattern() && isBlack(getGraphicsState().getNonStrokingColor())) { + // see spec '8.4.3.6 Line dash pattern' + var dashPattern = getGraphicsState().getLineDashPattern(); + if (dashPattern != null && dashPattern.getDashArray().length > 0) { + path.forEach(r -> r.setStyle(Ruling.Style.DASHED)); + } else { + path.forEach(r -> r.setStyle(Ruling.Style.SOLID)); + } rulings.addAll(path); } } catch (UnsupportedOperationException e) { @@ -247,9 +254,7 @@ public class PDFLinesTextStripper extends PDFTextStripper { } if (!words.isEmpty()) { - previous = words.get(words.size() - 1) - .getCharacters() - .get(words.get(words.size() - 1).getCharacters().size() - 1).getTextPosition(); + previous = words.get(words.size() - 1).getCharacters().get(words.get(words.size() - 1).getCharacters().size() - 1).getTextPosition(); } if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0") || textPositions.get(i).getUnicode().equals("\t"))) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/AreaSweepGridifier.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/AreaSweepGridifier.java new file mode 100644 index 0000000..5029e6d --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/AreaSweepGridifier.java @@ -0,0 +1,138 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.tables; + +import java.awt.geom.AffineTransform; +import java.awt.geom.Point2D; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Comparator; +import java.util.HashSet; +import java.util.List; +import java.util.Optional; +import java.util.Set; + +import com.knecon.fforesight.service.layoutparser.processor.model.UnionFind; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; + +import lombok.experimental.UtilityClass; + +@UtilityClass +public class AreaSweepGridifier { + + public static final double CELL_AREA_CONTAINED_THRESHOLD = 0.8; + public static final double MIN_SIZE_FACTOR = 0.5; + + + /** + * Calculates the grid structure of the table. For spanning rows and columns multiple cells with the same values will be inserted. + * Works well for perfectly straight tables, but fails as soon as the tables are slightly rotated. Then the area sweep will drop some cells or duplicate them unnecessarily. + * + * @return TablePageBlock Structure as a rows of cells matrix + */ + public List> gridify(Set cells, AffineTransform pageToPdfTransform, double minCellWidth, double minCellHeight) { + + if (cells.isEmpty()) { + return new ArrayList<>(); + } + + var colDividers = getColDividers(cells, minCellWidth); + var rowDividers = getRowDividers(cells, minCellHeight); + + List> rowsOfCells = new ArrayList<>(); + + for (int i = 1; i < rowDividers.size(); i++) { + double prevY = rowDividers.get(i - 1); + double y = rowDividers.get(i); + + List row = new ArrayList<>(); + + for (int j = 1; j < colDividers.size(); j++) { + double prevX = colDividers.get(j - 1); + double x = colDividers.get(j); + + var cellFromGridStructure = Cell.fromPageCoordinates(new Point2D.Double(prevX, prevY), new Point2D.Double(x, y), pageToPdfTransform); + + if (!cellFromGridStructure.hasMinimumSize()) { + continue; + } + + Optional matchingCell = cells.stream() + .map(originalCell -> new CellWithIntersection(originalCell, + RectangleTransformations.calculateIntersectedArea(cellFromGridStructure.getBBox(), originalCell.getBBox()))) + .filter(cellWithIntersection -> cellWithIntersection.intersectedArea() > 0) + .filter(cellWithIntersection -> cellFromGridStructure.getArea() > cellWithIntersection.intersectedArea * CELL_AREA_CONTAINED_THRESHOLD) + .max(Comparator.comparing(CellWithIntersection::intersectedArea)) + .map(CellWithIntersection::originalCell); + + if (matchingCell.isPresent()) { + cellFromGridStructure.getTextBlocks().addAll(matchingCell.get().getTextBlocks()); + cellFromGridStructure.setHeaderCell(matchingCell.get().isHeaderCell()); + } + + row.add(cellFromGridStructure); + + } + + rowsOfCells.add(row); + } + + return rowsOfCells; + } + + + private List getRowDividers(Collection cells, double minCellHeight) { + + Set uniqueY = new HashSet<>(); + cells.stream() + .filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3) + .forEach(c -> { + uniqueY.add(c.getMinY()); + uniqueY.add(c.getMaxY()); + }); + + return deduplicate(uniqueY, minCellHeight * MIN_SIZE_FACTOR); + } + + + private List getColDividers(Collection cells, double minCellWidth) { + + Set uniqueX = new HashSet<>(); + cells.stream() + .filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3) + .forEach(c -> { + uniqueX.add(c.getMinX()); + uniqueX.add(c.getMaxX()); + }); + + return deduplicate(uniqueX, minCellWidth * MIN_SIZE_FACTOR); + } + + + private List deduplicate(Set doubles, double minDistance) { + // finds all doubles less than the minDistance apart and replaces them with their average + UnionFind uf = new UnionFind<>(doubles); + for (Double x : doubles) { + for (Double x2 : doubles) { + if (x.equals(x2)) { + continue; + } + if (Math.abs(x - x2) < minDistance) { + uf.union(x, x2); + } + } + } + return uf.getGroups() + .stream() + .map(xs -> xs.stream() + .mapToDouble(Double::doubleValue).average() + .orElseThrow()) + .sorted() + .toList(); + } + + + record CellWithIntersection(Cell originalCell, double intersectedArea) { + + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangularIntersectionFinder.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/RectangularIntersectionFinder.java similarity index 91% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangularIntersectionFinder.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/RectangularIntersectionFinder.java index 60a19b9..e059538 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangularIntersectionFinder.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/RectangularIntersectionFinder.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.utils; +package com.knecon.fforesight.service.layoutparser.processor.services.tables; import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.Y_FIRST_POINT_COMPARATOR; @@ -14,15 +14,6 @@ public class RectangularIntersectionFinder { public static List find(List horizontalRulingLines, List verticalRulingLines) { -// // Fix for 211.pdf -// for (Ruling r : horizontalRulingLines) { -// if (r.getX2() < r.getX1()) { -// double a = r.getX2(); -// r.x2 = (float) r.getX1(); -// r.x1 = (float) a; -// } -// } - List foundRectangles = new ArrayList<>(); Map intersectionPoints = RulingIntersectionFinder.findNaive(horizontalRulingLines, verticalRulingLines); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RulingIntersectionFinder.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/RulingIntersectionFinder.java similarity index 97% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RulingIntersectionFinder.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/RulingIntersectionFinder.java index e69bcee..fc6e943 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RulingIntersectionFinder.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/RulingIntersectionFinder.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.utils; +package com.knecon.fforesight.service.layoutparser.processor.services.tables; import java.awt.geom.Point2D; import java.util.Collections; @@ -10,6 +10,7 @@ import java.util.Optional; import java.util.TreeMap; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons; import lombok.experimental.UtilityClass; import lombok.extern.slf4j.Slf4j; @@ -33,7 +34,7 @@ public class RulingIntersectionFinder { */ /* * The algorithm assumes there are only horizontal and vertical lines which are unique in their coordinates. (E.g. no overlapping horizontal lines exist) - * As a high level overview, the algorithm uses a sweep line advancing from left to right. + * As a high level overview, the algorithm uses a sweep line advancing from lefts to rights. * It dynamically updates the horizontal rulings which are intersected by the current sweep line. * When the sweep line hits a vertical line, it then checks for all intersections with the currently intersected horizontal rulings. * THe trick of the algorithm is using a binary search tree to store the currently intersected horizontal rulings. This way the lookup should be in O(log n). diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RulingTextDirAdjustUtil.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/RulingTextDirAdjustUtil.java similarity index 92% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RulingTextDirAdjustUtil.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/RulingTextDirAdjustUtil.java index 04ff106..9dd77cd 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RulingTextDirAdjustUtil.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/RulingTextDirAdjustUtil.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.utils; +package com.knecon.fforesight.service.layoutparser.processor.services.tables; import java.awt.geom.Line2D; import java.awt.geom.Point2D; @@ -12,7 +12,7 @@ public final class RulingTextDirAdjustUtil { /** * Converts a ruling (line of a table) the same way TextPositions are converted in PDFBox. - * This will get the y position of the text, adjusted so that 0,0 is upper left and it is adjusted based on the text direction. + * This will get the y position of the text, adjusted so that 0,0 is upper lefts and it is adjusted based on the text direction. *

* See org.apache.pdfbox.text.TextPosition */ diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableAreaFiller.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableAreaFiller.java new file mode 100644 index 0000000..3838e15 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableAreaFiller.java @@ -0,0 +1,109 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.tables; + +import java.awt.geom.AffineTransform; +import java.awt.geom.Rectangle2D; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; + +import lombok.SneakyThrows; +import lombok.experimental.UtilityClass; + +@UtilityClass +public class TableAreaFiller { + + public Set findMissingCells(List cells, Rectangle2D areaPDF, AffineTransform pdfToPageTransform) { + + var area = RectangleTransformations.transform(areaPDF, pdfToPageTransform); + + List rectangles = cells.stream() + .map(BoundingBox::getBBox) + .toList(); + Set unfilledRects = findMissingRects(rectangles, area); + + AffineTransform pageToPdfTransform = getInverse(pdfToPageTransform); + + return unfilledRects.stream() + .map(rect -> Cell.fromPageCoordinates(rect, pageToPdfTransform)) + .collect(Collectors.toSet()); + } + + + public static Set findMissingRects(List rectangles, Rectangle2D area) { + + double minWidth = rectangles.stream() + .mapToDouble(Rectangle2D::getWidth) + .min().orElse(0) * 0.95; + double minHeight = rectangles.stream() + .mapToDouble(Rectangle2D::getHeight) + .min().orElse(0) * 0.95; + + Set unfilledRects = new HashSet<>(); + unfilledRects.add(area); + for (Rectangle2D rectangle : rectangles) { + unfilledRects = fillWithRectangle(unfilledRects, rectangle, minWidth, minHeight); + } + return unfilledRects; + } + + + private Set fillWithRectangle(Set unfilledRects, Rectangle2D rectToAdd, double minWidth, double minHeight) { + + Set remainingUnfilledRects = new HashSet<>(); + for (Rectangle2D unfilledRect : unfilledRects) { + if (!rectToAdd.intersects(unfilledRect)) { + remainingUnfilledRects.add(unfilledRect); + continue; + } + + boolean topAdded = false; + boolean bottomAdded = false; + + // Top rectangle + double topHeight = rectToAdd.getY() - unfilledRect.getY(); + if (topHeight > minHeight) { + topAdded = true; + Rectangle2D topRect = new Rectangle2D.Double(unfilledRect.getX(), unfilledRect.getY(), unfilledRect.getWidth(), topHeight); + remainingUnfilledRects.add(topRect); + } + // Bottom rectangle + double bottomHeight = unfilledRect.getMaxY() - rectToAdd.getMaxY(); + if (bottomHeight > minHeight) { + bottomAdded = true; + Rectangle2D bottomRect = new Rectangle2D.Double(unfilledRect.getX(), rectToAdd.getMaxY(), unfilledRect.getWidth(), bottomHeight); + remainingUnfilledRects.add(bottomRect); + } + + double y = topAdded ? rectToAdd.getY() : unfilledRect.getY(); + double maxY = bottomAdded ? rectToAdd.getMaxY() : unfilledRect.getMaxY(); + double height = maxY - y; + + // Left rectangle + double leftWidth = rectToAdd.getX() - unfilledRect.getX(); + if (leftWidth > minWidth) { + Rectangle2D leftRect = new Rectangle2D.Double(unfilledRect.getX(), y, leftWidth, height); + remainingUnfilledRects.add(leftRect); + } + // Right rectangle + double rightWidth = unfilledRect.getMaxX() - rectToAdd.getMaxX(); + if (rightWidth > minWidth) { + Rectangle2D rightRect = new Rectangle2D.Double(rectToAdd.getMaxX(), y, rightWidth, height); + remainingUnfilledRects.add(rightRect); + } + } + return remainingUnfilledRects; + } + + + @SneakyThrows + private static AffineTransform getInverse(AffineTransform pdfToPageTransform) { + + return pdfToPageTransform.createInverse(); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableExtractionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableExtractionService.java new file mode 100644 index 0000000..4e94e4a --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableExtractionService.java @@ -0,0 +1,270 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.tables; + +import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.CELL_SIZE_COMPARATOR; +import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.RECTANGLE_SIZE_COMPARATOR; + +import java.awt.Color; +import java.awt.geom.AffineTransform; +import java.awt.geom.Point2D; +import java.awt.geom.Rectangle2D; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.function.Function; +import java.util.stream.Collectors; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; +import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; +import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ReadingOrderService; +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; +import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationService; +import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms; +import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons; +import com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators; +import com.knecon.fforesight.service.layoutparser.processor.utils.SpreadsheetFinder; +import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer; +import com.knecon.fforesight.service.ocr.v1.api.model.Table; +import com.knecon.fforesight.service.ocr.v1.api.model.TableCell; +import com.knecon.fforesight.service.ocr.v1.api.model.TableCellType; + +import lombok.AccessLevel; +import lombok.RequiredArgsConstructor; +import lombok.SneakyThrows; +import lombok.experimental.FieldDefaults; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@Service +@RequiredArgsConstructor +@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true) +public class TableExtractionService { + + public static final int MAX_ROWS_OR_COLS = 500; + public static final int MAX_CELLS = MAX_ROWS_OR_COLS * MAX_ROWS_OR_COLS; + BlockificationService blockificationService; + ReadingOrderService readingOrderService; + static int MIN_TABLE_CONTAINED_CELLS_WITH_TEXT = 1; + static double TABLE_UNIFORMITY_THRESHOLD = 0.7; + + + public List extractTables(List emptyCells, + List words, + PageInformation pageInformation, + List idpTables, + LayoutParsingType layoutParsingType, + LayoutDebugLayer layoutDebugLayer) { + + AffineTransform pdfToPageTransform = CoordinateTransforms.calculateInitialUserSpaceCoordsToPageCoords(pageInformation); + List tablePageBlocks; + if (idpTables == null || idpTables.isEmpty()) { + tablePageBlocks = extractTables(emptyCells, words, pdfToPageTransform, layoutParsingType, layoutDebugLayer, pageInformation); + } else { + tablePageBlocks = buildTableFromIdpResult(idpTables, words, pdfToPageTransform, layoutParsingType); + } + return tablePageBlocks; + } + + + private List extractTables(List emptyCells, + List words, + AffineTransform pdfToPageTransform, + LayoutParsingType layoutParsingType, + LayoutDebugLayer layoutDebugLayer, + PageInformation pageInformation) { + + // sort cells by size (height * width) ascending so that textBlocks are always assigned to the smallest cells that contain them + emptyCells.sort(CELL_SIZE_COMPARATOR); + + List cells = new ArrayList<>(new HashSet<>(emptyCells)); + DoubleComparisons.sort(cells, GeometricComparators.CELL_SORTER); + + List spreadsheetAreas = SpreadsheetFinder.findSpreadsheetsFromCells(cells); + // sort spreadsheetAreas by size (height * width) ascending so that cells are placed in the smallest tables first + // this way no cell duplication occurs when tables are contained in other tables and only the most inner table contains the cells + spreadsheetAreas.sort(RECTANGLE_SIZE_COMPARATOR); + + List tables = new ArrayList<>(); + for (Rectangle2D area : spreadsheetAreas) { + + List containedCells = new ArrayList<>(); + for (Cell cell : cells) { + if (cell.hasMinimumSize() && area.contains(cell.getBBoxPdf())) { + containedCells.add(cell); + } + } + + if (containedCells.isEmpty()) { + continue; + } + // if cells are missing, for example a corner hasn't been recognized (See files/syngenta/CustomerFiles/SinglePages/T4_Page16_138 IDD0000261736.pdf), + // the LinkedCell based gridification can deal with this, but the transpose logic will then drop the entire column. + // That's why we compute the missing Cells from the spreadsheet area and fill them in. + Set missingCells = TableAreaFiller.findMissingCells(containedCells, area, pdfToPageTransform); + + layoutDebugLayer.addCellVisualizations(missingCells, pageInformation.number(), Color.RED); + layoutDebugLayer.addCellVisualizations(List.of(new Cell(area, pdfToPageTransform)), pageInformation.number(), Color.BLUE); + + containedCells.addAll(missingCells); + + Set wordsInTable = new HashSet<>(); // As docstrum blockfication recomputes the words, we need to remember the origin words to remove them from the overall list of words + for (Cell cell : containedCells) { + Function contains = p -> cell.getBBoxPdf().contains(p); + Function containsRect = r -> cell.getBBoxPdf().contains(r); + BlocksWithTheirWords blocksWithTheirWords = sortBlocksIntoCell(layoutParsingType, words, tables, contains, containsRect); + cell.setTextBlocks(blocksWithTheirWords.blocks()); + wordsInTable.addAll(blocksWithTheirWords.words()); + } + + if (containedCells.size() > MAX_CELLS) { + continue; + } + + var containedCellsWithText = containedCells.stream() + .filter(cell -> !cell.getTextBlocks().isEmpty()) + .toList(); + + // verify if table would contain fewer cells with text than the threshold allows + if (containedCellsWithText.size() >= MIN_TABLE_CONTAINED_CELLS_WITH_TEXT && checkIfTableCellsAreUniform(containedCells)) { + + TablePageBlock tablePageBlock = new TableFromCellsExtractor(containedCells, pdfToPageTransform).extract(); + cells.removeAll(containedCells); + addTableIfValid(words, tablePageBlock, tables, wordsInTable); + } + } + + return tables; + } + + + private static void removeWordsFromCells(List words, TablePageBlock tablePageBlock) { + + Set wordsFromCells = new HashSet<>(tablePageBlock.getWords()); + words.removeAll(wordsFromCells); + } + + + private List buildTableFromIdpResult(List
idpTables, List words, AffineTransform pdfToPageTransform, LayoutParsingType layoutParsingType) { + + if (idpTables == null || idpTables.isEmpty()) { + return Collections.emptyList(); + } + List tables = new ArrayList<>(); + for (Table idpTable : idpTables) { + if (idpTable.bboxes().size() != 1) { + // Should never happen, as IDP still looks at pages individually. (I think so, at least 😅) + log.error("IDP Table on multiple pages are not handled yet!"); + continue; + } + + List cells = new ArrayList<>(idpTable.cells().size()); + Set wordsInTable = new HashSet<>(); // As docstrum blockfication recomputes the words, we need to remember the origin words to remove them from the overall list of words + for (TableCell idpCell : idpTable.cells()) { + Cell cell = new Cell(idpCell, pdfToPageTransform); + if (idpCell.kind().equals(TableCellType.ROW_HEADER) || idpCell.kind().equals(TableCellType.COLUMN_HEADER)) { + cell.setHeaderCell(true); + } + cells.add(cell); + Function contains = p -> idpCell.textRegion().region().bbox().get().contains(p); + Function containsRect = r -> idpCell.textRegion().region().bbox().get().contains(r); + BlocksWithTheirWords blocksWithTheirWords = sortBlocksIntoCell(layoutParsingType, words, tables, contains, containsRect); + cell.setTextBlocks(blocksWithTheirWords.blocks); + wordsInTable.addAll(blocksWithTheirWords.words()); + } + + TableGridStructureCalculator calculator = new TableGridStructureCalculator(cells, pdfToPageTransform); + List> gridCells = calculator.gridify(); + TablePageBlock tablePageBlock = new TablePageBlock(null, gridCells); + addTableIfValid(words, tablePageBlock, tables, wordsInTable); + } + return tables; + } + + + private static void addTableIfValid(List words, TablePageBlock tablePageBlock, List tables, Set wordsInTable) { + + if (tablePageBlock.getRowCount() > MAX_ROWS_OR_COLS || tablePageBlock.getColCount() == 0 || tablePageBlock.getColCount() > MAX_ROWS_OR_COLS) { + return; + } + words.removeAll(wordsInTable); + tables.add(tablePageBlock); + } + + + private BlocksWithTheirWords sortBlocksIntoCell(LayoutParsingType layoutParsingType, + List words, + List tables, + Function contains, + Function containsRect) { + + List wordsInCell = new LinkedList<>(); + for (Word word : words) { + Rectangle2D bBoxPdf = word.getBBoxPdf(); + if (!contains.apply(new Point2D.Double(bBoxPdf.getCenterX(), bBoxPdf.getCenterY()))) { + continue; + } + wordsInCell.add(word); + } + List textBlocks = blockificationService.blockify(layoutParsingType, wordsInCell, CleanRulings.empty(), null); + List tablesInCell = new LinkedList<>(); + for (TablePageBlock table : tables) { + if (containsRect.apply(table.getBBoxPdf())) { + tablesInCell.add(table); + } + } + var blocks = readingOrderService.resolve(textBlocks, tablesInCell); + return new BlocksWithTheirWords(blocks, wordsInCell); + } + + + private boolean checkIfTableCellsAreUniform(List containedCells) { + + if (containedCells.size() <= 2) { + return true; + } + + Map> cellsGroupedByRoundedWidth = containedCells.stream() + .map(BoundingBox::getWidth) + .map(size -> Math.round(size / 10.0) * 10) + .collect(Collectors.groupingBy(Long::longValue)); + + return (double) cellsGroupedByRoundedWidth.size() / containedCells.size() <= TABLE_UNIFORMITY_THRESHOLD; + } + + + @SneakyThrows + public static List findCells(List horizontalRulingLines, List verticalRulingLines, PageInformation pageInformation) { + + var solidHorizontalRulingLines = horizontalRulingLines.stream() + .filter(r -> !Objects.equals(Ruling.Style.DASHED, r.getStyle())) + .toList(); + var solidVerticalRulingLines = verticalRulingLines.stream() + .filter(r -> !Objects.equals(Ruling.Style.DASHED, r.getStyle())) + .toList(); + AffineTransform affineTransform = CoordinateTransforms.calculateInitialUserSpaceCoordsToPageCoords(pageInformation); + return RectangularIntersectionFinder.find(solidHorizontalRulingLines, solidVerticalRulingLines) + .stream() + .map(rect -> new Cell(rect, affineTransform)) + .collect(Collectors.toList()); + } + + + private record BlocksWithTheirWords(List blocks, Collection words) { + + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableFromCellsExtractor.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableFromCellsExtractor.java new file mode 100644 index 0000000..fa2ad62 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableFromCellsExtractor.java @@ -0,0 +1,133 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.tables; + +import java.awt.geom.AffineTransform; +import java.util.ArrayList; +import java.util.Collection; +import java.util.LinkedList; +import java.util.List; + +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; + +import lombok.Getter; +import lombok.Setter; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +public class TableFromCellsExtractor { + + @JsonIgnore + protected PageBlockType classification; + private List> rows; + @Getter + @Setter + private final List originCells; + private final AffineTransform pdfToPageTransform; + + + public TableFromCellsExtractor(List originCells, AffineTransform pdfToPageTransform) { + + classification = PageBlockType.TABLE; + this.originCells = originCells; + this.pdfToPageTransform = pdfToPageTransform; + } + + + public TablePageBlock extract() { + + computeRows(originCells); + + computeHeaders(); + + return new TablePageBlock(null, rows); + } + + + /** + * Detect header cells (either first row or first column): + * Column is marked as header if originalCell text is bold and row originalCell text is not bold. + * Defaults to row. + */ + private void computeHeaders() { + + // A bold originalCell is a header originalCell as long as every originalCell to the lefts/top is bold, too + // we move from lefts to rights and top to bottom + for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) { + List rowCells = rows.get(rowIndex); + if (rowCells.size() == 1) { + continue; + } + + for (int colIndex = 0; colIndex < rowCells.size(); colIndex++) { + Cell cell = rowCells.get(colIndex); + List cellsToTheLeft = rowCells.subList(0, colIndex); + Cell lastHeaderCell = null; + for (Cell leftCell : cellsToTheLeft) { + if (leftCell.isHeaderCell()) { + lastHeaderCell = leftCell; + } else { + break; + } + } + if (lastHeaderCell != null) { + cell.getHeaderCells().add(lastHeaderCell); + } + List cellsToTheTop = new ArrayList<>(); + for (int i = 0; i < rowIndex; i++) { + try { + cellsToTheTop.add(rows.get(i).get(colIndex)); + } catch (IndexOutOfBoundsException e) { + log.debug("No cell {} in row {}, ignoring.", colIndex, rowIndex); + } + } + for (Cell topCell : cellsToTheTop) { + if (topCell.isHeaderCell()) { + lastHeaderCell = topCell; + } else { + break; + } + } + if (lastHeaderCell != null) { + cell.getHeaderCells().add(lastHeaderCell); + } + if (!cell.getTextBlocks().isEmpty() // + && cell.getTextBlocks().get(0) instanceof TextPageBlock textPageBlock // + && textPageBlock.getMostPopularWordStyle().equals("bold")) { + cell.setHeaderCell(true); + } + } + } + setFirstRowAsHeaderIfNoneFound(rows); + } + + + private void setFirstRowAsHeaderIfNoneFound(List> rows) { + + if (rows.isEmpty()) { + return; + } + + if (rows.stream() + .flatMap(Collection::stream) + .noneMatch(Cell::isHeaderCell)) { + rows.get(0) + .forEach(cell -> cell.setHeaderCell(true)); + } + + } + + + private void computeRows(List cells) { + + if (cells.isEmpty()) { + return; + } + + TableGridStructureCalculator calculator = new TableGridStructureCalculator(cells, pdfToPageTransform); + rows = calculator.gridify(); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableGridStructureCalculator.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableGridStructureCalculator.java new file mode 100644 index 0000000..b1c21d2 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableGridStructureCalculator.java @@ -0,0 +1,353 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.tables; + +import java.awt.geom.AffineTransform; +import java.awt.geom.Point2D; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; + +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +public class TableGridStructureCalculator { + + // multiplied with minimum cell height/width, Cells may be at most this apart in one dimension, and must overlap at least that much in the other dimension to be considered neighbours + private static final double DISTANCE_FACTOR = 0.5; + Set cells; + AffineTransform pageToPdfTransform; + double minCellHeight; + double minCellWidth; + + + @SneakyThrows + TableGridStructureCalculator(Collection cells, AffineTransform pdfToPageTransform) { + + this.cells = new HashSet<>(cells); + this.pageToPdfTransform = pdfToPageTransform.createInverse(); + this.minCellHeight = cells.stream() + .mapToDouble(cell -> cell.getBBox().getHeight()) + .min().orElse(0); + this.minCellWidth = cells.stream() + .mapToDouble(cell -> cell.getBBox().getWidth()) + .min().orElse(0); + } + + + /** + * Calculates the grid structure of the table. For spanning rows and columns multiple cells with the same values will be inserted. + * Checks if any cell has more than one neighbor in any direction, if it does, it splits the cell according to its neighbors. + * This is repeated until no more splits are necessary. Then the rows are computed using that very same linked neighbor structure starting with the top left cell. + * + * @return TablePageBlock Structure as a rows of cells matrix + */ + public List> gridify() { + + if (cellsHaveLargeOverlaps()) { + // If cells overlap significantly, the logic below will keep splitting them infinitely, so we revert to the simpler area sweep implementation. + List> rows = AreaSweepGridifier.gridify(cells, pageToPdfTransform, minCellWidth, minCellHeight); + rows = removeEmptyRows(rows); + rows = removeEmptyCols(rows); + return rows; + } + + var linkedCells = cells.stream() + .map(LinkedCell::new) + .collect(Collectors.toList()); + + computeNeighbours(linkedCells); + + while (linkedCells.stream() + .anyMatch(LinkedCell::needsSplit)) { + + List newCells = new LinkedList<>(); + for (LinkedCell linkedCell : linkedCells) { + if (linkedCell.needsSplit()) { + newCells.addAll(linkedCell.split()); + } else { + newCells.add(linkedCell); + } + } + computeNeighbours(newCells); + linkedCells = newCells; + } + return buildStructure(linkedCells); + } + + + private boolean cellsHaveLargeOverlaps() { + + for (Cell cell1 : cells) { + for (Cell cell2 : cells) { + if (cell1.equals(cell2)) { + continue; + } + if (cell1.horizontalOverlap(cell2) > minCellWidth * DISTANCE_FACTOR // + && cell1.verticalOverlap(cell2) > minCellHeight * DISTANCE_FACTOR) { + return true; + } + } + } + return false; + } + + + private List> buildStructure(List cells) { + + if (cells.isEmpty()) { + return Collections.emptyList(); + } + List> rows = buildRows(cells); + if (isNotRectangular(rows)) { + throw new AssertionError(); + } + rows = removeEmptyRows(rows); + rows = removeEmptyCols(rows); + return rows; + } + + + private boolean isNotRectangular(List> rows) { + + if (rows.isEmpty()) { + return true; + } + int n = rows.get(0).size(); + return rows.stream() + .anyMatch(row -> row.size() != n); + } + + + private List> buildRows(List cells) { + + List topLeftCandidates = cells.stream() + .filter(LinkedCell::isTopLeft) + .toList(); + + assert topLeftCandidates.size() == 1; + var cell = topLeftCandidates.get(0); + + List> rows = new ArrayList<>(); + rows.add(buildRow(cell)); + while (!cell.belows.isEmpty()) { + cell = cell.belows.get(0); + rows.add(buildRow(cell)); + } + if (isNotRectangular(rows)) { + throw new AssertionError(); + } + return rows; + } + + + private static List buildRow(LinkedCell cell) { + + List currentRow = new ArrayList<>(); + LinkedCell nextCell = cell; + currentRow.add(cell.originalCell); + while (!nextCell.rights.isEmpty()) { + nextCell = nextCell.rights.get(0); + currentRow.add(nextCell.originalCell); + } + return currentRow; + } + + + private void computeNeighbours(List cells) { + + for (LinkedCell cell : cells) { + cell.resetNeighbours(); + computeNeighbours(cell, cells); + } + + } + + + private void computeNeighbours(LinkedCell cell, List otherCells) { + + for (LinkedCell otherCell : otherCells) { + if (cell.equals(otherCell)) { + continue; + } + if (cell.originalCell.horizontalDistance(otherCell.originalCell) <= minCellWidth * DISTANCE_FACTOR + && cell.originalCell.verticalOverlap(otherCell.originalCell) >= minCellHeight * DISTANCE_FACTOR) { + if (cell.originalCell.getBBox().getCenterX() <= otherCell.originalCell.getBBox().getCenterX()) { + cell.rights.add(otherCell); + } else { + cell.lefts.add(otherCell); + } + } else if (cell.originalCell.verticalDistance(otherCell.originalCell) <= minCellHeight * DISTANCE_FACTOR + && cell.originalCell.horizontalOverlap(otherCell.originalCell) >= minCellWidth * DISTANCE_FACTOR) { + if (cell.originalCell.getBBox().getCenterY() <= otherCell.originalCell.getBBox().getCenterY()) { + cell.belows.add(otherCell); + } else { + cell.aboves.add(otherCell); + } + } + } + + } + + + static List> transpose(List> table) { + + List> ret = new ArrayList>(); + final int N = table.get(0).size(); + for (int i = 0; i < N; i++) { + List col = new ArrayList(); + for (List row : table) { + col.add(row.get(i)); + } + ret.add(col); + } + return ret; + } + + + private List> removeEmptyCols(List> rowsOfCells) { + + if (rowsOfCells.isEmpty()) { + return rowsOfCells; + } + + var colsOfCells = transpose(rowsOfCells); + colsOfCells = removeEmptyRows(colsOfCells); + return transpose(colsOfCells); + } + + + private List> removeEmptyRows(List> rowsOfCells) { + + return rowsOfCells.stream() + .filter(row -> row.stream() + .anyMatch(cell -> !cell.getTextBlocks().isEmpty())) + .collect(Collectors.toList()); + } + + + class LinkedCell { + + private final Cell originalCell; + private final List rights; + private final List lefts; + private final List aboves; + private final List belows; + + + LinkedCell(Cell cell) { + + this.originalCell = cell; + this.rights = new LinkedList<>(); + this.lefts = new LinkedList<>(); + this.aboves = new LinkedList<>(); + this.belows = new LinkedList<>(); + } + + + public boolean needsSplit() { + + return rights.size() > 1 || lefts.size() > 1 || aboves.size() > 1 || belows.size() > 1; + } + + + public boolean isTopLeft() { + + return lefts.isEmpty() && aboves.isEmpty(); + } + + + public String toString() { + + return originalCell.toString(); + } + + + public Collection split() { + + if (rights.size() > 1 && rights.size() >= lefts.size()) { + return splitY(rights); + } + if (lefts.size() > 1) { + return splitY(lefts); + } + if (aboves.size() > 1 && aboves.size() >= belows.size()) { + return splitX(aboves); + } + if (belows.size() > 1) { + return splitX(belows); + } + return List.of(this); + } + + + private List splitY(List neighbours) { + + List splitCells = new LinkedList<>(); + List ySplit = neighbours.stream() + .map(right -> right.originalCell.getMaxY()) + .sorted() + .toList(); + Point2D topLeft = new Point2D.Double(originalCell.getBBox().getMinX(), originalCell.getBBox().getMinY()); + double maxX = originalCell.getBBox().getMaxX(); + double x = originalCell.getBBox().getX(); + double maxY = originalCell.getBBox().getMaxY(); + for (Double neighborY : ySplit) { + double y = Math.min(neighborY, maxY); + Point2D bottomRight = new Point2D.Double(maxX, y); + Cell cell = copyCell(topLeft, bottomRight); + splitCells.add(new LinkedCell(cell)); + topLeft = new Point2D.Double(x, y); + } + return splitCells; + } + + + private List splitX(List neighbours) { + + List splitCells = new LinkedList<>(); + List xSplit = neighbours.stream() + .map(right -> right.originalCell.getMaxX()) + .sorted() + .toList(); + Point2D topLeft = new Point2D.Double(originalCell.getBBox().getMinX(), originalCell.getBBox().getMinY()); + double maxY = originalCell.getBBox().getMaxY(); + double y = originalCell.getBBox().getY(); + double maxX = originalCell.getBBox().getMaxX(); + for (Double neighborX : xSplit) { + double x = Math.min(neighborX, maxX); + Point2D bottomRight = new Point2D.Double(x, maxY); + Cell cell = copyCell(topLeft, bottomRight); + splitCells.add(new LinkedCell(cell)); + topLeft = new Point2D.Double(x, y); + } + return splitCells; + } + + + private Cell copyCell(Point2D topLeft, Point2D bottomRight) { + + Cell cell = Cell.fromPageCoordinates(topLeft, bottomRight, pageToPdfTransform); + cell.setHeaderCell(originalCell.isHeaderCell()); + cell.setTextBlocks(originalCell.getTextBlocks()); + return cell; + } + + + public void resetNeighbours() { + + rights.clear(); + lefts.clear(); + aboves.clear(); + belows.clear(); + } + + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TableMergingUtility.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableMergingUtility.java similarity index 98% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TableMergingUtility.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableMergingUtility.java index d33b63c..4a369d7 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TableMergingUtility.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableMergingUtility.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.utils; +package com.knecon.fforesight.service.layoutparser.processor.services.tables; import java.util.Collection; import java.util.Collections; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/IdpResultLayer.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/IdpResultLayer.java new file mode 100644 index 0000000..2990eba --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/IdpResultLayer.java @@ -0,0 +1,113 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.visualization; + +import java.awt.Color; +import java.util.Objects; + +import com.knecon.fforesight.service.ocr.v1.api.model.Figure; +import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult; +import com.knecon.fforesight.service.ocr.v1.api.model.KeyValuePair; +import com.knecon.fforesight.service.ocr.v1.api.model.QuadPoint; +import com.knecon.fforesight.service.ocr.v1.api.model.Region; +import com.knecon.fforesight.service.ocr.v1.api.model.Table; +import com.knecon.fforesight.service.ocr.v1.api.model.TableCell; +import com.knecon.fforesight.service.ocr.v1.api.model.TableCellType; +import com.knecon.fforesight.service.ocr.v1.api.model.TextRegion; +import com.knecon.fforesight.service.viewerdoc.layers.IdpLayerConfig; +import com.knecon.fforesight.service.viewerdoc.model.ColoredLine; +import com.knecon.fforesight.service.viewerdoc.model.FilledRectangle; +import com.knecon.fforesight.service.viewerdoc.model.Visualizations; + +import lombok.AccessLevel; +import lombok.experimental.FieldDefaults; + +@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true) +public class IdpResultLayer extends IdpLayerConfig { + + public static final int LINE_WIDTH = 1; + + + public IdpResultLayer(IdpResult result) { + + result.tables() + .forEach(this::addTable); + result.keyValuePairs() + .forEach(this::addKeyValue); + result.figures() + .forEach(this::addFigure); + } + + + private void addFigure(Figure figure) { + + addRegion(figure.image(), figures, IMAGE_COLOR); + if (figure.caption() != null) { + addRegion(figure.caption().region(), figures, IMAGE_COLOR); + } + } + + + private void addTable(Table table) { + + for (Region bbox : table.bboxes()) { + addRegion(bbox, tables, TABLE_COLOR); + } + for (TableCell cell : table.cells()) { + addRegion(cell.textRegion().region(), tables, INNER_LINES_COLOR); + if (Objects.equals(cell.kind(), TableCellType.ROW_HEADER) || Objects.equals(cell.kind(), TableCellType.COLUMN_HEADER)) { + addRegionAsFilledRect(cell.textRegion().region(), tables, HEADER_CELL_COLOR); + } + } + if (table.caption() != null) { + addRegion(table.caption().region(), tables, TABLE_COLOR); + } + for (TextRegion footnote : table.footnotes()) { + addRegion(footnote.region(), tables, FOOTNOTE_COLOR); + } + } + + + private void addQuadPoint(int pageNumber, QuadPoint bbox, Visualizations vis, Color color) { + + var visOnPage = getOrCreateVisualizationsOnPage(pageNumber, vis); + bbox.asLines() + .forEach(line -> visOnPage.getColoredLines().add(new ColoredLine(line, color, LINE_WIDTH))); + } + + + private void addRegion(Region region, Visualizations vis, Color color) { + + var sectionsOnPage = getOrCreateVisualizationsOnPage(region.pageNumber(), vis); + region.bbox().get().asLines() + .forEach(line -> sectionsOnPage.getColoredLines().add(new ColoredLine(line, color, LINE_WIDTH))); + } + + + private void addRegionAsFilledRect(Region region, Visualizations vis, Color color) { + + var sectionsOnPage = getOrCreateVisualizationsOnPage(region.pageNumber(), vis); + sectionsOnPage.getFilledRectangles().add(new FilledRectangle(region.bbox().get().getBounds2D(), color, 0.2f)); + } + + + public void addKeyValue(KeyValuePair keyValue) { + + if (keyValue.key() != null) { + addRegion(keyValue.key().region(), keyValuePairs, KEY_COLOR); + } + if (keyValue.value() != null) { + addRegion(keyValue.value().region(), keyValuePairs, VALUE_COLOR); + } + if (keyValue.key() != null && keyValue.value() != null) { + QuadPoint key = keyValue.key().region().bbox().get(); + QuadPoint value = keyValue.value().region().bbox().get(); + + var line = LineUtils.findClosestMidpointLine(key, value); + var arrowHead = LineUtils.createArrowHead(line, Math.min(LineUtils.length(line), 5)); + var linesOnPage = getOrCreateVisualizationsOnPage(keyValue.key().region().pageNumber(), keyValuePairs).getColoredLines(); + linesOnPage.add(new ColoredLine(line, KEY_VALUE_BBOX_COLOR, LINE_WIDTH)); + linesOnPage.add(new ColoredLine(arrowHead[0], KEY_VALUE_BBOX_COLOR, LINE_WIDTH)); + linesOnPage.add(new ColoredLine(arrowHead[1], KEY_VALUE_BBOX_COLOR, LINE_WIDTH)); + } + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java index d4f8f4e..4617101 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java @@ -1,6 +1,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services.visualization; import java.io.File; +import java.util.LinkedList; import java.util.List; import org.springframework.stereotype.Service; @@ -14,6 +15,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi import com.knecon.fforesight.service.layoutparser.processor.model.DocumentWithVisualization; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.OutlineMapper; import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutGrid; +import com.knecon.fforesight.service.viewerdoc.layers.LayerGroup; import com.knecon.fforesight.service.viewerdoc.model.Outline; import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService; @@ -48,16 +50,15 @@ public class LayoutGridService { document.layoutDebugLayer().addSentenceVisualization(document.document().getTextBlock()); document.layoutDebugLayer().addOutlineHeadlines(document.document()); + List layers = new LinkedList<>(); + layers.add(layoutGrid); if (document.layoutDebugLayer().isActive()) { - viewerDocumentService.addLayerGroups(originFile, - destinationFile, - List.of(layoutGrid, document.layoutDebugLayer()), - layoutParserVersion, - layoutParsingTypeName, - outline); - } else { - viewerDocumentService.addLayerGroups(originFile, destinationFile, List.of(layoutGrid), layoutParserVersion, layoutParsingTypeName, outline); + layers.add(document.layoutDebugLayer()); + } + + viewerDocumentService.addLayerGroups(originFile, destinationFile, layers, layoutParserVersion, layoutParsingTypeName, outline); + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LineUtils.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LineUtils.java new file mode 100644 index 0000000..93ce08c --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LineUtils.java @@ -0,0 +1,125 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.visualization; + +import java.awt.Color; +import java.awt.geom.AffineTransform; +import java.awt.geom.Line2D; +import java.awt.geom.Point2D; +import java.util.List; + +import com.knecon.fforesight.service.ocr.v1.api.model.QuadPoint; +import com.knecon.fforesight.service.viewerdoc.model.ColoredLine; + +import lombok.experimental.UtilityClass; + +@UtilityClass +public class LineUtils { + + public List quadPointAsLines(QuadPoint rect, boolean tight) { + + if (tight) { + return List.of(new ColoredLine(new Line2D.Double(rect.a(), rect.b()), Color.GREEN, 1), + new ColoredLine(new Line2D.Double(rect.b(), rect.c()), Color.GREEN, 1), + new ColoredLine(new Line2D.Double(rect.c(), rect.d()), Color.GREEN, 1), + new ColoredLine(new Line2D.Double(rect.d(), rect.a()), Color.GREEN, 1)); + } + + return List.of(new ColoredLine(new Line2D.Double(rect.a(), rect.b()), Color.BLUE, 1), + new ColoredLine(new Line2D.Double(rect.b(), rect.c()), Color.BLUE, 1), + new ColoredLine(new Line2D.Double(rect.c(), rect.d()), Color.BLUE, 1), + new ColoredLine(new Line2D.Double(rect.d(), rect.a()), Color.BLUE, 1)); + } + + + public List quadPointAsLines(QuadPoint rect, Color color) { + + return List.of(new ColoredLine(new Line2D.Double(rect.a(), rect.b()), color, 1), + new ColoredLine(new Line2D.Double(rect.b(), rect.c()), color, 1), + new ColoredLine(new Line2D.Double(rect.c(), rect.d()), color, 1), + new ColoredLine(new Line2D.Double(rect.d(), rect.a()), color, 1)); + } + + + public static Line2D transform(Line2D line2D, AffineTransform affineTransform) { + + var p1 = affineTransform.transform(line2D.getP1(), null); + var p2 = affineTransform.transform(line2D.getP2(), null); + return new Line2D.Double(p1, p2); + } + + + public static double length(Line2D line2D) { + + return line2D.getP1().distance(line2D.getP2()); + } + + + public static Line2D findClosestMidpointLine(QuadPoint quad1, QuadPoint quad2) { + + List lines1 = quad1.asLines() + .toList(); + List lines2 = quad2.asLines() + .toList(); + + Line2D closestLine1 = null; + Line2D closestLine2 = null; + double minDistance = Double.MAX_VALUE; + + for (Line2D line1 : lines1) { + for (Line2D line2 : lines2) { + double distance = lineDistance(line1, line2); + if (distance < minDistance) { + minDistance = distance; + closestLine1 = line1; + closestLine2 = line2; + } + } + } + + if (closestLine1 == null || closestLine2 == null) { + throw new IllegalStateException("Could not find closest lines"); + } + + Point2D midpoint1 = getMidpoint(closestLine1); + Point2D midpoint2 = getMidpoint(closestLine2); + + return new Line2D.Double(midpoint1, midpoint2); + } + + + private static double lineDistance(Line2D line1, Line2D line2) { + + return Math.abs(getMidpoint(line1).distance(getMidpoint(line2))); + } + + + private static Point2D getMidpoint(Line2D line) { + + double x = (line.getX1() + line.getX2()) / 2; + double y = (line.getY1() + line.getY2()) / 2; + return new Point2D.Double(x, y); + } + + + public static Line2D[] createArrowHead(Line2D line, double arrowLength) { + + Point2D start = line.getP1(); + Point2D end = line.getP2(); + + // Calculate the angle of the line + double angle = Math.atan2(end.getY() - start.getY(), end.getX() - start.getX()); + + // Calculate the points for the two arrow lines + double arrowHeadAngle = Math.PI / 6; + double x1 = end.getX() - arrowLength * Math.cos(angle - arrowHeadAngle); + double y1 = end.getY() - arrowLength * Math.sin(angle - arrowHeadAngle); + double x2 = end.getX() - arrowLength * Math.cos(angle + arrowHeadAngle); + double y2 = end.getY() - arrowLength * Math.sin(angle + arrowHeadAngle); + + // Create and return the two arrow lines + Line2D arrow1 = new Line2D.Double(end, new Point2D.Double(x1, y1)); + Line2D arrow2 = new Line2D.Double(end, new Point2D.Double(x2, y2)); + + return new Line2D[]{arrow1, arrow2}; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/BBoxMergingUtility.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/BBoxMergingUtility.java deleted file mode 100644 index 4191450..0000000 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/BBoxMergingUtility.java +++ /dev/null @@ -1,34 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.processor.utils; - -import java.awt.geom.Rectangle2D; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; - -import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page; - -import lombok.experimental.UtilityClass; - -@UtilityClass -public class BBoxMergingUtility { - - public Map mergeBBoxes(List> bboxesToMerge) { - - Map bBoxPerPage = new HashMap<>(); - Set pages = bboxesToMerge.stream() - .flatMap(map -> map.keySet() - .stream()) - .collect(Collectors.toSet()); - for (Page page : pages) { - Rectangle2D bBoxOnPage = bboxesToMerge.stream() - .filter(childBboxPerPage -> childBboxPerPage.containsKey(page)) - .map(childBboxPerPage -> childBboxPerPage.get(page)) - .collect(RectangleTransformations.collectBBox()); - bBoxPerPage.put(page, bBoxOnPage); - } - return bBoxPerPage; - } - -} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/CoordinateTransforms.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/CoordinateTransforms.java index ce3f99f..4eefcfd 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/CoordinateTransforms.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/CoordinateTransforms.java @@ -2,6 +2,8 @@ package com.knecon.fforesight.service.layoutparser.processor.utils; import java.awt.geom.AffineTransform; +import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation; + import lombok.SneakyThrows; import lombok.experimental.UtilityClass; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/GeometricComparators.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/GeometricComparators.java index 7bd53a2..ca73e32 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/GeometricComparators.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/GeometricComparators.java @@ -4,12 +4,14 @@ import java.awt.geom.Point2D; import java.awt.geom.Rectangle2D; import java.util.Comparator; +import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; public class GeometricComparators { private static final int COMPARATOR_ROUNDING = 2; + static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f; public static final Comparator X_FIRST_POINT_COMPARATOR = (point1, point2) -> { @@ -58,6 +60,17 @@ public class GeometricComparators { return cell1Size.compareTo(cell2Size); }; + public static final Comparator CELL_SORTER = (o1, o2) -> { + + if (o1.equals(o2)) { + return 0; + } + if (o1.verticalOverlapPdf(o2) > VERTICAL_COMPARISON_THRESHOLD * ((o1.getHeight() + o2.getHeight()) / 2)) { + return Double.compare(o1.getMinX(), o2.getMinX()); + } else { + return Double.compare(o1.getMaxY(), o2.getMaxY()); + } + }; public static final Comparator RECTANGLE_SIZE_COMPARATOR = (rect1, rect2) -> { Double rect1Size = rect1.getHeight() * rect1.getWidth(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PageInformation.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PageInformation.java deleted file mode 100644 index 884c717..0000000 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PageInformation.java +++ /dev/null @@ -1,59 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.processor.utils; - -import java.awt.geom.Rectangle2D; - -import org.apache.pdfbox.pdmodel.PDPage; -import org.apache.pdfbox.pdmodel.common.PDRectangle; - -import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page; - -public record PageInformation(Rectangle2D mediabox, int number, int rotationDegrees) { - - public static PageInformation fromPDPage(int pageNum, PDPage page) { - - PDRectangle mediaBox = page.getMediaBox(); - return new PageInformation(new Rectangle2D.Double(mediaBox.getLowerLeftX(), mediaBox.getLowerLeftY(), mediaBox.getWidth(), mediaBox.getHeight()), - pageNum, - page.getRotation()); - } - - - public static PageInformation fromPage(Page page) { - - return new PageInformation(new Rectangle2D.Double(0, 0, page.getWidth(), page.getHeight()), page.getNumber(), page.getRotation()); - } - - - public double height() { - - return mediabox.getHeight(); - } - - - public double heightRot() { - - if (rotationDegrees == 90 || rotationDegrees == 270) { - return width(); - } - return height(); - } - - - public double width() { - - return mediabox.getWidth(); - } - - - public double minX() { - - return mediabox.getX(); - } - - - public double minY() { - - return mediabox.getY(); - } - -} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/ProtobufUtil.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/ProtobufUtil.java deleted file mode 100644 index 8d1a5b1..0000000 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/ProtobufUtil.java +++ /dev/null @@ -1,42 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.processor.utils; - -import java.io.BufferedOutputStream; -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.util.zip.GZIPOutputStream; - -import com.google.protobuf.Message; -import com.google.protobuf.MessageOrBuilder; -import com.google.protobuf.Struct; -import com.google.protobuf.util.JsonFormat; - -import lombok.SneakyThrows; -import lombok.experimental.UtilityClass; - -@UtilityClass -public class ProtobufUtil { - - public static String toJson(MessageOrBuilder messageOrBuilder) throws IOException { - return JsonFormat.printer().print(messageOrBuilder); - } - - @SuppressWarnings("unchecked") - public static Message fromJson(String json) throws IOException { - Message.Builder structBuilder = Struct.newBuilder(); - JsonFormat.parser().ignoringUnknownFields().merge(json, structBuilder); - return structBuilder.build(); - } - - - @SneakyThrows - public File serializeToTempFile(T any) { - var tempFile = File.createTempFile("storage-protobuf", ".data"); - - try (var fos = new GZIPOutputStream(new BufferedOutputStream(new FileOutputStream(tempFile)))) { - any.writeTo(fos); - return tempFile; - } - } - -} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java index 75ba59b..cc9f218 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java @@ -2,10 +2,12 @@ package com.knecon.fforesight.service.layoutparser.processor.utils; import static java.lang.String.format; +import java.awt.geom.AffineTransform; import java.awt.geom.Point2D; import java.awt.geom.Rectangle2D; import java.awt.geom.RectangularShape; import java.util.ArrayList; +import java.util.Collection; import java.util.Collections; import java.util.LinkedList; import java.util.List; @@ -125,7 +127,7 @@ public class RectangleTransformations { } - public static Rectangle2D rectangle2DBBox(List rectangle2DList) { + public static Rectangle2D rectangle2DBBox(Collection rectangle2DList) { return rectangle2DList.stream() .collect(new Rectangle2DBBoxCollector()); @@ -185,6 +187,12 @@ public class RectangleTransformations { } + public static Rectangle2D transform(Rectangle2D rect, AffineTransform transform) { + + return transform.createTransformedShape(rect).getBounds2D(); + } + + private static class Rectangle2DBBoxCollector implements Collector { @Override diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java index 0830a42..257560a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java @@ -13,7 +13,7 @@ import java.util.stream.Collectors; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.AngleFilter; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox; -import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind; +import com.knecon.fforesight.service.layoutparser.processor.model.UnionFind; import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils; import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; @@ -46,6 +46,12 @@ public class TextPositionOperations { return sortUsingLineDetection(sequences); } + public List mergeAndSort(TextPageBlock textBlocks) { + + var sequences = new HashSet<>(textBlocks.getWords()); + return sortUsingLineDetection(sequences); + } + public List sort(List sequences) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/UnionFind.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/UnionFind.java deleted file mode 100644 index d6af3fa..0000000 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/UnionFind.java +++ /dev/null @@ -1,44 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.processor.utils; - -import java.util.HashMap; -import java.util.Map; - -// simple implementation of a disjoint-set data structure -// https://en.wikipedia.org/wiki/Disjoint-set_data_structure -public class UnionFind { - - Map parents = new HashMap<>(); - Map numberOfObjects = new HashMap<>(); - - - public T find(T node) { - - if (!parents.containsKey(node)) { - parents.put(node, node); - numberOfObjects.put(node, 1); - } - if (!node.equals(parents.get(node))) { - parents.put(node, find(parents.get(node))); - } - return parents.get(node); - } - - - public void union(T node1, T node2) { - - T root1 = find(node1); - T root2 = find(node2); - - if (!root1.equals(root2)) { - if (numberOfObjects.getOrDefault(root1, 1) < numberOfObjects.getOrDefault(root2, 1)) { - parents.put(root1, root2); - numberOfObjects.put(root2, numberOfObjects.get(root2) + numberOfObjects.get(root1)); - } else { - parents.put(root2, root1); - numberOfObjects.put(root1, numberOfObjects.get(root1) + numberOfObjects.get(root2)); - } - } - } - -} - diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java index 7fc72c1..5866717 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutDebugLayer.java @@ -17,7 +17,6 @@ import java.util.concurrent.atomic.AtomicInteger; import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; -import com.iqser.red.service.redaction.v1.server.data.LayoutEngineProto; import com.iqser.red.service.redaction.v1.server.model.document.TextRange; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document; import com.iqser.red.service.redaction.v1.server.model.document.nodes.LayoutEngine; @@ -36,7 +35,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import com.knecon.fforesight.service.layoutparser.processor.services.classification.NumberWord; import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms; import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; -import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation; +import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import com.knecon.fforesight.service.viewerdoc.layers.LayoutDebugLayerConfig; import com.knecon.fforesight.service.viewerdoc.model.ColoredLine; @@ -59,7 +58,7 @@ import lombok.experimental.FieldDefaults; @FieldDefaults(level = AccessLevel.PRIVATE) public class LayoutDebugLayer extends LayoutDebugLayerConfig { - boolean active; + boolean active = true; Map outlineObjectsWithoutPointsPerPage = new HashMap<>(); @@ -141,7 +140,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig { } - public void addCellVisualizations(List cells, int pageNumber) { + public void addCellVisualizations(Collection cells, int pageNumber, Color color) { if (!active) { return; @@ -149,7 +148,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig { VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.cells); visualizationsOnPage.getColoredRectangles() .addAll(cells.stream() - .map(cell -> new ColoredRectangle(cell.getBBoxPdf(), CELLS_COLOR, 1)) + .map(cell -> new ColoredRectangle(cell.getBBoxPdf(), color == null ? CELLS_COLOR : color, 1)) .toList()); } @@ -211,7 +210,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig { } - public void addTextBlockVisualizations(List textPageBlocks, int page) { + public void addTextBlockVisualizations(List textPageBlocks, int page) { if (!active) { return; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutGrid.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutGrid.java index fa6ea96..911acdc 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutGrid.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutGrid.java @@ -26,9 +26,8 @@ import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNo import com.iqser.red.service.redaction.v1.server.model.document.nodes.SuperSection; import com.iqser.red.service.redaction.v1.server.model.document.nodes.Table; import com.iqser.red.service.redaction.v1.server.model.document.nodes.TableCell; -import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; +import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation; import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms; -import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import com.knecon.fforesight.service.viewerdoc.layers.LayoutGridLayerConfig; import com.knecon.fforesight.service.viewerdoc.model.ColoredLine; @@ -93,19 +92,10 @@ public class LayoutGrid extends LayoutGridLayerConfig { public void addTreeId(SemanticNode semanticNode) { Page page = semanticNode.getFirstPage(); - if (semanticNode.getBBox() - .get(page) == null) { + if (semanticNode.getBBox().get(page) == null) { return; } - addPlacedText(page, - semanticNode.getBBox() - .get(page), - semanticNode.getBBox() - .get(page), - buildTreeIdString(semanticNode), - 1, - treeIds, - TREEID_COLOR); + addPlacedText(page, semanticNode.getBBox().get(page), semanticNode.getBBox().get(page), buildTreeIdString(semanticNode), 1, treeIds, TREEID_COLOR); } @@ -134,8 +124,7 @@ public class LayoutGrid extends LayoutGridLayerConfig { .toList(); Integer maxChildDepth = subSections.stream() .map(node -> node.getTreeId().size()) - .max(Integer::compareTo) - .orElse(section.getTreeId().size()); + .max(Integer::compareTo).orElse(section.getTreeId().size()); int ownDepth = section.getTreeId().size(); Page firstPage = section.getFirstPage(); @@ -321,8 +310,7 @@ public class LayoutGrid extends LayoutGridLayerConfig { Visualizations visualizations = semanticNode.getType().equals(NodeType.TABLE_OF_CONTENTS) ? toc : sections; List coloredLines = getOrCreateVisualizationsOnPage(page.getNumber(), visualizations).getColoredLines(); int lineWidthModifier = maxChildDepth - ownDepth; - Rectangle2D r = RectangleTransformations.pad(semanticNode.getBBox() - .get(page), LINE_WIDTH * (1 + lineWidthModifier), LINE_WIDTH * (1 + lineWidthModifier)); + Rectangle2D r = RectangleTransformations.pad(semanticNode.getBBox().get(page), LINE_WIDTH * (1 + lineWidthModifier), LINE_WIDTH * (1 + lineWidthModifier)); SemanticNode highestParent = semanticNode.getHighestParent(); Rectangle2D highestParentRect = rectangleMap.get(new RectangleIdentifier(highestParent.getTreeId(), page.getNumber())); @@ -371,8 +359,7 @@ public class LayoutGrid extends LayoutGridLayerConfig { List ys = yStream.collect(Collectors.toList()); ys.remove(0); - Rectangle2D tableBBox = table.getBBox() - .get(page); + Rectangle2D tableBBox = table.getBBox().get(page); List coloredLines = getOrCreateVisualizationsOnPage(page.getNumber(), tables).getColoredLines(); xs.forEach(x -> { diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableAreaFillerTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableAreaFillerTest.java new file mode 100644 index 0000000..95dbc58 --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/processor/services/tables/TableAreaFillerTest.java @@ -0,0 +1,60 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.tables; + +import static org.junit.jupiter.api.Assertions.*; + +import java.awt.geom.AffineTransform; +import java.awt.geom.Rectangle2D; +import java.util.Iterator; +import java.util.List; +import java.util.Set; +import java.util.stream.Stream; + +import org.junit.jupiter.api.Test; + +import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; + +class TableAreaFillerTest { + + @Test + void findMissingCells() { + + Rectangle2D area = new Rectangle2D.Double(0, 0, 2, 2); + List rectangles = List.of(new Rectangle2D.Double(0, 0, 1, 1), new Rectangle2D.Double(1, 1, 1, 1), new Rectangle2D.Double(1, 0, 1, 1)); + Set missing = TableAreaFiller.findMissingRects(rectangles, area); + + assertEquals(1, missing.size()); + assertEquals(new Rectangle2D.Double(0, 1, 1, 1), missing.iterator().next()); + } + + + @Test + void findMissingCells2() { + + Rectangle2D area = new Rectangle2D.Double(0, 0, 3, 3); + List rectangles = List.of(new Rectangle2D.Double(0, 0, 1, 1), + new Rectangle2D.Double(1, 0, 1, 1), + new Rectangle2D.Double(2, 0, 1, 1), + new Rectangle2D.Double(0, 1, 1, 1), + new Rectangle2D.Double(1, 1, 1, 1), + new Rectangle2D.Double(2, 1, 1, 1)); + + + var missing = TableAreaFiller.findMissingRects(rectangles, area); + assertEquals(1, missing.size()); + assertEquals(new Rectangle2D.Double(0, 2, 3, 1), missing.iterator().next()); + } + + @Test + void findMissingCells3() { + + Rectangle2D area = new Rectangle2D.Double(0, 0, 2, 2); + List rectangles = List.of(new Rectangle2D.Double(0, 0, 1, 1)); + Set missing = TableAreaFiller.findMissingRects(rectangles, area); + + assertEquals(2, missing.size()); + Iterator iterator = missing.iterator(); + assertEquals(new Rectangle2D.Double(0, 1, 2, 1), iterator.next()); + assertEquals(new Rectangle2D.Double(1, 0, 1, 1), iterator.next()); + } + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/AbstractTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/AbstractTest.java index 6505037..d8fea97 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/AbstractTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/AbstractTest.java @@ -75,6 +75,7 @@ public abstract class AbstractTest { protected final static String TENANT_ID = "tenant"; protected final static String VIEWER_DOCUMENT_ID = "viewer"; protected final static String SIMPLIFIED_ID = "simplified"; + protected final static String IDP_ID = "idp"; protected LayoutParsingRequest buildStandardLayoutParsingRequest() { @@ -117,7 +118,14 @@ public abstract class AbstractTest { public static LayoutParsingRequest buildDefaultLayoutParsingRequest(String fileName, LayoutParsingType layoutParsingType, boolean debug) { + return buildDefaultLayoutParsingRequest(fileName, layoutParsingType, debug, false); + } + + + public static LayoutParsingRequest buildDefaultLayoutParsingRequest(String fileName, LayoutParsingType layoutParsingType, boolean debug, boolean withIdpResult) { + var identifier = debug ? Map.of("fileId", fileName, "debug", "true") : Map.of("fileId", fileName); + Optional idpResultStorageId = withIdpResult ? Optional.of(fileName + IDP_ID) : Optional.empty(); return LayoutParsingRequest.builder() .identifier(identifier) .layoutParsingType(layoutParsingType) @@ -132,6 +140,7 @@ public abstract class AbstractTest { .simplifiedTextStorageId(fileName + SIMPLIFIED_ID) .viewerDocumentStorageId(fileName + VIEWER_DOCUMENT_ID) .documentMarkdownFileStorageId(Optional.of(fileName + MARKDOWN_FILE_ID)) + .idpResultStorageId(idpResultStorageId) .build(); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java index 4dcfeae..e53d8e6 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java @@ -34,6 +34,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.factory.Doc import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper; import com.knecon.fforesight.service.layoutparser.processor.utils.DrawingOptions; import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; +import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult; import lombok.SneakyThrows; @@ -51,11 +52,12 @@ public class BdrJsonBuildTest extends AbstractTest { return DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.CLARIFYND, layoutParsingPipeline.parseLayout(LayoutParsingType.CLARIFYND, - file, - new ImageServiceResponse(), - new TableServiceResponse(), - new VisualLayoutParsingResponse(), - Map.of("file",file.toString()))).document(); + file, + new ImageServiceResponse(), + new TableServiceResponse(), + IdpResult.empty(), + new VisualLayoutParsingResponse(), + Map.of("file",file.toString()))).document(); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BuildDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BuildDocumentTest.java index 0feb7df..9d156a7 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BuildDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BuildDocumentTest.java @@ -16,6 +16,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.DocumentWithVi import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; +import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult; import lombok.SneakyThrows; @@ -39,6 +40,7 @@ public abstract class BuildDocumentTest extends AbstractTest { fileResource, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), new TableServiceResponse(), + IdpResult.empty(), new VisualLayoutParsingResponse(), Map.of("file", filename, "debug", "true")); } @@ -63,6 +65,7 @@ public abstract class BuildDocumentTest extends AbstractTest { layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId() .get()), new TableServiceResponse(), + IdpResult.empty(), new VisualLayoutParsingResponse(), layoutParsingRequest.identifier())); } else { diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java index 7fac654..7da3a68 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java @@ -30,6 +30,7 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.ima import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; +import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult; import com.knecon.fforesight.tenantcommons.TenantsClient; import lombok.AllArgsConstructor; @@ -106,6 +107,7 @@ public class HeadlinesGoldStandardIntegrationTest { pdfFileResource.getFile(), new ImageServiceResponse(), new TableServiceResponse(), + IdpResult.empty(), new VisualLayoutParsingResponse(), Map.of("file", filePath))).document(); var foundHeadlines = documentGraph.streamAllSubNodes() diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java index 14b96fa..8152a60 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java @@ -1,6 +1,7 @@ package com.knecon.fforesight.service.layoutparser.server; import java.io.File; +import java.io.FileInputStream; import java.nio.file.Files; import java.nio.file.Path; import java.util.Arrays; @@ -24,6 +25,7 @@ import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; @Slf4j +@Disabled public class LayoutparserEnd2EndTest extends AbstractTest { public static final LayoutParsingType LAYOUT_PARSING_TYPE = LayoutParsingType.DOCUMINE_OLD; @@ -33,15 +35,24 @@ public class LayoutparserEnd2EndTest extends AbstractTest { @Test - @Disabled public void testLayoutParserEndToEnd() { - String filePath = "/home/kschuettler/Dokumente/LayoutparsingEvaluation/VV-340050.pdf"; + String filePath = "/home/kschuettler/Downloads/2021-2048323.pdf"; runForFile(filePath); } + @Test + public void testLayoutParserEndToEndWithIdpResult() { + + String filePath = "/tmp/OCR_TEST/2009-1048395_50pages_tables.pdf/document.pdf"; + String idpResultPath = "/tmp/OCR_TEST/2009-1048395_50pages_tables.pdf/idpResult.json"; + + runForFile(filePath, idpResultPath); + } + + @Test @Disabled @SneakyThrows @@ -62,9 +73,15 @@ public class LayoutparserEnd2EndTest extends AbstractTest { } - @SneakyThrows private void runForFile(String filePath) { + runForFile(filePath, null); + } + + + @SneakyThrows + private void runForFile(String filePath, String idpResultPath) { + String fileName = Path.of(filePath).getFileName().toString(); File file; if (filePath.startsWith("files")) { // from resources @@ -73,7 +90,13 @@ public class LayoutparserEnd2EndTest extends AbstractTest { file = new File(filePath); } - LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LAYOUT_PARSING_TYPE, true); + LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LAYOUT_PARSING_TYPE, true, true); + + if (layoutParsingRequest.idpResultStorageId().isPresent() && idpResultPath != null) { + try (var in = new FileInputStream(idpResultPath)) { + storageService.storeObject(TENANT_ID, layoutParsingRequest.idpResultStorageId().get(), in); + } + } prepareStorage(layoutParsingRequest, file); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/OutlineDetectionTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/OutlineDetectionTest.java index 34a68e9..b631272 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/OutlineDetectionTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/OutlineDetectionTest.java @@ -32,6 +32,7 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService; +import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult; import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService; import lombok.SneakyThrows; @@ -192,6 +193,7 @@ public class OutlineDetectionTest extends AbstractTest { fileResource, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), new TableServiceResponse(), + IdpResult.empty(), new VisualLayoutParsingResponse(), Map.of("file", filename, "debug", "true")); } @@ -209,6 +211,7 @@ public class OutlineDetectionTest extends AbstractTest { layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId() .get()), new TableServiceResponse(), + IdpResult.empty(), new VisualLayoutParsingResponse(), layoutParsingRequest.identifier())); } else { diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/SimplifiedTextServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/SimplifiedTextServiceTest.java index e40cae6..790907e 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/SimplifiedTextServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/SimplifiedTextServiceTest.java @@ -20,6 +20,7 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse; import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; +import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult; import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; @@ -61,6 +62,7 @@ public class SimplifiedTextServiceTest extends AbstractTest { file, new ImageServiceResponse(), new TableServiceResponse(), + IdpResult.empty(), new VisualLayoutParsingResponse(), Map.of("file", file.toString()))).document(); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java index d5cd172..c2750a2 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java @@ -21,6 +21,7 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.server.BuildDocumentTest; +import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult; import lombok.SneakyThrows; @@ -58,11 +59,12 @@ public class DocumentGraphJsonWritingTest extends BuildDocumentTest { Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, - filename.toFile(), - new ImageServiceResponse(), - new TableServiceResponse(), - new VisualLayoutParsingResponse(), - Map.of("file",filename.toFile().toString()))).document(); + filename.toFile(), + new ImageServiceResponse(), + new TableServiceResponse(), + IdpResult.empty(), + new VisualLayoutParsingResponse(), + Map.of("file",filename.toFile().toString()))).document(); DocumentData documentData = DocumentDataMapper.toDocumentData(documentGraph); ObjectMapper mapper = ObjectMapperFactory.create(); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphMappingTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphMappingTest.java index 5ad24f8..6c2cc50 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphMappingTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphMappingTest.java @@ -29,7 +29,7 @@ public class DocumentGraphMappingTest extends BuildDocumentTest { @SneakyThrows public void testGraphMapping() { - String filename = "files/syngenta/CustomerFiles/SYNGENTA_EFSA_sanitisation_GFL_v1_moreSections.pdf"; + String filename = "files/syngenta/CustomerFiles/Fludioxonil_duplicates.pdf"; Document document = buildGraph(filename); DocumentData documentData = DocumentDataMapper.toDocumentData(document); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index 3cecb90..37fefcc 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -17,8 +17,9 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService; -import com.knecon.fforesight.service.layoutparser.server.PDFNetInitializer; import com.knecon.fforesight.service.layoutparser.server.BuildDocumentTest; +import com.knecon.fforesight.service.layoutparser.server.PDFNetInitializer; +import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult; import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService; import lombok.SneakyThrows; @@ -74,6 +75,7 @@ public class ViewerDocumentTest extends BuildDocumentTest { documentFile, new ImageServiceResponse(), tableResponse, + IdpResult.empty(), new VisualLayoutParsingResponse(), Map.of("file", Path.of(fileName).getFileName().toFile().toString())); PDFTronViewerDocumentService viewerDocumentService = new PDFTronViewerDocumentService(null); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java index c303959..51d1c82 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java @@ -39,6 +39,8 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.server.AbstractTest; +import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult; +import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService; import lombok.SneakyThrows; @@ -58,6 +60,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { originDocument, new ImageServiceResponse(), tableServiceResponse, + IdpResult.empty(), new VisualLayoutParsingResponse(), Map.of("file", "document")); @@ -103,24 +106,19 @@ public class PdfSegmentationServiceTest extends AbstractTest { String textToSearch = "Annex to Regulation 283/2013 Annex to Regulation 284/2013"; ClassPathResource pdfFileResource = new ClassPathResource(fileName); - List textPositionPerPage = PageContentExtractor.getSortedPageContents(fileName); + List textPositionPerPage = PageContentExtractor.getDocumentContents(pdfFileResource.getFile(), 4); var textPositions = textPositionPerPage.stream() - .flatMap(t -> t.getSortedWords() + .flatMap(t -> t.getWords() .stream() .map(Word::toString)) .collect(Collectors.joining(" ")); - assertThat(textPositions.contains(textToSearch)).isFalse(); + assertThat(textPositions.contains(textToSearch)).isTrue(); ClassificationDocument classificationDocument = buildClassificationDocument(pdfFileResource.getFile()); - assertThat(classificationDocument.getHeaders() - .get(0).getTextBlocks().size()).isEqualTo(3); - assertThat(classificationDocument.getHeaders() - .get(0).getTextBlocks() - .get(0).getWords().size()).isEqualTo(8); - assertThat(classificationDocument.getHeaders() - .get(0).getTextBlocks() - .get(0).toString()).contains(textToSearch); + assertThat(classificationDocument.getHeaders().get(0).getTextBlocks().size()).isEqualTo(3); + assertThat(classificationDocument.getHeaders().get(0).getTextBlocks().get(0).getWords().size()).isEqualTo(8); + assertThat(classificationDocument.getHeaders().get(0).getTextBlocks().get(0).toString()).contains(textToSearch); Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, classificationDocument).document(); @@ -216,8 +214,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { .stream() .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) - .toList() - .get(0); + .toList().get(0); assertThat(table.getColCount()).isEqualTo(6); assertThat(table.getRowCount()).isEqualTo(13); assertThat(table.getRows() @@ -246,8 +243,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { .stream() .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) - .toList() - .get(0); + .toList().get(0); assertThat(firstTable.getColCount()).isEqualTo(8); assertThat(firstTable.getRowCount()).isEqualTo(1); TablePageBlock secondTable = document.getSectionTree().getAllTableOfContentItems() @@ -256,12 +252,10 @@ public class PdfSegmentationServiceTest extends AbstractTest { .stream() .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) - .toList() - .get(1); + .toList().get(1); assertThat(secondTable.getColCount()).isEqualTo(8); assertThat(secondTable.getRowCount()).isEqualTo(2); - List> firstTableHeaderCells = firstTable.getRows() - .get(0) + List> firstTableHeaderCells = firstTable.getRows().get(0) .stream() .map(Collections::singletonList) .collect(Collectors.toList()); @@ -293,8 +287,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { .stream() .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) - .toList() - .get(0); + .toList().get(0); assertThat(firstTable.getColCount()).isEqualTo(9); assertThat(firstTable.getRowCount()).isEqualTo(5); TablePageBlock secondTable = document.getSectionTree().getAllTableOfContentItems() @@ -303,12 +296,10 @@ public class PdfSegmentationServiceTest extends AbstractTest { .stream() .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) - .toList() - .get(1); + .toList().get(1); assertThat(secondTable.getColCount()).isEqualTo(9); assertThat(secondTable.getRowCount()).isEqualTo(6); - List> firstTableHeaderCells = firstTable.getRows() - .get(firstTable.getRowCount() - 1) + List> firstTableHeaderCells = firstTable.getRows().get(firstTable.getRowCount() - 1) .stream() .map(Cell::getHeaderCells) .collect(Collectors.toList()); @@ -340,8 +331,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { .stream() .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) - .toList() - .get(0); + .toList().get(0); assertThat(firstTable.getColCount()).isEqualTo(8); assertThat(firstTable.getRowCount()).isEqualTo(1); TablePageBlock secondTable = document.getSectionTree().getAllTableOfContentItems() @@ -350,12 +340,10 @@ public class PdfSegmentationServiceTest extends AbstractTest { .stream() .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) - .toList() - .get(1); + .toList().get(1); assertThat(secondTable.getColCount()).isEqualTo(8); assertThat(secondTable.getRowCount()).isEqualTo(6); - List> firstTableHeaderCells = firstTable.getRows() - .get(0) + List> firstTableHeaderCells = firstTable.getRows().get(0) .stream() .map(Collections::singletonList) .collect(Collectors.toList()); @@ -376,10 +364,10 @@ public class PdfSegmentationServiceTest extends AbstractTest { validateTableSize(document, 4); - validateTable(document, 0, 1, 1, 0, 0); - validateTable(document, 1, 2, 2, 0, 0); - validateTable(document, 2, 4, 19, 12, 0); - validateTable(document, 3, 2, 12, 0, 0); + validateTable(document, 0, 1, 1, 0); + validateTable(document, 1, 2, 2, 0); + validateTable(document, 2, 2, 12, 0); + validateTable(document, 3, 4, 19, 12); } @@ -393,10 +381,10 @@ public class PdfSegmentationServiceTest extends AbstractTest { validateTableSize(document, 4); - validateTable(document, 0, 5, 4, 0, 0); - validateTable(document, 1, 5, 15, 14, 0); - validateTable(document, 2, 5, 14, 11, 0); - validateTable(document, 3, 5, 3, 0, 0); + validateTable(document, 0, 5, 4, 0); + validateTable(document, 1, 5, 15, 14); + validateTable(document, 2, 5, 14, 11); + validateTable(document, 3, 5, 3, 0); } @@ -410,7 +398,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { validateTableSize(document, 1); - validateTable(document, 0, 8, 8, 0, 0); + validateTable(document, 0, 8, 8, 0); List> values = Arrays.asList(Arrays.asList("Annex point Reference within DAR/RAR", "Author, date", @@ -455,10 +443,10 @@ public class PdfSegmentationServiceTest extends AbstractTest { validateTableSize(document, 4); - validateTable(document, 0, 3, 2, 0, 0); - validateTable(document, 1, 3, 2, 0, 0); - validateTable(document, 2, 3, 3, 0, 0); - validateTable(document, 3, 3, 3, 0, 0); + validateTable(document, 0, 3, 2, 0); + validateTable(document, 1, 3, 2, 0); + validateTable(document, 2, 3, 3, 0); + validateTable(document, 3, 3, 3, 0); } @@ -473,7 +461,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { validateTableSize(document, 1); - validateTable(document, 0, 7, 4, 0, 0); + validateTable(document, 0, 7, 4, 0); } @@ -486,7 +474,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { validateTableSize(document, 1); - validateTable(document, 0, 7, 4, 0, 0); + validateTable(document, 0, 7, 4, 0); } @@ -499,12 +487,12 @@ public class PdfSegmentationServiceTest extends AbstractTest { validateTableSize(document, 6); - validateTable(document, 0, 2, 1, 0, 0); - validateTable(document, 1, 2, 1, 0, 0); - validateTable(document, 2, 2, 5, 0, 0); - validateTable(document, 3, 2, 5, 0, 0); - validateTable(document, 4, 2, 4, 0, 0); - validateTable(document, 5, 2, 1, 0, 0); + validateTable(document, 0, 2, 1, 0); + validateTable(document, 1, 2, 1, 0); + validateTable(document, 2, 2, 5, 0); + validateTable(document, 3, 2, 5, 0); + validateTable(document, 4, 2, 4, 0); + validateTable(document, 5, 2, 1, 0); } @@ -518,9 +506,9 @@ public class PdfSegmentationServiceTest extends AbstractTest { validateTableSize(document, 3); - validateTable(document, 0, 7, 9, 0, 0); - validateTable(document, 1, 2, 1, 0, 0); - validateTable(document, 2, 2, 10, 0, 0); + validateTable(document, 0, 7, 9, 0); + validateTable(document, 1, 2, 1, 0); + validateTable(document, 2, 2, 10, 0); } @@ -533,7 +521,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); validateTableSize(document, 1); - validateTable(document, 0, 9, 9, 0, 0); + validateTable(document, 0, 9, 9, 0); } @@ -547,7 +535,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { validateTableSize(document, 1); - validateTable(document, 0, 9, 5, 6, 0); + validateTable(document, 0, 9, 5, 6); } @@ -560,7 +548,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); validateTableSize(document, 1); - validateTable(document, 0, 9, 6, 7, 0); + validateTable(document, 0, 9, 6, 7); } @@ -574,7 +562,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); validateTableSize(document, 1); - validateTable(document, 0, 10, 6, 0, 0); + validateTable(document, 0, 10, 6, 0); } @@ -588,8 +576,8 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); validateTableSize(document, 2); - validateTable(document, 0, 2, 2, 0, 0); - validateTable(document, 1, 1, 1, 0, 0); + validateTable(document, 0, 2, 2, 0); + validateTable(document, 1, 1, 1, 0); } @@ -604,8 +592,8 @@ public class PdfSegmentationServiceTest extends AbstractTest { validateTableSize(document, 2); - validateTable(document, 0, 7, 8, 1, 0); - validateTable(document, 1, 7, 8, 1, 0); + validateTable(document, 0, 7, 8, 1); + validateTable(document, 1, 7, 8, 1); } @@ -620,8 +608,8 @@ public class PdfSegmentationServiceTest extends AbstractTest { validateTableSize(document, 2); - validateTable(document, 0, 4, 17, 0, 0); - validateTable(document, 1, 7, 12, 0, 0); + validateTable(document, 0, 4, 17, 0); + validateTable(document, 1, 7, 12, 0); } @@ -636,8 +624,8 @@ public class PdfSegmentationServiceTest extends AbstractTest { validateTableSize(document, 2); - validateTable(document, 0, 5, 14, 4, 0); - validateTable(document, 1, 7, 12, 0, 0); + validateTable(document, 0, 5, 14, 4); + validateTable(document, 1, 7, 12, 0); } @@ -651,8 +639,8 @@ public class PdfSegmentationServiceTest extends AbstractTest { validateTableSize(document, 2); - validateTable(document, 0, 5, 17, 3, 0); - validateTable(document, 1, 5, 16, 2, 0); + validateTable(document, 0, 5, 17, 3); + validateTable(document, 1, 5, 16, 2); } @@ -666,10 +654,10 @@ public class PdfSegmentationServiceTest extends AbstractTest { validateTableSize(document, 4); - validateTable(document, 0, 4, 4, 0, 0); - validateTable(document, 1, 1, 1, 0, 0); - validateTable(document, 2, 2, 3, 0, 0); - validateTable(document, 3, 1, 1, 0, 0); + validateTable(document, 0, 4, 4, 0); + validateTable(document, 1, 1, 1, 0); + validateTable(document, 2, 2, 3, 0); + validateTable(document, 3, 1, 1, 0); } @@ -684,7 +672,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { validateTableSize(document, 1); - validateTable(document, 0, 11, 8, 0, 0); + validateTable(document, 0, 11, 8, 0); } @@ -699,8 +687,8 @@ public class PdfSegmentationServiceTest extends AbstractTest { validateTableSize(document, 2); - validateTable(document, 0, 6, 8, 0, 0); - validateTable(document, 1, 6, 8, 0, 0); + validateTable(document, 0, 6, 8, 0); + validateTable(document, 1, 6, 8, 0); } @@ -714,7 +702,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { validateTableSize(document, 1); - validateTable(document, 0, 9, 5, 2, 0); + validateTable(document, 0, 9, 5, 2); } @@ -728,7 +716,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { validateTableSize(document, 1); - validateTable(document, 0, 3, 5, 0, 0); + validateTable(document, 0, 3, 5, 0); } @@ -742,7 +730,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { validateTableSize(document, 1); - validateTable(document, 0, 6, 8, 0, 0); + validateTable(document, 0, 6, 8, 0); } @@ -755,10 +743,10 @@ public class PdfSegmentationServiceTest extends AbstractTest { validateTableSize(document, 4); - validateTable(document, 0, 3, 3, 0, 0); - validateTable(document, 1, 3, 6, 2, 0); - validateTable(document, 2, 3, 3, 1, 0); - validateTable(document, 3, 3, 3, 0, 0); + validateTable(document, 0, 3, 6, 0); + validateTable(document, 1, 3, 3, 0); + validateTable(document, 2, 3, 3, 0); + validateTable(document, 3, 3, 3, 0); } @@ -772,12 +760,12 @@ public class PdfSegmentationServiceTest extends AbstractTest { validateTableSize(document, 6); - validateTable(document, 0, 5, 5, 0, 0); - validateTable(document, 1, 5, 6, 0, 0); - validateTable(document, 2, 5, 5, 0, 0); - validateTable(document, 3, 5, 5, 0, 0); - validateTable(document, 4, 5, 5, 0, 0); - validateTable(document, 5, 5, 5, 0, 0); + validateTable(document, 0, 5, 6, 0); + validateTable(document, 1, 5, 5, 0); + validateTable(document, 2, 5, 5, 0); + validateTable(document, 3, 5, 5, 0); + validateTable(document, 4, 5, 5, 0); + validateTable(document, 5, 5, 5, 0); } @@ -791,7 +779,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { validateTableSize(document, 1); - validateTable(document, 0, 6, 5, 0, 0); + validateTable(document, 0, 6, 5, 0); } @@ -805,7 +793,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { validateTableSize(document, 1); - validateTable(document, 0, 5, 8, 1, 0); + validateTable(document, 0, 5, 8, 0); } @@ -816,13 +804,14 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/T5_Page16_VV-640252.pdf"); ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); - - validateTableSize(document, 5); - validateTable(document, 0, 1, 1, 0, 0); - validateTable(document, 1, 1, 1, 0, 0); - validateTable(document, 2, 1, 1, 0, 0); - validateTable(document, 3, 1, 1, 0, 0); - validateTable(document, 4, 1, 1, 0, 0); + validateTableSize(document, 6); + // does not make sense to assert anything here other than that it runs. This is not a Table and completely breaks the current table detection logic +// viewerDocumentService.addLayerGroups(pdfFileResource.getFile(), new File("/tmp/cellDebug.pdf"), List.of(document.getLayoutDebugLayer())); +// validateTable(document, 0, 1, 1, 0); +// validateTable(document, 1, 1, 1, 0); +// validateTable(document, 2, 1, 1, 3); +// validateTable(document, 3, 1, 1, 0); +// validateTable(document, 4, 1, 1, 0); } @@ -836,7 +825,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { validateTableSize(document, 1); - validateTable(document, 0, 6, 6, 5, 0); + validateTable(document, 0, 6, 6, 5); } @@ -869,7 +858,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { } - private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) { + private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect) { TablePageBlock table = document.getSectionTree().getAllTableOfContentItems() .stream() @@ -877,8 +866,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { .stream() .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) - .toList() - .get(tableIndex); + .toList().get(tableIndex); List> rows = table.getRows(); int emptyCellsFoundFound = rows.stream() @@ -891,7 +879,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { for (List row : table.getRows()) { row.forEach(r -> System.out.println(r.toString())); } - assertThat(emptyCellsFoundFound).isEqualTo(emptyCellsCountCorrect + emptyCellsCountIncorrect); + assertThat(emptyCellsFoundFound).isEqualTo(emptyCellsCountCorrect); assertThat(table.getColCount()).isEqualTo(colCount); assertThat(table.getRowCount()).isEqualTo(rowCount); @@ -907,8 +895,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { .stream() .filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock)) .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) - .toList() - .get(tableIndex); + .toList().get(tableIndex); List> rows = table.getRows(); List rowsFlattened = rows.stream() diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/GapAcrossLinesDetectionServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/GapAcrossLinesDetectionServiceTest.java index 63beced..7ef2477 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/GapAcrossLinesDetectionServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/GapAcrossLinesDetectionServiceTest.java @@ -6,14 +6,10 @@ import java.util.List; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; +import org.springframework.core.io.ClassPathResource; -import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation; import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; -import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation; -import com.knecon.fforesight.service.layoutparser.processor.services.DividingColumnDetectionService; -import com.knecon.fforesight.service.layoutparser.processor.services.GapDetectionService; -import com.knecon.fforesight.service.layoutparser.processor.services.GapsAcrossLinesService; -import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService; +import com.knecon.fforesight.service.layoutparser.processor.experimental.DividingColumnDetectionService; import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor; import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; @@ -21,32 +17,6 @@ import lombok.SneakyThrows; class GapAcrossLinesDetectionServiceTest { - @Test - @Disabled - @SneakyThrows - public void testGapBasedColumnDetection() { - - String filename = "files/basf/CustomerFiles/invisible_tables_test-two-pages_ocred.pdf"; - var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf"; - System.out.println("start TextPosition extraction"); - long start = System.currentTimeMillis(); - List pageInformations = PageContentExtractor.getSortedPageContents(filename).stream().map(PageInformationService::build).toList(); - List> columnsPerPage = new LinkedList<>(); - System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start); - System.out.println("start column detection"); - start = System.currentTimeMillis(); - for (PageInformation pageInformation : pageInformations) { - GapInformation gapInformation = GapDetectionService.findGapsInLines(pageInformation.getPageContents().getSortedWords(), pageInformation.getMainBodyTextFrame()); - columnsPerPage.add(GapsAcrossLinesService.detectXGapsAcrossLines(gapInformation, pageInformation.getMainBodyTextFrame())); - } - System.out.printf("Finished column detection in %d ms%n", System.currentTimeMillis() - start); - System.out.println("start draw rectangles"); - start = System.currentTimeMillis(); - PdfDraw.drawRectanglesPerPage(filename, columnsPerPage, tmpFileName); - System.out.printf("Finished drawing rectangles in %d ms%n", System.currentTimeMillis() - start); - } - - @Test @Disabled @SneakyThrows @@ -56,7 +26,7 @@ class GapAcrossLinesDetectionServiceTest { var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf"; System.out.println("start TextPosition extraction"); long start = System.currentTimeMillis(); - List sortedTextPositionSequencesPerPage = PageContentExtractor.getSortedPageContents(filename); + List sortedTextPositionSequencesPerPage = PageContentExtractor.getDocumentContents(new ClassPathResource(filename).getFile(), 4); List> columnsPerPage = new LinkedList<>(); System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start); System.out.println("start column detection"); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/InvisibleTableDetectionServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/InvisibleTableDetectionServiceTest.java deleted file mode 100644 index f08858f..0000000 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/InvisibleTableDetectionServiceTest.java +++ /dev/null @@ -1,66 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.server.services; - -import java.awt.geom.Rectangle2D; -import java.nio.file.Path; -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.stream.Collectors; - -import org.junit.jupiter.api.Test; - -import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation; -import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; -import com.knecon.fforesight.service.layoutparser.processor.services.InvisibleTableDetectionService; -import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService; -import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor; -import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; -import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; - -import lombok.SneakyThrows; - -class InvisibleTableDetectionServiceTest { - - @Test -// @Disabled - @SneakyThrows - public void detectInvisibleTableTest() { - - String fileName = "files/basf/CustomerFiles/invisible_tables_test-two-pages_ocred.pdf"; - var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TABLE.pdf").toString(); - List pageContents = PageContentExtractor.getSortedPageContents(fileName) - .stream() - .map(PageInformationService::build) - .collect(Collectors.toList()); - - int pageNumber = 1; - Rectangle2D tableBBox = pageContents.get(0).getPageContents().getSortedWords().subList(45, 152) - .stream() - .map(Word::getBBox) - .map(this::mirrorY) - .collect(RectangleTransformations.collectBBox()); - - List words = pageContents.get(0).getPageContents().getSortedWords() - .stream() - .filter(textPositionSequence -> tableBBox.contains(mirrorY(textPositionSequence.getBBox()))) - .toList(); - - var table = InvisibleTableDetectionService.detectTable(words, tableBBox); - - PdfDraw.drawRectanglesPerPage(fileName, - List.of(table.stream() - .flatMap(Collection::stream) - .toList(), Collections.emptyList()), - tmpFileName); - } - - - private Rectangle2D mirrorY(Rectangle2D rectangle2D) { - - if (rectangle2D.getHeight() >= 0) { - return rectangle2D; - } - return new Rectangle2D.Double(rectangle2D.getX(), rectangle2D.getY() + rectangle2D.getHeight(), rectangle2D.getWidth(), -rectangle2D.getHeight()); - } - -} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/MainBodyTextFrameExtractionServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/MainBodyTextFrameExtractionServiceTest.java index 1173f30..d799d82 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/MainBodyTextFrameExtractionServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/MainBodyTextFrameExtractionServiceTest.java @@ -5,6 +5,7 @@ import java.util.List; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; +import org.springframework.core.io.ClassPathResource; import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor; @@ -20,7 +21,7 @@ class MainBodyTextFrameExtractionServiceTest { String fileName = "files/basf/CustomerFiles/invisible_tables_test-two-pages_ocred.pdf"; String tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_MAIN_BODY.pdf").toString(); - List sortedTextPositionSequence = PageContentExtractor.getSortedPageContents(fileName); + List sortedTextPositionSequence = PageContentExtractor.getDocumentContents(new ClassPathResource(fileName).getFile(), 4); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageContentExtractorTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageContentExtractorTest.java index 14c4c64..891533b 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageContentExtractorTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageContentExtractorTest.java @@ -4,6 +4,7 @@ import java.nio.file.Path; import java.util.List; import org.junit.jupiter.api.Test; +import org.springframework.core.io.ClassPathResource; import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; @@ -21,11 +22,11 @@ class PageContentExtractorTest { String fileName = "files/syngenta/CustomerFiles/Documine/Flora/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica.pdf"; var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TEXT_POSITION_SEQUENCES.pdf").toString(); - List textPositionPerPage = PageContentExtractor.getSortedPageContents(fileName); + List textPositionPerPage = PageContentExtractor.getDocumentContents(new ClassPathResource(fileName).getFile(), 4); PdfDraw.drawRectanglesPerPageNumberedByLine(fileName, textPositionPerPage.stream() - .map(t -> t.getSortedWords() + .map(t -> t.getWords() .stream() .map(Word::getBBoxPdf) .map(List::of) diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageInformationServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageInformationServiceTest.java deleted file mode 100644 index 0da7c58..0000000 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageInformationServiceTest.java +++ /dev/null @@ -1,63 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.server.services; - -import java.util.Collection; -import java.util.List; - -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; - -import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation; -import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService; -import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor; -import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; - -import lombok.SneakyThrows; - -class PageInformationServiceTest { - - @Test - @Disabled - @SneakyThrows - public void testGapDetection() { - - String filename = "files/basf/CustomerFiles/invisible_tables_test-two-pages_ocred.pdf"; - var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf"; - System.out.println("start TextPosition extraction"); - long start = System.currentTimeMillis(); - List pageInformations = PageContentExtractor.getSortedPageContents(filename).stream().map(PageInformationService::build).toList(); - System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start); - System.out.println("start gap detection"); - start = System.currentTimeMillis(); - System.out.printf("Finished gap detection in %d ms%n", System.currentTimeMillis() - start); - System.out.println("start draw rectangles"); - start = System.currentTimeMillis(); - PdfDraw.drawRectanglesAndLinesPerPage(filename, - pageInformations.stream().map(PageInformation::getGapInformation).map(gaps -> gaps.getYGaps().stream().flatMap(Collection::stream).toList()).toList(), - pageInformations.stream().map(PageInformation::getGapInformation).map(gaps -> gaps.getXGaps().stream().flatMap(Collection::stream).toList()).toList(), - tmpFileName); - System.out.printf("Finished drawing rectangles in %d ms%n", System.currentTimeMillis() - start); - } - - @Test - @Disabled - @SneakyThrows - public void testLineDetection() { - - String filename = "files/basf/CustomerFiles/invisible_tables_test-two-pages_ocred.pdf"; - var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf"; - System.out.println("start TextPosition extraction"); - long start = System.currentTimeMillis(); - List pageInformations = PageContentExtractor.getSortedPageContents(filename).stream().map(PageInformationService::build).toList(); - System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start); - System.out.println("start gap detection"); - start = System.currentTimeMillis(); - System.out.printf("Finished gap detection in %d ms%n", System.currentTimeMillis() - start); - System.out.println("start draw rectangles"); - start = System.currentTimeMillis(); - PdfDraw.drawRectanglesPerPageNumberedByLine(filename, - pageInformations.stream().map(PageInformation::getLineInformation).map(gaps -> gaps.getBBoxWithGapsByLines().stream().toList()).toList(), - tmpFileName); - System.out.printf("Finished drawing rectangles in %d ms%n", System.currentTimeMillis() - start); - } - -} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java index 37658db..9c968a5 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java @@ -34,9 +34,10 @@ import com.knecon.fforesight.service.layoutparser.processor.services.PageContent import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.processor.utils.DrawingOptions; -import com.knecon.fforesight.service.layoutparser.processor.utils.RectangularIntersectionFinder; +import com.knecon.fforesight.service.layoutparser.processor.services.tables.RectangularIntersectionFinder; import com.knecon.fforesight.service.layoutparser.server.BuildDocumentTest; import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; +import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult; import lombok.SneakyThrows; @@ -49,7 +50,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest { String fileName = "files/syngenta/CustomerFiles/SinglePages/T5_Page16_VV-640252.pdf"; String lineFileName = "/tmp/" + Path.of(fileName).getFileName().toString() + "_CELLS.pdf"; - List pageContents = PageContentExtractor.getSortedPageContents(fileName); + List pageContents = PageContentExtractor.getDocumentContents(new ClassPathResource(fileName).getFile(), 4); RulingCleaningService rulingCleaningService = new RulingCleaningService(); List> rectanglesPerPage = new LinkedList<>(); for (PageContents pageContent : pageContents) { @@ -69,7 +70,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest { String fileName = "files/syngenta/CustomerFiles/SinglePages/Page35_19 Chlorothalonil RAR 08 Volume 3CA B 6b metabolites Oct 2017.pdf"; String lineFileName = "/tmp/" + Path.of(fileName).getFileName().toString() + "_LINES.pdf"; - List pageContents = PageContentExtractor.getSortedPageContents(fileName); + List pageContents = PageContentExtractor.getDocumentContents(new ClassPathResource(fileName).getFile(), 4); RulingCleaningService rulingCleaningService = new RulingCleaningService(); List cleanRulingsPerPage = new LinkedList<>(); for (PageContents pageContent : pageContents) { @@ -110,6 +111,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest { filename.toFile(), new ImageServiceResponse(), new TableServiceResponse(), + IdpResult.empty(), new VisualLayoutParsingResponse(), Map.of("file", filename.toFile().toString()))).document(); Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, @@ -117,6 +119,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest { filename.toFile(), new ImageServiceResponse(), new TableServiceResponse(), + IdpResult.empty(), new VisualLayoutParsingResponse(), Map.of("file", filename.toFile().toString()))).document(); DocumentData documentDataBefore = DocumentDataMapper.toDocumentData(documentGraphBefore); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingsClassifierTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingsClassifierTest.java index 1576918..680404d 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingsClassifierTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingsClassifierTest.java @@ -7,6 +7,7 @@ import java.util.Collections; import java.util.List; import org.junit.jupiter.api.Test; +import org.springframework.core.io.ClassPathResource; import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; @@ -15,7 +16,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.Word; import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor; import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService; import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier; -import com.knecon.fforesight.service.layoutparser.processor.utils.RectangularIntersectionFinder; +import com.knecon.fforesight.service.layoutparser.processor.services.tables.RectangularIntersectionFinder; import lombok.SneakyThrows; @@ -26,19 +27,19 @@ public class RulingsClassifierTest { public void textRulingExtractionTest() { String fileName = "files/Minimal Examples/RotateTextWithRulingsTestFile.pdf"; - List pageContents = PageContentExtractor.getSortedPageContents(fileName); + List pageContents = PageContentExtractor.getDocumentContents(new ClassPathResource(fileName).getFile(), 4); RulingCleaningService rulingCleaningService = new RulingCleaningService(); for (PageContents pageContent : pageContents) { CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings()); RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals()); - TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getSortedWords(), cleanRulings); + TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getWords(), cleanRulings); - assertTrue(pageContent.getSortedWords() + assertTrue(pageContent.getWords() .stream() .filter(word -> word.toString().equals("Underlined")) .allMatch(Word::isUnderline)); - assertTrue(pageContent.getSortedWords() + assertTrue(pageContent.getWords() .stream() .filter(word -> word.toString().equals("Striketrough")) .allMatch(Word::isStrikethrough)); @@ -64,13 +65,13 @@ public class RulingsClassifierTest { public void tableRulingExtractionTest() { String fileName = "files/SinglePages/AbsolutelyEnormousTable.pdf"; - List pageContents = PageContentExtractor.getSortedPageContents(fileName); + List pageContents = PageContentExtractor.getDocumentContents(new ClassPathResource(fileName).getFile(), 4); RulingCleaningService rulingCleaningService = new RulingCleaningService(); for (PageContents pageContent : pageContents) { CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings()); RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals()); - TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getSortedWords(), cleanRulings); + TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getWords(), cleanRulings); assertEquals(30, cleanRulings.getHorizontals().size()); assertEquals(30, cleanRulings.getTableLines().getHorizontals().size()); diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/LayerIdentifier.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/LayerIdentifier.java index 13b0418..3cf295c 100644 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/LayerIdentifier.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/LayerIdentifier.java @@ -5,8 +5,7 @@ import org.apache.pdfbox.cos.COSName; /* These identifiers are used to mark content in the pdf, such that it may be found later. The markedContentName must therefore be unique. The String "name" is only used to display optional content in the optional content view in the pdf. -Therefore, it may be null, if optionalContent is false. -If optionalContent is false, the layer will not be created as a OCG, and will not be listed in the OCG view. +Therefore, it may be null, then the layer will not be created as a OCG, and will not be listed in the OCG view. */ public record LayerIdentifier(String name, String markedContentName) { diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/IdpLayerConfig.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/IdpLayerConfig.java index 6441d99..9639163 100644 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/IdpLayerConfig.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/IdpLayerConfig.java @@ -38,6 +38,7 @@ public class IdpLayerConfig extends AbstractLayerGroup { protected static final Color KEY_COLOR = new Color(30, 92, 172); protected static final Color VALUE_COLOR = new Color(30, 172, 146); protected static final Color LINES_COLOR = new Color(152, 45, 179); + protected static final Color HEADER_CELL_COLOR = new Color(156, 21, 48); @Override diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayoutDebugLayerConfig.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayoutDebugLayerConfig.java index e871e8b..2860aae 100644 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayoutDebugLayerConfig.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayoutDebugLayerConfig.java @@ -16,7 +16,7 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup { public static final LayerGroup CONFIG_INSTANCE = new LayoutDebugLayerConfig(); - protected static final Standard14EmbeddableFont FONT = Standard14EmbeddableFont.helvetica(); + protected static final Standard14EmbeddableFont FONT = Standard14EmbeddableFont.HELVETICA_INSTANCE; public static final float LINE_WIDTH = 0.5f; @@ -66,6 +66,7 @@ public class LayoutDebugLayerConfig extends AbstractLayerGroup { protected final Visualizations listIdentifiers = Visualizations.builder().layer(LayerIdentifier.LIST_IDENTIFIERS).build(); protected final Visualizations outlineHeadlines = Visualizations.builder().layer(LayerIdentifier.OUTLINE_HEADLINES).build(); + public List getVisualizations() { return List.of(characters, // diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayoutGridLayerConfig.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayoutGridLayerConfig.java index eb56c67..2c4e3ba 100644 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayoutGridLayerConfig.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/layers/LayoutGridLayerConfig.java @@ -16,9 +16,10 @@ public class LayoutGridLayerConfig extends AbstractLayerGroup { public static final LayerGroup CONFIG_INSTANCE = new LayoutGridLayerConfig(); + protected static final Standard14EmbeddableFont FONT = Standard14EmbeddableFont.HELVETICA_INSTANCE; + protected static final float FONT_SIZE = 10f; protected static final float LINE_WIDTH = 1f; - protected static final Standard14EmbeddableFont FONT = Standard14EmbeddableFont.helvetica(); protected static final Color INNER_LINES_COLOR = new Color(255, 175, 175); protected static final Color HEADER_CELL_COLOR = new Color(156, 21, 48); diff --git a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/Standard14EmbeddableFont.java b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/Standard14EmbeddableFont.java index a4b6926..6630324 100644 --- a/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/Standard14EmbeddableFont.java +++ b/layoutparser-service/viewer-doc-processor/src/main/java/com/knecon/fforesight/service/viewerdoc/model/Standard14EmbeddableFont.java @@ -1,7 +1,5 @@ package com.knecon.fforesight.service.viewerdoc.model; -import java.util.Objects; - import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.font.PDFont; import org.apache.pdfbox.pdmodel.font.PDType1Font; @@ -22,6 +20,9 @@ public class Standard14EmbeddableFont implements EmbeddableFont { private final int pdfTronIdentifier; + public static final Standard14EmbeddableFont HELVETICA_INSTANCE = helvetica(); + + public static Standard14EmbeddableFont helvetica() { return new Standard14EmbeddableFont(new PDType1Font(Standard14Fonts.FontName.HELVETICA), Font.e_helvetica);