From df9cbdc036e73baf845eb4d37a54e7c706bb2b78 Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Thu, 15 Jun 2023 12:51:35 +0200 Subject: [PATCH] RED-6725: Integrate new layout parser * ported current state from RedactManager --- .../api/data/AtomicTextBlockData.java | 4 + .../internal/api/data/DocumentData.java | 3 +- ...ontentsData.java => DocumentTreeData.java} | 25 +- .../api/{graph/nodes => data}/NodeType.java | 12 +- .../internal/api/graph/DocumentGraph.java | 101 ---- .../internal/api/graph/TableOfContents.java | 193 -------- .../internal/api/graph/entity/EntityNode.java | 76 --- .../api/graph/entity/EntityPosition.java | 45 -- .../internal/api/graph/nodes/FooterNode.java | 53 --- .../internal/api/graph/nodes/HeaderNode.java | 53 --- .../api/graph/nodes/HeadlineNode.java | 60 --- .../internal/api/graph/nodes/ImageNode.java | 87 ---- .../internal/api/graph/nodes/ImageType.java | 9 - .../internal/api/graph/nodes/PageNode.java | 71 --- .../api/graph/nodes/ParagraphNode.java | 51 -- .../internal/api/graph/nodes/SectionNode.java | 63 --- .../api/graph/nodes/SemanticNode.java | 275 ----------- .../api/graph/nodes/TableCellNode.java | 92 ---- .../internal/api/graph/nodes/TableNode.java | 73 --- .../api/graph/textblock/AtomicTextBlock.java | 131 ----- .../api/mapper/DocumentGraphMapper.java | 229 --------- .../internal/api/mapper/PropertiesMapper.java | 101 ---- .../api/services/EntityEnrichmentService.java | 10 - .../api/services/EntityInsertionService.java | 56 --- .../layoutparser-service-processor/pom.xml | 4 + .../processor/LayoutParsingService.java | 17 +- .../LayoutParsingStorageService.java | 12 +- .../adapter/CvTableParsingAdapter.java | 12 +- .../adapter/ImageServiceResponseAdapter.java | 6 +- .../adapter/model/image/Classification.java | 3 - .../adapter/model/image/FilterGeometry.java | 5 +- .../adapter/model/image/Filters.java | 3 - .../adapter/model/image/Geometry.java | 3 - .../image/{Format.java => ImageFormat.java} | 5 +- .../{Metadata.java => ImageMetadata.java} | 5 +- .../model/image/ImageServiceResponse.java | 11 +- .../adapter/model/image/ImageSize.java | 3 - .../adapter/model/image/Position.java | 3 - .../adapter/model/image/Probability.java | 3 - .../model/table/CvParsedTableModel.java | 17 - .../{CvParsedPageInfo.java => PageInfo.java} | 5 +- .../model/table/PdfTableCell.java} | 4 +- ...CvParsedTableCell.java => TableCells.java} | 7 +- .../adapter/model/table/TableData.java | 14 + .../model/table/TableServiceResponse.java | 5 +- .../dto/AbstractTextContainer.java | 71 --- .../classification/dto/table/TableCell.java | 38 -- .../model/AbstractPageBlock.java | 80 ++++ .../ClassificationDocument.java | 10 +- .../{dto => model}/ClassificationFooter.java | 6 +- .../{dto => model}/ClassificationHeader.java | 6 +- .../{dto => model}/ClassificationPage.java | 8 +- .../{dto => model}/ClassificationSection.java | 24 +- .../{dto => model}/FloatFrequencyCounter.java | 4 +- .../Orientation.java} | 4 +- .../classification/model/PageBlockType.java | 38 ++ .../{dto => model}/image/ClassifiedImage.java | 4 +- .../classification/model/table/Cell.java | 79 ++++ .../table/CellPosition.java} | 6 +- .../{dto => model}/table/CleanRulings.java | 2 +- .../{dto => model/table}/Rectangle.java | 2 +- .../{dto => model}/table/Ruling.java | 2 +- .../table/TablePageBlock.java} | 114 ++--- .../{dto => model}/text/RedTextPosition.java | 8 +- .../model/text/SearchableText.java | 48 ++ .../model/text/SimplifiedSectionText.java | 17 + .../model/text/SimplifiedText.java | 20 + .../text/StringFrequencyCounter.java | 2 +- .../{dto => model}/text/TextDirection.java | 9 +- .../text/TextPageBlock.java} | 60 ++- .../text/TextPositionSequence.java | 33 +- .../{dto => model}/text/UnclassifiedText.java | 4 +- .../parsing/PDFAreaTextStripper.java | 82 ---- .../parsing/PDFLinesTextStripper.java | 13 +- .../service/BlockificationService.java | 87 ++-- .../service/BodyTextFrameService.java | 78 +-- .../service/ClassificationService.java | 53 +-- .../service/PdfParsingService.java | 32 +- .../service/RulingCleaningService.java | 20 +- .../service/SectionsBuilderService.java | 104 ++-- .../service/TableExtractionService.java | 74 ++- .../utils/DoubleComparisons.java | 13 + .../classification/utils/FileUtils.java | 56 +++ .../classification/utils/PositionUtils.java | 16 +- .../utils/RulingTextDirAdjustUtil.java | 4 +- .../utils/TextNormalizationUtilities.java | 12 + .../factory/DocumentGraphFactory.java | 394 +++++----------- .../factory/RectangleTransformations.java | 105 ----- ...ava => SearchTextWithTextPositionDto.java} | 15 +- .../SearchTextWithTextPositionFactory.java | 125 +++-- .../processor/factory/SectionNodeFactory.java | 183 +++++++ .../processor/factory/TableNodeFactory.java | 136 ++++++ .../processor/factory/TextBlockFactory.java | 80 ++-- .../processor}/graph/Boundary.java | 47 +- .../processor/graph/DocumentTree.java | 217 +++++++++ .../processor/graph/entity/EntityType.java | 8 + .../graph/entity/RedactionEntity.java | 228 +++++++++ .../graph/entity/RedactionPosition.java | 24 + .../processor/graph/nodes/Document.java | 120 +++++ .../processor/graph/nodes/Footer.java | 65 +++ .../graph/nodes/GenericSemanticNode.java | 5 + .../processor/graph/nodes/Header.java | 65 +++ .../processor/graph/nodes/Headline.java | 72 +++ .../processor/graph/nodes/Image.java | 95 ++++ .../processor/graph/nodes/ImageType.java | 21 + .../processor/graph/nodes/Page.java | 87 ++++ .../processor/graph/nodes/Paragraph.java | 63 +++ .../processor/graph/nodes/Section.java | 77 +++ .../processor/graph/nodes/SemanticNode.java | 446 ++++++++++++++++++ .../processor/graph/nodes/Table.java | 316 +++++++++++++ .../processor/graph/nodes/TableCell.java | 91 ++++ .../graph/textblock/AtomicTextBlock.java | 215 +++++++++ .../textblock/ConcatenatedTextBlock.java | 48 +- .../processor}/graph/textblock/TextBlock.java | 25 +- .../graph/textblock/TextBlockCollector.java | 5 +- .../processor}/mapper/DocumentDataMapper.java | 81 ++-- .../processor/mapper/DocumentGraphMapper.java | 198 ++++++++ .../processor/mapper/PropertiesMapper.java | 112 +++++ .../services/RectangleTransformations.java | 4 +- .../processor/utils/IdBuilder.java | 41 ++ .../utils/PdfVisualisationUtility.java | 169 +++++++ .../utils/RectangleTransformations.java | 146 ++++++ .../processor/utils/TableMergingUtility.java | 42 ++ .../TextPositionOperations.java | 8 +- .../server/graph/BoundaryTest.java | 2 +- .../server/graph/BuildDocumentGraphTest.java | 6 +- .../DocumentGraphEntityInsertionTest.java | 280 ----------- .../graph/DocumentGraphJsonWritingTest.java | 14 +- .../graph/DocumentGraphMappingTest.java | 12 +- .../graph/DocumentGraphVisualizationTest.java | 11 +- .../layoutparser/server/utils/BaseTest.java | 10 - .../layoutparser/server/utils/TestEntity.java | 124 ----- .../utils/TestEntityEnrichmentService.java | 89 ---- .../server/utils/visualizations/PdfDraw.java | 30 +- 134 files changed, 4481 insertions(+), 3639 deletions(-) rename layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/{TableOfContentsData.java => DocumentTreeData.java} (71%) rename layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/{graph/nodes => data}/NodeType.java (52%) delete mode 100644 layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/DocumentGraph.java delete mode 100644 layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/TableOfContents.java delete mode 100644 layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/entity/EntityNode.java delete mode 100644 layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/entity/EntityPosition.java delete mode 100644 layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/FooterNode.java delete mode 100644 layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/HeaderNode.java delete mode 100644 layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/HeadlineNode.java delete mode 100644 layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/ImageNode.java delete mode 100644 layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/ImageType.java delete mode 100644 layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/PageNode.java delete mode 100644 layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/ParagraphNode.java delete mode 100644 layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/SectionNode.java delete mode 100644 layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/SemanticNode.java delete mode 100644 layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/TableCellNode.java delete mode 100644 layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/TableNode.java delete mode 100644 layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/textblock/AtomicTextBlock.java delete mode 100644 layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/mapper/DocumentGraphMapper.java delete mode 100644 layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/mapper/PropertiesMapper.java delete mode 100644 layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/services/EntityEnrichmentService.java delete mode 100644 layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/services/EntityInsertionService.java rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/{Format.java => ImageFormat.java} (71%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/{Metadata.java => ImageMetadata.java} (77%) delete mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/CvParsedTableModel.java rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/{CvParsedPageInfo.java => PageInfo.java} (70%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification/dto/table/CvParsedTableCell.java => adapter/model/table/PdfTableCell.java} (72%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/{CvParsedTableCell.java => TableCells.java} (73%) create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/TableData.java delete mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/AbstractTextContainer.java delete mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/TableCell.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/AbstractPageBlock.java rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/{dto => model}/ClassificationDocument.java (78%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/{dto => model}/ClassificationFooter.java (70%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/{dto => model}/ClassificationHeader.java (70%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/{dto => model}/ClassificationPage.java (87%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/{dto => model}/ClassificationSection.java (52%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/{dto => model}/FloatFrequencyCounter.java (98%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/{dto/text/TextBlockOrientation.java => model/Orientation.java} (63%) create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/PageBlockType.java rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/{dto => model}/image/ClassifiedImage.java (81%) create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/Cell.java rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/{dto/table/TableCellPosition.java => model/table/CellPosition.java} (67%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/{dto => model}/table/CleanRulings.java (90%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/{dto => model/table}/Rectangle.java (99%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/{dto => model}/table/Ruling.java (99%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/{dto/table/Table.java => model/table/TablePageBlock.java} (70%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/{dto => model}/text/RedTextPosition.java (89%) create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/SearchableText.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/SimplifiedSectionText.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/SimplifiedText.java rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/{dto => model}/text/StringFrequencyCounter.java (98%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/{dto => model}/text/TextDirection.java (89%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/{dto/text/ClassificationTextBlock.java => model/text/TextPageBlock.java} (84%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/{dto => model}/text/TextPositionSequence.java (93%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/{dto => model}/text/UnclassifiedText.java (73%) delete mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFAreaTextStripper.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/FileUtils.java delete mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/RectangleTransformations.java rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/{SearchTextWithTextPositionModel.java => SearchTextWithTextPositionDto.java} (51%) create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SectionNodeFactory.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/TableNodeFactory.java rename layoutparser-service/{layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api => layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor}/graph/Boundary.java (61%) create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/DocumentTree.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/entity/EntityType.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/entity/RedactionEntity.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/entity/RedactionPosition.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Document.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Footer.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/GenericSemanticNode.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Header.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Headline.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Image.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/ImageType.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Page.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Paragraph.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Section.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/SemanticNode.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Table.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/TableCell.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/AtomicTextBlock.java rename layoutparser-service/{layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api => layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor}/graph/textblock/ConcatenatedTextBlock.java (70%) rename layoutparser-service/{layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api => layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor}/graph/textblock/TextBlock.java (73%) rename layoutparser-service/{layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api => layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor}/graph/textblock/TextBlockCollector.java (84%) rename layoutparser-service/{layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api => layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor}/mapper/DocumentDataMapper.java (57%) create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/DocumentGraphMapper.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/PropertiesMapper.java rename layoutparser-service/{layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api => layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor}/services/RectangleTransformations.java (93%) create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/IdBuilder.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PdfVisualisationUtility.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TableMergingUtility.java rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{factory => utils}/TextPositionOperations.java (72%) delete mode 100644 layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphEntityInsertionTest.java delete mode 100644 layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/TestEntity.java delete mode 100644 layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/TestEntityEnrichmentService.java diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/AtomicTextBlockData.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/AtomicTextBlockData.java index 80910e8..04349e4 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/AtomicTextBlockData.java +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/AtomicTextBlockData.java @@ -1,5 +1,7 @@ package com.knecon.fforesight.service.layoutparser.internal.api.data; + + import lombok.AccessLevel; import lombok.AllArgsConstructor; import lombok.Builder; @@ -20,4 +22,6 @@ public class AtomicTextBlockData { int end; int[] lineBreaks; + + } diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/DocumentData.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/DocumentData.java index 166500c..9e85750 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/DocumentData.java +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/DocumentData.java @@ -15,6 +15,7 @@ public class DocumentData { PageData[] pages; AtomicTextBlockData[] atomicTextBlocks; AtomicPositionBlockData[] atomicPositionBlocks; - TableOfContentsData tableOfContents; + DocumentTreeData documentTreeData; + } diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/TableOfContentsData.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/DocumentTreeData.java similarity index 71% rename from layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/TableOfContentsData.java rename to layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/DocumentTreeData.java index c875969..3a14a37 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/TableOfContentsData.java +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/DocumentTreeData.java @@ -4,8 +4,6 @@ import java.util.List; import java.util.Map; import java.util.stream.Stream; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType; - import lombok.AccessLevel; import lombok.AllArgsConstructor; import lombok.Builder; @@ -19,7 +17,7 @@ import lombok.experimental.FieldDefaults; @AllArgsConstructor @NoArgsConstructor @FieldDefaults(level = AccessLevel.PRIVATE) -public class TableOfContentsData { +public class DocumentTreeData { EntryData root; @@ -29,9 +27,9 @@ public class TableOfContentsData { if (tocId.isEmpty()) { return root; } - EntryData entry = root.subEntries.get(tocId.get(0)); + EntryData entry = root.children.get(tocId.get(0)); for (int id : tocId.subList(1, tocId.size())) { - entry = entry.subEntries.get(id); + entry = entry.children.get(id); } return entry; } @@ -39,7 +37,7 @@ public class TableOfContentsData { public Stream streamAllEntries() { - return Stream.concat(Stream.of(root), root.subEntries.stream()).flatMap(TableOfContentsData::flatten); + return Stream.concat(Stream.of(root), root.children.stream()).flatMap(DocumentTreeData::flatten); } @@ -51,7 +49,7 @@ public class TableOfContentsData { private static Stream flatten(EntryData entry) { - return Stream.concat(Stream.of(entry), entry.subEntries.stream().flatMap(TableOfContentsData::flatten)); + return Stream.concat(Stream.of(entry), entry.children.stream().flatMap(DocumentTreeData::flatten)); } @@ -62,19 +60,18 @@ public class TableOfContentsData { public static class EntryData { NodeType type; - int[] tocId; - Long[] atomicBlocks; - Long[] pages; + int[] treeId; + Long[] atomicBlockIds; + Long[] pageNumbers; Map properties; - List subEntries; - + List children; @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append("["); - for (int i : tocId) { + for (int i : treeId) { sb.append(i); sb.append(","); } @@ -83,7 +80,7 @@ public class TableOfContentsData { sb.append(type); sb.append(" atbs = "); - sb.append(atomicBlocks.length); + sb.append(atomicBlockIds.length); return sb.toString(); } diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/NodeType.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/NodeType.java similarity index 52% rename from layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/NodeType.java rename to layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/NodeType.java index a85c4aa..91104f2 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/NodeType.java +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/NodeType.java @@ -1,4 +1,6 @@ -package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes; +package com.knecon.fforesight.service.layoutparser.internal.api.data; + +import java.util.Locale; public enum NodeType { DOCUMENT, @@ -9,5 +11,11 @@ public enum NodeType { TABLE_CELL, IMAGE, HEADER, - FOOTER + FOOTER; + + + public String toString() { + + return this.name().charAt(0) + this.name().substring(1).toLowerCase(Locale.ROOT); + } } diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/DocumentGraph.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/DocumentGraph.java deleted file mode 100644 index a5d5f12..0000000 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/DocumentGraph.java +++ /dev/null @@ -1,101 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.internal.api.graph; - -import java.awt.geom.Rectangle2D; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SectionNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector; - -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.NoArgsConstructor; -import lombok.experimental.FieldDefaults; - -@Data -@Builder -@AllArgsConstructor -@NoArgsConstructor -@FieldDefaults(level = AccessLevel.PRIVATE) -public class DocumentGraph implements SemanticNode { - - Set pages; - TableOfContents tableOfContents; - Integer numberOfPages; - TextBlock textBlock; - - - public TextBlock buildTextBlock() { - - return streamTerminalTextBlocksInOrder().collect(new TextBlockCollector()); - } - - - public List getMainSections() { - - return streamChildren().filter(node -> node instanceof SectionNode).map(node -> (SectionNode) node).collect(Collectors.toList()); - } - - - public Stream streamTerminalTextBlocksInOrder() { - - return streamAllNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock); - } - - - public Set getEntities() { - - return streamAllSubNodes().map(SemanticNode::getEntities).flatMap(Set::stream).collect(Collectors.toUnmodifiableSet()); - } - - - @Override - public List getTocId() { - - return Collections.emptyList(); - } - - - @Override - public void setTocId(List tocId) { - - throw new UnsupportedOperationException("DocumentGraph is always the root of the Table of Contents"); - } - - - private Stream streamAllNodes() { - - return tableOfContents.streamAllEntriesInOrder().map(TableOfContents.Entry::getNode); - } - - - @Override - public String toString() { - - return NodeType.DOCUMENT + ": " + buildTextBlock().buildSummary(); - } - - - @Override - public Map getBBox() { - - Map bBox = new HashMap<>(); - for (PageNode page : pages) { - bBox.put(page, new Rectangle2D.Double(0, 0, page.getWidth(), page.getHeight())); - } - return bBox; - } - -} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/TableOfContents.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/TableOfContents.java deleted file mode 100644 index 800b683..0000000 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/TableOfContents.java +++ /dev/null @@ -1,193 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.internal.api.graph; - -import static java.lang.String.format; - -import java.nio.charset.StandardCharsets; -import java.util.Collections; -import java.util.LinkedList; -import java.util.List; -import java.util.stream.Stream; - -import com.google.common.hash.Hashing; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector; - -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.Getter; -import lombok.experimental.FieldDefaults; - -@Data -public class TableOfContents { - - private final Entry root; - - - public TableOfContents(DocumentGraph documentGraph) { - - root = Entry.builder().tocId(Collections.emptyList()).type(NodeType.DOCUMENT).children(new LinkedList<>()).node(documentGraph).build(); - } - - - public TextBlock buildTextBlock() { - - return streamAllEntriesInOrder().map(Entry::getNode).filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector()); - } - - - public List createNewMainEntryAndReturnId(NodeType nodeType, SemanticNode node) { - - return createNewChildEntryAndReturnId(Collections.emptyList(), nodeType, node); - } - - - public List createNewChildEntryAndReturnId(List parentId, NodeType nodeType, SemanticNode node) { - - if (!entryExists(parentId)) { - throw new UnsupportedOperationException(format("parentId %s does not exist!", parentId)); - } - - Entry parent = getEntryById(parentId); - List newId = new LinkedList<>(parentId); - newId.add(parent.children.size()); - parent.children.add(Entry.builder().tocId(newId).node(node).type(nodeType).children(new LinkedList<>()).build()); - - return newId; - } - - - private boolean entryExists(List tocId) { - - if (tocId.isEmpty()) { - return root != null; - } - Entry entry = root.children.get(tocId.get(0)); - for (int id : tocId.subList(1, tocId.size())) { - if (id >= entry.children.size() || 0 > id) { - return false; - } - entry = entry.children.get(id); - } - return true; - } - - - public Entry getParentEntryById(List tocId) { - - return getEntryById(getParentId(tocId)); - } - - - public boolean hasParentById(List tocId) { - - return entryExists(getParentId(tocId)); - } - - - public Stream streamChildrenNodes(List tocId) { - - return getEntryById(tocId).children.stream().map(Entry::getNode); - } - - - private static List getParentId(List tocId) { - - if (tocId.isEmpty()) { - throw new UnsupportedOperationException("Root has no parent!"); - } - if (tocId.size() < 2) { - return Collections.emptyList(); - } - return tocId.subList(0, tocId.size() - 1); - } - - - public Entry getEntryById(List tocId) { - - if (tocId.isEmpty()) { - return root; - } - Entry entry = root.children.get(tocId.get(0)); - for (int id : tocId.subList(1, tocId.size())) { - entry = entry.children.get(id); - } - return entry; - } - - - public Stream streamMainEntries() { - - return root.children.stream(); - } - - - public Stream streamAllEntriesInOrder() { - - return Stream.of(root).flatMap(TableOfContents::flatten); - } - - - public Stream streamAllSubEntriesInOrder(List parentId) { - - return getEntryById(parentId).getChildren().stream().flatMap(TableOfContents::flatten); - } - - - @Override - public String toString() { - - return String.join("\n", streamAllEntriesInOrder().map(Entry::toString).toList()); - } - - - public String toString(List id) { - - return String.join("\n", streamAllSubEntriesInOrder(id).map(Entry::toString).toList()); - } - - - private static Stream flatten(Entry entry) { - - return Stream.concat(Stream.of(entry), entry.children.stream().flatMap(TableOfContents::flatten)); - } - - - @Builder - @Getter - @AllArgsConstructor - @FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true) - public static class Entry { - - List tocId; - NodeType type; - SemanticNode node; - List children; - - - @Override - public String toString() { - - return node.toString(); - } - - - @Override - public int hashCode() { - - return Hashing.murmur3_32_fixed().hashString(toString(), StandardCharsets.UTF_8).hashCode(); - } - - - @Override - public boolean equals(Object o) { - - return o instanceof Entry && o.hashCode() == this.hashCode(); - } - - } - -} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/entity/EntityNode.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/entity/EntityNode.java deleted file mode 100644 index 286e52d..0000000 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/entity/EntityNode.java +++ /dev/null @@ -1,76 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.internal.api.graph.entity; - -import java.util.Collections; -import java.util.List; -import java.util.Set; - -import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode; - -public interface EntityNode { - - /** - * This represents the text, which is contained within the boundary of the Entity. - * - * @return String - */ - String getValue(); - - - /** - * The Boundary primarily defines the Entity, all other values may be inferred from it. - * - * @return Boundary, uniquely identifying this Entity - */ - Boundary getBoundary(); - - - /** - * The deepest fully containing node represents the node which is the deepest node in the document tree structure, - * whose boundary also fully contains the boundary of this entity. - * - * @return the deepest fully containing node - */ - SemanticNode getDeepestFullyContainingNode(); - - - /** - * The intersecting nodes represent all nodes, whose boundary intersects the boundary of this entity. - * - * @return all intersecting Nodes - */ - List getIntersectingNodes(); - - - void setDeepestFullyContainingNode(SemanticNode semanticNode); - - - void addIntersectingNode(SemanticNode semanticNode); - - - void setIntersectingNodes(List semanticNodes); - - - /** - * @return all pages this entity intersects. - */ - Set getPages(); - - - void setPages(Set pages); - - - /** - * removes all occurrences of this node in the graph and resets all graph specific fields. - */ - default void removeFromGraph() { - - getIntersectingNodes().forEach(node -> node.getEntities().remove(this)); - getPages().forEach(page -> page.getEntities().remove(this)); - setPages(Collections.emptySet()); - setDeepestFullyContainingNode(null); - setIntersectingNodes(Collections.emptyList()); - } - -} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/entity/EntityPosition.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/entity/EntityPosition.java deleted file mode 100644 index 4be7d3a..0000000 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/entity/EntityPosition.java +++ /dev/null @@ -1,45 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.internal.api.graph.entity; - -import java.awt.geom.Rectangle2D; -import java.nio.charset.StandardCharsets; -import java.util.List; - -import com.google.common.hash.Hashing; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode; - -import lombok.AccessLevel; -import lombok.Builder; -import lombok.Data; -import lombok.experimental.FieldDefaults; - -@Data -@Builder -@FieldDefaults(level = AccessLevel.PRIVATE) -public class EntityPosition { - - PageNode pageNode; - List rectanglePerLine; - - - public String getId() { - - return String.valueOf(hashCode()); - } - - - @Override - public int hashCode() { - - StringBuilder sb = new StringBuilder(); - sb.append(pageNode.getNumber()); - rectanglePerLine.forEach(r -> sb.append(r.getX()).append(r.getY()).append(r.getWidth()).append(r.getHeight())); - return Hashing.murmur3_128().hashString(sb.toString(), StandardCharsets.UTF_8).hashCode(); - } - - @Override - public boolean equals(Object o) { - - return o instanceof EntityPosition && o.hashCode() == this.hashCode(); - } - -} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/FooterNode.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/FooterNode.java deleted file mode 100644 index a497a0e..0000000 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/FooterNode.java +++ /dev/null @@ -1,53 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes; - -import java.util.HashSet; -import java.util.List; -import java.util.Set; - -import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; - -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.EqualsAndHashCode; -import lombok.NoArgsConstructor; -import lombok.experimental.FieldDefaults; - -@Data -@Builder -@AllArgsConstructor -@NoArgsConstructor -@FieldDefaults(level = AccessLevel.PRIVATE) -public class FooterNode implements SemanticNode { - - List tocId; - TextBlock terminalTextBlock; - - @Builder.Default - boolean terminal = true; - - @EqualsAndHashCode.Exclude - TableOfContents tableOfContents; - - @Builder.Default - @EqualsAndHashCode.Exclude - Set entities = new HashSet<>(); - - - @Override - public TextBlock buildTextBlock() { - - return terminalTextBlock; - } - - - @Override - public String toString() { - - return tocId + ": " + NodeType.FOOTER + ": " + terminalTextBlock.buildSummary(); - } - -} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/HeaderNode.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/HeaderNode.java deleted file mode 100644 index 75794c5..0000000 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/HeaderNode.java +++ /dev/null @@ -1,53 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes; - -import java.util.HashSet; -import java.util.List; -import java.util.Set; - -import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; - -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.EqualsAndHashCode; -import lombok.NoArgsConstructor; -import lombok.experimental.FieldDefaults; - -@Data -@Builder -@AllArgsConstructor -@NoArgsConstructor -@FieldDefaults(level = AccessLevel.PRIVATE) -public class HeaderNode implements SemanticNode { - - List tocId; - TextBlock terminalTextBlock; - - @Builder.Default - boolean terminal = true; - - @EqualsAndHashCode.Exclude - TableOfContents tableOfContents; - - @Builder.Default - @EqualsAndHashCode.Exclude - Set entities = new HashSet<>(); - - - @Override - public TextBlock buildTextBlock() { - - return terminalTextBlock; - } - - - @Override - public String toString() { - - return tocId + ": " + NodeType.HEADER + ": " + terminalTextBlock.buildSummary(); - } - -} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/HeadlineNode.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/HeadlineNode.java deleted file mode 100644 index 9c88cac..0000000 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/HeadlineNode.java +++ /dev/null @@ -1,60 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes; - -import java.util.HashSet; -import java.util.List; -import java.util.Set; - -import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; - -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.EqualsAndHashCode; -import lombok.NoArgsConstructor; -import lombok.experimental.FieldDefaults; - -@Data -@Builder -@AllArgsConstructor -@NoArgsConstructor -@FieldDefaults(level = AccessLevel.PRIVATE) -public class HeadlineNode implements SemanticNode { - - List tocId; - TextBlock terminalTextBlock; - - @Builder.Default - boolean terminal = true; - - @EqualsAndHashCode.Exclude - TableOfContents tableOfContents; - - @Builder.Default - @EqualsAndHashCode.Exclude - Set entities = new HashSet<>(); - - - @Override - public TextBlock buildTextBlock() { - - return terminalTextBlock; - } - - - @Override - public String toString() { - - return tocId + ": " + NodeType.HEADLINE + ": " + terminalTextBlock.buildSummary(); - } - - - @Override - public SemanticNode getHeadline() { - - return this; - } - -} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/ImageNode.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/ImageNode.java deleted file mode 100644 index c428ed6..0000000 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/ImageNode.java +++ /dev/null @@ -1,87 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes; - -import java.awt.geom.Rectangle2D; -import java.util.Collections; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector; - -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.EqualsAndHashCode; -import lombok.NoArgsConstructor; -import lombok.experimental.FieldDefaults; - -@Data -@Builder -@AllArgsConstructor -@NoArgsConstructor -@FieldDefaults(level = AccessLevel.PRIVATE) -public class ImageNode implements SemanticNode { - - List tocId; - - ImageType imageType; - boolean transparency; - Rectangle2D position; - - - boolean redaction; - boolean ignored; - - @Builder.Default - String redactionReason = ""; - @Builder.Default - String legalBasis = ""; - @Builder.Default - int matchedRule = -1; - - @EqualsAndHashCode.Exclude - PageNode page; - - @EqualsAndHashCode.Exclude - TableOfContents tableOfContents; - - @Builder.Default - @EqualsAndHashCode.Exclude - Set entities = new HashSet<>(); - - @Override - public TextBlock buildTextBlock() { - - return streamAllSubNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector()); - } - - - @Override - public Set getPages() { - - return Collections.singleton(page); - } - - - @Override - public String toString() { - - return tocId + ": " + NodeType.IMAGE + ": " + imageType.toString() + " " + position; - } - - - @Override - public Map getBBox() { - - Map bBoxPerPage = new HashMap<>(); - bBoxPerPage.put(page, position); - return bBoxPerPage; - } - -} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/ImageType.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/ImageType.java deleted file mode 100644 index 7c8afc8..0000000 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/ImageType.java +++ /dev/null @@ -1,9 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes; - -public enum ImageType { - LOGO, - FORMULA, - SIGNATURE, - OTHER, - OCR -} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/PageNode.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/PageNode.java deleted file mode 100644 index f4601c1..0000000 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/PageNode.java +++ /dev/null @@ -1,71 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes; - -import java.util.HashSet; -import java.util.List; -import java.util.Set; - -import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector; - -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.EqualsAndHashCode; -import lombok.Getter; -import lombok.Setter; -import lombok.experimental.FieldDefaults; - -@Getter -@Setter -@Builder -@AllArgsConstructor -@FieldDefaults(level = AccessLevel.PRIVATE) -public class PageNode { - - Integer number; - Integer height; - Integer width; - Integer rotation; - - @EqualsAndHashCode.Exclude - List mainBody; - @EqualsAndHashCode.Exclude - HeaderNode header; - @EqualsAndHashCode.Exclude - FooterNode footer; - - @Builder.Default - @EqualsAndHashCode.Exclude - Set entities = new HashSet<>(); - - @Builder.Default - @EqualsAndHashCode.Exclude - Set images = new HashSet<>(); - - - public TextBlock getMainBodyTextBlock() { - - return mainBody.stream().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector()); - } - - - @Override - public String toString() { - - return String.valueOf(number); - } - - - @Override - public int hashCode() { - - return number; - } - - @Override - public boolean equals(Object o) { - - return o instanceof PageNode && o.hashCode() == this.hashCode(); - } -} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/ParagraphNode.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/ParagraphNode.java deleted file mode 100644 index ab7c594..0000000 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/ParagraphNode.java +++ /dev/null @@ -1,51 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes; - -import java.util.HashSet; -import java.util.List; -import java.util.Set; - -import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; - -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.EqualsAndHashCode; -import lombok.experimental.FieldDefaults; - -@Data -@Builder -@AllArgsConstructor -@FieldDefaults(level = AccessLevel.PRIVATE) -public class ParagraphNode implements SemanticNode { - - List tocId; - TextBlock terminalTextBlock; - - @Builder.Default - boolean terminal = true; - - @EqualsAndHashCode.Exclude - TableOfContents tableOfContents; - - @Builder.Default - @EqualsAndHashCode.Exclude - Set entities = new HashSet<>(); - - - @Override - public TextBlock buildTextBlock() { - - return terminalTextBlock; - } - - - @Override - public String toString() { - - return tocId + ": " + NodeType.PARAGRAPH + ": " + terminalTextBlock.buildSummary(); - } - -} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/SectionNode.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/SectionNode.java deleted file mode 100644 index 1b33ec1..0000000 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/SectionNode.java +++ /dev/null @@ -1,63 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes; - -import java.util.HashSet; -import java.util.List; -import java.util.NoSuchElementException; -import java.util.Set; - -import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector; - -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.EqualsAndHashCode; -import lombok.experimental.FieldDefaults; - - -@Data -@Builder -@AllArgsConstructor -@FieldDefaults(level = AccessLevel.PRIVATE) -public class SectionNode implements SemanticNode { - - List tocId; - - TextBlock textBlock; - @EqualsAndHashCode.Exclude - TableOfContents tableOfContents; - - @Builder.Default - @EqualsAndHashCode.Exclude - Set entities = new HashSet<>(); - - - @Override - public TextBlock buildTextBlock() { - - if (textBlock == null) { - textBlock = streamAllSubNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector()); - } - return textBlock; - } - - - @Override - public String toString() { - - return tocId.toString() + ": " + NodeType.SECTION + ": " + buildTextBlock().buildSummary(); - } - - - public HeadlineNode getHeadline() { - - return streamChildren().filter(node -> node instanceof HeadlineNode) - .map(node -> (HeadlineNode) node) - .findFirst() - .orElseThrow(() -> new NoSuchElementException("ClassificationSection has no Headline!")); - } - -} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/SemanticNode.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/SemanticNode.java deleted file mode 100644 index a5a4ba3..0000000 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/SemanticNode.java +++ /dev/null @@ -1,275 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes; - -import java.awt.geom.Rectangle2D; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; -import com.knecon.fforesight.service.layoutparser.internal.api.services.RectangleTransformations; - -public interface SemanticNode { - - /** - * Searches all Nodes located underneath this Node in the TableOfContents and concatenates their AtomicTextBlocks into a single TextBlockEntity. - * So, for a ClassificationSection all TextBlocks of Subsections, Paragraphs, and Tables are concatenated into a single TextBlockEntity - * If the Node is Terminal, the TerminalTextBlock will be returned instead. - * - * @return ClassificationTextBlock containing all AtomicTextBlocks that are located under this Node. - */ - TextBlock buildTextBlock(); - - - /** - * Any Node maintains its own Set of Entities. - * This Set contains all Entities whose boundary intersects the boundary of this node. - * - * @return Set of all Entities associated with this Node - */ - Set getEntities(); - - - /** - * Each AtomicTextBlock is assigned a page, so to get the pages this node appears on, it collects the PageNodes from each AtomicTextBlock belonging to this node's ClassificationTextBlock. - * - * @return Set of PageNodes this node appears on. - */ - default Set getPages() { - - return buildTextBlock().getPages(); - } - - - /** - * @return the TableOfContents of the ClassificationDocument this node belongs to - */ - TableOfContents getTableOfContents(); - - - /** - * The id is a List of Integers uniquely identifying this node in the TableOfContents. - * - * @return the TableOfContents ID - */ - List getTocId(); - - - /** - * This should only be used during graph construction. - * - * @param tocId List of Integers - */ - void setTocId(List tocId); - - - /** - * Traverses the Tree up, until it hits a HeadlineNode or hits a SectionNode which will then return the first HeadlineNode from its children. - * Throws NotFoundException if no Headline is found this way - * - * @return First HeadlineNode found - */ - default SemanticNode getHeadline() { - - return getParent().getHeadline(); - } - - - /** - * @return boolean indicating wether this Node has a Parent in the TableOfContents - */ - default boolean hasParent() { - - return getTableOfContents().hasParentById(getTocId()); - } - - - /** - * @return The SemanticNode representing the Parent in the TableOfContents - * throws NotFoundException, when no parent is present - */ - default SemanticNode getParent() { - - return getTableOfContents().getParentEntryById(getTocId()).getNode(); - } - - - /** - * Terminal means a SemanticNode has direct access to a ClassificationTextBlock, by default this is false and must be overridden. - * Currently only Sections, Images, and Tables are not terminal. - * A TableCell might be Terminal depending on its area compared to the page. - * - * @return boolean, indicating if a Node has direct access to a ClassificationTextBlock - */ - default boolean isTerminal() { - - return false; - } - - - /** - * Terminal means a SemanticNode has direct access to a ClassificationTextBlock, by default this is false and must be overridden. - * Currently only Sections and Tables are not terminal. - * - * @return AtomicTextBlock - */ - default TextBlock getTerminalTextBlock() { - - throw new UnsupportedOperationException("Only terminal Nodes have access to TerminalTextBlocks!"); - } - - - default void setTerminalTextBlock(TextBlock textBlock) { - - throw new UnsupportedOperationException(); - } - - - /** - * Each AtomicTextBlock has an index on its page, this returns the number of the first AtomicTextBlock underneath this node. - * If this node does not have any AtomicTexBlocks underneath it, e.g. an empty TableCell. It returns -1. - * - * @return Integer representing the number on the page - */ - default Integer getNumberOnPage() { - - TextBlock textBlock = buildTextBlock(); - if (textBlock.getAtomicTextBlocks().size() > 0) { - return buildTextBlock().getAtomicTextBlocks().get(0).getNumberOnPage(); - } else { - return -1; - } - } - - - /** - * @return true, if this node's ClassificationTextBlock is not empty - */ - default boolean hasText() { - - return buildTextBlock().length() > 0; - } - - - /** - * @param string A String which the ClassificationTextBlock might contain - * @return true, if this node's ClassificationTextBlock contains the string - */ - default boolean containsString(String string) { - - return buildTextBlock().getSearchText().contains(string); - } - - - /** - * @param strings A List of Strings which the ClassificationTextBlock might contain - * @return true, if this node's ClassificationTextBlock contains any of the strings - */ - default boolean containsAnyString(List strings) { - - return strings.stream().anyMatch(this::containsString); - } - - - /** - * This function is used during insertion of EntityNodes into the graph, it checks if the boundary of the EntityNode intersects or even contains the EntityNode. - * It sets the fields accordingly and recursively calls this function on all its children. - * - * @param entityNode EntityNode, which is being inserted into the graph - */ - default void addThisToEntityIfIntersects(EntityNode entityNode) { - - TextBlock textBlock = buildTextBlock(); - if (textBlock.getBoundary().intersects(entityNode.getBoundary())) { - - if (textBlock.containsBoundary(entityNode.getBoundary())) { - entityNode.setDeepestFullyContainingNode(this); - } - - entityNode.addIntersectingNode(this); - streamChildren().forEach(node -> node.addThisToEntityIfIntersects(entityNode)); - } - } - - - /** - * Streams all children located directly underneath this node in the TableOfContents. - * - * @return Stream of all children - */ - default Stream streamChildren() { - - return getTableOfContents().streamChildrenNodes(getTocId()); - } - - - /** - * recursively streams all SemanticNodes located underneath this node in the TableOfContents in order. - * - * @return Stream of all SubNodes - */ - default Stream streamAllSubNodes() { - - return getTableOfContents().streamAllSubEntriesInOrder(getTocId()).map(TableOfContents.Entry::getNode); - } - - - /** - * @return Boundary of this Node's ClassificationTextBlock - */ - default Boundary getBoundary() { - - return buildTextBlock().getBoundary(); - } - - - /** - * If this Node is Terminal it will calculate the boundingBox of its TerminalTextBlock, otherwise it will calculate the Union of the BoundingBoxes of all its Children. - * If called on the ClassificationDocument, it will return the cropbox of each page - * - * @return Rectangle2D fully encapsulating this Node for each page. - */ - default Map getBBox() { - - Map bBoxPerPage = new HashMap<>(); - if (isTerminal()) { - return getBBoxFromTerminalTextBlock(bBoxPerPage); - } - - return getBBoxFromChildren(bBoxPerPage); - } - - - /** - * TODO this does not yet work for sections spanning multiple columns. - * - * @param bBoxPerPage initial empty BoundingBox - * @return The union of the BoundingBoxes of all children - */ - private Map getBBoxFromChildren(Map bBoxPerPage) { - - return streamChildren().map(SemanticNode::getBBox).reduce((map1, map2) -> { - map1.forEach((page, rectangle) -> map2.merge(page, rectangle, (rect1, rect2) -> rect1.createUnion(rect2).getBounds2D())); - return map2; - }).orElse(bBoxPerPage); - } - - - /** - * @param bBoxPerPage initial empty BoundingBox - * @return The union of all BoundingBoxes of the ClassificationTextBlock of this node - */ - private Map getBBoxFromTerminalTextBlock(Map bBoxPerPage) { - - Map> atomicTextBlockPerPage = buildTextBlock().getAtomicTextBlocks().stream().collect(Collectors.groupingBy(AtomicTextBlock::getPage)); - atomicTextBlockPerPage.forEach((page, atbs) -> bBoxPerPage.put(page, RectangleTransformations.bBoxUnionAtomicTextBlock(atbs))); - return bBoxPerPage; - } - -} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/TableCellNode.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/TableCellNode.java deleted file mode 100644 index af05599..0000000 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/TableCellNode.java +++ /dev/null @@ -1,92 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes; - -import java.awt.geom.Rectangle2D; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.stream.Stream; - -import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector; - -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.EqualsAndHashCode; -import lombok.experimental.FieldDefaults; - -@Data -@Builder -@AllArgsConstructor -@FieldDefaults(level = AccessLevel.PRIVATE) -public class TableCellNode implements SemanticNode { - - List tocId; - int row; - int col; - boolean header; - - Rectangle2D bBox; - - @Builder.Default - boolean terminal = true; - TextBlock terminalTextBlock; - - TextBlock textBlock; - - @EqualsAndHashCode.Exclude - TableOfContents tableOfContents; - - @Builder.Default - @EqualsAndHashCode.Exclude - Set entities = new HashSet<>(); - - - @Override - public Map getBBox() { - - Map bBoxPerPage = new HashMap<>(); - getPages().forEach(page -> bBoxPerPage.put(page, bBox)); - return bBoxPerPage; - } - - - @Override - public TextBlock buildTextBlock() { - - if (terminal) { - return terminalTextBlock; - } - - if (textBlock == null) { - textBlock = streamAllSubNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector()); - } - return textBlock; - } - - - @Override - public String toString() { - - return tocId + ": " + NodeType.TABLE_CELL + ": " + buildTextBlock().buildSummary(); - } - - - public boolean hasHeader(String headerString) { - - return getHeaders().anyMatch(header -> header.buildTextBlock().getSearchText().strip().equals(headerString)); - } - - - private Stream getHeaders() { - - TableNode tableNode = (TableNode) getParent(); - return tableNode.streamHeadersForCell(row, col); - } - -} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/TableNode.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/TableNode.java deleted file mode 100644 index a7ad031..0000000 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/nodes/TableNode.java +++ /dev/null @@ -1,73 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes; - -import java.util.HashSet; -import java.util.List; -import java.util.Set; -import java.util.stream.Stream; - -import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector; - -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.EqualsAndHashCode; -import lombok.experimental.FieldDefaults; - -@Data -@Builder -@AllArgsConstructor -@FieldDefaults(level = AccessLevel.PRIVATE) -public class TableNode implements SemanticNode { - - List tocId; - TableOfContents tableOfContents; - - Integer numberOfRows; - Integer numberOfCols; - - TextBlock textBlock; - - @Builder.Default - @EqualsAndHashCode.Exclude - Set entities = new HashSet<>(); - - - public Stream streamTableCells() { - - return streamChildren().map(node -> (TableCellNode) node); - } - - - public Stream streamHeaders() { - - return streamTableCells().filter(TableCellNode::isHeader); - } - - - public Stream streamHeadersForCell(int row, int col) { - - return streamHeaders().filter(cell -> cell.getRow() == row || cell.getCol() == col); - } - - - @Override - public TextBlock buildTextBlock() { - - if (textBlock == null) { - textBlock = streamAllSubNodes().filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector()); - } - return textBlock; - } - - - @Override - public String toString() { - - return tocId.toString() + ": " + NodeType.TABLE + ": " + buildTextBlock().buildSummary(); - } - -} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/textblock/AtomicTextBlock.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/textblock/AtomicTextBlock.java deleted file mode 100644 index 945f278..0000000 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/textblock/AtomicTextBlock.java +++ /dev/null @@ -1,131 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock; - -import java.awt.geom.Rectangle2D; -import java.util.List; - -import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityPosition; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode; -import com.knecon.fforesight.service.layoutparser.internal.api.services.RectangleTransformations; - -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.EqualsAndHashCode; -import lombok.experimental.FieldDefaults; - -@Data -@Builder -@AllArgsConstructor -@FieldDefaults(level = AccessLevel.PRIVATE) -public class AtomicTextBlock implements TextBlock { - - Long id; - Integer numberOnPage; - PageNode page; - - //string coordinates - Boundary boundary; - String searchText; - List lineBreaks; - - //position coordinates - List stringIdxToPositionIdx; - List positions; - - @EqualsAndHashCode.Exclude - SemanticNode parent; - - - @Override - public int numberOfLines() { - - return lineBreaks.size() + 1; - } - - - public CharSequence getLine(int lineNumber) { - - if (lineNumber >= numberOfLines() || lineNumber < 0) { - throw new IndexOutOfBoundsException(String.format("line %d out of range for AtomicTextBlock with %d lines", lineNumber, numberOfLines())); - } - if (lineNumber == 0) { - return subSequence(boundary.start(), lineBreaks.get(0) + boundary.start()); - } else if (lineNumber == numberOfLines() - 1) { - return subSequence(lineBreaks.get(lineBreaks.size() - 1) + boundary.start(), boundary.end()); - } - return subSequence(lineBreaks.get(lineNumber - 1) + boundary.start(), lineBreaks.get(lineNumber) + boundary.start()); - } - - - @Override - public List getAtomicTextBlocks() { - - return List.of(this); - } - - - @Override - public int getNextLinebreak(int fromIndex) { - - return lineBreaks.stream()// - .filter(linebreak -> linebreak > fromIndex - boundary.start()) // - .findFirst() // - .orElse(searchText.length()) + boundary.start(); - } - - - @Override - public int getPreviousLinebreak(int fromIndex) { - - return lineBreaks.stream()// - .filter(linebreak -> linebreak <= fromIndex - boundary.start())// - .reduce((a, b) -> b)// - .orElse(0) + boundary.start(); - } - - - @Override - public Rectangle2D getPosition(int stringIdx) { - - return positions.get(stringIdxToPositionIdx.get(stringIdx - boundary.start())); - } - - - @Override - public List getPositions(Boundary stringBoundary) { - - if (!containsBoundary(stringBoundary)) { - throw new IndexOutOfBoundsException(String.format("%s is out of bounds for %s", stringBoundary, this.boundary)); - } - - if (stringBoundary.end() == this.boundary.end()) { - return positions.subList(stringIdxToPositionIdx.get(stringBoundary.start() - this.boundary.start()), positions.size()); - } - - return positions.subList(stringIdxToPositionIdx.get(stringBoundary.start() - this.boundary.start()), - stringIdxToPositionIdx.get(stringBoundary.end() - this.boundary.start())); - } - - - public List getEntityPositionsPerPage(Boundary stringBoundary) { - - List positionsPerLine = stringBoundary.split(getLineBreaks().stream().map(lb -> lb + boundary.start()).filter(stringBoundary::contains).toList()) - .stream() - .map(this::getPositions) - .map(RectangleTransformations::rectangleUnion) - .toList(); - - return List.of(EntityPosition.builder().rectanglePerLine(positionsPerLine).pageNode(page).build()); - } - - - @Override - public String toString() { - - return searchText; - } - -} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/mapper/DocumentGraphMapper.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/mapper/DocumentGraphMapper.java deleted file mode 100644 index d007e82..0000000 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/mapper/DocumentGraphMapper.java +++ /dev/null @@ -1,229 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.internal.api.mapper; - -import static com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType.FOOTER; -import static com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType.HEADER; - -import java.awt.geom.Rectangle2D; -import java.util.Arrays; -import java.util.HashSet; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.NoSuchElementException; - -import com.google.common.primitives.Ints; -import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositionBlockData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.PageData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.TableOfContentsData; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.FooterNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.HeaderNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.HeadlineNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ParagraphNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SectionNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableCellNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector; - -import lombok.experimental.UtilityClass; - -@UtilityClass -public class DocumentGraphMapper { - - public DocumentGraph toDocumentGraph(DocumentData documentData) { - - - DocumentGraph documentGraph = new DocumentGraph(); - Context context = new Context(documentData, - new TableOfContents(documentGraph), - new LinkedList<>(), - new LinkedList<>(), - Arrays.stream(documentData.getAtomicTextBlocks()).toList(), - Arrays.stream(documentData.getAtomicPositionBlocks()).toList()); - - context.pages.addAll(Arrays.stream(documentData.getPages()).map(DocumentGraphMapper::buildPage).toList()); - - context.tableOfContents.getRoot().getChildren().addAll(buildEntries(documentData.getTableOfContents().getRoot().getSubEntries(), context)); - - documentGraph.setTableOfContents(context.tableOfContents); - documentGraph.setPages(new HashSet<>(context.pages)); - documentGraph.setNumberOfPages(documentData.getPages().length); - - documentGraph.setTextBlock(documentGraph.buildTextBlock()); - return documentGraph; - } - - - private List buildEntries(List entries, - Context context) { - - List newEntries = new LinkedList<>(); - for (TableOfContentsData.EntryData entryData : entries) { - - boolean terminal = isTerminal(entryData); - List pages = Arrays.stream(entryData.getPages()).map(pageNumber -> getPage(pageNumber, context)).toList(); - - SemanticNode node = switch (entryData.getType()) { - case SECTION -> buildSection(context); - case PARAGRAPH -> buildParagraph(context, terminal); - case HEADLINE -> buildHeadline(context, terminal); - case HEADER -> buildHeader(context, terminal); - case FOOTER -> buildFooter(context, terminal); - case TABLE -> buildTable(context, entryData.getProperties()); - case TABLE_CELL -> buildTableCell(context, entryData.getProperties(), terminal); - case IMAGE -> buildImage(context, entryData.getProperties()); - default -> throw new UnsupportedOperationException("Not yet implemented for type " + entryData.getType()); - }; - - if (node.isTerminal()) { - TextBlock textBlock = toTextBlock(entryData.getAtomicBlocks(), context, node); - node.setTerminalTextBlock(textBlock); - } - List tocId = Arrays.stream(entryData.getTocId()).boxed().toList(); - node.setTocId(tocId); - - if (entryData.getType() == HEADER) { - pages.forEach(page -> page.setHeader((HeaderNode) node)); - } else if (entryData.getType() == FOOTER) { - pages.forEach(page -> page.setFooter((FooterNode) node)); - } else { - pages.forEach(page -> page.getMainBody().add(node)); - } - newEntries.add(TableOfContents.Entry.builder().tocId(tocId).type(entryData.getType()).children(buildEntries(entryData.getSubEntries(), context)).node(node).build()); - } - return newEntries; - } - - - private HeadlineNode buildHeadline(Context context, boolean terminal) { - - return HeadlineNode.builder().terminal(terminal).tableOfContents(context.tableOfContents()).build(); - } - - - private static boolean isTerminal(TableOfContentsData.EntryData entryData) { - - return entryData.getAtomicBlocks().length > 0; - } - - - private ImageNode buildImage(Context context, Map properties) { - - var builder = ImageNode.builder(); - PropertiesMapper.parseImageProperties(properties, builder); - return builder.tableOfContents(context.tableOfContents()).build(); - } - - - private TableCellNode buildTableCell(Context context, Map properties, boolean terminal) { - - TableCellNode.TableCellNodeBuilder builder = TableCellNode.builder(); - PropertiesMapper.parseTableCellProperties(properties, builder); - return builder.terminal(terminal).tableOfContents(context.tableOfContents()).build(); - } - - - private TableNode buildTable(Context context, Map properties) { - - TableNode.TableNodeBuilder builder = TableNode.builder(); - PropertiesMapper.parseTableProperties(properties, builder); - return TableNode.builder().tableOfContents(context.tableOfContents()).build(); - } - - - private FooterNode buildFooter(Context context, boolean terminal) { - - return FooterNode.builder().terminal(terminal).tableOfContents(context.tableOfContents()).build(); - } - - - private HeaderNode buildHeader(Context context, boolean terminal) { - - return HeaderNode.builder().terminal(terminal).tableOfContents(context.tableOfContents()).build(); - } - - - private SectionNode buildSection(Context context) { - - return SectionNode.builder().tableOfContents(context.tableOfContents()).build(); - - } - - - private ParagraphNode buildParagraph(Context context, boolean terminal) { - - return ParagraphNode.builder().terminal(terminal).tableOfContents(context.tableOfContents()).build(); - } - - - private TextBlock toTextBlock(Long[] atomicTextBlockIds, Context context, SemanticNode parent) { - - return Arrays.stream(atomicTextBlockIds) - .map(atomicTextBlockId -> toAtomicTextBlock(context.atomicTextBlockData.get(Math.toIntExact(atomicTextBlockId)), - context.atomicPositionBlockData.get(Math.toIntExact(atomicTextBlockId)), - parent, - context)) - .collect(new TextBlockCollector()); - } - - - private PageNode buildPage(PageData p) { - - return PageNode.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).mainBody(new LinkedList<>()).build(); - } - - - private AtomicTextBlock toAtomicTextBlock(AtomicTextBlockData atomicTextBlockData, - AtomicPositionBlockData atomicPositionBlockData, - SemanticNode parent, - Context context) { - - return AtomicTextBlock.builder() - .id(atomicTextBlockData.getId()) - .numberOnPage(atomicTextBlockData.getNumberOnPage()) - .page(getPage(atomicTextBlockData.getPage(), context)) - .boundary(new Boundary(atomicTextBlockData.getStart(), atomicTextBlockData.getEnd())) - .searchText(atomicTextBlockData.getSearchText()) - .lineBreaks(Ints.asList(atomicTextBlockData.getLineBreaks())) - .stringIdxToPositionIdx(Ints.asList(atomicPositionBlockData.getStringIdxToPositionIdx())) - .positions(toRectangle2DList(atomicPositionBlockData.getPositions())) - .parent(parent) - .build(); - } - - - private static List toRectangle2DList(float[][] positions) { - - return Arrays.stream(positions).map(floatArr -> (Rectangle2D) new Rectangle2D.Float(floatArr[0], floatArr[1], floatArr[2], floatArr[3])).toList(); - } - - - private PageNode getPage(Long pageIndex, Context context) { - - return context.pages.stream() - .filter(page -> page.getNumber() == Math.toIntExact(pageIndex)) - .findFirst() - .orElseThrow(() -> new NoSuchElementException(String.format("ClassificationPage with number %d not found", pageIndex))); - } - - - record Context( - DocumentData layoutParsingModel, - TableOfContents tableOfContents, - List pages, - List sections, - List atomicTextBlockData, - List atomicPositionBlockData) { - - } - -} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/mapper/PropertiesMapper.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/mapper/PropertiesMapper.java deleted file mode 100644 index 8c9e370..0000000 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/mapper/PropertiesMapper.java +++ /dev/null @@ -1,101 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.internal.api.mapper; - -import java.awt.geom.Rectangle2D; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageType; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableCellNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableNode; -import com.knecon.fforesight.service.layoutparser.internal.api.services.RectangleTransformations; - -public class PropertiesMapper { - - public static Map buildImageProperties(ImageNode image) { - - Map properties = new HashMap<>(); - properties.put("imageType", image.getImageType().toString()); - properties.put("transparency", String.valueOf(image.isTransparency())); - properties.put("position", RectangleTransformations.toString(image.getPosition())); - return properties; - } - - - public static Map buildTableCellProperties(TableCellNode tableCell) { - - Map properties = new HashMap<>(); - properties.put("row", String.valueOf(tableCell.getRow())); - properties.put("col", String.valueOf(tableCell.getCol())); - properties.put("header", String.valueOf(tableCell.isHeader())); - - if (tableCell.getPages().size() > 1 || tableCell.getBBox().keySet().size() > 1) { - throw new IllegalArgumentException("TableCell can only occur on a single page!"); - } - String bBoxString = RectangleTransformations.toString(tableCell.getBBox().get(tableCell.getPages().stream().findFirst().get())); - properties.put("bBox", bBoxString); - - return properties; - } - - - public static Map buildTableProperties(TableNode table) { - - Map properties = new HashMap<>(); - properties.put("numberOfRows", String.valueOf(table.getNumberOfRows())); - properties.put("numberOfCols", String.valueOf(table.getNumberOfCols())); - return properties; - } - - - public static void parseImageProperties(Map properties, ImageNode.ImageNodeBuilder builder) { - - builder.imageType(parseImageType(properties.get("imageType"))); - builder.transparency(Boolean.parseBoolean(properties.get("transparency"))); - builder.position(parseRectangle2D(properties.get("position"))); - } - - - public static void parseTableCellProperties(Map properties, TableCellNode.TableCellNodeBuilder builder) { - - builder.row(Integer.parseInt(properties.get("row"))); - builder.col(Integer.parseInt(properties.get("col"))); - builder.header(Boolean.parseBoolean(properties.get("header"))); - builder.bBox(parseRectangle2D(properties.get("bBox"))); - } - - - public static void parseTableProperties(Map properties, TableNode.TableNodeBuilder builder) { - - builder.numberOfRows(Integer.parseInt(properties.get("numberOfRows"))); - builder.numberOfCols(Integer.parseInt(properties.get("numberOfCols"))); - } - - - private static ImageType parseImageType(String imageType) { - - return switch (imageType) { - case "LOGO" -> ImageType.LOGO; - case "FORMULA" -> ImageType.FORMULA; - case "SIGNATURE" -> ImageType.SIGNATURE; - case "OCR" -> ImageType.OCR; - default -> ImageType.OTHER; - }; - } - - - public static String toString(Rectangle2D rectangle2D) { - - return String.format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight()); - } - - - public static Rectangle2D parseRectangle2D(String bBox) { - - List floats = Arrays.stream(bBox.split(",")).map(Float::parseFloat).toList(); - return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3)); - } - -} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/services/EntityEnrichmentService.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/services/EntityEnrichmentService.java deleted file mode 100644 index a0d079c..0000000 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/services/EntityEnrichmentService.java +++ /dev/null @@ -1,10 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.internal.api.services; - -import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; - -public interface EntityEnrichmentService { - - void enrichEntity(EntityNode entity, TextBlock textBlock); - -} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/services/EntityInsertionService.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/services/EntityInsertionService.java deleted file mode 100644 index 5e116da..0000000 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/services/EntityInsertionService.java +++ /dev/null @@ -1,56 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.internal.api.services; - -import java.util.Collections; -import java.util.NoSuchElementException; -import java.util.Set; - -import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; - -import lombok.RequiredArgsConstructor; - -@RequiredArgsConstructor -public class EntityInsertionService { - - private final EntityEnrichmentService entityEnrichmentService; - - - public void addEntityToGraph(EntityNode entity, TableOfContents tableOfContents) { - - try { - SemanticNode containingNode = tableOfContents.streamChildrenNodes(Collections.emptyList()) - .filter(node -> node.buildTextBlock().containsBoundary(entity.getBoundary())) - .findFirst() - .orElseThrow(() -> new NoSuchElementException("No containing Node found!")); - - containingNode.addThisToEntityIfIntersects(entity); - - TextBlock textBlock = entity.getDeepestFullyContainingNode().buildTextBlock(); - entityEnrichmentService.enrichEntity(entity, textBlock); - - addToPages(entity); - addToNodeEntitySets(entity); - - } catch (NoSuchElementException e) { - entity.removeFromGraph(); - } - } - - - private void addToPages(EntityNode entity) { - - Set pages = entity.getDeepestFullyContainingNode().getPages(); - entity.getPages().addAll(pages); - pages.forEach(page -> page.getEntities().add(entity)); - } - - - private void addToNodeEntitySets(EntityNode entity) { - - entity.getIntersectingNodes().forEach(node -> node.getEntities().add(entity)); - } - -} diff --git a/layoutparser-service/layoutparser-service-processor/pom.xml b/layoutparser-service/layoutparser-service-processor/pom.xml index b62a423..3118ed5 100644 --- a/layoutparser-service/layoutparser-service-processor/pom.xml +++ b/layoutparser-service/layoutparser-service-processor/pom.xml @@ -77,6 +77,10 @@ spring-boot-starter-amqp ${spring.version} + + org.junit.platform + junit-platform-commons + diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingService.java index e4c2273..33f309b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingService.java @@ -7,19 +7,19 @@ import java.io.IOException; import org.apache.pdfbox.pdmodel.PDDocument; import org.springframework.stereotype.Service; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph; -import com.knecon.fforesight.service.layoutparser.internal.api.mapper.DocumentDataMapper; +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent; +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; import com.knecon.fforesight.service.layoutparser.processor.adapter.CvTableParsingAdapter; import com.knecon.fforesight.service.layoutparser.processor.adapter.ImageServiceResponseAdapter; import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.classification.service.ClassificationService; import com.knecon.fforesight.service.layoutparser.processor.classification.service.PdfParsingService; import com.knecon.fforesight.service.layoutparser.processor.classification.service.SectionsBuilderService; import com.knecon.fforesight.service.layoutparser.processor.factory.DocumentGraphFactory; -import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent; -import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.mapper.DocumentDataMapper; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; @@ -35,7 +35,6 @@ public class LayoutParsingService { private final PdfParsingService pdfParsingService; private final ClassificationService classificationService; private final SectionsBuilderService sectionsBuilderService; - private final DocumentGraphFactory documentGraphFactory; public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException { @@ -53,7 +52,7 @@ public class LayoutParsingService { tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.pageFileStorageId()); } - DocumentGraph documentGraph = parseLayout(originDocument, imageServiceResponse, tableServiceResponse); + Document documentGraph = parseLayout(originDocument, imageServiceResponse, tableServiceResponse); int numberOfPages = originDocument.getNumberOfPages(); originDocument.close(); @@ -72,7 +71,7 @@ public class LayoutParsingService { } - public DocumentGraph parseLayout(PDDocument originDocument, ImageServiceResponse imageServiceResponse, TableServiceResponse tableServiceResponse) { + public Document parseLayout(PDDocument originDocument, ImageServiceResponse imageServiceResponse, TableServiceResponse tableServiceResponse) { ClassificationDocument classificationDocument = pdfParsingService.parseDocument(originDocument, cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse), @@ -82,7 +81,7 @@ public class LayoutParsingService { sectionsBuilderService.buildSections(classificationDocument); - return documentGraphFactory.buildDocumentGraph(classificationDocument); + return DocumentGraphFactory.buildDocumentGraph(classificationDocument); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java index 7d284f9..bcdfdbd 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java @@ -17,12 +17,12 @@ import com.iqser.red.storage.commons.service.StorageService; import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositionBlockData; import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData; import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentTreeData; import com.knecon.fforesight.service.layoutparser.internal.api.data.PageData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.TableOfContentsData; +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.multitenancy.TenantContext; -import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; @@ -69,7 +69,7 @@ public class LayoutParsingStorageService { public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, DocumentData documentData) throws IOException { - storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), documentData.getTableOfContents()); + storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), documentData.getDocumentTreeData()); storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.textBlockFileStorageId(), documentData.getAtomicTextBlocks()); storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.positionBlockFileStorageId(), documentData.getAtomicPositionBlocks()); storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.pageFileStorageId(), documentData.getPages()); @@ -86,12 +86,12 @@ public class LayoutParsingStorageService { AtomicPositionBlockData[] atomicPositionBlockData = storageService.readJSONObject(TenantContext.getTenantId(), layoutParsingRequest.positionBlockFileStorageId(), AtomicPositionBlockData[].class); - TableOfContentsData tableOfContentsData = storageService.readJSONObject(TenantContext.getTenantId(), + DocumentTreeData tableOfContentsData = storageService.readJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), - TableOfContentsData.class); + DocumentTreeData.class); return DocumentData.builder() - .tableOfContents(tableOfContentsData) + .documentTreeData(tableOfContentsData) .atomicPositionBlocks(atomicPositionBlockData) .atomicTextBlocks(atomicTextBlockData) .pages(pageData) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/CvTableParsingAdapter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/CvTableParsingAdapter.java index 31bf171..0fd3131 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/CvTableParsingAdapter.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/CvTableParsingAdapter.java @@ -8,7 +8,7 @@ import java.util.Map; import org.springframework.stereotype.Service; -import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.CvParsedTableCell; +import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableCells; import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse; import lombok.RequiredArgsConstructor; @@ -19,9 +19,9 @@ import lombok.extern.slf4j.Slf4j; @RequiredArgsConstructor public class CvTableParsingAdapter { - public Map> buildCvParsedTablesPerPage(TableServiceResponse tableServiceResponse) { + public Map> buildCvParsedTablesPerPage(TableServiceResponse tableServiceResponse) { - Map> tableCells = new HashMap<>(); + Map> tableCells = new HashMap<>(); tableServiceResponse.getData() .forEach(tableData -> tableCells.computeIfAbsent(tableData.getPageInfo().getNumber(), tableCell -> new ArrayList<>()) .addAll(convertTableCells(tableData.getTableCells()))); @@ -30,11 +30,11 @@ public class CvTableParsingAdapter { } - private Collection convertTableCells(List tableCells) { + private Collection convertTableCells(List tableCells) { - List cvParsedTableCells = new ArrayList<>(); + List cvParsedTableCells = new ArrayList<>(); - tableCells.forEach(t -> cvParsedTableCells.add(com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell.builder() + tableCells.forEach(t -> cvParsedTableCells.add(TableCells.builder() .y0(t.getY0()) .x1(t.getX1()) .y1(t.getY1()) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/ImageServiceResponseAdapter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/ImageServiceResponseAdapter.java index 5517f3a..29898c9 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/ImageServiceResponseAdapter.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/ImageServiceResponseAdapter.java @@ -9,10 +9,10 @@ import java.util.Map; import org.springframework.stereotype.Service; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageType; import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.ImageType; import lombok.RequiredArgsConstructor; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Classification.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Classification.java index a743e5b..f482f81 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Classification.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Classification.java @@ -3,12 +3,9 @@ package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image import java.util.HashMap; import java.util.Map; -import com.dslplatform.json.CompiledJson; - import lombok.Data; @Data -@CompiledJson public class Classification { private Map probabilities = new HashMap<>(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/FilterGeometry.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/FilterGeometry.java index ea02ade..3f3c75b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/FilterGeometry.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/FilterGeometry.java @@ -1,14 +1,11 @@ package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image; -import com.dslplatform.json.CompiledJson; - import lombok.Data; @Data -@CompiledJson public class FilterGeometry { private ImageSize imageSize; - private Format imageFormat; + private ImageFormat imageFormat; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Filters.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Filters.java index 9258f89..cf3a755 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Filters.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Filters.java @@ -1,11 +1,8 @@ package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image; -import com.dslplatform.json.CompiledJson; - import lombok.Data; @Data -@CompiledJson public class Filters { private FilterGeometry geometry; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Geometry.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Geometry.java index 4395be7..7380c48 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Geometry.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Geometry.java @@ -1,11 +1,8 @@ package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image; -import com.dslplatform.json.CompiledJson; - import lombok.Data; @Data -@CompiledJson public class Geometry { private float width; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Format.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/ImageFormat.java similarity index 71% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Format.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/ImageFormat.java index 92263f8..842f1b7 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Format.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/ImageFormat.java @@ -1,12 +1,9 @@ package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image; -import com.dslplatform.json.CompiledJson; - import lombok.Data; @Data -@CompiledJson -public class Format { +public class ImageFormat { private float quotient; private boolean tooTall; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Metadata.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/ImageMetadata.java similarity index 77% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Metadata.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/ImageMetadata.java index e354652..ab37c0e 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Metadata.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/ImageMetadata.java @@ -1,12 +1,9 @@ package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image; -import com.dslplatform.json.CompiledJson; - import lombok.Data; @Data -@CompiledJson -public class Metadata { +public class ImageMetadata { private Classification classification; private Position position; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/ImageServiceResponse.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/ImageServiceResponse.java index 9083787..bad3701 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/ImageServiceResponse.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/ImageServiceResponse.java @@ -3,15 +3,12 @@ package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image import java.util.ArrayList; import java.util.List; -import com.dslplatform.json.CompiledJson; -import com.dslplatform.json.JsonAttribute; import com.fasterxml.jackson.annotation.JsonAlias; import com.fasterxml.jackson.annotation.JsonProperty; import lombok.Data; @Data -@CompiledJson public class ImageServiceResponse { private String dossierId; @@ -19,15 +16,13 @@ public class ImageServiceResponse { @JsonProperty(value = "imageMetadata") @JsonAlias("data") - @JsonAttribute(alternativeNames = {"imageMetadata"}) - private List data = new ArrayList<>(); + private List data = new ArrayList<>(); - private List dataCV = new ArrayList<>(); + private List dataCV = new ArrayList<>(); @JsonProperty(value = "imageMetadata") @JsonAlias("data") - @JsonAttribute(alternativeNames = {"imageMetadata"}) - public void setData(List data) {this.data = data;} + public void setData(List data) {this.data = data;} } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/ImageSize.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/ImageSize.java index cafd0b5..871e942 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/ImageSize.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/ImageSize.java @@ -1,11 +1,8 @@ package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image; -import com.dslplatform.json.CompiledJson; - import lombok.Data; @Data -@CompiledJson public class ImageSize { private float quotient; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Position.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Position.java index 297499e..595d9ce 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Position.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Position.java @@ -1,11 +1,8 @@ package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image; -import com.dslplatform.json.CompiledJson; - import lombok.Data; @Data -@CompiledJson public class Position { private float x1; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Probability.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Probability.java index 961f76a..7762e7d 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Probability.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/image/Probability.java @@ -1,11 +1,8 @@ package com.knecon.fforesight.service.layoutparser.processor.adapter.model.image; -import com.dslplatform.json.CompiledJson; - import lombok.Data; @Data -@CompiledJson public class Probability { private boolean unconfident; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/CvParsedTableModel.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/CvParsedTableModel.java deleted file mode 100644 index 01902cf..0000000 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/CvParsedTableModel.java +++ /dev/null @@ -1,17 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table; - -import java.util.ArrayList; -import java.util.List; - -import com.dslplatform.json.CompiledJson; - -import lombok.Data; - -@Data -@CompiledJson -public class CvParsedTableModel { - - private CvParsedPageInfo pageInfo; - private List tableCells = new ArrayList<>(); - -} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/CvParsedPageInfo.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/PageInfo.java similarity index 70% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/CvParsedPageInfo.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/PageInfo.java index 5f4bde3..b098cb3 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/CvParsedPageInfo.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/PageInfo.java @@ -1,12 +1,9 @@ package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table; -import com.dslplatform.json.CompiledJson; - import lombok.Data; @Data -@CompiledJson -public class CvParsedPageInfo { +public class PageInfo { private int number; private int rotation; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/CvParsedTableCell.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/PdfTableCell.java similarity index 72% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/CvParsedTableCell.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/PdfTableCell.java index 0b11042..808a572 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/CvParsedTableCell.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/PdfTableCell.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table; +package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table; import lombok.AllArgsConstructor; import lombok.Builder; @@ -9,7 +9,7 @@ import lombok.RequiredArgsConstructor; @Builder @AllArgsConstructor @RequiredArgsConstructor -public class CvParsedTableCell { +public class PdfTableCell { private float x0; private float y0; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/CvParsedTableCell.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/TableCells.java similarity index 73% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/CvParsedTableCell.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/TableCells.java index 7aa369f..8410179 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/CvParsedTableCell.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/TableCells.java @@ -1,12 +1,11 @@ package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table; -import com.dslplatform.json.CompiledJson; - +import lombok.Builder; import lombok.Data; @Data -@CompiledJson -public class CvParsedTableCell { +@Builder +public class TableCells { private float x0; private float y0; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/TableData.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/TableData.java new file mode 100644 index 0000000..f82911f --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/TableData.java @@ -0,0 +1,14 @@ +package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table; + +import java.util.ArrayList; +import java.util.List; + +import lombok.Data; + +@Data +public class TableData { + + private PageInfo pageInfo; + private List tableCells = new ArrayList<>(); + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/TableServiceResponse.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/TableServiceResponse.java index 484f7e7..8d5e506 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/TableServiceResponse.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/model/table/TableServiceResponse.java @@ -3,12 +3,9 @@ package com.knecon.fforesight.service.layoutparser.processor.adapter.model.table import java.util.ArrayList; import java.util.List; -import com.dslplatform.json.CompiledJson; - import lombok.Data; @Data -@CompiledJson public class TableServiceResponse { private String dossierId; @@ -17,6 +14,6 @@ public class TableServiceResponse { private String targetFileExtension; private String responseFileExtension; - private List data = new ArrayList<>(); + private List data = new ArrayList<>(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/AbstractTextContainer.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/AbstractTextContainer.java deleted file mode 100644 index 84325a0..0000000 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/AbstractTextContainer.java +++ /dev/null @@ -1,71 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.dto; - -import java.awt.geom.Rectangle2D; - -import com.dslplatform.json.JsonAttribute; -import com.fasterxml.jackson.annotation.JsonIgnore; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextBlockOrientation; - -import lombok.AllArgsConstructor; -import lombok.Data; -import lombok.NoArgsConstructor; - -@Data -@AllArgsConstructor -@NoArgsConstructor -public abstract class AbstractTextContainer { - - protected float minX; - protected float maxX; - protected float minY; - protected float maxY; - protected String classification; - protected int page; - - private TextBlockOrientation orientation = TextBlockOrientation.NONE; - - - public abstract String getText(); - - - public boolean containsBlock(ClassificationTextBlock other) { - - return this.minX <= other.getPdfMinX() && this.maxX >= other.getPdfMaxX() && this.minY >= other.getPdfMinY() && this.maxY <= other.getPdfMaxY(); - } - - - public boolean contains(AbstractTextContainer other) { - - return this.minX <= other.minX && this.maxX >= other.maxX && this.minY >= other.minY && this.maxY <= other.maxY; - } - - - public boolean contains(Rectangle2D other) { - - return other.contains(minX, minY, getWidth(), getHeight()); - } - - - @JsonIgnore - @JsonAttribute(ignore = true) - public float getHeight() { - - return maxY - minY; - } - - - @JsonIgnore - @JsonAttribute(ignore = true) - public float getWidth() { - - return maxX - minX; - } - - - public boolean intersectsY(AbstractTextContainer atc) { - - return this.minY <= atc.getMaxY() && this.maxY >= atc.getMinY(); - } - -} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/TableCell.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/TableCell.java deleted file mode 100644 index 578371f..0000000 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/TableCell.java +++ /dev/null @@ -1,38 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table; - -import java.awt.geom.Point2D; -import java.util.ArrayList; -import java.util.List; - -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.Rectangle; - -import lombok.Data; -import lombok.EqualsAndHashCode; -import lombok.NoArgsConstructor; - -@SuppressWarnings("serial") -@Data -@EqualsAndHashCode(callSuper = true) -@NoArgsConstructor -public class TableCell extends Rectangle { - - private List textBlocks = new ArrayList<>(); - - private List headerCells = new ArrayList<>(); - - private boolean isHeaderCell; - - - public TableCell(Point2D topLeft, Point2D bottomRight) { - - super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), (float) (bottomRight.getY() - topLeft.getY())); - } - - - public void addTextBlock(ClassificationTextBlock textBlock) { - - textBlocks.add(textBlock); - } - -} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/AbstractPageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/AbstractPageBlock.java new file mode 100644 index 0000000..821a3f6 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/AbstractPageBlock.java @@ -0,0 +1,80 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.model; + +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; + +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@AllArgsConstructor +@NoArgsConstructor +public abstract class AbstractPageBlock { + + @JsonIgnore + protected float minX; + @JsonIgnore + protected float maxX; + @JsonIgnore + protected float minY; + @JsonIgnore + protected float maxY; + @JsonIgnore + protected PageBlockType classification; + @JsonIgnore + protected int page; + + @JsonIgnore + private Orientation orientation = Orientation.NONE; + + + public abstract String getText(); + + + public boolean isHeadline() { + + return this instanceof TextPageBlock && this.getClassification() != null && this.getClassification().isHeadline(); + } + + + public boolean containsBlock(TextPageBlock other) { + + return this.minX <= other.getPdfMinX() && this.maxX >= other.getPdfMaxX() && this.minY >= other.getPdfMinY() && this.maxY <= other.getPdfMaxY(); + } + + + public boolean contains(AbstractPageBlock other) { + + return this.minX <= other.minX && this.maxX >= other.maxX && this.minY >= other.minY && this.maxY <= other.maxY; + } + + + public boolean contains(Rectangle other) { + + return page == other.getPage() && this.minX <= other.getTopLeft().getX() && this.maxX >= other.getTopLeft().getX() + other.getWidth() && this.minY <= other.getTopLeft() + .getY() && this.maxY >= other.getTopLeft().getY() + other.getHeight(); + } + + + @JsonIgnore + public float getHeight() { + + return maxY - minY; + } + + + @JsonIgnore + public float getWidth() { + + return maxX - minX; + } + + + public boolean intersectsY(AbstractPageBlock atc) { + + return this.minY <= atc.getMaxY() && this.maxY >= atc.getMinY(); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/ClassificationDocument.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationDocument.java similarity index 78% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/ClassificationDocument.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationDocument.java index df2af76..1ce5a1c 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/ClassificationDocument.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationDocument.java @@ -1,10 +1,11 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.dto; +package com.knecon.fforesight.service.layoutparser.processor.classification.model; import java.util.ArrayList; import java.util.List; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.StringFrequencyCounter; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.UnclassifiedText; +import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionGrid; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.StringFrequencyCounter; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.UnclassifiedText; import lombok.Data; import lombok.NoArgsConstructor; @@ -24,4 +25,7 @@ public class ClassificationDocument { private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter(); private boolean headlines; + private SectionGrid sectionGrid = new SectionGrid(); + private long rulesVersion; + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/ClassificationFooter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationFooter.java similarity index 70% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/ClassificationFooter.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationFooter.java index f72b1bc..2aad008 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/ClassificationFooter.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationFooter.java @@ -1,8 +1,8 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.dto; +package com.knecon.fforesight.service.layoutparser.processor.classification.model; import java.util.List; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; import lombok.AllArgsConstructor; import lombok.Data; @@ -11,6 +11,6 @@ import lombok.Data; @AllArgsConstructor public class ClassificationFooter { - private List textBlocks; + private List textBlocks; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/ClassificationHeader.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationHeader.java similarity index 70% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/ClassificationHeader.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationHeader.java index aef421a..be4447d 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/ClassificationHeader.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationHeader.java @@ -1,8 +1,8 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.dto; +package com.knecon.fforesight.service.layoutparser.processor.classification.model; import java.util.List; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; import lombok.AllArgsConstructor; import lombok.Data; @@ -11,6 +11,6 @@ import lombok.Data; @AllArgsConstructor public class ClassificationHeader { - private List textBlocks; + private List textBlocks; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/ClassificationPage.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationPage.java similarity index 87% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/ClassificationPage.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationPage.java index f1c2a61..91dfd79 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/ClassificationPage.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationPage.java @@ -1,11 +1,11 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.dto; +package com.knecon.fforesight.service.layoutparser.processor.classification.model; import java.util.ArrayList; import java.util.List; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.StringFrequencyCounter; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.StringFrequencyCounter; import lombok.Data; import lombok.NonNull; @@ -16,7 +16,7 @@ import lombok.RequiredArgsConstructor; public class ClassificationPage { @NonNull - private List textBlocks; + private List textBlocks; private List images = new ArrayList<>(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/ClassificationSection.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationSection.java similarity index 52% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/ClassificationSection.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationSection.java index 4e1b6fa..8de2007 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/ClassificationSection.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationSection.java @@ -1,38 +1,32 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.dto; +package com.knecon.fforesight.service.layoutparser.processor.classification.model; import java.util.ArrayList; import java.util.List; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Table; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock; import lombok.Data; import lombok.NoArgsConstructor; @Data @NoArgsConstructor -public class ClassificationSection implements Comparable { +public class ClassificationSection { - private List pageBlocks = new ArrayList<>(); + private List pageBlocks = new ArrayList<>(); private List images = new ArrayList<>(); private String headline; - public List getTables() { + public List getTables() { - List
tables = new ArrayList<>(); + List tables = new ArrayList<>(); pageBlocks.forEach(block -> { - if (block instanceof Table) { - tables.add((Table) block); + if (block instanceof TablePageBlock) { + tables.add((TablePageBlock) block); } }); return tables; } - - @Override - public int compareTo(Object o) { - - return 0; - } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/FloatFrequencyCounter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/FloatFrequencyCounter.java similarity index 98% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/FloatFrequencyCounter.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/FloatFrequencyCounter.java index f970120..80bcbf6 100755 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/FloatFrequencyCounter.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/FloatFrequencyCounter.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.dto; +package com.knecon.fforesight.service.layoutparser.processor.classification.model; import java.util.ArrayList; import java.util.Collections; @@ -9,9 +9,9 @@ import java.util.stream.Collectors; import lombok.Getter; +@Getter public class FloatFrequencyCounter { - @Getter Map countPerValue = new HashMap<>(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/TextBlockOrientation.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/Orientation.java similarity index 63% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/TextBlockOrientation.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/Orientation.java index 5ff10a4..75ae7bd 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/TextBlockOrientation.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/Orientation.java @@ -1,6 +1,6 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text; +package com.knecon.fforesight.service.layoutparser.processor.classification.model; -public enum TextBlockOrientation { +public enum Orientation { NONE, LEFT, diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/PageBlockType.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/PageBlockType.java new file mode 100644 index 0000000..9740979 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/PageBlockType.java @@ -0,0 +1,38 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.model; + +public enum PageBlockType { + H1, + H2, + H3, + H4, + H5, + H6, + HEADER, + FOOTER, + TITLE, + PARAGRAPH, + PARAGRAPH_BOLD, + PARAGRAPH_ITALIC, + PARAGRAPH_UNKNOWN, + OTHER, + TABLE; + + + public static PageBlockType getHeadlineType(int i) { + + return switch (i) { + case 1 -> PageBlockType.H1; + case 2 -> PageBlockType.H2; + case 3 -> PageBlockType.H3; + case 4 -> PageBlockType.H4; + case 5 -> PageBlockType.H5; + default -> PageBlockType.H6; + }; + } + + + public boolean isHeadline() { + + return this.equals(H1) || this.equals(H2) || this.equals(H3) || this.equals(H4) || this.equals(H5) || this.equals(H6); + } +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/image/ClassifiedImage.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/image/ClassifiedImage.java similarity index 81% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/image/ClassifiedImage.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/image/ClassifiedImage.java index 0ed4851..3670100 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/image/ClassifiedImage.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/image/ClassifiedImage.java @@ -1,8 +1,8 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.dto.image; +package com.knecon.fforesight.service.layoutparser.processor.classification.model.image; import java.awt.geom.Rectangle2D; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageType; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.ImageType; import lombok.Data; import lombok.NonNull; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/Cell.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/Cell.java new file mode 100644 index 0000000..42ac4be --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/Cell.java @@ -0,0 +1,79 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.model.table; + +import java.awt.geom.Point2D; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.classification.utils.TextNormalizationUtilities; + +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.NoArgsConstructor; + +@SuppressWarnings("serial") +@Data +@EqualsAndHashCode(callSuper = true) +@NoArgsConstructor +public class Cell extends Rectangle { + + private List textBlocks = new ArrayList<>(); + + private List headerCells = new ArrayList<>(); + + private boolean isHeaderCell; + + private static final int MIN_SIZE = 1; + + private int pageNumber; + + + public Cell(Point2D topLeft, Point2D bottomRight) { + + super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), (float) (bottomRight.getY() - topLeft.getY())); + } + + + public void addTextBlock(TextPageBlock textBlock) { + + textBlocks.add(textBlock); + } + + + @Override + public String toString() { + + StringBuilder sb = new StringBuilder(); + + Iterator itty = textBlocks.iterator(); + TextPositionSequence previous = null; + while (itty.hasNext()) { + + TextPageBlock textBlock = itty.next(); + + for (TextPositionSequence word : textBlock.getSequences()) { + if (previous != null) { + if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) { + sb.append('\n'); + } else { + sb.append(' '); + } + } + sb.append(word.toString()); + previous = word; + } + + } + + return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString()).replaceAll("\n", " ").replaceAll(" {2}", " "); + } + + + public boolean hasMinimumSize() { + + return this.getHeight() >= MIN_SIZE && this.getWidth() >= MIN_SIZE; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/TableCellPosition.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/CellPosition.java similarity index 67% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/TableCellPosition.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/CellPosition.java index 42cb649..2b5ef89 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/TableCellPosition.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/CellPosition.java @@ -1,11 +1,11 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table; +package com.knecon.fforesight.service.layoutparser.processor.classification.model.table; import lombok.RequiredArgsConstructor; import lombok.Value; @Value @RequiredArgsConstructor -public class TableCellPosition implements Comparable { +public class CellPosition implements Comparable { int row; @@ -13,7 +13,7 @@ public class TableCellPosition implements Comparable { @Override - public int compareTo(TableCellPosition other) { + public int compareTo(CellPosition other) { int rowDiff = row - other.row; return rowDiff != 0 ? rowDiff : col - other.col; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/CleanRulings.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/CleanRulings.java similarity index 90% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/CleanRulings.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/CleanRulings.java index 9da8aa0..daa1055 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/CleanRulings.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/CleanRulings.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table; +package com.knecon.fforesight.service.layoutparser.processor.classification.model.table; import java.util.List; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/Rectangle.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/Rectangle.java similarity index 99% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/Rectangle.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/Rectangle.java index c3323fd..4ce30df 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/Rectangle.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/Rectangle.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.dto; +package com.knecon.fforesight.service.layoutparser.processor.classification.model.table; import java.awt.geom.Point2D; import java.awt.geom.Rectangle2D; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/Ruling.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/Ruling.java similarity index 99% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/Ruling.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/Ruling.java index 109a06f..9759960 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/Ruling.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/Ruling.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table; +package com.knecon.fforesight.service.layoutparser.processor.classification.model.table; import java.awt.geom.Line2D; import java.awt.geom.Point2D; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/Table.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/TablePageBlock.java similarity index 70% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/Table.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/TablePageBlock.java index cee62ef..23e5631 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/table/Table.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/TablePageBlock.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.dto.table; +package com.knecon.fforesight.service.layoutparser.processor.classification.model.table; import java.awt.geom.Point2D; import java.util.ArrayList; @@ -7,20 +7,19 @@ import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.TreeMap; -import java.util.stream.Collectors; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.Rectangle; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; import lombok.Getter; import lombok.Setter; import lombok.extern.slf4j.Slf4j; @Slf4j -public class Table extends AbstractTextContainer { +public class TablePageBlock extends AbstractPageBlock { - private final TreeMap cells = new TreeMap<>(); + private final TreeMap cells = new TreeMap<>(); private final int rotation; @Getter @@ -28,32 +27,29 @@ public class Table extends AbstractTextContainer { private String headline; private int unrotatedRowCount; private int unrotatedColCount; - private int rowCount = -1; - private int colCount = -1; - private List> rows; + private List> rows; - public Table(List cells, Rectangle area, int rotation) { + public TablePageBlock(List cells, Rectangle area, int rotation) { addCells(cells); minX = area.getLeft(); minY = area.getBottom(); maxX = area.getRight(); maxY = area.getTop(); - classification = "Table"; + classification = PageBlockType.TABLE; this.rotation = rotation; - } - public List> getRows() { + public List> getRows() { if (rows == null) { rows = computeRows(); // Ignore rows that does not contain any cells and values. - List> rowsToRemove = new ArrayList<>(); - for (List row : rows) { + List> rowsToRemove = new ArrayList<>(); + for (List row : rows) { if (row.size() == 1 && row.get(0).getTextBlocks().isEmpty()) { rowsToRemove.add(row); } @@ -70,19 +66,13 @@ public class Table extends AbstractTextContainer { public int getRowCount() { - if (rowCount == -1) { - rowCount = getRows().size(); - } - return rowCount; + return getRows().size(); } public int getColCount() { - if (colCount == -1) { - colCount = getRows().stream().mapToInt(List::size).max().orElse(0); - } - return colCount; + return getRows().stream().mapToInt(List::size).max().orElse(0); } @@ -100,16 +90,16 @@ public class Table extends AbstractTextContainer { // A bold cell is a header cell as long as every cell to the left/top is bold, too // we move from left to right and top to bottom for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) { - List rowCells = rows.get(rowIndex); + List rowCells = rows.get(rowIndex); if (rowCells.size() == 1) { continue; } for (int colIndex = 0; colIndex < rowCells.size(); colIndex++) { - TableCell cell = rowCells.get(colIndex); - List cellsToTheLeft = rowCells.subList(0, colIndex); - TableCell lastHeaderCell = null; - for (TableCell leftCell : cellsToTheLeft) { + Cell cell = rowCells.get(colIndex); + List cellsToTheLeft = rowCells.subList(0, colIndex); + Cell lastHeaderCell = null; + for (Cell leftCell : cellsToTheLeft) { if (leftCell.isHeaderCell()) { lastHeaderCell = leftCell; } else { @@ -119,7 +109,7 @@ public class Table extends AbstractTextContainer { if (lastHeaderCell != null) { cell.getHeaderCells().add(lastHeaderCell); } - List cellsToTheTop = new ArrayList<>(); + List cellsToTheTop = new ArrayList<>(); for (int i = 0; i < rowIndex; i++) { try { cellsToTheTop.add(rows.get(i).get(colIndex)); @@ -127,7 +117,7 @@ public class Table extends AbstractTextContainer { log.debug("No cell {} in row {}, ignoring.", colIndex, rowIndex); } } - for (TableCell topCell : cellsToTheTop) { + for (Cell topCell : cellsToTheTop) { if (topCell.isHeaderCell()) { lastHeaderCell = topCell; } else { @@ -146,14 +136,14 @@ public class Table extends AbstractTextContainer { } - private List> computeRows() { + private List> computeRows() { - List> rows = new ArrayList<>(); + List> rows = new ArrayList<>(); if (rotation == 90) { for (int i = 0; i < unrotatedColCount; i++) { // rows - List lastRow = new ArrayList<>(); + List lastRow = new ArrayList<>(); for (int j = unrotatedRowCount - 1; j >= 0; j--) { // cols - TableCell cell = cells.get(new TableCellPosition(j, i)); + Cell cell = cells.get(new CellPosition(j, i)); if (cell != null) { lastRow.add(cell); } @@ -162,9 +152,9 @@ public class Table extends AbstractTextContainer { } } else if (rotation == 270) { for (int i = unrotatedColCount - 1; i >= 0; i--) { // rows - List lastRow = new ArrayList<>(); + List lastRow = new ArrayList<>(); for (int j = 0; j < unrotatedRowCount; j++) { // cols - TableCell cell = cells.get(new TableCellPosition(j, i)); + Cell cell = cells.get(new CellPosition(j, i)); if (cell != null) { lastRow.add(cell); } @@ -173,9 +163,9 @@ public class Table extends AbstractTextContainer { } } else { for (int i = 0; i < unrotatedRowCount; i++) { - List lastRow = new ArrayList<>(); + List lastRow = new ArrayList<>(); for (int j = 0; j < unrotatedColCount; j++) { - TableCell cell = cells.get(new TableCellPosition(i, j)); // JAVA_8 use getOrDefault() + Cell cell = cells.get(new CellPosition(i, j)); // JAVA_8 use getOrDefault() if (cell != null) { lastRow.add(cell); } @@ -189,18 +179,18 @@ public class Table extends AbstractTextContainer { } - private void add(TableCell chunk, int row, int col) { + private void add(Cell chunk, int row, int col) { unrotatedRowCount = Math.max(unrotatedRowCount, row + 1); unrotatedColCount = Math.max(unrotatedColCount, col + 1); - TableCellPosition cp = new TableCellPosition(row, col); + CellPosition cp = new CellPosition(row, col); cells.put(cp, chunk); } - private void addCells(List cells) { + private void addCells(List cells) { if (cells.isEmpty()) { return; @@ -208,7 +198,7 @@ public class Table extends AbstractTextContainer { cells.removeIf(cell -> cell.getWidth() < 1.1 || cell.getHeight() < 1.1); - List> rowsOfCells = calculateStructure(cells); + List> rowsOfCells = calculateStructure(cells); for (int i = 0; i < rowsOfCells.size(); i++) { for (int j = 0; j < rowsOfCells.get(i).size(); j++) { @@ -223,11 +213,11 @@ public class Table extends AbstractTextContainer { * Calculates the structure of the table. For spanning rows and columns multiple cells with the same values will be inserted. * * @param cells The found cells - * @return Table Structure + * @return TablePageBlock Structure */ - private List> calculateStructure(List cells) { + private List> calculateStructure(List cells) { - List> matrix = new ArrayList<>(); + List> matrix = new ArrayList<>(); if (cells.isEmpty()) { return matrix; @@ -242,30 +232,30 @@ public class Table extends AbstractTextContainer { uniqueY.add(c.getTop()); }); - var sortedUniqueX = uniqueX.stream().sorted().collect(Collectors.toList()); - var sortedUniqueY = uniqueY.stream().sorted().collect(Collectors.toList()); + var sortedUniqueX = uniqueX.stream().sorted().toList(); + var sortedUniqueY = uniqueY.stream().sorted().toList(); Float prevY = null; for (Float y : sortedUniqueY) { - List row = new ArrayList<>(); + List row = new ArrayList<>(); Float prevX = null; for (Float x : sortedUniqueX) { if (prevY != null && prevX != null) { - var cell = new TableCell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y)); + var cell = new Cell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y)); var intersectionCell = cells.stream().filter(c -> cell.intersects(c) && cell.overlapRatio(c) > 0.1f).findFirst(); - if (intersectionCell.isPresent()) { - cell.getTextBlocks().addAll(intersectionCell.get().getTextBlocks()); + intersectionCell.ifPresent(value -> cell.getTextBlocks().addAll(value.getTextBlocks())); + if (cell.hasMinimumSize()) { + row.add(cell); } - row.add(cell); } prevX = x; } - if (prevY != null && prevX != null) { + if (prevY != null && prevX != null && !row.isEmpty()) { matrix.add(row); } prevY = y; @@ -281,22 +271,22 @@ public class Table extends AbstractTextContainer { public String getText() { StringBuilder sb = new StringBuilder(); - List> rows = getRows(); + List> rows = getRows(); int i = 0; - for (List row : rows) { + for (List row : rows) { if (i != 0) { sb.append("\n"); } if (!row.isEmpty()) { boolean firstColumn = true; - for (TableCell column : row) { + for (Cell column : row) { if (!firstColumn) { sb.append(","); } if (column != null && column.getTextBlocks() != null) { boolean first = true; - for (ClassificationTextBlock textBlock : column.getTextBlocks()) { + for (TextPageBlock textBlock : column.getTextBlocks()) { if (!first) { sb.append("\n"); } @@ -317,18 +307,18 @@ public class Table extends AbstractTextContainer { public String getTextAsHtml() { StringBuilder sb = new StringBuilder(); - List> rows = getRows(); + List> rows = getRows(); sb.append("
"); int i = 0; - for (List row : rows) { + for (List row : rows) { sb.append("\n"); if (!row.isEmpty()) { - for (TableCell column : row) { + for (Cell column : row) { sb.append(i == 0 ? "\n
" : "\n"); if (column != null && column.getTextBlocks() != null) { boolean first = true; - for (ClassificationTextBlock textBlock : column.getTextBlocks()) { + for (TextPageBlock textBlock : column.getTextBlocks()) { if (!first) { sb.append("
"); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/RedTextPosition.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/RedTextPosition.java similarity index 89% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/RedTextPosition.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/RedTextPosition.java index 1266286..2a8de35 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/RedTextPosition.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/RedTextPosition.java @@ -1,10 +1,8 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text; +package com.knecon.fforesight.service.layoutparser.processor.classification.model.text; import org.apache.pdfbox.text.TextPosition; import org.springframework.beans.BeanUtils; -import com.dslplatform.json.CompiledJson; -import com.dslplatform.json.JsonAttribute; import com.fasterxml.jackson.annotation.JsonIgnore; import lombok.AllArgsConstructor; @@ -17,7 +15,6 @@ import lombok.SneakyThrows; @Builder @NoArgsConstructor @AllArgsConstructor -@CompiledJson public class RedTextPosition { private String textMatrix; @@ -39,17 +36,14 @@ public class RedTextPosition { // not used in reanalysis @JsonIgnore - @JsonAttribute(ignore = true) private float widthOfSpace; // not used in reanalysis @JsonIgnore - @JsonAttribute(ignore = true) private float fontSizeInPt; // not used in reanalysis @JsonIgnore - @JsonAttribute(ignore = true) private String fontName; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/SearchableText.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/SearchableText.java new file mode 100644 index 0000000..b8081be --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/SearchableText.java @@ -0,0 +1,48 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.model.text; + +import java.util.ArrayList; +import java.util.List; +import com.knecon.fforesight.service.layoutparser.processor.classification.utils.TextNormalizationUtilities; + +import lombok.Getter; + +@Getter +public class SearchableText { + + private final List sequences = new ArrayList<>(); + + + public void add(TextPositionSequence textPositionSequence) { + + sequences.add(textPositionSequence); + } + + + public void addAll(List textPositionSequences) { + + sequences.addAll(textPositionSequences); + } + + + @Override + public String toString() { + + return buildString(sequences); + } + + + public static String buildString(List sequences) { + + StringBuilder sb = new StringBuilder(); + for (TextPositionSequence word : sequences) { + sb.append(word); + sb.append(' '); + } + String text = sb.toString(); + text = TextNormalizationUtilities.removeHyphenLineBreaks(text); + text = TextNormalizationUtilities.removeLineBreaks(text); + text = TextNormalizationUtilities.removeRepeatingWhitespaces(text); + return text; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/SimplifiedSectionText.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/SimplifiedSectionText.java new file mode 100644 index 0000000..beb8d8f --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/SimplifiedSectionText.java @@ -0,0 +1,17 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.model.text; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@Builder +@NoArgsConstructor +@AllArgsConstructor +public class SimplifiedSectionText { + + private int sectionNumber; + private String text; + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/SimplifiedText.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/SimplifiedText.java new file mode 100644 index 0000000..ea9b7ca --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/SimplifiedText.java @@ -0,0 +1,20 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.model.text; + +import java.util.ArrayList; +import java.util.List; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@Builder +@NoArgsConstructor +@AllArgsConstructor +public class SimplifiedText { + + private int numberOfPages; + private List sectionTexts = new ArrayList<>(); + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/StringFrequencyCounter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/StringFrequencyCounter.java similarity index 98% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/StringFrequencyCounter.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/StringFrequencyCounter.java index 4c6d3d3..a210116 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/StringFrequencyCounter.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/StringFrequencyCounter.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text; +package com.knecon.fforesight.service.layoutparser.processor.classification.model.text; import java.util.HashMap; import java.util.Map; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/TextDirection.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/TextDirection.java similarity index 89% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/TextDirection.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/TextDirection.java index ef31669..e555301 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/TextDirection.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/TextDirection.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text; +package com.knecon.fforesight.service.layoutparser.processor.classification.model.text; import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonValue; @@ -33,13 +33,6 @@ public enum TextDirection { } - @com.dslplatform.json.JsonValue - public float jsonValue() { - - return getDegrees(); - } - - @JsonCreator(mode = JsonCreator.Mode.DELEGATING) public static TextDirection fromDegrees(float degrees) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/ClassificationTextBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/TextPageBlock.java similarity index 84% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/ClassificationTextBlock.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/TextPageBlock.java index 1076cf8..cbf6214 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/ClassificationTextBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/TextPageBlock.java @@ -1,57 +1,67 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text; +package com.knecon.fforesight.service.layoutparser.processor.classification.model.text; import java.util.ArrayList; import java.util.List; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer; +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.classification.utils.TextNormalizationUtilities; import lombok.AllArgsConstructor; import lombok.Builder; import lombok.Data; -import lombok.EqualsAndHashCode; import lombok.NoArgsConstructor; -@EqualsAndHashCode(callSuper = true) @AllArgsConstructor @Builder @Data @NoArgsConstructor -public class ClassificationTextBlock extends AbstractTextContainer { +public class TextPageBlock extends AbstractPageBlock { @Builder.Default private List sequences = new ArrayList<>(); + @JsonIgnore private int rotation; - private int indexOnPage; - + @JsonIgnore private String mostPopularWordFont; + @JsonIgnore private String mostPopularWordStyle; + @JsonIgnore private float mostPopularWordFontSize; + @JsonIgnore private float mostPopularWordHeight; + @JsonIgnore private float mostPopularWordSpaceWidth; + @JsonIgnore private float highestFontSize; - private String classification; + @JsonIgnore + private PageBlockType classification; + @JsonIgnore public TextDirection getDir() { return sequences.get(0).getDir(); } + + @JsonIgnore private float getPageHeight() { return sequences.get(0).getPageHeight(); } + @JsonIgnore private float getPageWidth() { return sequences.get(0).getPageWidth(); @@ -68,6 +78,7 @@ public class ClassificationTextBlock extends AbstractTextContainer { * * @return the minX value in pdf coordinate system */ + @JsonIgnore public float getPdfMinX() { if (getDir().getDegrees() == 90) { @@ -83,6 +94,7 @@ public class ClassificationTextBlock extends AbstractTextContainer { } } + /** * Returns the maxX value in pdf coordinate system. * Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation. @@ -93,6 +105,7 @@ public class ClassificationTextBlock extends AbstractTextContainer { * * @return the maxX value in pdf coordinate system */ + @JsonIgnore public float getPdfMaxX() { if (getDir().getDegrees() == 90) { @@ -118,6 +131,7 @@ public class ClassificationTextBlock extends AbstractTextContainer { * * @return the minY value in pdf coordinate system */ + @JsonIgnore public float getPdfMinY() { if (getDir().getDegrees() == 90) { @@ -144,6 +158,7 @@ public class ClassificationTextBlock extends AbstractTextContainer { * * @return the maxY value in pdf coordinate system */ + @JsonIgnore public float getPdfMaxY() { if (getDir().getDegrees() == 90) { @@ -159,35 +174,34 @@ public class ClassificationTextBlock extends AbstractTextContainer { } - public ClassificationTextBlock(float minX, float maxX, float minY, float maxY, List sequences, int rotation, int indexOnPage) { - super(); - this.indexOnPage = indexOnPage; - super.minX = minX; - super.maxX = maxX; - super.minY = minY; - super.maxY = maxY; + public TextPageBlock(float minX, float maxX, float minY, float maxY, List sequences, int rotation) { + + this.minX = minX; + this.maxX = maxX; + this.minY = minY; + this.maxY = maxY; this.sequences = sequences; this.rotation = rotation; } - public ClassificationTextBlock union(TextPositionSequence r) { + public TextPageBlock union(TextPositionSequence r) { - ClassificationTextBlock union = this.copy(); + TextPageBlock union = this.copy(); union.add(r); return union; } - public ClassificationTextBlock union(ClassificationTextBlock r) { + public TextPageBlock union(TextPageBlock r) { - ClassificationTextBlock union = this.copy(); + TextPageBlock union = this.copy(); union.add(r); return union; } - public void add(ClassificationTextBlock r) { + public void add(TextPageBlock r) { if (r.getMinX() < minX) { minX = r.getMinX(); @@ -222,9 +236,9 @@ public class ClassificationTextBlock extends AbstractTextContainer { } - public ClassificationTextBlock copy() { + public TextPageBlock copy() { - return new ClassificationTextBlock(minX, maxX, minY, maxY, sequences, rotation, indexOnPage); + return new TextPageBlock(minX, maxX, minY, maxY, sequences, rotation); } @@ -263,6 +277,7 @@ public class ClassificationTextBlock extends AbstractTextContainer { @Override + @JsonIgnore public String getText() { StringBuilder sb = new StringBuilder(); @@ -283,4 +298,5 @@ public class ClassificationTextBlock extends AbstractTextContainer { return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString()); } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/TextPositionSequence.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/TextPositionSequence.java similarity index 93% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/TextPositionSequence.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/TextPositionSequence.java index ac525d5..fa1b243 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/TextPositionSequence.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/TextPositionSequence.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text; +package com.knecon.fforesight.service.layoutparser.processor.classification.model.text; import java.awt.geom.AffineTransform; import java.awt.geom.Point2D; @@ -8,8 +8,8 @@ import java.util.stream.Collectors; import org.apache.pdfbox.text.TextPosition; -import com.dslplatform.json.JsonAttribute; import com.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; @@ -25,6 +25,7 @@ import lombok.extern.slf4j.Slf4j; @Builder @NoArgsConstructor @AllArgsConstructor +@JsonIgnoreProperties({"empty"}) public class TextPositionSequence implements CharSequence { public static final int HEIGHT_PADDING = 2; @@ -37,6 +38,12 @@ public class TextPositionSequence implements CharSequence { private float pageWidth; + public TextPositionSequence(int page) { + + this.page = page; + } + + public TextPositionSequence(List textPositions, int page) { this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList()); @@ -64,6 +71,14 @@ public class TextPositionSequence implements CharSequence { } + public char charAt(int index, boolean caseInSensitive) { + + RedTextPosition textPosition = textPositionAt(index); + String text = textPosition.getUnicode(); + return caseInSensitive ? text.toLowerCase().charAt(0) : text.charAt(0); + } + + @Override public TextPositionSequence subSequence(int start, int end) { @@ -126,7 +141,6 @@ public class TextPositionSequence implements CharSequence { * @return the text direction adjusted minX value */ @JsonIgnore - @JsonAttribute(ignore = true) public float getMinXDirAdj() { return textPositions.get(0).getXDirAdj(); @@ -141,7 +155,6 @@ public class TextPositionSequence implements CharSequence { * @return the text direction adjusted maxX value */ @JsonIgnore - @JsonAttribute(ignore = true) public float getMaxXDirAdj() { return textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidthDirAdj() + HEIGHT_PADDING; @@ -156,7 +169,6 @@ public class TextPositionSequence implements CharSequence { * @return the text direction adjusted minY value. The upper border of the bounding box of the word. */ @JsonIgnore - @JsonAttribute(ignore = true) public float getMinYDirAdj() { return textPositions.get(0).getYDirAdj() - getTextHeight(); @@ -171,7 +183,6 @@ public class TextPositionSequence implements CharSequence { * @return the text direction adjusted maxY value. The lower border of the bounding box of the word. */ @JsonIgnore - @JsonAttribute(ignore = true) public float getMaxYDirAdj() { return textPositions.get(0).getYDirAdj(); @@ -180,7 +191,6 @@ public class TextPositionSequence implements CharSequence { @JsonIgnore - @JsonAttribute(ignore = true) public float getTextHeight() { return textPositions.get(0).getHeightDir() + HEIGHT_PADDING; @@ -188,7 +198,6 @@ public class TextPositionSequence implements CharSequence { @JsonIgnore - @JsonAttribute(ignore = true) public float getHeight() { return getMaxYDirAdj() - getMinYDirAdj(); @@ -196,7 +205,6 @@ public class TextPositionSequence implements CharSequence { @JsonIgnore - @JsonAttribute(ignore = true) public float getWidth() { return getMaxXDirAdj() - getMinXDirAdj(); @@ -204,7 +212,6 @@ public class TextPositionSequence implements CharSequence { @JsonIgnore - @JsonAttribute(ignore = true) public String getFont() { return textPositions.get(0).getFontName().toLowerCase().replaceAll(",bold", "").replaceAll(",italic", ""); @@ -212,7 +219,6 @@ public class TextPositionSequence implements CharSequence { @JsonIgnore - @JsonAttribute(ignore = true) public String getFontStyle() { String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase(); @@ -231,7 +237,6 @@ public class TextPositionSequence implements CharSequence { @JsonIgnore - @JsonAttribute(ignore = true) public float getFontSize() { return textPositions.get(0).getFontSizeInPt(); @@ -239,7 +244,6 @@ public class TextPositionSequence implements CharSequence { @JsonIgnore - @JsonAttribute(ignore = true) public float getSpaceWidth() { return textPositions.get(0).getWidthOfSpace(); @@ -256,11 +260,10 @@ public class TextPositionSequence implements CharSequence { * @return bounding box of the word in Pdf Coordinate System */ @JsonIgnore - @JsonAttribute(ignore = true) @SneakyThrows public Rectangle getRectangle() { - log.debug("ClassificationPage: '{}', Word: '{}', Rotation: '{}', textRotation {}", page, this, rotation, dir); + log.debug("Page: '{}', Word: '{}', Rotation: '{}', textRotation {}", page, this, rotation, dir); float textHeight = getTextHeight(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/UnclassifiedText.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/UnclassifiedText.java similarity index 73% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/UnclassifiedText.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/UnclassifiedText.java index 16be334..0d9bfb4 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/dto/text/UnclassifiedText.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/UnclassifiedText.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.dto.text; +package com.knecon.fforesight.service.layoutparser.processor.classification.model.text; import java.util.List; @@ -9,6 +9,6 @@ import lombok.Data; @AllArgsConstructor public class UnclassifiedText { - private List textBlocks; + private List textBlocks; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFAreaTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFAreaTextStripper.java deleted file mode 100644 index b799434..0000000 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFAreaTextStripper.java +++ /dev/null @@ -1,82 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.parsing; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; - -import org.apache.pdfbox.text.PDFTextStripperByArea; -import org.apache.pdfbox.text.TextPosition; - -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence; - -import lombok.Getter; -import lombok.Setter; - -public class PDFAreaTextStripper extends PDFTextStripperByArea { - - @Getter - private List textPositionSequences = new ArrayList<>(); - - @Setter - private int pageNumber; - - - public PDFAreaTextStripper() throws IOException { - - } - - - @Override - public void writeString(String text, List textPositions) throws IOException { - - int startIndex = 0; - for (int i = 0; i <= textPositions.size() - 1; i++) { - - if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0"))) { - startIndex++; - continue; - } - - // Strange but sometimes this is happening, for example: Metolachlor2.pdf - if (i > 0 && textPositions.get(i).getX() < textPositions.get(i - 1).getX()) { - List sublist = textPositions.subList(startIndex, i); - if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) { - textPositionSequences.add(new TextPositionSequence(sublist, pageNumber)); - } - startIndex = i; - } - - if (textPositions.get(i).getRotation() == 0 && i > 0 && textPositions.get(i).getX() > textPositions.get(i - 1).getEndX() + 1) { - List sublist = textPositions.subList(startIndex, i); - if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) { - textPositionSequences.add(new TextPositionSequence(sublist, pageNumber)); - } - startIndex = i; - } - - if (i > 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0")) && i <= textPositions.size() - 2) { - List sublist = textPositions.subList(startIndex, i); - if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) { - textPositionSequences.add(new TextPositionSequence(sublist, pageNumber)); - } - startIndex = i + 1; - } - } - - List sublist = textPositions.subList(startIndex, textPositions.size()); - if (!sublist.isEmpty() && (sublist.get(sublist.size() - 1).getUnicode().equals(" ") || sublist.get(sublist.size() - 1).getUnicode().equals("\u00A0"))) { - sublist = sublist.subList(0, sublist.size() - 1); - } - if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0")))) { - textPositionSequences.add(new TextPositionSequence(sublist, pageNumber)); - } - super.writeString(text); - } - - - public void clearPositions() { - - textPositionSequences = new ArrayList<>(); - } - -} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFLinesTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFLinesTextStripper.java index ae5c958..223b0ba 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFLinesTextStripper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFLinesTextStripper.java @@ -34,31 +34,26 @@ import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.text.TextPosition; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.RedTextPosition; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.RedTextPosition; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence; import lombok.Getter; import lombok.Setter; import lombok.extern.slf4j.Slf4j; +@Getter @Slf4j public class PDFLinesTextStripper extends PDFTextStripper { - @Getter private final List textPositionSequences = new ArrayList<>(); - @Getter private final List rulings = new ArrayList<>(); private final List graphicsPath = new ArrayList<>(); @Setter protected PDPage pdpage; - @Getter private int minCharWidth; - @Getter private int maxCharWidth; - @Getter private int minCharHeight; - @Getter private int maxCharHeight; private float path_x; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/BlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/BlockificationService.java index 244f6ab..c657ffc 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/BlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/BlockificationService.java @@ -9,14 +9,14 @@ import java.util.List; import org.springframework.stereotype.Service; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.FloatFrequencyCounter; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextBlockOrientation; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.StringFrequencyCounter; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.FloatFrequencyCounter; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.Orientation; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.StringFrequencyCounter; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.classification.utils.RulingTextDirAdjustUtil; @Service @@ -29,18 +29,18 @@ public class BlockificationService { /** * This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions. * This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this! - * Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling. + * Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling. * * @param textPositions The words of a page. * @param horizontalRulingLines Horizontal table lines. * @param verticalRulingLines Vertical table lines. - * @return ClassificationPage object that contains the Textblock and text statistics. + * @return Page object that contains the Textblock and text statistics. */ public ClassificationPage blockify(List textPositions, List horizontalRulingLines, List verticalRulingLines) { int indexOnPage = 0; List chunkWords = new ArrayList<>(); - List chunkBlockList1 = new ArrayList<>(); + List chunkBlockList = new ArrayList<>(); float minX = 1000, maxX = 0, minY = 1000, maxY = 0; TextPositionSequence prev = null; @@ -59,27 +59,27 @@ public class BlockificationService { if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) { - TextBlockOrientation prevOrientation = null; - if (!chunkBlockList1.isEmpty()) { - prevOrientation = chunkBlockList1.get(chunkBlockList1.size() - 1).getOrientation(); + Orientation prevOrientation = null; + if (!chunkBlockList.isEmpty()) { + prevOrientation = chunkBlockList.get(chunkBlockList.size() - 1).getOrientation(); } - ClassificationTextBlock cb1 = buildTextBlock(chunkWords, indexOnPage); + TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage); indexOnPage++; - chunkBlockList1.add(cb1); + chunkBlockList.add(cb1); chunkWords = new ArrayList<>(); if (splitByX && !isSplitByRuling) { wasSplitted = true; - cb1.setOrientation(TextBlockOrientation.LEFT); + cb1.setOrientation(Orientation.LEFT); splitX1 = word.getMinXDirAdj(); } else if (newLineAfterSplit && !isSplitByRuling) { wasSplitted = false; - cb1.setOrientation(TextBlockOrientation.RIGHT); + cb1.setOrientation(Orientation.RIGHT); splitX1 = null; - } else if (prevOrientation != null && prevOrientation.equals(TextBlockOrientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) { - cb1.setOrientation(TextBlockOrientation.LEFT); + } else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) { + cb1.setOrientation(Orientation.LEFT); } minX = 1000; @@ -106,19 +106,19 @@ public class BlockificationService { } } - ClassificationTextBlock cb1 = buildTextBlock(chunkWords, indexOnPage); + TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage); if (cb1 != null) { - chunkBlockList1.add(cb1); + chunkBlockList.add(cb1); } - Iterator itty = chunkBlockList1.iterator(); + Iterator itty = chunkBlockList.iterator(); - ClassificationTextBlock previousLeft = null; - ClassificationTextBlock previousRight = null; + TextPageBlock previousLeft = null; + TextPageBlock previousRight = null; while (itty.hasNext()) { - ClassificationTextBlock block = (ClassificationTextBlock) itty.next(); + TextPageBlock block = (TextPageBlock) itty.next(); - if (previousLeft != null && block.getOrientation().equals(TextBlockOrientation.LEFT)) { + if (previousLeft != null && block.getOrientation().equals(Orientation.LEFT)) { if (previousLeft.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousLeft.getMinY()) { previousLeft.add(block); itty.remove(); @@ -126,7 +126,7 @@ public class BlockificationService { } } - if (previousRight != null && block.getOrientation().equals(TextBlockOrientation.RIGHT)) { + if (previousRight != null && block.getOrientation().equals(Orientation.RIGHT)) { if (previousRight.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousRight.getMinY()) { previousRight.add(block); itty.remove(); @@ -134,21 +134,21 @@ public class BlockificationService { } } - if (block.getOrientation().equals(TextBlockOrientation.LEFT)) { + if (block.getOrientation().equals(Orientation.LEFT)) { previousLeft = block; - } else if (block.getOrientation().equals(TextBlockOrientation.RIGHT)) { + } else if (block.getOrientation().equals(Orientation.RIGHT)) { previousRight = block; } } - itty = chunkBlockList1.iterator(); - ClassificationTextBlock previous = null; + itty = chunkBlockList.iterator(); + TextPageBlock previous = null; while (itty.hasNext()) { - ClassificationTextBlock block = (ClassificationTextBlock) itty.next(); + TextPageBlock block = (TextPageBlock) itty.next(); - if (previous != null && previous.getOrientation().equals(TextBlockOrientation.LEFT) && block.getOrientation().equals(TextBlockOrientation.LEFT) && equalsWithThreshold(block.getMaxY(), - previous.getMaxY()) || previous != null && previous.getOrientation().equals(TextBlockOrientation.LEFT) && block.getOrientation() - .equals(TextBlockOrientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) { + if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(), + previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation() + .equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) { previous.add(block); itty.remove(); continue; @@ -157,7 +157,7 @@ public class BlockificationService { previous = block; } - return new ClassificationPage(chunkBlockList1); + return new ClassificationPage(chunkBlockList); } @@ -167,9 +167,9 @@ public class BlockificationService { } - private ClassificationTextBlock buildTextBlock(List wordBlockList, int indexOnPage) { + private TextPageBlock buildTextBlock(List wordBlockList, int indexOnPage) { - ClassificationTextBlock textBlock = null; + TextPageBlock textBlock = null; FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter(); FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter(); @@ -186,15 +186,14 @@ public class BlockificationService { styleFrequencyCounter.add(wordBlock.getFontStyle()); if (textBlock == null) { - textBlock = new ClassificationTextBlock(wordBlock.getMinXDirAdj(), + textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(), wordBlock.getMaxXDirAdj(), wordBlock.getMinYDirAdj(), wordBlock.getMaxYDirAdj(), wordBlockList, - wordBlock.getRotation(), - indexOnPage); + wordBlock.getRotation()); } else { - ClassificationTextBlock spatialEntity = textBlock.union(wordBlock); + TextPageBlock spatialEntity = textBlock.union(wordBlock); textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight()); } } @@ -254,7 +253,7 @@ public class BlockificationService { verticalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), - word.getPageHeight()); // + word.getPageHeight()); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/BodyTextFrameService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/BodyTextFrameService.java index 2860222..ded5d93 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/BodyTextFrameService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/BodyTextFrameService.java @@ -1,52 +1,56 @@ package com.knecon.fforesight.service.layoutparser.processor.classification.service; + import java.util.List; import org.springframework.stereotype.Service; -import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.FloatFrequencyCounter; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.TableCell; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Table; +import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.FloatFrequencyCounter; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.classification.utils.PositionUtils; @Service public class BodyTextFrameService { + private static final float APPROXIMATE_HEADER_LINE_COUNT = 2.9f; + + /** - * Adjusts and sets the body text frame to a classificationPage. - * Note: This needs to use Pdf Coordinate System where {0,0} rotated with the classificationPage rotation. + * Adjusts and sets the body text frame to a page. + * Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation. * 0 -> LowerLeft * 90 -> UpperLeft * 180 -> UpperRight * 270 -> LowerRight - * The aspect ratio of the classificationPage is also regarded. + * The aspect ratio of the page is also regarded. * - * @param classificationPage The classificationPage + * @param page The page * @param bodyTextFrame frame that contains the main text on portrait pages * @param landscapeBodyTextFrame frame that contains the main text on landscape pages */ - public void setBodyTextFrameAdjustedToPage(ClassificationPage classificationPage, Rectangle bodyTextFrame, Rectangle landscapeBodyTextFrame) { + public void setBodyTextFrameAdjustedToPage(ClassificationPage page, Rectangle bodyTextFrame, Rectangle landscapeBodyTextFrame) { - Rectangle textFrame = classificationPage.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame; + Rectangle textFrame = page.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame; - if (classificationPage.getPageWidth() > classificationPage.getPageHeight() && classificationPage.getRotation() == 270) { - textFrame = new Rectangle(new Point(textFrame.getTopLeft().getY(), classificationPage.getPageHeight() - textFrame.getTopLeft().getX() - textFrame.getWidth()), + if (page.getPageWidth() > page.getPageHeight() && page.getRotation() == 270) { + textFrame = new Rectangle(new Point(textFrame.getTopLeft().getY(), page.getPageHeight() - textFrame.getTopLeft().getX() - textFrame.getWidth()), textFrame.getHeight(), textFrame.getWidth(), 0); - } else if (classificationPage.getPageWidth() > classificationPage.getPageHeight() && classificationPage.getRotation() != 0) { - textFrame = new Rectangle(new Point(textFrame.getTopLeft().getY(), textFrame.getTopLeft().getX()), textFrame.getHeight(), textFrame.getWidth(), classificationPage.getPageNumber()); - } else if (classificationPage.getRotation() == 180) { - textFrame = new Rectangle(new Point(textFrame.getTopLeft().getX(), classificationPage.getPageHeight() - textFrame.getTopLeft().getY() - textFrame.getHeight()), + } else if (page.getPageWidth() > page.getPageHeight() && page.getRotation() != 0) { + textFrame = new Rectangle(new Point(textFrame.getTopLeft().getY(), textFrame.getTopLeft().getX()), textFrame.getHeight(), textFrame.getWidth(), page.getPageNumber()); + } else if (page.getRotation() == 180) { + textFrame = new Rectangle(new Point(textFrame.getTopLeft().getX(), page.getPageHeight() - textFrame.getTopLeft().getY() - textFrame.getHeight()), textFrame.getWidth(), textFrame.getHeight(), 0); } - classificationPage.setBodyTextFrame(textFrame); + page.setBodyTextFrame(textFrame); } @@ -59,50 +63,50 @@ public class BodyTextFrameService { * 270 -> LowerRight * The aspect ratio of the page is also regarded. * - * @param classificationPages List of all classificationPages + * @param pages List of all pages * @param documentFontSizeCounter Statistics of the document * @param landscape Calculate for landscape or portrait * @return Rectangle of the text frame */ - public Rectangle calculateBodyTextFrame(List classificationPages, FloatFrequencyCounter documentFontSizeCounter, boolean landscape) { + public Rectangle calculateBodyTextFrame(List pages, FloatFrequencyCounter documentFontSizeCounter, boolean landscape) { BodyTextFrameExpansionsRectangle expansionsRectangle = new BodyTextFrameExpansionsRectangle(); - for (ClassificationPage classificationPage : classificationPages) { + for (ClassificationPage page : pages) { - if (classificationPage.getTextBlocks().isEmpty() || landscape != classificationPage.isLandscape()) { + if (page.getTextBlocks().isEmpty() || landscape != page.isLandscape()) { continue; } - for (AbstractTextContainer container : classificationPage.getTextBlocks()) { + for (AbstractPageBlock container : page.getTextBlocks()) { - if (container instanceof ClassificationTextBlock) { - ClassificationTextBlock textBlock = (ClassificationTextBlock) container; + if (container instanceof TextPageBlock) { + TextPageBlock textBlock = (TextPageBlock) container; if (textBlock.getMostPopularWordFont() == null || textBlock.getMostPopularWordStyle() == null) { continue; } float approxLineCount = PositionUtils.getApproxLineCount(textBlock); - if (approxLineCount < 2.9f) { + if (approxLineCount < APPROXIMATE_HEADER_LINE_COUNT) { continue; } if (documentFontSizeCounter.getMostPopular() != null && textBlock.getMostPopularWordFontSize() >= documentFontSizeCounter.getMostPopular()) { - expandRectangle(textBlock, classificationPage, expansionsRectangle); + expandRectangle(textBlock, page, expansionsRectangle); } } - if (container instanceof Table) { - Table table = (Table) container; - for (List row : table.getRows()) { - for (TableCell cell : row) { + if (container instanceof TablePageBlock) { + TablePageBlock table = (TablePageBlock) container; + for (List row : table.getRows()) { + for (Cell cell : row) { if (cell == null || cell.getTextBlocks() == null) { continue; } - for (ClassificationTextBlock textBlock : cell.getTextBlocks()) { - expandRectangle(textBlock, classificationPage, expansionsRectangle); + for (TextPageBlock textBlock : cell.getTextBlocks()) { + expandRectangle(textBlock, page, expansionsRectangle); } } } @@ -116,9 +120,9 @@ public class BodyTextFrameService { } - private void expandRectangle(ClassificationTextBlock textBlock, ClassificationPage classificationPage, BodyTextFrameExpansionsRectangle expansionsRectangle) { + private void expandRectangle(TextPageBlock textBlock, ClassificationPage page, BodyTextFrameExpansionsRectangle expansionsRectangle) { - if (classificationPage.getPageWidth() > classificationPage.getPageHeight() && classificationPage.getRotation() != 0) { + if (page.getPageWidth() > page.getPageHeight() && page.getRotation() != 0) { if (textBlock.getPdfMinY() < expansionsRectangle.minX) { expansionsRectangle.minX = textBlock.getPdfMinY(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/ClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/ClassificationService.java index 02cbb83..263b7eb 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/ClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/ClassificationService.java @@ -6,10 +6,11 @@ import java.util.regex.Pattern; import org.springframework.stereotype.Service; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationDocument; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.classification.utils.PositionUtils; import lombok.RequiredArgsConstructor; @@ -31,43 +32,43 @@ public class ClassificationService { log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue()); - for (ClassificationPage classificationPage : document.getPages()) { - bodyTextFrameService.setBodyTextFrameAdjustedToPage(classificationPage, bodyTextFrame, landscapeBodyTextFrame); - classifyPage(classificationPage, document, headlineFontSizes); + for (ClassificationPage page : document.getPages()) { + bodyTextFrameService.setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame); + classifyPage(page, document, headlineFontSizes); } } - public void classifyPage(ClassificationPage classificationPage, ClassificationDocument document, List headlineFontSizes) { + public void classifyPage(ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { - for (AbstractTextContainer textBlock : classificationPage.getTextBlocks()) { - if (textBlock instanceof ClassificationTextBlock) { - classifyBlock((ClassificationTextBlock) textBlock, classificationPage, document, headlineFontSizes); + for (AbstractPageBlock textBlock : page.getTextBlocks()) { + if (textBlock instanceof TextPageBlock) { + classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes); } } } - public void classifyBlock(ClassificationTextBlock textBlock, ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { + public void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { var bodyTextFrame = page.getBodyTextFrame(); if (document.getFontSizeCounter().getMostPopular() == null) { - textBlock.setClassification("Other"); + textBlock.setClassification(PageBlockType.OTHER); return; } if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) { - textBlock.setClassification("Header"); + textBlock.setClassification(PageBlockType.HEADER); } else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) { - textBlock.setClassification("Footer"); + textBlock.setClassification(PageBlockType.FOOTER); } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks() .size() == 1)) { if (!Pattern.matches("[0-9]+", textBlock.toString())) { - textBlock.setClassification("Title"); + textBlock.setClassification(PageBlockType.TITLE); } } else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter() .getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter() @@ -80,36 +81,34 @@ public class ClassificationService { for (int i = 1; i <= headlineFontSizes.size(); i++) { if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) { - textBlock.setClassification("H " + i); + textBlock.setClassification(PageBlockType.getHeadlineType(i)); document.setHeadlines(true); } } - } else if (!textBlock.getText().startsWith("Table ") && !textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, - textBlock) && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter() - .getMostPopular() - .equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences() + } else if (!textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle() + .equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences() .get(0) .getTextPositions() .get(0) .getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { - textBlock.setClassification("H " + (headlineFontSizes.size() + 1)); + textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1)); document.setHeadlines(true); } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter() .getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) { - textBlock.setClassification("TextBlock Bold"); + textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD); } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont() .equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle() .equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) { - textBlock.setClassification("TextBlock"); + textBlock.setClassification(PageBlockType.PARAGRAPH); } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter() .getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter() .getMostPopular() .equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) { - textBlock.setClassification("TextBlock Italic"); + textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC); } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) { - textBlock.setClassification("TextBlock Unknown"); + textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN); } else { - textBlock.setClassification("Other"); + textBlock.setClassification(PageBlockType.OTHER); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/PdfParsingService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/PdfParsingService.java index cde9a8b..e6c22e2 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/PdfParsingService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/PdfParsingService.java @@ -9,16 +9,16 @@ import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.springframework.stereotype.Service; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationDocument; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.adapter.ImageServiceResponseAdapter; +import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableCells; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.classification.parsing.PDFLinesTextStripper; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CleanRulings; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell; import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; @@ -35,7 +35,7 @@ public class PdfParsingService { private final ImageServiceResponseAdapter imageServiceResponseAdapter; - public ClassificationDocument parseDocument(PDDocument originDocument, Map> pdfTableCells, Map> pdfImages) { + public ClassificationDocument parseDocument(PDDocument originDocument, Map> pdfTableCells, Map> pdfImages) { ClassificationDocument document = new ClassificationDocument(); List classificationPages = new ArrayList<>(); @@ -56,7 +56,7 @@ public class PdfParsingService { @SneakyThrows private void parsePage(Map> pdfImages, PDDocument pdDocument, - Map> pdfTableCells, + Map> pdfTableCells, ClassificationDocument document, List classificationPages, int pageNumber) { @@ -93,7 +93,7 @@ public class PdfParsingService { imageServiceResponseAdapter.findOcr(classificationPage); } - tableExtractionService.removeRedundantTableCells(cleanRulings, classificationPage); + tableExtractionService.extractTables(cleanRulings, classificationPage); buildPageStatistics(classificationPage); increaseDocumentStatistics(classificationPage, document); @@ -115,12 +115,12 @@ public class PdfParsingService { private void buildPageStatistics(ClassificationPage classificationPage) { // Collect all statistics for the classificationPage, except from blocks inside tables, as tables will always be added to BodyTextFrame. - for (AbstractTextContainer textBlock : classificationPage.getTextBlocks()) { - if (textBlock instanceof ClassificationTextBlock) { - if (((ClassificationTextBlock) textBlock).getSequences() == null) { + for (AbstractPageBlock textBlock : classificationPage.getTextBlocks()) { + if (textBlock instanceof TextPageBlock) { + if (((TextPageBlock) textBlock).getSequences() == null) { continue; } - for (TextPositionSequence word : ((ClassificationTextBlock) textBlock).getSequences()) { + for (TextPositionSequence word : ((TextPageBlock) textBlock).getSequences()) { classificationPage.getTextHeightCounter().add(word.getTextHeight()); classificationPage.getFontCounter().add(word.getFont()); classificationPage.getFontSizeCounter().add(word.getFontSize()); @@ -132,3 +132,5 @@ public class PdfParsingService { } } + + diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/RulingCleaningService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/RulingCleaningService.java index 5dd0985..8e8de6f 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/RulingCleaningService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/RulingCleaningService.java @@ -12,9 +12,9 @@ import java.util.Map; import org.springframework.stereotype.Service; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CleanRulings; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CvParsedTableCell; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableCells; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons; import lombok.RequiredArgsConstructor; @@ -25,7 +25,7 @@ import lombok.extern.slf4j.Slf4j; @RequiredArgsConstructor public class RulingCleaningService { - public CleanRulings getCleanRulings(List cvParsedTableCells, List rulings, float minCharWidth, float maxCharHeight) { + public CleanRulings getCleanRulings(List tableCells, List rulings, float minCharWidth, float maxCharHeight) { if (!rulings.isEmpty()) { snapPoints(rulings, minCharWidth, maxCharHeight); @@ -38,7 +38,7 @@ public class RulingCleaningService { } } if (vrs.isEmpty()) { - vrs.addAll(extractVerticalRulings(cvParsedTableCells)); + vrs.addAll(extractVerticalRulings(tableCells)); } List verticalRulingLines = collapseOrientedRulings(vrs); @@ -49,7 +49,7 @@ public class RulingCleaningService { } } if (hrs.isEmpty()) { - hrs.addAll(extractHorizontalRulings(cvParsedTableCells)); + hrs.addAll(extractHorizontalRulings(tableCells)); } List horizontalRulingLines = collapseOrientedRulings(hrs); @@ -132,12 +132,12 @@ public class RulingCleaningService { } - private Collection extractVerticalRulings(List cvParsedTableCells) { + private Collection extractVerticalRulings(List cvParsedTableCells) { List vrs = new ArrayList<>(); if (cvParsedTableCells != null) { - for (CvParsedTableCell cvParsedTableCell : cvParsedTableCells) { + for (TableCells cvParsedTableCell : cvParsedTableCells) { Ruling leftLine = createRuling(cvParsedTableCell.getX0(), cvParsedTableCell.getX0(), cvParsedTableCell.getY0(), cvParsedTableCell.getY1()); Ruling rightLine = createRuling(cvParsedTableCell.getX1(), cvParsedTableCell.getX1(), cvParsedTableCell.getY0(), cvParsedTableCell.getY1()); vrs.add(leftLine); @@ -148,12 +148,12 @@ public class RulingCleaningService { } - private Collection extractHorizontalRulings(List cvParsedTableCells) { + private Collection extractHorizontalRulings(List cvParsedTableCells) { List hrs = new ArrayList<>(); if (cvParsedTableCells != null) { - for (CvParsedTableCell cvParsedTableCell : cvParsedTableCells) { + for (TableCells cvParsedTableCell : cvParsedTableCells) { Ruling topLine = createRuling(cvParsedTableCell.getX0(), cvParsedTableCell.getX1(), cvParsedTableCell.getY1(), cvParsedTableCell.getY1()); Ruling baseLine = createRuling(cvParsedTableCell.getX0(), cvParsedTableCell.getX1(), cvParsedTableCell.getY0(), cvParsedTableCell.getY0()); hrs.add(topLine); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/SectionsBuilderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/SectionsBuilderService.java index a8309ba..044e98b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/SectionsBuilderService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/SectionsBuilderService.java @@ -9,17 +9,18 @@ import java.util.stream.Collectors; import org.springframework.stereotype.Service; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationDocument; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationFooter; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationSection; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationHeader; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.UnclassifiedText; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.TableCell; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Table; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationFooter; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationHeader; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationSection; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.UnclassifiedText; import lombok.extern.slf4j.Slf4j; @@ -29,44 +30,44 @@ public class SectionsBuilderService { public void buildSections(ClassificationDocument document) { - List chunkWords = new ArrayList<>(); + List chunkWords = new ArrayList<>(); List chunkBlockList = new ArrayList<>(); List headers = new ArrayList<>(); List footers = new ArrayList<>(); List unclassifiedTexts = new ArrayList<>(); - AbstractTextContainer prev = null; + AbstractPageBlock prev = null; String lastHeadline = ""; - Table previousTable = null; - for (ClassificationPage classificationPage : document.getPages()) { - List header = new ArrayList<>(); - List footer = new ArrayList<>(); - List unclassifiedText = new ArrayList<>(); - for (AbstractTextContainer current : classificationPage.getTextBlocks()) { + TablePageBlock previousTable = null; + for (ClassificationPage page : document.getPages()) { + List header = new ArrayList<>(); + List footer = new ArrayList<>(); + List unclassifiedText = new ArrayList<>(); + for (AbstractPageBlock current : page.getTextBlocks()) { if (current.getClassification() == null) { continue; } - current.setPage(classificationPage.getPageNumber()); + current.setPage(page.getPageNumber()); - if (current.getClassification().equals("Header")) { - header.add((ClassificationTextBlock) current); + if (current.getClassification().equals(PageBlockType.HEADER)) { + header.add((TextPageBlock) current); continue; } - if (current.getClassification().equals("Footer")) { - footer.add((ClassificationTextBlock) current); + if (current.getClassification().equals(PageBlockType.FOOTER)) { + footer.add((TextPageBlock) current); continue; } - if (current.getClassification().equals("Other")) { - unclassifiedText.add((ClassificationTextBlock) current); + if (current.getClassification().equals(PageBlockType.OTHER)) { + unclassifiedText.add((TextPageBlock) current); continue; } - if (prev != null && current.getClassification().startsWith("H ") && !prev.getClassification().startsWith("H ") || !document.isHeadlines()) { + if (prev != null && current.getClassification().isHeadline() && !prev.getClassification().isHeadline() || !document.isHeadlines()) { ClassificationSection chunkBlock = buildTextBlock(chunkWords, lastHeadline); chunkBlock.setHeadline(lastHeadline); if (document.isHeadlines()) { @@ -78,7 +79,7 @@ public class SectionsBuilderService { previousTable = chunkBlock.getTables().get(chunkBlock.getTables().size() - 1); } } - if (current instanceof Table table) { + if (current instanceof TablePageBlock table) { // Distribute header information for subsequent tables mergeTableMetadata(table, previousTable); previousTable = table; @@ -106,15 +107,14 @@ public class SectionsBuilderService { document.setHeaders(headers); document.setFooters(footers); document.setUnclassifiedTexts(unclassifiedTexts); - addImagesToSections(document); } - private void addImagesToSections(ClassificationDocument document) { + public void addImagesToSections(ClassificationDocument document) { Map> sectionMap = new HashMap<>(); for (ClassificationSection section : document.getSections()) { - for (AbstractTextContainer container : section.getPageBlocks()) { + for (AbstractPageBlock container : section.getPageBlocks()) { List sectionsOnPage = sectionMap.computeIfAbsent(container.getPage(), c -> new ArrayList<>()); if (sectionsOnPage.contains(section)) { @@ -138,11 +138,11 @@ public class SectionsBuilderService { sectionMap.computeIfAbsent(1, x -> new ArrayList<>()).add(section); } - for (ClassificationPage classificationPage : document.getPages()) { - for (ClassifiedImage image : classificationPage.getImages()) { - List sectionsOnPage = sectionMap.get(classificationPage.getPageNumber()); + for (ClassificationPage page : document.getPages()) { + for (ClassifiedImage image : page.getImages()) { + List sectionsOnPage = sectionMap.get(page.getPageNumber()); if (sectionsOnPage == null) { - int i = classificationPage.getPageNumber(); + int i = page.getPageNumber(); while (sectionsOnPage == null) { sectionsOnPage = sectionMap.get(i); i--; @@ -154,8 +154,8 @@ public class SectionsBuilderService { Float xMax = null; Float yMax = null; - for (AbstractTextContainer abs : section.getPageBlocks()) { - if (abs.getPage() != classificationPage.getPageNumber()) { + for (AbstractPageBlock abs : section.getPageBlocks()) { + if (abs.getPage() != page.getPageNumber()) { continue; } @@ -212,23 +212,23 @@ public class SectionsBuilderService { } - private void mergeTableMetadata(Table currentTable, Table previousTable) { + private void mergeTableMetadata(TablePageBlock currentTable, TablePageBlock previousTable) { // Distribute header information for subsequent tables if (previousTable != null && hasInvalidHeaderInformation(currentTable) && hasValidHeaderInformation(previousTable)) { - List previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable); - List tableNonHeaderRow = getRowWithNonHeaderCells(currentTable); + List previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable); + List tableNonHeaderRow = getRowWithNonHeaderCells(currentTable); // Allow merging of tables if header row is separated from first logical non-header row if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows().get(0).size() == tableNonHeaderRow.size()) { previousTableNonHeaderRow = previousTable.getRows().get(0).stream().map(cell -> { - TableCell fakeCell = new TableCell(cell.getPoints()[0], cell.getPoints()[2]); + Cell fakeCell = new Cell(cell.getPoints()[0], cell.getPoints()[2]); fakeCell.setHeaderCells(Collections.singletonList(cell)); return fakeCell; }).collect(Collectors.toList()); } if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) { for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table - List row = currentTable.getRows().get(i); + List row = currentTable.getRows().get(i); if (row.size() == tableNonHeaderRow.size() && row.stream().allMatch(cell -> cell.getHeaderCells().isEmpty())) { for (int j = 0; j < row.size(); j++) { row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells()); @@ -240,52 +240,52 @@ public class SectionsBuilderService { } - private ClassificationSection buildTextBlock(List wordBlockList, String lastHeadline) { + private ClassificationSection buildTextBlock(List wordBlockList, String lastHeadline) { ClassificationSection section = new ClassificationSection(); - for (AbstractTextContainer container : wordBlockList) { - if (container instanceof Table table) { + for (AbstractPageBlock container : wordBlockList) { + if (container instanceof TablePageBlock table) { if (lastHeadline == null || lastHeadline.isEmpty()) { table.setHeadline("Text in table"); } else { - table.setHeadline("Table in: " + lastHeadline); + table.setHeadline("TablePageBlock in: " + lastHeadline); } section.getPageBlocks().add(table); continue; } - ClassificationTextBlock wordBlock = (ClassificationTextBlock) container; + TextPageBlock wordBlock = (TextPageBlock) container; section.getPageBlocks().add(wordBlock); } return section; } - private boolean hasValidHeaderInformation(Table table) { + private boolean hasValidHeaderInformation(TablePageBlock table) { return !hasInvalidHeaderInformation(table); } - private boolean hasInvalidHeaderInformation(Table table) { + private boolean hasInvalidHeaderInformation(TablePageBlock table) { return table.getRows().stream().flatMap(row -> row.stream().filter(cell -> !cell.getHeaderCells().isEmpty())).findAny().isEmpty(); } - private List getRowWithNonHeaderCells(Table table) { + private List getRowWithNonHeaderCells(TablePageBlock table) { for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table - List row = table.getRows().get(i); + List row = table.getRows().get(i); if (row.size() == 1) { continue; } boolean allNonHeader = true; - for (TableCell cell : row) { + for (Cell cell : row) { if (cell.isHeaderCell()) { allNonHeader = false; break; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/TableExtractionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/TableExtractionService.java index 515e4cc..c11cca0 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/TableExtractionService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/TableExtractionService.java @@ -9,20 +9,18 @@ import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; -import java.util.stream.Collectors; import org.springframework.stereotype.Service; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.TableCell; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.CleanRulings; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Ruling; -import com.knecon.fforesight.service.layoutparser.processor.classification.utils.QuickSort; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Rectangle; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.Rectangle; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Table; @Service public class TableExtractionService { @@ -68,28 +66,28 @@ public class TableExtractionService { /** - * Finds tables on a classificationPage and moves textblocks into cells of the found tables. - * Note: This algorithm uses Pdf Coordinate System where {0,0} rotated with the classificationPage rotation. + * Finds tables on a page and moves textblocks into cells of the found tables. + * Note: This algorithm uses Pdf Coordinate System where {0,0} rotated with the page rotation. * 0 -> LowerLeft * 90 -> UpperLeft * 180 -> UpperRight * 270 -> LowerRight - * + *

* DirAdj (Text direction adjusted) values can not be used here. * * @param cleanRulings The lines used to build the table. - * @param classificationPage ClassificationPage object that contains textblocks and statistics. + * @param page Page object that contains textblocks and statistics. */ - public void removeRedundantTableCells(CleanRulings cleanRulings, ClassificationPage classificationPage) { + public void extractTables(CleanRulings cleanRulings, ClassificationPage page) { - List cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical()); + List cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical()); - List toBeRemoved = new ArrayList<>(); + List toBeRemoved = new ArrayList<>(); - for (AbstractTextContainer abstractTextContainer : classificationPage.getTextBlocks()) { - ClassificationTextBlock textBlock = (ClassificationTextBlock) abstractTextContainer; - for (TableCell cell : cells) { - if (cell.intersects(textBlock.getPdfMinX(), + for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) { + TextPageBlock textBlock = (TextPageBlock) abstractPageBlock; + for (Cell cell : cells) { + if (cell.hasMinimumSize() && cell.intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getPdfMaxX() - textBlock.getPdfMinX(), textBlock.getPdfMaxY() - textBlock.getPdfMinY())) { @@ -101,44 +99,44 @@ public class TableExtractionService { } cells = new ArrayList<>(new HashSet<>(cells)); - QuickSort.sort(cells, Rectangle.ILL_DEFINED_ORDER); + DoubleComparisons.sort(cells, Rectangle.ILL_DEFINED_ORDER); - List spreadsheetAreas = findSpreadsheetsFromCells(cells).stream().filter(r -> r.getWidth() > 0f && r.getHeight() > 0f).collect(Collectors.toList()); + List spreadsheetAreas = findSpreadsheetsFromCells(cells).stream().filter(r -> r.getWidth() > 0f && r.getHeight() > 0f).toList(); - List tables = new ArrayList<>(); + List tables = new ArrayList<>(); for (Rectangle area : spreadsheetAreas) { - List overlappingCells = new ArrayList<>(); - for (TableCell c : cells) { - if (c.intersects(area)) { + List overlappingCells = new ArrayList<>(); + for (Cell c : cells) { + if (c.hasMinimumSize() && c.intersects(area)) { overlappingCells.add(c); } } - tables.add(new Table(overlappingCells, area, classificationPage.getRotation())); + tables.add(new TablePageBlock(overlappingCells, area, page.getRotation())); } - for (Table table : tables) { + for (TablePageBlock table : tables) { int position = -1; - Iterator itty = classificationPage.getTextBlocks().iterator(); + Iterator itty = page.getTextBlocks().iterator(); while (itty.hasNext()) { - AbstractTextContainer textBlock = itty.next(); - if (textBlock instanceof ClassificationTextBlock ? table.containsBlock((ClassificationTextBlock) textBlock) : table.contains(textBlock) && position == -1) { - position = classificationPage.getTextBlocks().indexOf(textBlock); + AbstractPageBlock textBlock = itty.next(); + if (textBlock instanceof TextPageBlock ? table.containsBlock((TextPageBlock) textBlock) : table.contains(textBlock) && position == -1) { + position = page.getTextBlocks().indexOf(textBlock); } } if (position != -1) { - classificationPage.getTextBlocks().add(position, table); + page.getTextBlocks().add(position, table); } } - classificationPage.getTextBlocks().removeAll(toBeRemoved); + page.getTextBlocks().removeAll(toBeRemoved); } - public List findCells(List horizontalRulingLines, List verticalRulingLines) { + public List findCells(List horizontalRulingLines, List verticalRulingLines) { - List cellsFound = new ArrayList<>(); + List cellsFound = new ArrayList<>(); Map intersectionPoints = Ruling.findIntersections(horizontalRulingLines, verticalRulingLines); List intersectionPointsList = new ArrayList<>(intersectionPoints.keySet()); intersectionPointsList.sort(POINT_COMPARATOR); @@ -174,7 +172,7 @@ public class TableExtractionService { Point2D btmRight = new Point2D.Float((float) yPoint.getX(), (float) xPoint.getY()); if (intersectionPoints.containsKey(btmRight) && intersectionPoints.get(btmRight)[0].equals(intersectionPoints.get(xPoint)[0]) && intersectionPoints.get(btmRight)[1].equals( intersectionPoints.get(yPoint)[1])) { - cellsFound.add(new TableCell(topLeft, btmRight)); + cellsFound.add(new Cell(topLeft, btmRight)); break outer; } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/DoubleComparisons.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/DoubleComparisons.java index 36ef41b..05fe8ad 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/DoubleComparisons.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/DoubleComparisons.java @@ -1,6 +1,8 @@ package com.knecon.fforesight.service.layoutparser.processor.classification.utils; import java.math.BigDecimal; +import java.util.Comparator; +import java.util.List; import lombok.experimental.UtilityClass; import lombok.extern.slf4j.Slf4j; @@ -20,11 +22,22 @@ public final class DoubleComparisons { public static float round(double d, int decimalPlace) { + BigDecimal bd = BigDecimal.valueOf(d); bd = bd.setScale(decimalPlace, BigDecimal.ROUND_HALF_UP); return bd.floatValue(); } + public static void sort(List list, Comparator comparator) { + + try { + QuickSort.sort(list, comparator); + } catch (IllegalArgumentException e) { + // This should not happen since we use QuickSort from PDFBox + log.warn(e.getMessage()); + } + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/FileUtils.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/FileUtils.java new file mode 100644 index 0000000..8196f3b --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/FileUtils.java @@ -0,0 +1,56 @@ +package com.knecon.fforesight.service.layoutparser.processor.classification.utils; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; + +import lombok.experimental.UtilityClass; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@UtilityClass +public class FileUtils { + + public File createTempFile(String filenamePrefix, String filenameSuffix) throws IOException { + + File tempFile = Files.createTempFile(filenamePrefix, filenameSuffix).toFile(); + setRWPermissionsOnlyForOwner(tempFile); + + return tempFile; + } + + + /** + * Deletes a file; logs a message with the reason if the deletion fails. + * This method is null-safe. + * + * @param file The file to delete. Can be null. + */ + public void deleteFile(File file) { + + if (file != null) { + try { + Files.deleteIfExists(file.toPath()); + } catch (IOException ex) { + log.warn("Could not delete file!", ex); + } + } + } + + + // We don't need to check the results of the permission setters below, + // since we're manipulating a file we created ourselves. + @SuppressWarnings({"ResultOfMethodCallIgnored", "squid:S899"}) + private void setRWPermissionsOnlyForOwner(File tempFile) { + + try { + tempFile.setReadable(true, true); + tempFile.setWritable(true, true); + tempFile.setExecutable(false); + } catch (SecurityException ex) { + // This should never happen since we're creating a temp file ourselves. + log.warn("Caught an exception during temp file creation. This should not happend. Check the code.", ex); + } + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/PositionUtils.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/PositionUtils.java index 765ea6c..8b52b74 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/PositionUtils.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/PositionUtils.java @@ -1,7 +1,7 @@ package com.knecon.fforesight.service.layoutparser.processor.classification.utils; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; import lombok.experimental.UtilityClass; @@ -11,7 +11,7 @@ public final class PositionUtils { // TODO This currently uses pdf coord system. In the futher this should use java coord system. // Note: DirAdj (TextDirection Adjusted) can not be user for this. - public boolean isWithinBodyTextFrame(Rectangle btf, ClassificationTextBlock textBlock) { + public boolean isWithinBodyTextFrame(Rectangle btf, TextPageBlock textBlock) { if (btf == null || textBlock == null) { return false; @@ -32,7 +32,7 @@ public final class PositionUtils { // TODO This currently uses pdf coord system. In the futher this should use java coord system. // Note: DirAdj (TextDirection Adjusted) can not be user for this. - public boolean isOverBodyTextFrame(Rectangle btf, ClassificationTextBlock textBlock, int rotation) { + public boolean isOverBodyTextFrame(Rectangle btf, TextPageBlock textBlock, int rotation) { if (btf == null || textBlock == null) { return false; @@ -58,9 +58,10 @@ public final class PositionUtils { } + // TODO This currently uses pdf coord system. In the futher this should use java coord system. // Note: DirAdj (TextDirection Adjusted) can not be user for this. - public boolean isUnderBodyTextFrame(Rectangle btf, ClassificationTextBlock textBlock, int rotation) { + public boolean isUnderBodyTextFrame(Rectangle btf, TextPageBlock textBlock, int rotation) { if (btf == null || textBlock == null) { return false; @@ -86,9 +87,10 @@ public final class PositionUtils { } + // TODO This currently uses pdf coord system. In the futher this should use java coord system. // Note: DirAdj (TextDirection Adjusted) can not be user for this. - public boolean isTouchingUnderBodyTextFrame(Rectangle btf, ClassificationTextBlock textBlock) { + public boolean isTouchingUnderBodyTextFrame(Rectangle btf, TextPageBlock textBlock) { //TODO Currently this is not working for rotated pages. @@ -105,13 +107,13 @@ public final class PositionUtils { } - public float getHeightDifferenceBetweenChunkWordAndDocumentWord(ClassificationTextBlock textBlock, Float documentMostPopularWordHeight) { + public float getHeightDifferenceBetweenChunkWordAndDocumentWord(TextPageBlock textBlock, Float documentMostPopularWordHeight) { return textBlock.getMostPopularWordHeight() - documentMostPopularWordHeight; } - public Float getApproxLineCount(ClassificationTextBlock textBlock) { + public Float getApproxLineCount(TextPageBlock textBlock) { return textBlock.getHeight() / textBlock.getMostPopularWordHeight(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/RulingTextDirAdjustUtil.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/RulingTextDirAdjustUtil.java index ce3ac2b..7931d65 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/RulingTextDirAdjustUtil.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/RulingTextDirAdjustUtil.java @@ -3,7 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.classification.util import java.awt.geom.Line2D; import java.awt.geom.Point2D; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling; import lombok.experimental.UtilityClass; @@ -13,7 +13,7 @@ public final class RulingTextDirAdjustUtil { /** * Converts a ruling (line of a table) the same way TextPositions are converted in PDFBox. * This will get the y position of the text, adjusted so that 0,0 is upper left and it is adjusted based on the text direction. - * + *

* See org.apache.pdfbox.text.TextPosition */ public Line2D.Float convertToDirAdj(Ruling ruling, float dir, float pageWidth, float pageHeight) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/TextNormalizationUtilities.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/TextNormalizationUtilities.java index fa08958..9cec075 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/TextNormalizationUtilities.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/TextNormalizationUtilities.java @@ -16,4 +16,16 @@ public final class TextNormalizationUtilities { return text.replaceAll("([^\\s\\d\\-]{2,500})[\\-\\u00AD]\\R", "$1"); } + + public static String removeLineBreaks(String text) { + + return text.replaceAll("\n", " "); + } + + + public static String removeRepeatingWhitespaces(String text) { + + return text.replaceAll(" {2}", " "); + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/DocumentGraphFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/DocumentGraphFactory.java index 95d54b6..4897aa2 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/DocumentGraphFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/DocumentGraphFactory.java @@ -4,286 +4,124 @@ import static java.lang.String.format; import static java.util.stream.Collectors.groupingBy; import static java.util.stream.Collectors.toList; -import java.util.Collections; +import java.awt.geom.Rectangle2D; +import java.util.ArrayList; import java.util.HashMap; -import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.NoSuchElementException; import java.util.Set; -import java.util.concurrent.atomic.AtomicInteger; -import org.springframework.stereotype.Service; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationFooter; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationHeader; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Footer; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.GenericSemanticNode; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Header; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Headline; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Image; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Paragraph; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Section; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.AtomicTextBlock; +import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder; +import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.FooterNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.HeaderNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.HeadlineNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ParagraphNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SectionNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableCellNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationDocument; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationFooter; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationHeader; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationPage; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Table; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.TableCell; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence; +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Getter; +import lombok.experimental.FieldDefaults; +import lombok.experimental.UtilityClass; - -@Service +@UtilityClass public class DocumentGraphFactory { - public static final double TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD = 0.05; + public Document buildDocumentGraph(ClassificationDocument document) { + Document documentGraph = new Document(); + Context context = new Context(documentGraph); - public DocumentGraph buildDocumentGraph(ClassificationDocument document) { - - TextBlockFactory textBlockFactory = new TextBlockFactory(); - DocumentGraph documentGraph = new DocumentGraph(); - Context context = new Context(new TableOfContents(documentGraph), new HashMap<>(), new LinkedList<>(), new LinkedList<>(), textBlockFactory); - - document.getPages().stream().map(this::buildPage).forEach(page -> context.pages().put(page, new AtomicInteger(1))); - document.getSections().stream().flatMap(section -> section.getImages().stream()).forEach(image -> context.images().add(image)); + document.getPages().forEach(context::buildAndAddPageWithCounter); + document.getSections().stream().flatMap(section -> section.getImages().stream()).forEach(image -> context.getImages().add(image)); addSections(document, context); addHeaderAndFooterToEachPage(document, context); documentGraph.setNumberOfPages(context.pages.size()); documentGraph.setPages(context.pages.keySet()); - documentGraph.setTableOfContents(context.tableOfContents); - documentGraph.setTextBlock(documentGraph.buildTextBlock()); + documentGraph.setDocumentTree(context.documentTree); + documentGraph.setTextBlock(documentGraph.getTextBlock()); return documentGraph; } private void addSections(ClassificationDocument document, Context context) { - document.getSections().forEach(section -> addSection(null, section.getPageBlocks(), section.getImages(), context)); + document.getSections().forEach(section -> SectionNodeFactory.addSection(null, section.getPageBlocks(), section.getImages(), context)); } - private void addSection(SemanticNode parentNode, List pageBlocks, List images, Context context) { + public void addParagraphOrHeadline(GenericSemanticNode parentNode, TextPageBlock originalTextBlock, Context context, List textBlocksToMerge) { - Map> blocksPerPage = pageBlocks.stream().collect(groupingBy(AbstractTextContainer::getPage)); - SectionNode sectionNode = SectionNode.builder().entities(new HashSet<>()).tableOfContents(context.tableOfContents()).build(); + Page page = context.getPage(originalTextBlock.getPage()); - context.sections().add(sectionNode); - blocksPerPage.keySet().forEach(pageNumber -> addSectionNodeToPageNode(context, sectionNode, pageNumber)); - - List tocId; - if (parentNode == null) { - tocId = context.tableOfContents.createNewMainEntryAndReturnId(NodeType.SECTION, sectionNode); + GenericSemanticNode node; + if (originalTextBlock.isHeadline()) { + node = Headline.builder().documentTree(context.getDocumentTree()).build(); } else { - tocId = context.tableOfContents.createNewChildEntryAndReturnId(parentNode.getTocId(), NodeType.SECTION, sectionNode); - } - sectionNode.setTocId(tocId); - Set alreadyMerged = new HashSet<>(); - for (AbstractTextContainer abstractTextContainer : pageBlocks) { - - if (alreadyMerged.contains(abstractTextContainer)) { - continue; - } - - if (abstractTextContainer instanceof ClassificationTextBlock) { - List textBlocks = findTextBlocksWithSameClassificationAndAlignsY(abstractTextContainer, pageBlocks); - alreadyMerged.addAll(textBlocks); - addParagraphOrHeadline(sectionNode, (ClassificationTextBlock) abstractTextContainer, context, textBlocks); - } - if (abstractTextContainer instanceof Table) { - addTable(sectionNode, (Table) abstractTextContainer, context); - } - } - for (ClassifiedImage image : images) { - - addImage(sectionNode, image, context); - } - } - - - private static List findTextBlocksWithSameClassificationAndAlignsY(AbstractTextContainer atc, List pageBlocks) { - - return pageBlocks.stream() - .filter(abstractTextContainer -> !abstractTextContainer.equals(atc)) - .filter(abstractTextContainer -> abstractTextContainer.getPage() == atc.getPage()) - .filter(abstractTextContainer -> abstractTextContainer instanceof ClassificationTextBlock) - .filter(abstractTextContainer -> abstractTextContainer.intersectsY(atc)) - .map(abstractTextContainer -> (ClassificationTextBlock) abstractTextContainer) - .toList(); - } - - - private void addSectionNodeToPageNode(Context context, SectionNode sectionNode, Integer pageNumber) { - - PageNode page = getPage(pageNumber, context); - page.getMainBody().add(sectionNode); - } - - - private void addTable(SemanticNode parentNode, Table table, Context context) { - - PageNode page = getPage(table.getPage(), context); - TableNode tableNode = TableNode.builder().tableOfContents(context.tableOfContents()).numberOfCols(table.getColCount()).numberOfRows(table.getRowCount()).build(); - - if (!page.getMainBody().contains(parentNode)) { - parentNode.getPages().add(page); - } - - page.getMainBody().add(tableNode); - - List tocId = context.tableOfContents().createNewChildEntryAndReturnId(parentNode.getTocId(), NodeType.TABLE, tableNode); - tableNode.setTocId(tocId); - - addTableCells(table.getRows(), tableNode, context, table.getPage()); - } - - - private void addTableCells(List> rows, SemanticNode parentNode, Context context, int pageNumber) { - - for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) { - for (int colIndex = 0; colIndex < rows.get(rowIndex).size(); colIndex++) { - addTableCell(rows.get(rowIndex).get(colIndex), rowIndex, colIndex, parentNode, pageNumber, context); - } - } - } - - - private void addTableCell(TableCell cell, int rowIndex, int colIndex, SemanticNode parentNode, int pageNumber, Context context) { - - PageNode page = getPage(pageNumber, context); - cell.getTextBlocks().stream().filter(tb -> tb.getPage() == 0).forEach(tb -> tb.setPage(pageNumber)); - - TableCellNode tableCellNode = TableCellNode.builder() - .tableOfContents(context.tableOfContents()) - .row(rowIndex) - .col(colIndex) - .header(cell.isHeaderCell()) - .bBox(cell.getBounds2D()) - .build(); - page.getMainBody().add(tableCellNode); - - TextBlock textBlock; - - List tocId = context.tableOfContents().createNewChildEntryAndReturnId(parentNode.getTocId(), NodeType.TABLE_CELL, tableCellNode); - tableCellNode.setTocId(tocId); - - if (cell.getTextBlocks().isEmpty()) { - tableCellNode.setTerminalTextBlock(context.textBlockFactory.emptyTextBlock(parentNode, context, page)); - tableCellNode.setTerminal(true); - - } else if (cell.getTextBlocks().size() == 1) { - textBlock = context.textBlockFactory().buildAtomicTextBlock(cell.getTextBlocks().get(0).getSequences(), tableCellNode, context, page); - tableCellNode.setTerminalTextBlock(textBlock); - tableCellNode.setTerminal(true); - - } else if (firstTextBlockIsHeadline(cell)) { - addSection(tableCellNode, cell.getTextBlocks().stream().map(tb -> (AbstractTextContainer) tb).toList(), Collections.emptyList(), context); - tableCellNode.setTerminal(false); - - } else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) { - List sequences = TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(cell.getTextBlocks()); - textBlock = context.textBlockFactory().buildAtomicTextBlock(sequences, tableCellNode, context, page); - tableCellNode.setTerminalTextBlock(textBlock); - tableCellNode.setTerminal(true); - - } else { - cell.getTextBlocks().forEach(tb -> addParagraphOrHeadline(tableCellNode, tb, context)); - tableCellNode.setTerminal(false); - } - - } - - - private static boolean cellAreaIsSmallerThanPageAreaTimesThreshold(TableCell cell, PageNode page) { - - return cell.getArea() < TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD * page.getHeight() * page.getWidth(); - } - - - private static boolean firstTextBlockIsHeadline(TableCell cell) { - - String classification = cell.getTextBlocks().get(0).getClassification(); - return classification != null && classification.startsWith("H"); - } - - - private void addParagraphOrHeadline(SemanticNode parentNode, ClassificationTextBlock originalTextBlock, Context context) { - - addParagraphOrHeadline(parentNode, originalTextBlock, context, Collections.emptyList()); - } - - - private void addParagraphOrHeadline(SemanticNode parentNode, ClassificationTextBlock originalTextBlock, Context context, List textBlocksToMerge) { - - PageNode page = getPage(originalTextBlock.getPage(), context); - - SemanticNode node; - if (originalTextBlock.getClassification() != null && originalTextBlock.getClassification().startsWith("H")) { - node = HeadlineNode.builder().tableOfContents(context.tableOfContents()).build(); - } else { - node = ParagraphNode.builder().tableOfContents(context.tableOfContents()).build(); + node = Paragraph.builder().documentTree(context.getDocumentTree()).build(); } page.getMainBody().add(node); - List textBlocks = new LinkedList<>(textBlocksToMerge); + List textBlocks = new ArrayList<>(textBlocksToMerge); textBlocks.add(originalTextBlock); AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), node, context, page); - - if (node instanceof HeadlineNode headlineNode) { - List tocId = context.tableOfContents.createNewChildEntryAndReturnId(parentNode.getTocId(), NodeType.HEADLINE, node); - headlineNode.setTerminalTextBlock(textBlock); - headlineNode.setTocId(tocId); - } - if (node instanceof ParagraphNode paragraphNode) { - List tocId = context.tableOfContents.createNewChildEntryAndReturnId(parentNode.getTocId(), NodeType.PARAGRAPH, node); - paragraphNode.setTerminalTextBlock(textBlock); - paragraphNode.setTocId(tocId); - } + List treeId = context.documentTree.createNewChildEntryAndReturnId(parentNode, node); + node.setLeafTextBlock(textBlock); + node.setTreeId(treeId); } - private void addImage(SectionNode sectionNode, ClassifiedImage image, Context context) { + public void addImage(Section section, ClassifiedImage image, Context context) { - PageNode page = getPage(image.getPage(), context); - ImageNode imageNode = ImageNode.builder() + Rectangle2D position = image.getPosition(); + Page page = context.getPage(image.getPage()); + Image imageNode = Image.builder() + .id(IdBuilder.buildId(Set.of(page), List.of(position))) .imageType(image.getImageType()) - .position(image.getPosition()) - .transparency(image.isHasTransparency()) + .position(position) + .transparent(image.isHasTransparency()) .page(page) - .tableOfContents(context.tableOfContents()) + .documentTree(context.getDocumentTree()) .build(); page.getMainBody().add(imageNode); - List tocId = context.tableOfContents().createNewChildEntryAndReturnId(sectionNode.getTocId(), NodeType.IMAGE, imageNode); - imageNode.setTocId(tocId); + List tocId = context.getDocumentTree().createNewChildEntryAndReturnId(section, imageNode); + imageNode.setTreeId(tocId); } private void addHeaderAndFooterToEachPage(ClassificationDocument document, Context context) { - Map> headers = document.getHeaders() + Map> headers = document.getHeaders() .stream() .map(ClassificationHeader::getTextBlocks) .flatMap(List::stream) - .collect(groupingBy(AbstractTextContainer::getPage, toList())); + .collect(groupingBy(AbstractPageBlock::getPage, toList())); - Map> footers = document.getFooters() + Map> footers = document.getFooters() .stream() .map(ClassificationFooter::getTextBlocks) .flatMap(List::stream) - .collect(groupingBy(AbstractTextContainer::getPage, toList())); + .collect(groupingBy(AbstractPageBlock::getPage, toList())); for (int pageIndex = 1; pageIndex <= document.getPages().size(); pageIndex++) { if (headers.containsKey(pageIndex)) { @@ -303,85 +141,105 @@ public class DocumentGraphFactory { } - private void addFooter(List textBlocks, Context context) { + private void addFooter(List textBlocks, Context context) { - PageNode page = getPage(textBlocks.get(0).getPage(), context); - FooterNode footer = FooterNode.builder().tableOfContents(context.tableOfContents()).build(); + Page page = context.getPage(textBlocks.get(0).getPage()); + Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build(); AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), footer, context, page); - List tocId = context.tableOfContents().createNewMainEntryAndReturnId(NodeType.FOOTER, footer); - footer.setTocId(tocId); - footer.setTerminalTextBlock(textBlock); + List tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer); + footer.setTreeId(tocId); + footer.setLeafTextBlock(textBlock); page.setFooter(footer); } - public void addHeader(List textBlocks, Context context) { + public void addHeader(List textBlocks, Context context) { - PageNode page = getPage(textBlocks.get(0).getPage(), context); - HeaderNode header = HeaderNode.builder().tableOfContents(context.tableOfContents()).build(); - AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), - header, - context, - 0, - page); - List tocId = context.tableOfContents().createNewMainEntryAndReturnId(NodeType.HEADER, header); - header.setTocId(tocId); - header.setTerminalTextBlock(textBlock); + Page page = context.getPage(textBlocks.get(0).getPage()); + Header header = Header.builder().documentTree(context.getDocumentTree()).build(); + AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), header, 0, page); + List tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header); + header.setTreeId(tocId); + header.setLeafTextBlock(textBlock); page.setHeader(header); } private void addEmptyFooter(int pageIndex, Context context) { - PageNode page = getPage(pageIndex, context); - FooterNode footer = FooterNode.builder().tableOfContents(context.tableOfContents()).build(); + Page page = context.getPage(pageIndex); + Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build(); AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(footer, context, page); - List tocId = context.tableOfContents().createNewMainEntryAndReturnId(NodeType.FOOTER, footer); - footer.setTocId(tocId); - footer.setTerminalTextBlock(textBlock); + List tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer); + footer.setTreeId(tocId); + footer.setLeafTextBlock(textBlock); page.setFooter(footer); } private void addEmptyHeader(int pageIndex, Context context) { - PageNode page = getPage(pageIndex, context); - HeaderNode header = HeaderNode.builder().tableOfContents(context.tableOfContents()).build(); + Page page = context.getPage(pageIndex); + Header header = Header.builder().documentTree(context.getDocumentTree()).build(); AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(header, 0, page); - List tocId = context.tableOfContents().createNewMainEntryAndReturnId(NodeType.HEADER, header); - header.setTocId(tocId); - header.setTerminalTextBlock(textBlock); + List tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header); + header.setTreeId(tocId); + header.setLeafTextBlock(textBlock); page.setHeader(header); } - private PageNode buildPage(ClassificationPage p) { + @Getter + @Builder + @AllArgsConstructor + @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) + public final class Context { - return PageNode.builder() - .height((int) p.getPageHeight()) - .width((int) p.getPageWidth()) - .number(p.getPageNumber()) - .rotation(p.getRotation()) - .mainBody(new LinkedList<>()) - .build(); - } + DocumentTree documentTree; + Map pages; + List

sections; + List images; + TextBlockFactory textBlockFactory; - private PageNode getPage(int pageIndex, Context context) { + public Context(Document document) { - return context.pages.keySet() - .stream() - .filter(page -> page.getNumber() == pageIndex) - .findFirst() - .orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex))); - } + documentTree = new DocumentTree(document); + pages = new HashMap<>(); + sections = new LinkedList<>(); + images = new LinkedList<>(); + textBlockFactory = new TextBlockFactory(); + } - record Context( - TableOfContents tableOfContents, Map pages, List sections, List images, TextBlockFactory textBlockFactory) { + public void buildAndAddPageWithCounter(ClassificationPage classificationPage) { + + Page page = Page.fromClassificationPage(classificationPage); + //this counter counts the TextBlocks per page + //initial value is set to 1, because 0 is reserved for Header + pages.put(page, 1); + } + + + public int getAndIncrementTextBlockNumberOnPage(Page page) { + + Integer textBlockNumberOnPage = pages.get(page); + pages.merge(page, 1, Integer::sum); + return textBlockNumberOnPage; + } + + + public Page getPage(int pageIndex) { + + return pages.keySet() + .stream() + .filter(page -> page.getNumber() == pageIndex) + .findFirst() + .orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex))); + } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/RectangleTransformations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/RectangleTransformations.java deleted file mode 100644 index 062b49d..0000000 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/RectangleTransformations.java +++ /dev/null @@ -1,105 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.processor.factory; - -import static java.lang.String.format; - -import java.awt.geom.Area; -import java.awt.geom.Rectangle2D; -import java.util.Arrays; -import java.util.List; -import java.util.Set; -import java.util.function.BiConsumer; -import java.util.function.BinaryOperator; -import java.util.function.Function; -import java.util.function.Supplier; -import java.util.stream.Collector; - -import org.apache.pdfbox.pdmodel.common.PDRectangle; - -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer; - -public class RectangleTransformations { - - public static Rectangle2D pad(Rectangle2D rectangle2D, int deltaX, int deltaY) { - - return new Rectangle2D.Double(rectangle2D.getMinX() - deltaX, rectangle2D.getMinY() - deltaY, rectangle2D.getWidth() + 2 * deltaX, rectangle2D.getHeight() + 2 * deltaY); - } - - - public static Rectangle2D bBoxUnionAbstractTextContainer(List abstractTextContainers) { - - return abstractTextContainers.stream().map(RectangleTransformations::toRectangle2D).collect(new Rectangle2DUnion()); - } - - public static Rectangle2D rectangleUnion(List rectangle2DList) { - - return rectangle2DList.stream().collect(new Rectangle2DUnion()); - } - - - public static Rectangle2D toRectangle2D(AbstractTextContainer abstractTextContainer) { - - return new Rectangle2D.Float(abstractTextContainer.getMinX(), abstractTextContainer.getMinY(), abstractTextContainer.getWidth(), abstractTextContainer.getHeight()); - } - - - public static Rectangle2D toRectangle2D(PDRectangle rectangle) { - - return new Rectangle2D.Double(rectangle.getLowerLeftX(), rectangle.getLowerLeftY(), rectangle.getWidth(), rectangle.getHeight()); - } - - - public static String toString(Rectangle2D rectangle2D) { - - return format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight()); - } - - - public static Rectangle2D parseRectangle2D(String bBox) { - - List floats = Arrays.stream(bBox.split(",")).map(Float::parseFloat).toList(); - return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3)); - } - - - private static class Rectangle2DUnion implements Collector { - - @Override - public Supplier supplier() { - - return Area::new; - } - - - @Override - public BiConsumer accumulator() { - - return (area, rectangle2D) -> area.add(new Area(rectangle2D)); - } - - - @Override - public BinaryOperator combiner() { - - return (area1, area2) -> { - area1.add(area2); - return area1; - }; - } - - - @Override - public Function finisher() { - - return Area::getBounds2D; - } - - - @Override - public Set characteristics() { - - return Set.of(Characteristics.CONCURRENT, Characteristics.UNORDERED); - } - - } - -} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SearchTextWithTextPositionModel.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SearchTextWithTextPositionDto.java similarity index 51% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SearchTextWithTextPositionModel.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SearchTextWithTextPositionDto.java index a9d9f3d..223492c 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SearchTextWithTextPositionModel.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SearchTextWithTextPositionDto.java @@ -1,6 +1,7 @@ package com.knecon.fforesight.service.layoutparser.processor.factory; import java.awt.geom.Rectangle2D; +import java.util.Collections; import java.util.List; import lombok.AccessLevel; @@ -11,10 +12,22 @@ import lombok.experimental.FieldDefaults; @Builder @Getter @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) -public class SearchTextWithTextPositionModel { +public class SearchTextWithTextPositionDto { String searchText; List lineBreaks; List stringCoordsToPositionCoords; List positions; + + + public static SearchTextWithTextPositionDto empty() { + + return SearchTextWithTextPositionDto.builder() + .searchText("") + .lineBreaks(Collections.emptyList()) + .positions(Collections.emptyList()) + .stringCoordsToPositionCoords(Collections.emptyList()) + .build(); + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SearchTextWithTextPositionFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SearchTextWithTextPositionFactory.java index 76a1583..98033f1 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SearchTextWithTextPositionFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SearchTextWithTextPositionFactory.java @@ -2,38 +2,35 @@ package com.knecon.fforesight.service.layoutparser.processor.factory; import java.awt.geom.AffineTransform; import java.awt.geom.Rectangle2D; -import java.util.Collections; import java.util.LinkedList; import java.util.List; import java.util.Objects; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.RedTextPosition; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextDirection; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.RedTextPosition; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextDirection; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence; +import lombok.experimental.UtilityClass; + +@UtilityClass public class SearchTextWithTextPositionFactory { - public static final int HEIGHT_PADDING = 2; + public final int HEIGHT_PADDING = 2; + // when checking for a hyphen linebreak, we need to check after a linebreak if the last hyphen was less than three symbols away. + // We detect a linebreak as either a "\n" character or if two adjacent symbol's position differ in y-coordinates by at least one character height. + // If there is a hyphen linebreak, the hyphen will be 1 position in front of a "\n" or 2 positions in front of the character which has a lower y-coordinate + // This is why, we need to initialize this to < -2, otherwise, if the very first symbol is a \n we would detect a hyphen linebreak that isn't there. + // Also, Integer.MIN_VALUE is a bad idea due to potential overflow during arithmetic operations. This is why the default should be -3. + public final int MAX_HYPHEN_LINEBREAK_DISTANCE = 3; - public static SearchTextWithTextPositionModel buildSearchTextToTextPositionModel(List sequences) { + public SearchTextWithTextPositionDto buildSearchTextToTextPositionModel(List sequences) { if (sequences.isEmpty() || sequences.stream().allMatch(sequence -> sequence.getTextPositions().isEmpty())) { - return SearchTextWithTextPositionModel.builder() - .searchText("") - .lineBreaks(Collections.emptyList()) - .positions(Collections.emptyList()) - .stringCoordsToPositionCoords(Collections.emptyList()) - .build(); + return SearchTextWithTextPositionDto.empty(); } - List stringIdxToPositionIdx = new LinkedList<>(); - List lineBreaksStringIdx = new LinkedList<>(); - StringBuilder sb = new StringBuilder(); - - int stringIdx = 0; - int positionIdx = 0; - int lastHyphenIdx = -3; + Context context = new Context(); RedTextPosition currentTextPosition = sequences.get(0).getTextPositions().get(0); RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").position(currentTextPosition.getPosition()).build(); @@ -42,60 +39,78 @@ public class SearchTextWithTextPositionFactory { for (int i = 0; i < word.getTextPositions().size(); ++i) { currentTextPosition = word.getTextPositions().get(i); - if (isLineBreak(currentTextPosition, previousTextPosition)) { - - if (stringIdx - lastHyphenIdx < 3) { - sb.delete(lastHyphenIdx, sb.length()); - stringIdxToPositionIdx = stringIdxToPositionIdx.subList(0, lastHyphenIdx); - stringIdx = lastHyphenIdx; - lastHyphenIdx = -3; - } - lineBreaksStringIdx.add(stringIdx); + removeHyphenLinebreaks(context); + context.lineBreaksStringIdx.add(context.stringIdx); } if (!isRepeatedWhitespace(currentTextPosition.getUnicode(), previousTextPosition.getUnicode())) { - if (isHyphen(currentTextPosition.getUnicode())) { - lastHyphenIdx = stringIdx; + context.lastHyphenIdx = context.stringIdx; } - sb.append(currentTextPosition.getUnicode()); - stringIdxToPositionIdx.add(positionIdx); - ++stringIdx; + appendCurrentTextPosition(context, currentTextPosition); } previousTextPosition = currentTextPosition; - - ++positionIdx; + ++context.positionIdx; } previousTextPosition = RedTextPosition.builder().unicode(" ").position(previousTextPosition.getPosition()).build(); - sb.append(previousTextPosition.getUnicode()); - stringIdxToPositionIdx.add(positionIdx); - ++stringIdx; + context.stringBuilder.append(" "); + context.stringIdxToPositionIdx.add(context.positionIdx); + ++context.stringIdx; } - assert sb.length() == stringIdxToPositionIdx.size(); + assert context.stringBuilder.length() == context.stringIdxToPositionIdx.size(); List positions = sequences.stream() .flatMap(sequence -> sequence.getTextPositions().stream().map(textPosition -> mapRedTextPositionToInitialUserSpace(textPosition, sequence))) .toList(); - return SearchTextWithTextPositionModel.builder() - .searchText(sb.toString()) - .lineBreaks(lineBreaksStringIdx) - .stringCoordsToPositionCoords(stringIdxToPositionIdx) + return SearchTextWithTextPositionDto.builder() + .searchText(context.stringBuilder.toString()) + .lineBreaks(context.lineBreaksStringIdx) + .stringCoordsToPositionCoords(context.stringIdxToPositionIdx) .positions(positions) .build(); } - private static boolean isLineBreak(RedTextPosition currentTextPosition, RedTextPosition previousTextPosition) { + private void appendCurrentTextPosition(Context context, RedTextPosition currentTextPosition) { + + context.stringBuilder.append(currentTextPosition.getUnicode()); + + // unicode characters with more than 16-bit encoding have a length > 1 in java strings + for (int j = 0; j < currentTextPosition.getUnicode().length(); j++) { + context.stringIdxToPositionIdx.add(context.positionIdx); + } + context.stringIdx += currentTextPosition.getUnicode().length(); + } + + + private void removeHyphenLinebreaks(Context context) { + + if (lastHyphenDirectlyBeforeLineBreak(context)) { + context.stringBuilder.delete(context.lastHyphenIdx, context.stringBuilder.length()); + context.stringIdxToPositionIdx = context.stringIdxToPositionIdx.subList(0, context.lastHyphenIdx); + context.stringIdx = context.lastHyphenIdx; + context.lastHyphenIdx = -MAX_HYPHEN_LINEBREAK_DISTANCE; + } + } + + + private boolean lastHyphenDirectlyBeforeLineBreak(Context context) { + + return context.stringIdx - context.lastHyphenIdx < MAX_HYPHEN_LINEBREAK_DISTANCE; + } + + + private boolean isLineBreak(RedTextPosition currentTextPosition, RedTextPosition previousTextPosition) { return Objects.equals(currentTextPosition.getUnicode(), "\n") || isDeltaYLargerThanTextHeight(currentTextPosition, previousTextPosition); } - private static boolean isDeltaYLargerThanTextHeight(RedTextPosition currentPosition, RedTextPosition previousPosition) { + private boolean isDeltaYLargerThanTextHeight(RedTextPosition currentPosition, RedTextPosition previousPosition) { if (previousPosition == null) { return false; @@ -106,13 +121,13 @@ public class SearchTextWithTextPositionFactory { } - private static boolean isRepeatedWhitespace(String currentUnicode, String previousUnicode) { + private boolean isRepeatedWhitespace(String currentUnicode, String previousUnicode) { return Objects.equals(previousUnicode, " ") && Objects.equals(currentUnicode, " "); } - private static boolean isHyphen(String unicodeCharacter) { + private boolean isHyphen(String unicodeCharacter) { return Objects.equals(unicodeCharacter, "-") || // Objects.equals(unicodeCharacter, "~") || // @@ -128,7 +143,7 @@ public class SearchTextWithTextPositionFactory { } - private static Rectangle2D mapRedTextPositionToInitialUserSpace(RedTextPosition textPosition, TextPositionSequence sequence) { + private Rectangle2D mapRedTextPositionToInitialUserSpace(RedTextPosition textPosition, TextPositionSequence sequence) { float textHeight = sequence.getTextHeight() + HEIGHT_PADDING; Rectangle2D rectangle2D = new Rectangle2D.Double(textPosition.getXDirAdj(), @@ -153,4 +168,18 @@ public class SearchTextWithTextPositionFactory { return transform.createTransformedShape(rectangle2D).getBounds2D(); } + + private class Context { + + List stringIdxToPositionIdx = new LinkedList<>(); + List lineBreaksStringIdx = new LinkedList<>(); + StringBuilder stringBuilder = new StringBuilder(); + + int stringIdx; + int positionIdx; + + int lastHyphenIdx = -MAX_HYPHEN_LINEBREAK_DISTANCE; + + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SectionNodeFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SectionNodeFactory.java new file mode 100644 index 0000000..d3942fa --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SectionNodeFactory.java @@ -0,0 +1,183 @@ +package com.knecon.fforesight.service.layoutparser.processor.factory; + +import static java.lang.String.format; +import static java.util.Collections.emptyList; +import static java.util.stream.Collectors.groupingBy; + +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.GenericSemanticNode; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Section; +import com.knecon.fforesight.service.layoutparser.processor.utils.TableMergingUtility; + +import lombok.experimental.UtilityClass; + +@UtilityClass +public class SectionNodeFactory { + + public void addSection(GenericSemanticNode parentNode, List pageBlocks, List images, DocumentGraphFactory.Context context) { + + if (pageBlocks.isEmpty()) { + return; + } + Map> blocksPerPage = pageBlocks.stream().collect(groupingBy(AbstractPageBlock::getPage)); + Section section = Section.builder().documentTree(context.getDocumentTree()).build(); + + context.getSections().add(section); + blocksPerPage.keySet().forEach(pageNumber -> addSectionNodeToPageNode(context, section, pageNumber)); + + section.setTreeId(getTreeId(parentNode, context, section)); + + addFirstHeadlineDirectlyToSection(pageBlocks, context, section); + if (containsTablesAndTextBlocks(pageBlocks)) { + splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(section, subSectionPageBlocks, emptyList(), context)); + } else { + addTablesAndParagraphsAndHeadlinesToSection(pageBlocks, context, section); + } + + images.stream().distinct().forEach(image -> DocumentGraphFactory.addImage(section, image, context)); + } + + + private List getTreeId(GenericSemanticNode parentNode, DocumentGraphFactory.Context context, Section section) { + + if (parentNode == null) { + return context.getDocumentTree().createNewMainEntryAndReturnId(section); + } else { + return context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, section); + } + } + + + private void addFirstHeadlineDirectlyToSection(List pageBlocks, DocumentGraphFactory.Context context, Section section) { + + if (pageBlocks.get(0).isHeadline()) { + addTablesAndParagraphsAndHeadlinesToSection(List.of(pageBlocks.get(0)), context, section); + pageBlocks.remove(0); + } + } + + + private void addTablesAndParagraphsAndHeadlinesToSection(List pageBlocks, DocumentGraphFactory.Context context, Section section) { + + Set alreadyMerged = new HashSet<>(); + List remainingBlocks = new LinkedList<>(pageBlocks); + for (AbstractPageBlock abstractPageBlock : pageBlocks) { + + if (alreadyMerged.contains(abstractPageBlock)) { + continue; + } + + remainingBlocks.removeAll(alreadyMerged); + + if (abstractPageBlock instanceof TextPageBlock) { + List textBlocks = findTextBlocksWithSameClassificationAndAlignsY(abstractPageBlock, remainingBlocks); + alreadyMerged.addAll(textBlocks); + DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks); + } else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) { + List tablesToMerge = TableMergingUtility.findConsecutiveTablesWithSameColCountAndSameHeaders(tablePageBlock, remainingBlocks); + alreadyMerged.addAll(tablesToMerge); + TableNodeFactory.addTable(section, tablesToMerge, context); + } else { + throw new RuntimeException(format("Unhandled AbstractPageBlockType %s!", abstractPageBlock.getClass())); + } + } + } + + + private boolean containsTablesAndTextBlocks(List pageBlocks) { + + return pageBlocks.stream().anyMatch(pageBlock -> pageBlock instanceof TablePageBlock) && pageBlocks.stream().anyMatch(pageBlock -> pageBlock instanceof TextPageBlock); + } + + + /** + * This function splits the list of PageBlocks around TablePageBlocks, such that SubSections can be created, that don't include tables. + * This is needed so we can execute rules on sections, that do not contain tables. + * See: document structure wiki + * + * @param pageBlocks a List of AbstractPageBlocks, which have at least one TablePageBlock and one ClassificationTextBlock + * @return List of Lists of AbstractPageBlocks, which include either a single Headline ClassificationTextBlock and a TablePageBlock or only ClassificationTextBlocks. + */ + private List> splitPageBlocksIntoSubSections(List pageBlocks) { + + List> splitList = splitIntoCoherentList(pageBlocks); + movePrecedingHeadlineToTableList(splitList); + return splitList.stream().filter(list -> !list.isEmpty()).toList(); + } + + + private void movePrecedingHeadlineToTableList(List> splitList) { + + for (int i = 0; i < splitList.size(); i++) { + if (listIsTablesOnly(splitList.get(i)) && i > 0) { + List previousList = splitList.get(i - 1); + AbstractPageBlock lastPageBlockInPreviousList = previousList.get(previousList.size() - 1); + if (lastPageBlockInPreviousList.isHeadline()) { + previousList.remove(i - 1); + splitList.get(i).add(0, lastPageBlockInPreviousList); + } + } + } + } + + + private boolean listIsTablesOnly(List abstractPageBlocks) { + + return abstractPageBlocks.stream().allMatch(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock); + } + + + /** + * @param pageBlocks a List of AbstractPageBlocks, which have at least one TablePageBlock and one ClassificationTextBlock + * @return List of Lists of AbstractPageBlocks, which are exclusively of type ClassificationTextBlock or TablePageBlock + */ + private List> splitIntoCoherentList(List pageBlocks) { + + List> splitList = new LinkedList<>(); + List currentList = new LinkedList<>(); + splitList.add(currentList); + + Class lastPageBlockClass = pageBlocks.get(0).getClass(); + for (AbstractPageBlock pageBlock : pageBlocks) { + if (lastPageBlockClass.isInstance(pageBlock)) { + currentList.add(pageBlock); + } else { + currentList = new LinkedList<>(); + currentList.add(pageBlock); + splitList.add(currentList); + lastPageBlockClass = pageBlock.getClass(); + } + } + return splitList; + } + + + private List findTextBlocksWithSameClassificationAndAlignsY(AbstractPageBlock atc, List pageBlocks) { + + return pageBlocks.stream() + .filter(abstractTextContainer -> !abstractTextContainer.equals(atc)) + .filter(abstractTextContainer -> abstractTextContainer.getPage() == atc.getPage()) + .filter(abstractTextContainer -> abstractTextContainer instanceof TextPageBlock) + .filter(abstractTextContainer -> abstractTextContainer.intersectsY(atc)) + .map(abstractTextContainer -> (TextPageBlock) abstractTextContainer) + .toList(); + } + + + private void addSectionNodeToPageNode(DocumentGraphFactory.Context context, Section section, Integer pageNumber) { + + Page page = context.getPage(pageNumber); + page.getMainBody().add(section); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/TableNodeFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/TableNodeFactory.java new file mode 100644 index 0000000..0124ec1 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/TableNodeFactory.java @@ -0,0 +1,136 @@ +package com.knecon.fforesight.service.layoutparser.processor.factory; + +import static java.util.Collections.emptyList; + +import java.util.Collection; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.GenericSemanticNode; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Table; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.TableCell; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations; + +import lombok.experimental.UtilityClass; + +@UtilityClass +public class TableNodeFactory { + + public final double TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD = 0.05; + + + public void addTable(GenericSemanticNode parentNode, List tablesToMerge, DocumentGraphFactory.Context context) { + + setPageNumberInCells(tablesToMerge); + Set pages = tablesToMerge.stream().map(AbstractPageBlock::getPage).map(context::getPage).collect(Collectors.toSet()); + List> mergedRows = tablesToMerge.stream().map(TablePageBlock::getRows).flatMap(Collection::stream).toList(); + Table table = Table.builder().documentTree(context.getDocumentTree()).numberOfCols(mergedRows.get(0).size()).numberOfRows(mergedRows.size()).build(); + + pages.forEach(page -> addTableToPage(page, parentNode, table)); + + List treeId = context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, table); + table.setTreeId(treeId); + addTableCells(mergedRows, table, context); + + ifTableHasNoHeadersSetFirstRowAsHeaders(table); + } + + + private void setPageNumberInCells(List tablesToMerge) { + + // For some reason I can't figure out, in some table cells, the ClassificationTextBlocks have 0 as page number + // So I am fixing this here, but this should actually be fixed upstream. + tablesToMerge.forEach(table -> table.getRows() + .stream() + .flatMap(Collection::stream) + .peek(cell -> cell.setPageNumber(table.getPage())) + .forEach(cell -> setPageNumberInTextBlocksWithPageNumberSetTo0(table, cell))); + } + + + private void setPageNumberInTextBlocksWithPageNumberSetTo0(TablePageBlock table, Cell cell) { + + cell.getTextBlocks().stream()// + .filter(tb -> tb.getPage() == 0)// + .forEach(tb -> tb.setPage(table.getPage())); + } + + + @SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong + private void addTableToPage(Page page, SemanticNode parentNode, Table table) { + + if (!page.getMainBody().contains(parentNode)) { + parentNode.getPages().add(page); + } + + page.getMainBody().add(table); + } + + + private void ifTableHasNoHeadersSetFirstRowAsHeaders(Table table) { + + if (table.streamHeaders().findAny().isEmpty()) { + table.streamRow(0).forEach(tableCellNode -> tableCellNode.setHeader(true)); + } + } + + + private void addTableCells(List> rows, Table table, DocumentGraphFactory.Context context) { + + for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) { + for (int colIndex = 0; colIndex < rows.get(rowIndex).size(); colIndex++) { + addTableCell(rows.get(rowIndex).get(colIndex), rowIndex, colIndex, table, context); + } + } + } + + + @SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong + private void addTableCell(Cell cell, int rowIndex, int colIndex, Table tableNode, DocumentGraphFactory.Context context) { + + Page page = context.getPage(cell.getPageNumber()); + + TableCell tableCell = TableCell.builder().documentTree(context.getDocumentTree()).row(rowIndex).col(colIndex).header(cell.isHeaderCell()).bBox(cell.getBounds2D()).build(); + page.getMainBody().add(tableCell); + + List treeId = context.getDocumentTree().createNewTableChildEntryAndReturnId(tableNode, tableCell); + tableCell.setTreeId(treeId); + + TextBlock textBlock; + if (cell.getTextBlocks().isEmpty()) { + tableCell.setLeafTextBlock(context.getTextBlockFactory().emptyTextBlock(tableNode, context, page)); + } else if (cell.getTextBlocks().size() == 1) { + textBlock = context.getTextBlockFactory().buildAtomicTextBlock(cell.getTextBlocks().get(0).getSequences(), tableCell, context, page); + tableCell.setLeafTextBlock(textBlock); + } else if (firstTextBlockIsHeadline(cell)) { + SectionNodeFactory.addSection(tableCell, cell.getTextBlocks().stream().map(tb -> (AbstractPageBlock) tb).toList(), emptyList(), context); + } else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) { + List sequences = TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(cell.getTextBlocks()); + textBlock = context.getTextBlockFactory().buildAtomicTextBlock(sequences, tableCell, context, page); + tableCell.setLeafTextBlock(textBlock); + } else { + cell.getTextBlocks().forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList())); + } + } + + + private boolean cellAreaIsSmallerThanPageAreaTimesThreshold(Cell cell, Page page) { + + return cell.getArea() < TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD * page.getHeight() * page.getWidth(); + } + + + private boolean firstTextBlockIsHeadline(Cell cell) { + + return cell.getTextBlocks().get(0).isHeadline(); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/TextBlockFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/TextBlockFactory.java index 46a950a..caf01f9 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/TextBlockFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/TextBlockFactory.java @@ -1,79 +1,53 @@ package com.knecon.fforesight.service.layoutparser.processor.factory; -import java.util.Collections; import java.util.List; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.concurrent.atomic.AtomicLong; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.AtomicTextBlock; +import lombok.AccessLevel; +import lombok.experimental.FieldDefaults; +@FieldDefaults(level = AccessLevel.PRIVATE) public class TextBlockFactory { - AtomicInteger stringOffset; - AtomicLong textBlockIdx; + int stringOffset; + long textBlockIdx; - public TextBlockFactory() { + public AtomicTextBlock buildAtomicTextBlock(List sequences, SemanticNode parent, DocumentGraphFactory.Context context, Page page) { - stringOffset = new AtomicInteger(); - textBlockIdx = new AtomicLong(); + Integer numberOnPage = context.getAndIncrementTextBlockNumberOnPage(page); + return buildAtomicTextBlock(sequences, parent, numberOnPage, page); } - public AtomicTextBlock buildAtomicTextBlock(List sequences, SemanticNode parent, DocumentGraphFactory.Context context, PageNode page) { + public AtomicTextBlock buildAtomicTextBlock(List sequences, SemanticNode parent, Integer numberOnPage, Page page) { - Integer numberOnPage = context.pages().get(page).getAndIncrement(); - return buildAtomicTextBlock(sequences, parent, context, numberOnPage, page); + SearchTextWithTextPositionDto searchTextWithTextPositionDto = SearchTextWithTextPositionFactory.buildSearchTextToTextPositionModel(sequences); + int offset = stringOffset; + stringOffset += searchTextWithTextPositionDto.getSearchText().length(); + long idx = textBlockIdx; + textBlockIdx++; + return AtomicTextBlock.fromSearchTextWithTextPositionDto(searchTextWithTextPositionDto, parent, offset, idx, numberOnPage, page); } - public AtomicTextBlock buildAtomicTextBlock(List sequences, - SemanticNode parent, - DocumentGraphFactory.Context context, - Integer numberOnPage, - PageNode page) { + public AtomicTextBlock emptyTextBlock(SemanticNode parent, DocumentGraphFactory.Context context, Page page) { - SearchTextWithTextPositionModel searchTextWithTextPositionModel = SearchTextWithTextPositionFactory.buildSearchTextToTextPositionModel(sequences); - int offset = stringOffset.getAndAdd(searchTextWithTextPositionModel.getSearchText().length()); - - return AtomicTextBlock.builder() - .id(textBlockIdx.getAndIncrement()) - .parent(parent) - .searchText(searchTextWithTextPositionModel.getSearchText()) - .numberOnPage(numberOnPage) - .page(page) - .lineBreaks(searchTextWithTextPositionModel.getLineBreaks()) - .positions(searchTextWithTextPositionModel.getPositions()) - .stringIdxToPositionIdx(searchTextWithTextPositionModel.getStringCoordsToPositionCoords()) - .boundary(new Boundary(offset, offset + searchTextWithTextPositionModel.getSearchText().length())) - .build(); + long idx = textBlockIdx; + textBlockIdx++; + return AtomicTextBlock.empty(idx, stringOffset, page, context.getAndIncrementTextBlockNumberOnPage(page), parent); } - public AtomicTextBlock emptyTextBlock(SemanticNode parent, DocumentGraphFactory.Context context, PageNode page) { + public AtomicTextBlock emptyTextBlock(SemanticNode parent, Integer numberOnPage, Page page) { - return emptyTextBlock(parent, context.pages().get(page).getAndIncrement(), page); - } - - - public AtomicTextBlock emptyTextBlock(SemanticNode parent, Integer numberOnPage, PageNode page) { - - return AtomicTextBlock.builder() - .id(textBlockIdx.getAndIncrement()) - .boundary(new Boundary(stringOffset.get(), stringOffset.get())) - .searchText("") - .lineBreaks(Collections.emptyList()) - .page(page) - .numberOnPage(numberOnPage) - .stringIdxToPositionIdx(Collections.emptyList()) - .positions(Collections.emptyList()) - .parent(parent) - .build(); + long idx = textBlockIdx; + textBlockIdx++; + return AtomicTextBlock.empty(idx, stringOffset, page, numberOnPage, parent); } } diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/Boundary.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/Boundary.java similarity index 61% rename from layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/Boundary.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/Boundary.java index ece48e5..463b7a6 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/Boundary.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/Boundary.java @@ -1,11 +1,18 @@ -package com.knecon.fforesight.service.layoutparser.internal.api.graph; +package com.knecon.fforesight.service.layoutparser.processor.graph; +import static java.lang.String.format; + +import java.util.Collection; import java.util.LinkedList; import java.util.List; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; + +import lombok.EqualsAndHashCode; import lombok.Setter; @Setter +@EqualsAndHashCode public class Boundary implements Comparable { private int start; @@ -15,7 +22,7 @@ public class Boundary implements Comparable { public Boundary(int start, int end) { if (start > end) { - throw new IllegalArgumentException(String.format("start: %d > end: %d", start, end)); + throw new IllegalArgumentException(format("start: %d > end: %d", start, end)); } this.start = start; this.end = end; @@ -55,7 +62,7 @@ public class Boundary implements Comparable { public boolean contains(int start, int end) { if (start > end) { - throw new IllegalArgumentException(String.format("start: %d > end: %d", start, end)); + throw new IllegalArgumentException(format("start: %d > end: %d", start, end)); } return this.start <= start && end <= this.end; } @@ -64,7 +71,7 @@ public class Boundary implements Comparable { public boolean containedBy(int start, int end) { if (start > end) { - throw new IllegalArgumentException(String.format("start: %d > end: %d", start, end)); + throw new IllegalArgumentException(format("start: %d > end: %d", start, end)); } return start <= this.start && this.end <= end; } @@ -78,14 +85,14 @@ public class Boundary implements Comparable { public boolean intersects(Boundary boundary) { - return contains(boundary.start()) || contains(boundary.end() - 1); + return boundary.start() < this.end && this.start < boundary.end(); } public List split(List splitIndices) { if (splitIndices.stream().anyMatch(idx -> !this.contains(idx))) { - throw new IndexOutOfBoundsException(String.format("%s splitting indices are out of range for %s", splitIndices.stream().filter(idx -> !this.contains(idx)).toList(), this)); + throw new IndexOutOfBoundsException(format("%s splitting indices are out of range for %s", splitIndices.stream().filter(idx -> !this.contains(idx)).toList(), this)); } List splitBoundaries = new LinkedList<>(); int previousIndex = start; @@ -103,7 +110,7 @@ public class Boundary implements Comparable { } - public static Boundary merge(List boundaries) { + public static Boundary merge(Collection boundaries) { int minStart = boundaries.stream().mapToInt(Boundary::start).min().orElseThrow(IllegalArgumentException::new); int maxEnd = boundaries.stream().mapToInt(Boundary::end).max().orElseThrow(IllegalArgumentException::new); @@ -114,7 +121,7 @@ public class Boundary implements Comparable { @Override public String toString() { - return String.format("Boundary [%d|%d)", start, end); + return format("Boundary [%d|%d)", start, end); } @@ -132,17 +139,25 @@ public class Boundary implements Comparable { } - @Override - public int hashCode() { + /** + * shrinks the boundary, such that textBlock.subSequence(boundary) returns a string without whitespaces. + * + * @param textBlock TextBlock to check whitespaces against + * @return boundary + */ + public Boundary trim(TextBlock textBlock) { - return toString().hashCode(); - } + int trimmedStart = this.start; + while (Character.isWhitespace(textBlock.charAt(trimmedStart))) { + trimmedStart++; + } + int trimmedEnd = this.end; + while (Character.isWhitespace(textBlock.charAt(trimmedEnd - 1))) { + trimmedEnd--; + } - @Override - public boolean equals(Object object) { - - return hashCode() == object.hashCode(); + return new Boundary(trimmedStart, Math.max(trimmedEnd, trimmedStart)); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/DocumentTree.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/DocumentTree.java new file mode 100644 index 0000000..cea2557 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/DocumentTree.java @@ -0,0 +1,217 @@ +package com.knecon.fforesight.service.layoutparser.processor.graph; + +import static java.lang.String.format; + +import java.util.Collections; +import java.util.LinkedList; +import java.util.List; +import java.util.stream.Stream; + +import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.GenericSemanticNode; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Table; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.TableCell; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlockCollector; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.Getter; +import lombok.experimental.FieldDefaults; + +@Data +@EqualsAndHashCode +public class DocumentTree { + + private final Entry root; + + + public DocumentTree(Document document) { + + root = Entry.builder().treeId(Collections.emptyList()).children(new LinkedList<>()).node(document).build(); + } + + + public TextBlock buildTextBlock() { + + return allEntriesInOrder().map(Entry::getNode).filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector()); + } + + + public List createNewMainEntryAndReturnId(GenericSemanticNode node) { + + return createNewChildEntryAndReturnIdImpl(Collections.emptyList(), node); + } + + + public List createNewChildEntryAndReturnId(GenericSemanticNode parentNode, GenericSemanticNode node) { + + return createNewChildEntryAndReturnIdImpl(parentNode.getTreeId(), node); + } + + + public List createNewChildEntryAndReturnId(GenericSemanticNode parentNode, Table node) { + + return createNewChildEntryAndReturnIdImpl(parentNode.getTreeId(), node); + } + + + public List createNewTableChildEntryAndReturnId(Table parentTable, TableCell tableCell) { + + return createNewChildEntryAndReturnIdImpl(parentTable.getTreeId(), tableCell); + } + + + @SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong + private List createNewChildEntryAndReturnIdImpl(List parentId, SemanticNode node) { + + if (!entryExists(parentId)) { + throw new IllegalArgumentException(format("parentId %s does not exist!", parentId)); + } + + Entry parent = getEntryById(parentId); + List newId = new LinkedList<>(parentId); + newId.add(parent.children.size()); + parent.children.add(Entry.builder().treeId(newId).node(node).build()); + + return newId; + } + + + private boolean entryExists(List treeId) { + + if (treeId.isEmpty()) { + return root != null; + } + Entry entry = root.children.get(treeId.get(0)); + for (int id : treeId.subList(1, treeId.size())) { + if (id >= entry.children.size() || 0 > id) { + return false; + } + entry = entry.children.get(id); + } + return true; + } + + + public Entry getParentEntryById(List treeId) { + + return getEntryById(getParentId(treeId)); + } + + + public boolean hasParentById(List treeId) { + + return !treeId.isEmpty(); + } + + + public Stream childNodes(List treeId) { + + return getEntryById(treeId).children.stream().map(Entry::getNode); + } + + + public Stream childNodesOfType(List treeId, NodeType nodeType) { + + return getEntryById(treeId).children.stream().filter(entry -> entry.node.getType().equals(nodeType)).map(Entry::getNode); + } + + + private static List getParentId(List treeId) { + + if (treeId.isEmpty()) { + throw new UnsupportedOperationException("Root has no parent!"); + } + if (treeId.size() < 2) { + return Collections.emptyList(); + } + return treeId.subList(0, treeId.size() - 1); + } + + + public Entry getEntryById(List treeId) { + + if (treeId.isEmpty()) { + return root; + } + Entry entry = root.children.get(treeId.get(0)); + for (int id : treeId.subList(1, treeId.size())) { + entry = entry.children.get(id); + } + return entry; + } + + + public Stream mainEntries() { + + return root.children.stream(); + } + + + public Stream allEntriesInOrder() { + + return Stream.of(root).flatMap(DocumentTree::flatten); + } + + + public Stream allSubEntriesInOrder(List parentId) { + + return getEntryById(parentId).children.stream().flatMap(DocumentTree::flatten); + } + + + @Override + public String toString() { + + return String.join("\n", allEntriesInOrder().map(Entry::toString).toList()); + } + + + private static Stream flatten(Entry entry) { + + return Stream.concat(Stream.of(entry), entry.children.stream().flatMap(DocumentTree::flatten)); + } + + + public SemanticNode getHighestParentById(List treeId) { + + if (treeId.isEmpty()) { + return root.node; + } + return root.children.get(treeId.get(0)).node; + } + + + @Builder + @Getter + @AllArgsConstructor + @FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true) + public static class Entry { + + List treeId; + SemanticNode node; + @Builder.Default + List children = new LinkedList<>(); + + + @Override + public String toString() { + + return node.toString(); + } + + + public NodeType getType() { + + return node.getType(); + } + + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/entity/EntityType.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/entity/EntityType.java new file mode 100644 index 0000000..cbd484d --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/entity/EntityType.java @@ -0,0 +1,8 @@ +package com.knecon.fforesight.service.layoutparser.processor.graph.entity; + +public enum EntityType { + ENTITY, + RECOMMENDATION, + FALSE_POSITIVE, + FALSE_RECOMMENDATION +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/entity/RedactionEntity.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/entity/RedactionEntity.java new file mode 100644 index 0000000..5a1c686 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/entity/RedactionEntity.java @@ -0,0 +1,228 @@ +package com.knecon.fforesight.service.layoutparser.processor.graph.entity; + +import java.awt.geom.Rectangle2D; +import java.util.Collection; +import java.util.Comparator; +import java.util.Deque; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Engine; +import com.knecon.fforesight.service.layoutparser.processor.graph.Boundary; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode; +import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.experimental.FieldDefaults; + +@Data +@Builder +@AllArgsConstructor +@FieldDefaults(level = AccessLevel.PRIVATE) +@EqualsAndHashCode(onlyExplicitlyIncluded = true) +public class RedactionEntity { + + // initial values + @EqualsAndHashCode.Include + final Boundary boundary; + @EqualsAndHashCode.Include + final String type; + @EqualsAndHashCode.Include + final EntityType entityType; + + // empty defaults + boolean redaction; + boolean removed; + boolean ignored; + boolean resized; + boolean skipRemoveEntitiesContainedInLarger; + boolean dictionaryEntry; + boolean dossierDictionaryEntry; + Set engines; + Set references; + @Builder.Default + Deque matchedRules = new LinkedList<>(); + String redactionReason; + String legalBasis; + + // inferred on graph insertion + @EqualsAndHashCode.Include + String value; + String textBefore; + String textAfter; + @Builder.Default + Set pages = new HashSet<>(); + List redactionPositionsPerPage; + @Builder.Default + List intersectingNodes = new LinkedList<>(); + SemanticNode deepestFullyContainingNode; + + + public static RedactionEntity initialEntityNode(Boundary boundary, String type, EntityType entityType) { + + return RedactionEntity.builder().type(type).entityType(entityType).boundary(boundary).engines(new HashSet<>()).references(new HashSet<>()).build(); + } + + + public boolean occursInNodeOfType(Class clazz) { + + return intersectingNodes.stream().anyMatch(clazz::isInstance); + } + + + public boolean occursInNode(SemanticNode semanticNode) { + + return intersectingNodes.stream().anyMatch(node -> node.equals(semanticNode)); + } + + + public boolean isType(String type) { + + return this.type.equals(type); + } + + + public boolean isAnyType(List types) { + + return types.contains(type); + } + + + public void addIntersectingNode(SemanticNode containingNode) { + + intersectingNodes.add(containingNode); + } + + + public void removeFromGraph() { + + intersectingNodes.forEach(node -> node.getEntities().remove(this)); + pages.forEach(page -> page.getEntities().remove(this)); + intersectingNodes = new LinkedList<>(); + deepestFullyContainingNode = null; + pages = new HashSet<>(); + removed = true; + ignored = true; + } + + + public void addMatchedRule(int ruleNumber) { + + matchedRules.add(ruleNumber); + } + + + public int getMatchedRule() { + + if (matchedRules.isEmpty()) { + return 0; + } + return matchedRules.getLast(); + } + + + public List getRedactionPositionsPerPage() { + + if (redactionPositionsPerPage == null || redactionPositionsPerPage.isEmpty()) { + Map> rectanglesPerLinePerPage = deepestFullyContainingNode.getTextBlock().getPositionsPerPage(boundary); + + Page firstPage = rectanglesPerLinePerPage.keySet() + .stream() + .min(Comparator.comparingInt(Page::getNumber)) + .orElseThrow(() -> new RuntimeException("No Positions found on any page!")); + String id = IdBuilder.buildId(pages, rectanglesPerLinePerPage.values().stream().flatMap(Collection::stream).toList()); + redactionPositionsPerPage = rectanglesPerLinePerPage.entrySet().stream().map(entry -> buildRedactionPosition(firstPage, id, entry)).toList(); + } + return redactionPositionsPerPage; + } + + + private static RedactionPosition buildRedactionPosition(Page firstPage, String id, Map.Entry> entry) { + + if (entry.getKey().equals(firstPage)) { + return new RedactionPosition(id, entry.getKey(), entry.getValue()); + } else { + return new RedactionPosition(id + "-" + entry.getKey().getNumber(), entry.getKey(), entry.getValue()); + } + } + + + public boolean containedBy(RedactionEntity redactionEntity) { + + return this.boundary.containedBy(redactionEntity.getBoundary()); + } + + + public boolean contains(RedactionEntity redactionEntity) { + + return this.boundary.contains(redactionEntity.getBoundary()); + } + + + public boolean intersects(RedactionEntity redactionEntity) { + + return this.boundary.intersects(redactionEntity.getBoundary()); + } + + + public void addEngine(Engine engine) { + + engines.add(engine); + } + + + public void addEngines(Set engines) { + + this.engines.addAll(engines); + } + + + public void addReference(RedactionEntity reference) { + + references.add(reference); + } + + + public void addReferences(List references) { + + this.references.addAll(references); + } + + + public boolean matchesAnnotationId(String manualRedactionId) { + + return getRedactionPositionsPerPage().stream().anyMatch(entityPosition -> entityPosition.getId().equals(manualRedactionId)); + } + + + @Override + public String toString() { + + StringBuilder sb = new StringBuilder(); + sb.append("Entity[\""); + sb.append(value); + sb.append("\", "); + sb.append(boundary); + sb.append(", pages["); + pages.forEach(page -> { + sb.append(page.getNumber()); + sb.append(", "); + }); + sb.delete(sb.length() - 2, sb.length()); + sb.append("], type = \""); + sb.append(type); + sb.append("\", EntityType."); + sb.append(entityType); + sb.append("]"); + return sb.toString(); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/entity/RedactionPosition.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/entity/RedactionPosition.java new file mode 100644 index 0000000..5d050d4 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/entity/RedactionPosition.java @@ -0,0 +1,24 @@ +package com.knecon.fforesight.service.layoutparser.processor.graph.entity; + +import java.awt.geom.Rectangle2D; +import java.util.List; + +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.experimental.FieldDefaults; + +@Data +@AllArgsConstructor +@FieldDefaults(level = AccessLevel.PRIVATE) +public class RedactionPosition { + + final String id; + Page page; + // Each entry in this list corresponds to an entry in the redaction log, this means: + // An entity might be represented by multiple redaction log entries + List rectanglePerLine; + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Document.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Document.java new file mode 100644 index 0000000..9ebcce6 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Document.java @@ -0,0 +1,120 @@ +package com.knecon.fforesight.service.layoutparser.processor.graph.nodes; + +import java.awt.geom.Rectangle2D; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import com.amazonaws.services.kms.model.NotFoundException; +import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType; +import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; +import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlockCollector; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; +import lombok.experimental.FieldDefaults; + +@Data +@Builder +@AllArgsConstructor +@NoArgsConstructor +@FieldDefaults(level = AccessLevel.PRIVATE) +public class Document implements GenericSemanticNode { + + Set pages; + DocumentTree documentTree; + Integer numberOfPages; + TextBlock textBlock; + @Builder.Default + Set entities = new HashSet<>(); + + + @Override + public NodeType getType() { + + return NodeType.DOCUMENT; + } + + + public TextBlock getTextBlock() { + + if (textBlock == null) { + textBlock = streamTerminalTextBlocksInOrder().collect(new TextBlockCollector()); + } + return textBlock; + } + + + public List
getMainSections() { + + return streamChildrenOfType(NodeType.SECTION).map(node -> (Section) node).collect(Collectors.toList()); + } + + + public Stream streamTerminalTextBlocksInOrder() { + + return streamAllNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock); + } + + + @Override + public List getTreeId() { + + return Collections.emptyList(); + } + + + @Override + public void setTreeId(List tocId) { + + throw new UnsupportedOperationException("Document is always the root of the TablePageBlock of Contents"); + } + + + @Override + public Headline getHeadline() { + + return streamAllSubNodesOfType(NodeType.HEADLINE).map(node -> (Headline) node).findFirst().orElseThrow(() -> new NotFoundException("No Headlines found in this document!")); + } + + + private Stream streamAllNodes() { + + return documentTree.allEntriesInOrder().map(DocumentTree.Entry::getNode); + } + + + public Stream streamAllImages() { + + return streamAllSubNodesOfType(NodeType.IMAGE).map(node -> (Image) node); + } + + + @Override + public String toString() { + + return NodeType.DOCUMENT + ": " + this.getTextBlock().buildSummary(); + } + + + @Override + public Map getBBox() { + + Map bBox = new HashMap<>(); + for (Page page : pages) { + bBox.put(page, new Rectangle2D.Double(0, 0, page.getWidth(), page.getHeight())); + } + return bBox; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Footer.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Footer.java new file mode 100644 index 0000000..59813b4 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Footer.java @@ -0,0 +1,65 @@ +package com.knecon.fforesight.service.layoutparser.processor.graph.nodes; + +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType; +import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; +import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.NoArgsConstructor; +import lombok.experimental.FieldDefaults; + +@Data +@Builder +@AllArgsConstructor +@NoArgsConstructor +@FieldDefaults(level = AccessLevel.PRIVATE) +public class Footer implements GenericSemanticNode { + + List treeId; + TextBlock leafTextBlock; + + @EqualsAndHashCode.Exclude + DocumentTree documentTree; + + @Builder.Default + @EqualsAndHashCode.Exclude + Set entities = new HashSet<>(); + + + @Override + public NodeType getType() { + + return NodeType.FOOTER; + } + + + @Override + public boolean isLeaf() { + + return true; + } + + + @Override + public TextBlock getTextBlock() { + + return leafTextBlock; + } + + + @Override + public String toString() { + + return treeId + ": " + NodeType.FOOTER + ": " + leafTextBlock.buildSummary(); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/GenericSemanticNode.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/GenericSemanticNode.java new file mode 100644 index 0000000..fbffa94 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/GenericSemanticNode.java @@ -0,0 +1,5 @@ +package com.knecon.fforesight.service.layoutparser.processor.graph.nodes; + +public interface GenericSemanticNode extends SemanticNode { + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Header.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Header.java new file mode 100644 index 0000000..a9dfce2 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Header.java @@ -0,0 +1,65 @@ +package com.knecon.fforesight.service.layoutparser.processor.graph.nodes; + +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType; +import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; +import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.NoArgsConstructor; +import lombok.experimental.FieldDefaults; + +@Data +@Builder +@AllArgsConstructor +@NoArgsConstructor +@FieldDefaults(level = AccessLevel.PRIVATE) +public class Header implements GenericSemanticNode { + + List treeId; + TextBlock leafTextBlock; + + @EqualsAndHashCode.Exclude + DocumentTree documentTree; + + @Builder.Default + @EqualsAndHashCode.Exclude + Set entities = new HashSet<>(); + + + @Override + public boolean isLeaf() { + + return true; + } + + + @Override + public NodeType getType() { + + return NodeType.HEADER; + } + + + @Override + public TextBlock getTextBlock() { + + return leafTextBlock; + } + + + @Override + public String toString() { + + return treeId + ": " + NodeType.HEADER + ": " + leafTextBlock.buildSummary(); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Headline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Headline.java new file mode 100644 index 0000000..99e1adc --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Headline.java @@ -0,0 +1,72 @@ +package com.knecon.fforesight.service.layoutparser.processor.graph.nodes; + +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType; +import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; +import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.NoArgsConstructor; +import lombok.experimental.FieldDefaults; + +@Data +@Builder +@AllArgsConstructor +@NoArgsConstructor +@FieldDefaults(level = AccessLevel.PRIVATE) +public class Headline implements GenericSemanticNode { + + List treeId; + TextBlock leafTextBlock; + + @EqualsAndHashCode.Exclude + DocumentTree documentTree; + + @Builder.Default + @EqualsAndHashCode.Exclude + Set entities = new HashSet<>(); + + + @Override + public NodeType getType() { + + return NodeType.HEADLINE; + } + + + @Override + public boolean isLeaf() { + + return true; + } + + + @Override + public TextBlock getTextBlock() { + + return leafTextBlock; + } + + + @Override + public String toString() { + + return treeId + ": " + NodeType.HEADLINE + ": " + leafTextBlock.buildSummary(); + } + + + @Override + public Headline getHeadline() { + + return this; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Image.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Image.java new file mode 100644 index 0000000..7ec9926 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Image.java @@ -0,0 +1,95 @@ +package com.knecon.fforesight.service.layoutparser.processor.graph.nodes; + +import java.awt.geom.Rectangle2D; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType; +import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; +import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlockCollector; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.NoArgsConstructor; +import lombok.experimental.FieldDefaults; + +@Data +@Builder +@AllArgsConstructor +@NoArgsConstructor +@FieldDefaults(level = AccessLevel.PRIVATE) +public class Image implements GenericSemanticNode { + + List treeId; + String id; + + ImageType imageType; + boolean transparent; + Rectangle2D position; + + boolean redaction; + boolean ignored; + @Builder.Default + String redactionReason = ""; + @Builder.Default + String legalBasis = ""; + @Builder.Default + int matchedRule = -1; + + @EqualsAndHashCode.Exclude + Page page; + + @EqualsAndHashCode.Exclude + DocumentTree documentTree; + + @Builder.Default + @EqualsAndHashCode.Exclude + Set entities = new HashSet<>(); + + + @Override + public NodeType getType() { + + return NodeType.IMAGE; + } + + + @Override + public TextBlock getTextBlock() { + + return streamAllSubNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector()); + } + + + @Override + public Set getPages() { + + return Collections.singleton(page); + } + + + @Override + public String toString() { + + return treeId + ": " + NodeType.IMAGE + ": " + imageType.toString() + " " + position; + } + + + @Override + public Map getBBox() { + + Map bBoxPerPage = new HashMap<>(); + bBoxPerPage.put(page, position); + return bBoxPerPage; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/ImageType.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/ImageType.java new file mode 100644 index 0000000..49566d1 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/ImageType.java @@ -0,0 +1,21 @@ +package com.knecon.fforesight.service.layoutparser.processor.graph.nodes; + +public enum ImageType { + LOGO, + FORMULA, + SIGNATURE, + OTHER, + OCR; + + + public static ImageType fromString(String imageType) { + + return switch (imageType.toLowerCase()) { + case "logo" -> ImageType.LOGO; + case "formula" -> ImageType.FORMULA; + case "signature" -> ImageType.SIGNATURE; + case "ocr" -> ImageType.OCR; + default -> ImageType.OTHER; + }; + } +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Page.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Page.java new file mode 100644 index 0000000..f01cc38 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Page.java @@ -0,0 +1,87 @@ +package com.knecon.fforesight.service.layoutparser.processor.graph.nodes; + +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; + +import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlockCollector; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.EqualsAndHashCode; +import lombok.Getter; +import lombok.Setter; +import lombok.experimental.FieldDefaults; + +@Getter +@Setter +@Builder +@AllArgsConstructor +@FieldDefaults(level = AccessLevel.PRIVATE) +public class Page { + + Integer number; + Integer height; + Integer width; + Integer rotation; + + @EqualsAndHashCode.Exclude + List mainBody; + @EqualsAndHashCode.Exclude + Header header; + @EqualsAndHashCode.Exclude + Footer footer; + + @Builder.Default + @EqualsAndHashCode.Exclude + Set entities = new HashSet<>(); + + @Builder.Default + @EqualsAndHashCode.Exclude + Set images = new HashSet<>(); + + + public static Page fromClassificationPage(ClassificationPage classificationPage) { + + return Page.builder() + .height((int) classificationPage.getPageHeight()) + .width((int) classificationPage.getPageWidth()) + .number(classificationPage.getPageNumber()) + .rotation(classificationPage.getRotation()) + .mainBody(new LinkedList<>()) + .build(); + } + + + public TextBlock getMainBodyTextBlock() { + + return mainBody.stream().filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector()); + } + + + @Override + public String toString() { + + return String.valueOf(number); + } + + + @Override + public int hashCode() { + + return number; + } + + + @Override + public boolean equals(Object o) { + + return o instanceof Page && o.hashCode() == this.hashCode(); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Paragraph.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Paragraph.java new file mode 100644 index 0000000..8943d56 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Paragraph.java @@ -0,0 +1,63 @@ +package com.knecon.fforesight.service.layoutparser.processor.graph.nodes; + +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType; +import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; +import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.experimental.FieldDefaults; + +@Data +@Builder +@AllArgsConstructor +@FieldDefaults(level = AccessLevel.PRIVATE) +public class Paragraph implements GenericSemanticNode { + + List treeId; + TextBlock leafTextBlock; + + @EqualsAndHashCode.Exclude + DocumentTree documentTree; + + @Builder.Default + @EqualsAndHashCode.Exclude + Set entities = new HashSet<>(); + + + @Override + public NodeType getType() { + + return NodeType.PARAGRAPH; + } + + + @Override + public boolean isLeaf() { + + return true; + } + + + @Override + public TextBlock getTextBlock() { + + return leafTextBlock; + } + + + @Override + public String toString() { + + return treeId + ": " + NodeType.PARAGRAPH + ": " + leafTextBlock.buildSummary(); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Section.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Section.java new file mode 100644 index 0000000..76e6f08 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Section.java @@ -0,0 +1,77 @@ +package com.knecon.fforesight.service.layoutparser.processor.graph.nodes; + +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType; +import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; +import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlockCollector; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.experimental.FieldDefaults; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@Data +@Builder +@AllArgsConstructor +@FieldDefaults(level = AccessLevel.PRIVATE) +public class Section implements GenericSemanticNode { + + List treeId; + + TextBlock textBlock; + @EqualsAndHashCode.Exclude + DocumentTree documentTree; + + @Builder.Default + @EqualsAndHashCode.Exclude + Set entities = new HashSet<>(); + + + @Override + public NodeType getType() { + + return NodeType.SECTION; + } + + + public boolean hasTables() { + + return streamAllSubNodesOfType(NodeType.TABLE).findAny().isPresent(); + } + + + @Override + public TextBlock getTextBlock() { + + if (textBlock == null) { + textBlock = streamAllSubNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector()); + } + return textBlock; + } + + + @Override + public String toString() { + + return treeId.toString() + ": " + NodeType.SECTION + ": " + this.getTextBlock().buildSummary(); + } + + + public Headline getHeadline() { + + return streamChildrenOfType(NodeType.HEADLINE)// + .map(node -> (Headline) node)// + .findFirst()// + .orElseGet(() -> getParent().getHeadline()); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/SemanticNode.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/SemanticNode.java new file mode 100644 index 0000000..a9e753f --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/SemanticNode.java @@ -0,0 +1,446 @@ +package com.knecon.fforesight.service.layoutparser.processor.graph.nodes; + +import static java.lang.String.format; + +import java.awt.geom.Rectangle2D; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType; +import com.knecon.fforesight.service.layoutparser.processor.graph.Boundary; +import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; +import com.knecon.fforesight.service.layoutparser.processor.graph.entity.EntityType; +import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.AtomicTextBlock; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.processor.services.RectangleTransformations; + +public interface SemanticNode { + + /** + * Returns the type of this node, such as Section, Paragraph, etc. + * + * @return NodeType of this node + */ + NodeType getType(); + + + /** + * Searches all Nodes located underneath this Node in the DocumentTree and concatenates their AtomicTextBlocks into a single TextBlock. + * So, for a Section all TextBlocks of Subsections, Paragraphs, and Tables are concatenated into a single TextBlock + * If the Node is a Leaf, the LeafTextBlock will be returned instead. + * + * @return TextBlock containing all AtomicTextBlocks that are located under this Node. + */ + TextBlock getTextBlock(); + + + /** + * Any Node maintains its own Set of Entities. + * This Set contains all Entities whose boundary intersects the boundary of this node. + * + * @return Set of all Entities associated with this Node + */ + Set getEntities(); + + + /** + * Each AtomicTextBlock is assigned a page, so to get the pages this node appears on, it collects the PageNodes from each AtomicTextBlock belonging to this node's TextBlock. + * + * @return Set of PageNodes this node appears on. + */ + default Set getPages() { + + return getTextBlock().getPages(); + } + + + /** + * Each AtomicTextBlock is assigned a page, so to get the pages for this boundary, it collects the PageNodes from each AtomicTextBlock belonging to this node's TextBlock. + * + * @return Set of PageNodes this node appears on. + */ + default Set getPages(Boundary boundary) { + + if (!getBoundary().contains(boundary)) { + throw new IllegalArgumentException(format("%s which was used to query for pages is not contained in the %s of this node!", boundary, getBoundary())); + } + return getTextBlock().getPages(boundary); + } + + + default boolean isOnPage(int pageNumber) { + + return getPages().stream().anyMatch(page -> page.getNumber() == pageNumber); + } + + + /** + * Returns the DocumentTree Object. + * + * @return the DocumentTree of the Document this node belongs to + */ + DocumentTree getDocumentTree(); + + + /** + * The id is a List of Integers uniquely identifying this node in the DocumentTree. + * + * @return the DocumentTree ID + */ + List getTreeId(); + + + /** + * This should only be used during graph construction. + * + * @param tocId List of Integers + */ + void setTreeId(List tocId); + + + /** + * Traverses the Tree up, until it hits a Headline or hits a Section which will then return the first Headline from its children. + * Throws NotFoundException if no Headline is found this way + * + * @return First Headline found + */ + default Headline getHeadline() { + + return getParent().getHeadline(); + } + + + /** + * Checks if its TocId has a length greater than zero. + * + * @return boolean indicating whether this Node has a Parent in the DocumentTree + */ + default boolean hasParent() { + + return getDocumentTree().hasParentById(getTreeId()); + } + + + /** + * @return The SemanticNode representing the Parent in the DocumentTree + * throws NotFoundException, when no parent is present + */ + default SemanticNode getParent() { + + return getDocumentTree().getParentEntryById(getTreeId()).getNode(); + } + + + /** + * @return The SemanticNode which is directly underneath the document and also under which this node is. + * if this is the highest child node or the document itself, it returns itself. + */ + default SemanticNode getHighestParent() { + + return getDocumentTree().getHighestParentById(getTreeId()); + } + + + /** + * Leaf means a SemanticNode has direct access to a TextBlock, by default this is false and must be overridden. + * Currently only Sections, Images, and Tables are not leaves. + * A TableCell might be a leaf depending on its area compared to the page. + * + * @return boolean, indicating if a Node has direct access to a TextBlock + */ + default boolean isLeaf() { + + return false; + } + + + /** + * Leaf means a SemanticNode has direct access to a TextBlock, by default this is false and must be overridden. + * Currently only Sections and Tables are no leaves. + * + * @return AtomicTextBlock + */ + default TextBlock getLeafTextBlock() { + + throw new UnsupportedOperationException("Only leaf Nodes have access to LeafTextBlocks!"); + } + + + /** + * Should only be used during construction of the Graph. Sets the LeafTextBlock of this SemanticNode. + * + * @param textBlock the TextBlock to set as the LeafTextBlock of this SemanticNode + */ + default void setLeafTextBlock(TextBlock textBlock) { + + throw new UnsupportedOperationException(); + } + + + /** + * Checks whether this SemanticNode has any Entity with EntityType.ENTITY of the provided type. + * + * @param type string representing the type of entity to check for + * @return true, if this SemanticNode has at least one Entity of the provided type + */ + default boolean hasEntitiesOfType(String type) { + + return getEntities().stream().filter(entity -> entity.getEntityType().equals(EntityType.ENTITY)).anyMatch(redactionEntity -> redactionEntity.getType().equals(type)); + } + + + /** + * Returns a List of Entities in this SemanticNode which are of the provided type such as "CBI_author". + * + * @param type string representing the type of entities to return + * @return List of RedactionEntities of any the type + */ + default List getEntitiesOfType(String type) { + + return getEntities().stream().filter(redactionEntity -> redactionEntity.getType().equals(type)).toList(); + } + + + /** + * Returns a List of Entities in this SemanticNode which have any of the provided types such as "CBI_author". + * + * @param types A list of strings representing the types of entities to return + * @return List of RedactionEntities of any provided type + */ + default List getEntitiesOfType(List types) { + + return getEntities().stream().filter(redactionEntity -> redactionEntity.isAnyType(types)).toList(); + } + + + /** + * Each AtomicTextBlock has an index on its page, this returns the number of the first AtomicTextBlock underneath this node. + * If this node does not have any AtomicTexBlocks underneath it, e.g. an empty TableCell. It returns -1. + * + * @return Integer representing the number on the page + */ + default Integer getNumberOnPage() { + + TextBlock textBlock = getTextBlock(); + if (!textBlock.getAtomicTextBlocks().isEmpty()) { + return getTextBlock().getAtomicTextBlocks().get(0).getNumberOnPage(); + } else { + return -1; + } + } + + + /** + * Checks if the SemanticNode contains any text. + * + * @return true, if this node's TextBlock is not empty + */ + default boolean hasText() { + + return !getTextBlock().isEmpty(); + } + + + /** + * Checks whether this SemanticNode contains the provided String. + * + * @param string A String which the TextBlock might contain + * @return true, if this node's TextBlock contains the string + */ + default boolean containsString(String string) { + + return getTextBlock().getSearchText().contains(string); + } + + + /** + * Checks whether this SemanticNode contains all the provided Strings. + * + * @param strings A List of Strings which the TextBlock might contain + * @return true, if this node's TextBlock contains all strings + */ + default boolean containsStrings(List strings) { + + return strings.stream().allMatch(this::containsString); + } + + + /** + * Checks whether this SemanticNode contains all the provided Strings ignoring case. + * + * @param string A String which the TextBlock might contain + * @return true, if this node's TextBlock contains the string ignoring case + */ + default boolean containsStringIgnoreCase(String string) { + + return getTextBlock().getSearchText().toLowerCase().contains(string.toLowerCase()); + } + + + /** + * Checks whether this SemanticNode contains any of the provided Strings. + * + * @param strings A List of Strings which the TextBlock might contain + * @return true, if this node's TextBlock contains any of the strings + */ + default boolean containsAnyString(List strings) { + + return strings.stream().anyMatch(this::containsString); + } + + + /** + * Checks whether this SemanticNode contains any of the provided Strings ignoring case. + * + * @param strings A List of Strings which the TextBlock might contain + * @return true, if this node's TextBlock contains any of the strings + */ + default boolean containsAnyStringIgnoreCase(List strings) { + + return strings.stream().anyMatch(this::containsStringIgnoreCase); + } + + + + /** + * This function is used during insertion of EntityNodes into the graph, it checks if the boundary of the RedactionEntity intersects or even contains the RedactionEntity. + * It sets the fields accordingly and recursively calls this function on all its children. + * + * @param redactionEntity RedactionEntity, which is being inserted into the graph + */ + default void addThisToEntityIfIntersects(RedactionEntity redactionEntity) { + + TextBlock textBlock = getTextBlock(); + if (textBlock.getBoundary().intersects(redactionEntity.getBoundary())) { + if (textBlock.containsBoundary(redactionEntity.getBoundary())) { + redactionEntity.setDeepestFullyContainingNode(this); + } + + redactionEntity.addIntersectingNode(this); + streamChildren().filter(semanticNode -> semanticNode.getBoundary().intersects(redactionEntity.getBoundary())) + .forEach(node -> node.addThisToEntityIfIntersects(redactionEntity)); + } + } + + + /** + * Streams all children located directly underneath this node in the DocumentTree. + * + * @return Stream of all children + */ + default Stream streamChildren() { + + return getDocumentTree().childNodes(getTreeId()); + } + + + /** + * Streams all children located directly underneath this node in the DocumentTree of the provided type. + * + * @return Stream of all children + */ + default Stream streamChildrenOfType(NodeType nodeType) { + + return getDocumentTree().childNodesOfType(getTreeId(), nodeType); + } + + + /** + * Recursively streams all SemanticNodes located underneath this node in the DocumentTree in order. + * + * @return Stream of all SubNodes + */ + default Stream streamAllSubNodes() { + + return getDocumentTree().allSubEntriesInOrder(getTreeId()).map(DocumentTree.Entry::getNode); + } + + + /** + * Recursively streams all SemanticNodes of the provided type located underneath this node in the DocumentTree in order. + * + * @return Stream of all SubNodes + */ + default Stream streamAllSubNodesOfType(NodeType nodeType) { + + return getDocumentTree().allSubEntriesInOrder(getTreeId()).filter(entry -> entry.getType().equals(nodeType)).map(DocumentTree.Entry::getNode); + } + + + /** + * The Boundary is the start and end string offsets in the reading order of the document. + * + * @return Boundary of this Node's TextBlock + */ + default Boundary getBoundary() { + + return getTextBlock().getBoundary(); + } + + + /** + * If this Node is a Leaf it will calculate the boundingBox of its LeafTextBlock, otherwise it will calculate the Union of the BoundingBoxes of all its Children. + * If called on the Document, it will return the cropbox of each page + * + * @return Rectangle2D fully encapsulating this Node for each page. + */ + default Map getBBox() { + + Map bBoxPerPage = new HashMap<>(); + if (isLeaf()) { + return getBBoxFromLeafTextBlock(bBoxPerPage); + } + + return getBBoxFromChildren(bBoxPerPage); + } + + + /** + * Checks whether the Bounding Box of this SemanticNode contains the provided rectangle on the provided page. + * + * @param rectangle2D The rectangle to check if it is contained + * @param pageNumber The Page number on which the rectangle should be checked + * @return boolean + */ + default boolean containsRectangle(Rectangle2D rectangle2D, Integer pageNumber) { + + Page helperPage = Page.builder().number(pageNumber).build(); + if (!getPages().contains(helperPage)) { + return false; + } + return getBBox().get(helperPage).contains(rectangle2D); + } + + + /** + * TODO: this produces unwanted results for sections spanning multiple columns. + * + * @param bBoxPerPage initial empty BoundingBox + * @return The union of the BoundingBoxes of all children + */ + private Map getBBoxFromChildren(Map bBoxPerPage) { + + return streamChildren().map(SemanticNode::getBBox).reduce((map1, map2) -> { + map1.forEach((page, rectangle) -> map2.merge(page, rectangle, (rect1, rect2) -> rect1.createUnion(rect2).getBounds2D())); + return map2; + }).orElse(bBoxPerPage); + } + + + /** + * @param bBoxPerPage initial empty BoundingBox + * @return The union of all BoundingBoxes of the TextBlock of this node + */ + private Map getBBoxFromLeafTextBlock(Map bBoxPerPage) { + + Map> atomicTextBlockPerPage = getTextBlock().getAtomicTextBlocks().stream().collect(Collectors.groupingBy(AtomicTextBlock::getPage)); + atomicTextBlockPerPage.forEach((page, atbs) -> bBoxPerPage.put(page, RectangleTransformations.bBoxUnionAtomicTextBlock(atbs))); + return bBoxPerPage; + } + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Table.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Table.java new file mode 100644 index 0000000..37c55bd --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Table.java @@ -0,0 +1,316 @@ +package com.knecon.fforesight.service.layoutparser.processor.graph.nodes; + +import static java.lang.String.format; + +import java.util.Collection; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.stream.IntStream; +import java.util.stream.Stream; + +import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType; +import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; +import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlockCollector; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.experimental.FieldDefaults; + +@Data +@Builder +@AllArgsConstructor +@FieldDefaults(level = AccessLevel.PRIVATE) +public class Table implements SemanticNode { + + List treeId; + DocumentTree documentTree; + + int numberOfRows; + int numberOfCols; + + TextBlock textBlock; + + @Builder.Default + @EqualsAndHashCode.Exclude + Set entities = new HashSet<>(); + + + /** + * Streams all entities in this table, that appear in a row, which contains any of the provided strings. + * + * @param strings Strings to check whether a row contains them + * @return Stream of all entities in this table, that appear in a row, which contains any of the provided strings + */ + public Stream streamEntitiesWhereRowContainsStringsIgnoreCase(List strings) { + + return IntStream.range(0, numberOfRows) + .boxed() + .filter(row -> rowContainsStringsIgnoreCase(row, strings)) + .flatMap(this::streamRow) + .map(TableCell::getEntities) + .flatMap(Collection::stream); + } + + + /** + * Checks whether the specified row contains all the provided strings. + * + * @param row the row to check as an Integer, must be smaller than numberOfRows + * @param strings a list of strings to check for + * @return true, if all strings appear in the provided row + */ + public boolean rowContainsStringsIgnoreCase(Integer row, List strings) { + + String rowText = streamRow(row).map(TableCell::getTextBlock).collect(new TextBlockCollector()).getSearchText().toLowerCase(); + return strings.stream().map(String::toLowerCase).allMatch(rowText::contains); + } + + + /** + * Streams all entities which appear in a row where at least one cell has the provided header and the provided value. + * + * @param header the header value to search for + * @param value the string which the table cell should contain + * @return a stream of all entities, which appear in a row where at least one cell has the provided header and the provided value. + */ + public Stream streamEntitiesWhereRowHasHeaderAndValue(String header, String value) { + + List vertebrateStudyCols = streamHeaders().filter(headerNode -> headerNode.containsString(header)).map(TableCell::getCol).toList(); + return streamTableCells().filter(tableCellNode -> vertebrateStudyCols.stream() + .anyMatch(vertebrateStudyCol -> getCell(tableCellNode.getRow(), vertebrateStudyCol).containsString(value))).map(TableCell::getEntities).flatMap(Collection::stream); + } + + + /** + * Streams all entities which appear in a row where at least one cell has the provided header and any provided value. + * + * @param header the header value to search for + * @param values the strings which the table cell should contain + * @return a stream of all entities, which appear in a row where at least one cell has the provided header and any provided value. + */ + public Stream streamEntitiesWhereRowHasHeaderAndAnyValue(String header, List values) { + + List colsWithHeader = streamHeaders().filter(headerNode -> headerNode.containsString(header)).map(TableCell::getCol).toList(); + return streamTableCells().filter(tableCellNode -> colsWithHeader.stream() + .anyMatch(colWithHeader -> getCell(tableCellNode.getRow(), colWithHeader).containsAnyString(values))).map(TableCell::getEntities).flatMap(Collection::stream); + } + + + /** + * Streams all entities in this table, that appear in a row, which contains at least one entity with any of the provided types. + * + * @param types type strings to check whether a row contains an entity like them + * @return Stream of all entities in this table, that appear in a row, which contains at least one entity with any of the provided types. + */ + public Stream streamEntitiesWhereRowContainsEntitiesOfType(List types) { + + List rowsWithEntityOfType = IntStream.range(0, numberOfRows) + .boxed() + .filter(rowNumber -> streamEntityTypesInRow(rowNumber).anyMatch(existingType -> types.stream().anyMatch(typeToCheck -> typeToCheck.equals(existingType)))) + .toList(); + + return rowsWithEntityOfType.stream().flatMap(this::streamRow).map(TableCell::getEntities).flatMap(Collection::stream); + } + + + /** + * Streams all entities in this table, that appear in a row, which does not contain any entity with any of the provided types. + * + * @param types type strings to check whether a row doesn't contain an entity like it + * @return Stream of all entities in this table, that appear in a row, which does not contain any entity with any of the provided types. + */ + public Stream streamEntitiesWhereRowContainsNoEntitiesOfType(List types) { + + List rowsWithNoEntityOfType = IntStream.range(0, numberOfRows) + .boxed() + .filter(rowNumber -> streamEntityTypesInRow(rowNumber).noneMatch(existingType -> types.stream().anyMatch(typeToCheck -> typeToCheck.equals(existingType)))) + .toList(); + + return rowsWithNoEntityOfType.stream().flatMap(this::streamRow).map(TableCell::getEntities).flatMap(Collection::stream); + } + + + private Stream streamEntityTypesInRow(Integer rowNumber) { + + return streamRow(rowNumber).map(TableCell::getEntities).flatMap(Collection::stream).map(RedactionEntity::getType).distinct(); + } + + + /** + * Returns a TableCell at the provided row and column location. + * + * @param row int representing the row, must be smaller than numberOfRows + * @param col int representing the col, must be smaller than numberOfCols + * @return TableCell at the provided location in the table + */ + public TableCell getCell(int row, int col) { + + if (numberOfRows - row < 0 || numberOfCols - col < 0) { + throw new IllegalArgumentException(format("row %d, col %d is out of bounds for number of rows of %d and number of cols %d", row, col, numberOfRows, numberOfCols)); + } + int idx = row * numberOfCols + col; + return (TableCell) documentTree.getEntryById(treeId).getChildren().get(idx).getNode(); + } + + + /** + * Streams all TableCells in this Table row-wise. + * + * @return Stream of all TableCells + */ + public Stream streamTableCells() { + + return streamChildrenOfType(NodeType.TABLE_CELL).map(node -> (TableCell) node); + } + + + /** + * Streams all TableCells in this Table which have the provided header row-wise. + * + * @return Stream of all TableCells which have the provided header + */ + public Stream streamTableCellsWithHeader(String header) { + + return streamHeaders().filter(tableCellNode -> tableCellNode.getTextBlock().getSearchText().contains(header)) + .map(TableCell::getCol) + .flatMap(this::streamCol) + .filter(tableCellNode -> !tableCellNode.isHeader()); + } + + + /** + * Streams all TableCells belonging to the provided column from top down. + * + * @param col int representing the column + * @return Stream of all TableCell in the provided column + */ + public Stream streamCol(int col) { + + return IntStream.range(0, numberOfRows).boxed().map(row -> getCell(row, col)); + } + + + /** + * Streams all TableCells belonging to the provided row from left to right. + * + * @param row int representing the row + * @return Stream of all TableCell in the provided row + */ + public Stream streamRow(int row) { + + return IntStream.range(0, numberOfCols).boxed().map(col -> getCell(row, col)); + } + + + /** + * Streams all TableCells row-wise and filters them with header == true. + * + * @return Stream of all TableCells with header == true + */ + public Stream streamHeaders() { + + return streamTableCells().filter(TableCell::isHeader); + } + + + /** + * Streams all TableCells of the provided row and column and filters them with header == true. + * + * @param row int representing the row + * @param col int representing the column + * @return Stream of all TableCells with header == true in the provided row or col + */ + public Stream streamHeadersForCell(int row, int col) { + + return Stream.concat(streamRow(row), streamCol(col)).filter(TableCell::isHeader); + } + + + /** + * Streams all Headers and checks if any equal the provided string. + * + * @param header string to check the headers for + * @return true, if at least one header equals the provided string + */ + public boolean hasHeader(String header) { + + return streamHeaders().anyMatch(tableCellNode -> tableCellNode.getTextBlock().getSearchText().strip().equals(header)); + } + + + /** + * Checks if this table has a column with the provided header and any of the table cells in that column contain the provided value. + * + * @param header string to find header cells + * @param value string to check cells with provided header + * @return true, if this table has a column with the provided header and any of the table cells in that column contain the provided value + */ + public boolean hasRowWithHeaderAndValue(String header, String value) { + + return streamTableCellsWithHeader(header).anyMatch(tableCellNode -> tableCellNode.containsString(value)); + } + + + /** + * Checks if this table has a column with the provided header and any of the table cells in that column contains any of the provided values. + * + * @param header string to find header cells + * @param values List of strings to check cells with provided header + * @return true, if this table has a column with the provided header and any of the table cells in that column contains any of the provided values. + */ + public boolean hasRowWithHeaderAndAnyValue(String header, List values) { + + return streamTableCellsWithHeader(header).anyMatch(tableCellNode -> tableCellNode.containsAnyString(values)); + } + + + /** + * Finds all entities of the provided type, which appear in the same row that the provided entity appears in. + * + * @param type the type of entities to search for + * @param redactionEntity the entity, which appears in the row to search + * @return List of all entities of the provided type, which appear in the same row that the provided entity appears in. + */ + public List getEntitiesOfTypeInSameRow(String type, RedactionEntity redactionEntity) { + + return redactionEntity.getIntersectingNodes() + .stream() + .filter(node -> node instanceof TableCell) + .map(node -> (TableCell) node) + .flatMap(tableCellNode -> streamRow(tableCellNode.getRow())) + .map(cell -> cell.getEntitiesOfType(type)) + .flatMap(Collection::stream) + .toList(); + } + + + @Override + public NodeType getType() { + + return NodeType.TABLE; + } + + + @Override + public TextBlock getTextBlock() { + + if (textBlock == null) { + textBlock = streamAllSubNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector()); + } + return textBlock; + } + + + @Override + public String toString() { + + return treeId.toString() + ": " + NodeType.TABLE + ": #cols: " + numberOfCols + ", #rows: " + numberOfRows + ", " + this.getTextBlock().buildSummary(); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/TableCell.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/TableCell.java new file mode 100644 index 0000000..1a4f8a3 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/TableCell.java @@ -0,0 +1,91 @@ +package com.knecon.fforesight.service.layoutparser.processor.graph.nodes; + +import java.awt.geom.Rectangle2D; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType; +import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; +import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlockCollector; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.experimental.FieldDefaults; + +@Data +@Builder +@AllArgsConstructor +@FieldDefaults(level = AccessLevel.PRIVATE) +public class TableCell implements GenericSemanticNode { + + List treeId; + int row; + int col; + boolean header; + + Rectangle2D bBox; + + TextBlock leafTextBlock; + + TextBlock textBlock; + + @EqualsAndHashCode.Exclude + DocumentTree documentTree; + + @Builder.Default + @EqualsAndHashCode.Exclude + Set entities = new HashSet<>(); + + + @Override + public Map getBBox() { + + Map bBoxPerPage = new HashMap<>(); + getPages().forEach(page -> bBoxPerPage.put(page, bBox)); + return bBoxPerPage; + } + + + @Override + public NodeType getType() { + + return NodeType.TABLE_CELL; + } + + + @Override + public boolean isLeaf() { + + return getDocumentTree().getEntryById(getTreeId()).getChildren().isEmpty(); + } + + + @Override + public TextBlock getTextBlock() { + + if (isLeaf()) { + return leafTextBlock; + } + + if (textBlock == null) { + textBlock = streamAllSubNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector()); + } + return textBlock; + } + + + @Override + public String toString() { + + return treeId + ": " + NodeType.TABLE_CELL + ": " + this.getTextBlock().buildSummary(); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/AtomicTextBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/AtomicTextBlock.java new file mode 100644 index 0000000..37eaf19 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/AtomicTextBlock.java @@ -0,0 +1,215 @@ +package com.knecon.fforesight.service.layoutparser.processor.graph.textblock; + +import static java.lang.String.format; + +import java.awt.geom.Rectangle2D; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositionBlockData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData; +import com.knecon.fforesight.service.layoutparser.processor.factory.SearchTextWithTextPositionDto; +import com.knecon.fforesight.service.layoutparser.processor.graph.Boundary; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.experimental.FieldDefaults; + +@Data +@Builder +@AllArgsConstructor +@FieldDefaults(level = AccessLevel.PRIVATE) +public class AtomicTextBlock implements TextBlock { + + Long id; + Integer numberOnPage; + Page page; + + //string coordinates + Boundary boundary; + String searchText; + List lineBreaks; + + //position coordinates + List stringIdxToPositionIdx; + List positions; + + @EqualsAndHashCode.Exclude + SemanticNode parent; + + + @Override + public int numberOfLines() { + + return lineBreaks.size() + 1; + } + + + public static AtomicTextBlock fromSearchTextWithTextPositionDto(SearchTextWithTextPositionDto searchTextWithTextPositionDto, + SemanticNode parent, + int stringOffset, + Long textBlockIdx, + Integer numberOnPage, + Page page) { + + return AtomicTextBlock.builder() + .id(textBlockIdx) + .parent(parent) + .searchText(searchTextWithTextPositionDto.getSearchText()) + .numberOnPage(numberOnPage) + .page(page) + .lineBreaks(searchTextWithTextPositionDto.getLineBreaks()) + .positions(searchTextWithTextPositionDto.getPositions()) + .stringIdxToPositionIdx(searchTextWithTextPositionDto.getStringCoordsToPositionCoords()) + .boundary(new Boundary(stringOffset, stringOffset + searchTextWithTextPositionDto.getSearchText().length())) + .build(); + } + + + public static AtomicTextBlock empty(Long textBlockIdx, int stringOffset, Page page, int numberOnPage, SemanticNode parent) { + + return AtomicTextBlock.builder() + .id(textBlockIdx) + .boundary(new Boundary(stringOffset, stringOffset)) + .searchText("") + .lineBreaks(Collections.emptyList()) + .page(page) + .numberOnPage(numberOnPage) + .stringIdxToPositionIdx(Collections.emptyList()) + .positions(Collections.emptyList()) + .parent(parent) + .build(); + } + + + public static AtomicTextBlock fromAtomicTextBlockData(AtomicTextBlockData atomicTextBlockData, + AtomicPositionBlockData atomicPositionBlockData, + SemanticNode parent, + Page page) { + + return AtomicTextBlock.builder() + .id(atomicTextBlockData.getId()) + .numberOnPage(atomicTextBlockData.getNumberOnPage()) + .page(page) + .boundary(new Boundary(atomicTextBlockData.getStart(), atomicTextBlockData.getEnd())) + .searchText(atomicTextBlockData.getSearchText()) + .lineBreaks(Arrays.stream(atomicTextBlockData.getLineBreaks()).boxed().toList()) + .stringIdxToPositionIdx(Arrays.stream(atomicPositionBlockData.getStringIdxToPositionIdx()).boxed().toList()) + .positions(toRectangle2DList(atomicPositionBlockData.getPositions())) + .parent(parent) + .build(); + } + + + private static List toRectangle2DList(float[][] positions) { + + return Arrays.stream(positions).map(floatArr -> (Rectangle2D) new Rectangle2D.Float(floatArr[0], floatArr[1], floatArr[2], floatArr[3])).toList(); + } + + + public CharSequence getLine(int lineNumber) { + + if (lineNumber >= numberOfLines() || lineNumber < 0) { + throw new IndexOutOfBoundsException(format("line %d out of range for AtomicTextBlock with %d lines", lineNumber, numberOfLines())); + } + if (lineNumber == 0) { + return subSequence(boundary.start(), lineBreaks.get(0) + boundary.start()); + } else if (lineNumber == numberOfLines() - 1) { + return subSequence(lineBreaks.get(lineBreaks.size() - 1) + boundary.start(), boundary.end()); + } + return subSequence(lineBreaks.get(lineNumber - 1) + boundary.start(), lineBreaks.get(lineNumber) + boundary.start()); + } + + + @Override + public List getAtomicTextBlocks() { + + return List.of(this); + } + + + @Override + public int getNextLinebreak(int fromIndex) { + + return lineBreaks.stream()// + .filter(linebreak -> linebreak > fromIndex - boundary.start()) // + .findFirst() // + .orElse(searchText.length()) + boundary.start(); + } + + + @Override + public int getPreviousLinebreak(int fromIndex) { + + return lineBreaks.stream()// + .filter(linebreak -> linebreak <= fromIndex - boundary.start())// + .reduce((a, b) -> b)// + .orElse(0) + boundary.start(); + } + + + @Override + public Rectangle2D getPosition(int stringIdx) { + + return positions.get(stringIdxToPositionIdx.get(stringIdx - boundary.start())); + } + + + @Override + public List getPositions(Boundary stringBoundary) { + + if (!containsBoundary(stringBoundary)) { + throw new IndexOutOfBoundsException(format("%s is out of bounds for %s", stringBoundary, this.boundary)); + } + if (stringBoundary.length() == 0) { + return Collections.emptyList(); + } + + int startPositionIdx = stringIdxToPositionIdx.get(stringBoundary.start() - this.boundary.start()); + + if (stringBoundary.end() == this.boundary.end()) { + return positions.subList(startPositionIdx, positions.size()); + } + + return positions.subList(startPositionIdx, stringIdxToPositionIdx.get(stringBoundary.end() - this.boundary.start())); + + } + + + public Map> getPositionsPerPage(Boundary stringBoundary) { + + List rectanglesPerLine = stringBoundary.split(getAllLineBreaksInBoundary(stringBoundary)) + .stream() + .map(this::getPositions) + .map(RectangleTransformations::rectangleUnionWithGaps) + .flatMap(Collection::stream) + .toList(); + Map> rectanglePerLinePerPage = new HashMap<>(); + rectanglePerLinePerPage.put(page, rectanglesPerLine); + return rectanglePerLinePerPage; + } + + + private List getAllLineBreaksInBoundary(Boundary boundary) { + + return getLineBreaks().stream().map(linebreak -> linebreak + this.boundary.start()).filter(boundary::contains).toList(); + } + + + @Override + public String toString() { + + return searchText; + } + +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/textblock/ConcatenatedTextBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/ConcatenatedTextBlock.java similarity index 70% rename from layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/textblock/ConcatenatedTextBlock.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/ConcatenatedTextBlock.java index 1b4ab75..69e0473 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/textblock/ConcatenatedTextBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/ConcatenatedTextBlock.java @@ -1,14 +1,17 @@ -package com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock; +package com.knecon.fforesight.service.layoutparser.processor.graph.textblock; + +import static java.lang.String.format; import java.awt.geom.Rectangle2D; +import java.util.Collections; +import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; -import java.util.stream.Collectors; +import java.util.stream.Stream; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityPosition; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode; +import com.knecon.fforesight.service.layoutparser.processor.graph.Boundary; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page; import lombok.AccessLevel; import lombok.Data; @@ -23,6 +26,12 @@ public class ConcatenatedTextBlock implements TextBlock { Boundary boundary; + public static ConcatenatedTextBlock empty() { + + return new ConcatenatedTextBlock(Collections.emptyList()); + } + + public ConcatenatedTextBlock(List atomicTextBlocks) { this.atomicTextBlocks = new LinkedList<>(); @@ -44,10 +53,11 @@ public class ConcatenatedTextBlock implements TextBlock { boundary.setStart(textBlock.getBoundary().start()); boundary.setEnd(textBlock.getBoundary().end()); } else if (boundary.end() != textBlock.getBoundary().start()) { - throw new UnsupportedOperationException(String.format("Can only concat consecutive TextBlocks, trying to concat %s and %s", boundary, textBlock.getBoundary())); + throw new UnsupportedOperationException(format("Can only concat consecutive TextBlocks, trying to concat %s and %s", boundary, textBlock.getBoundary())); } this.atomicTextBlocks.addAll(textBlock.getAtomicTextBlocks()); boundary.setEnd(textBlock.getBoundary().end()); + this.searchText = null; return this; } @@ -135,38 +145,34 @@ public class ConcatenatedTextBlock implements TextBlock { @Override - public List getEntityPositionsPerPage(Boundary stringBoundary) { + public Map> getPositionsPerPage(Boundary stringBoundary) { List textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringBoundary); if (textBlocks.size() == 1) { - return textBlocks.get(0).getEntityPositionsPerPage(stringBoundary); + return textBlocks.get(0).getPositionsPerPage(stringBoundary); } AtomicTextBlock firstTextBlock = textBlocks.get(0); - List positions = new LinkedList<>(firstTextBlock.getEntityPositionsPerPage(new Boundary(stringBoundary.start(), firstTextBlock.getBoundary().end()))); + Map> rectanglesPerLinePerPage = firstTextBlock.getPositionsPerPage(new Boundary(stringBoundary.start(), firstTextBlock.getBoundary().end())); for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) { - positions.addAll(textBlock.getEntityPositionsPerPage(textBlock.getBoundary())); + rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage, textBlock.getPositionsPerPage(textBlock.getBoundary())); } AtomicTextBlock lastTextBlock = textBlocks.get(textBlocks.size() - 1); - positions.addAll(lastTextBlock.getEntityPositionsPerPage(new Boundary(lastTextBlock.getBoundary().start(), stringBoundary.end()))); + rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage, + lastTextBlock.getPositionsPerPage(new Boundary(lastTextBlock.getBoundary().start(), stringBoundary.end()))); - return mergeEntityPositionsWithSamePageNode(positions); + return rectanglesPerLinePerPage; } - private List mergeEntityPositionsWithSamePageNode(List positions) { - - Map> entityPositionsPerPage = positions.stream().collect(// - Collectors.groupingBy(EntityPosition::getPageNode, // - Collectors.flatMapping(entityPosition -> entityPosition.getRectanglePerLine().stream(), Collectors.toList()))); - - return entityPositionsPerPage.entrySet().stream()// - .map(entry -> EntityPosition.builder().pageNode(entry.getKey()).rectanglePerLine(entry.getValue()).build())// - .toList(); + private Map> mergeEntityPositionsWithSamePageNode(Map> map1, Map> map2) { + Map> mergedMap = new HashMap<>(map1); + map2.forEach((pageNode, rectangles) -> mergedMap.merge(pageNode, rectangles, (l1, l2) -> Stream.concat(l1.stream(), l2.stream()).toList())); + return mergedMap; } diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/textblock/TextBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/TextBlock.java similarity index 73% rename from layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/textblock/TextBlock.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/TextBlock.java index 3312650..34a0f7a 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/textblock/TextBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/TextBlock.java @@ -1,15 +1,17 @@ -package com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock; +package com.knecon.fforesight.service.layoutparser.processor.graph.textblock; + +import static java.lang.String.format; import java.awt.geom.Rectangle2D; import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.Map; import java.util.Set; import java.util.stream.Collectors; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityPosition; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode; +import com.knecon.fforesight.service.layoutparser.processor.graph.Boundary; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page; public interface TextBlock extends CharSequence { @@ -37,7 +39,7 @@ public interface TextBlock extends CharSequence { List getPositions(Boundary stringBoundary); - List getEntityPositionsPerPage(Boundary stringBoundary); + Map> getPositionsPerPage(Boundary stringBoundary); int numberOfLines(); @@ -49,12 +51,21 @@ public interface TextBlock extends CharSequence { } - default Set getPages() { + default Set getPages() { return getAtomicTextBlocks().stream().map(AtomicTextBlock::getPage).collect(Collectors.toUnmodifiableSet()); } + default Set getPages(Boundary boundary) { + + return getAtomicTextBlocks().stream() + .filter(atomicTextBlock -> atomicTextBlock.getBoundary().intersects(boundary)) + .map(AtomicTextBlock::getPage) + .collect(Collectors.toUnmodifiableSet()); + } + + default int indexOf(String searchTerm, int startOffset) { int start = getSearchText().indexOf(searchTerm, startOffset - getBoundary().start()); @@ -74,7 +85,7 @@ public interface TextBlock extends CharSequence { default boolean containsBoundary(Boundary boundary) { if (boundary.end() < boundary.start()) { - throw new IllegalArgumentException(String.format("Invalid %s, StartIndex must be smaller than EndIndex", boundary)); + throw new IllegalArgumentException(format("Invalid %s, StartIndex must be smaller than EndIndex", boundary)); } return getBoundary().contains(boundary); } diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/textblock/TextBlockCollector.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/TextBlockCollector.java similarity index 84% rename from layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/textblock/TextBlockCollector.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/TextBlockCollector.java index 3a9ba1d..7e066c4 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/graph/textblock/TextBlockCollector.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/TextBlockCollector.java @@ -1,6 +1,5 @@ -package com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock; +package com.knecon.fforesight.service.layoutparser.processor.graph.textblock; -import java.util.Collections; import java.util.Set; import java.util.function.BiConsumer; import java.util.function.BinaryOperator; @@ -16,7 +15,7 @@ public class TextBlockCollector implements Collector supplier() { - return () -> new ConcatenatedTextBlock(Collections.emptyList()); + return ConcatenatedTextBlock::empty; } diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/mapper/DocumentDataMapper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/DocumentDataMapper.java similarity index 57% rename from layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/mapper/DocumentDataMapper.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/DocumentDataMapper.java index 13bdf1b..08f182d 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/mapper/DocumentDataMapper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/DocumentDataMapper.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.internal.api.mapper; +package com.knecon.fforesight.service.layoutparser.processor.mapper; import java.awt.geom.Rectangle2D; import java.util.HashMap; @@ -9,75 +9,75 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositi import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData; import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData; import com.knecon.fforesight.service.layoutparser.internal.api.data.PageData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.TableOfContentsData; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ImageNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableCellNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentTreeData; +import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Image; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Table; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.TableCell; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.AtomicTextBlock; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; import lombok.experimental.UtilityClass; @UtilityClass public class DocumentDataMapper { - public DocumentData toDocumentData(DocumentGraph documentGraph) { + public DocumentData toDocumentData(Document document) { - List atomicTextBlockData = documentGraph.streamTerminalTextBlocksInOrder() + List atomicTextBlockData = document.streamTerminalTextBlocksInOrder() .flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream()) .distinct() .map(DocumentDataMapper::toAtomicTextBlockData) .toList(); - List atomicPositionBlockData = documentGraph.streamTerminalTextBlocksInOrder() + List atomicPositionBlockData = document.streamTerminalTextBlocksInOrder() .flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream()) .distinct() .map(DocumentDataMapper::toAtomicPositionBlockData) .toList(); - List pageData = documentGraph.getPages().stream().map(DocumentDataMapper::toPageData).toList(); - TableOfContentsData tableOfContentsData = toTableOfContentsData(documentGraph.getTableOfContents()); + List pageData = document.getPages().stream().map(DocumentDataMapper::toPageData).toList(); + DocumentTreeData tableOfContentsData = toDocumentTreeData(document.getDocumentTree()); return DocumentData.builder() .atomicTextBlocks(atomicTextBlockData.toArray(new AtomicTextBlockData[0])) .atomicPositionBlocks(atomicPositionBlockData.toArray(new AtomicPositionBlockData[0])) .pages(pageData.toArray(new PageData[0])) - .tableOfContents(tableOfContentsData) + .documentTreeData(tableOfContentsData) .build(); } - private TableOfContentsData toTableOfContentsData(TableOfContents tableOfContents) { + private DocumentTreeData toDocumentTreeData(DocumentTree documentTree) { - return new TableOfContentsData(toEntryData(tableOfContents.getRoot())); + return new DocumentTreeData(toEntryData(documentTree.getRoot())); } - private TableOfContentsData.EntryData toEntryData(TableOfContents.Entry entry) { + private DocumentTreeData.EntryData toEntryData(DocumentTree.Entry entry) { Long[] atomicTextBlocks; - if (entry.getNode().isTerminal()) { - atomicTextBlocks = toAtomicTextBlockIds(entry.getNode().getTerminalTextBlock()); + if (entry.getNode().isLeaf()) { + atomicTextBlocks = toAtomicTextBlockIds(entry.getNode().getLeafTextBlock()); } else { atomicTextBlocks = new Long[]{}; } Map properties = switch (entry.getType()) { - case TABLE -> PropertiesMapper.buildTableProperties((TableNode) entry.getNode()); - case TABLE_CELL -> PropertiesMapper.buildTableCellProperties((TableCellNode) entry.getNode()); - case IMAGE -> PropertiesMapper.buildImageProperties((ImageNode) entry.getNode()); + case TABLE -> PropertiesMapper.buildTableProperties((Table) entry.getNode()); + case TABLE_CELL -> PropertiesMapper.buildTableCellProperties((TableCell) entry.getNode()); + case IMAGE -> PropertiesMapper.buildImageProperties((Image) entry.getNode()); default -> new HashMap<>(); }; - return TableOfContentsData.EntryData.builder() - .tocId(toPrimitiveIntArray(entry.getTocId())) - .subEntries(entry.getChildren().stream().map(DocumentDataMapper::toEntryData).toList()) + return DocumentTreeData.EntryData.builder() + .treeId(toPrimitiveIntArray(entry.getTreeId())) + .children(entry.getChildren().stream().map(DocumentDataMapper::toEntryData).toList()) .type(entry.getType()) - .atomicBlocks(atomicTextBlocks) - .pages(entry.getNode().getPages().stream().map(PageNode::getNumber).map(Integer::longValue).toArray(Long[]::new)) + .atomicBlockIds(atomicTextBlocks) + .pageNumbers(entry.getNode().getPages().stream().map(Page::getNumber).map(Integer::longValue).toArray(Long[]::new)) .properties(properties) .build(); } @@ -89,7 +89,7 @@ public class DocumentDataMapper { } - private PageData toPageData(PageNode p) { + private PageData toPageData(Page p) { return PageData.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).build(); } @@ -119,28 +119,25 @@ public class DocumentDataMapper { } - private float[][] toPrimitiveFloatMatrix(List positions) { + private static float[][] toPrimitiveFloatMatrix(List positions) { float[][] positionMatrix = new float[positions.size()][]; for (int i = 0; i < positions.size(); i++) { - float[] singlePositions = new float[4]; - singlePositions[0] = (float) positions.get(i).getMinX(); - singlePositions[1] = (float) positions.get(i).getMinY(); - singlePositions[2] = (float) positions.get(i).getWidth(); - singlePositions[3] = (float) positions.get(i).getHeight(); - positionMatrix[i] = singlePositions; + positionMatrix[i] = toArray(positions.get(i)); } return positionMatrix; } + private static float[] toArray(Rectangle2D positions) { + + return new float[]{(float) positions.getMinX(), (float) positions.getMinY(), (float) positions.getWidth(), (float) positions.getHeight()}; + } + + private int[] toPrimitiveIntArray(List list) { - int[] array = new int[list.size()]; - for (int i = 0; i < list.size(); i++) { - array[i] = list.get(i); - } - return array; + return list.stream().mapToInt(Integer::intValue).toArray(); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/DocumentGraphMapper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/DocumentGraphMapper.java new file mode 100644 index 0000000..80973e1 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/DocumentGraphMapper.java @@ -0,0 +1,198 @@ +package com.knecon.fforesight.service.layoutparser.processor.mapper; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.NoSuchElementException; + +import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositionBlockData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentTreeData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.PageData; +import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Footer; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Header; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Headline; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Image; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Paragraph; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Section; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Table; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.TableCell; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.AtomicTextBlock; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlockCollector; + +import lombok.experimental.UtilityClass; + +@UtilityClass +public class DocumentGraphMapper { + + public Document toDocumentGraph(DocumentData documentData) { + + Document document = new Document(); + DocumentTree documentTree = new DocumentTree(document); + Context context = new Context(documentData, documentTree); + + context.pages.addAll(Arrays.stream(documentData.getPages()).map(DocumentGraphMapper::buildPage).toList()); + + context.documentTree.getRoot().getChildren().addAll(buildEntries(documentData.getDocumentTreeData().getRoot().getChildren(), context)); + + document.setDocumentTree(context.documentTree); + document.setPages(new HashSet<>(context.pages)); + document.setNumberOfPages(documentData.getPages().length); + + document.setTextBlock(document.getTextBlock()); + return document; + } + + + private List buildEntries(List entries, Context context) { + + List newEntries = new LinkedList<>(); + for (DocumentTreeData.EntryData entryData : entries) { + + List pages = Arrays.stream(entryData.getPageNumbers()).map(pageNumber -> getPage(pageNumber, context)).toList(); + + SemanticNode node = switch (entryData.getType()) { + case SECTION -> buildSection(context); + case PARAGRAPH -> buildParagraph(context); + case HEADLINE -> buildHeadline(context); + case HEADER -> buildHeader(context); + case FOOTER -> buildFooter(context); + case TABLE -> buildTable(context, entryData.getProperties()); + case TABLE_CELL -> buildTableCell(context, entryData.getProperties()); + case IMAGE -> buildImage(context, entryData.getProperties(), entryData.getPageNumbers()); + default -> throw new UnsupportedOperationException("Not yet implemented for type " + entryData.getType()); + }; + + if (entryData.getAtomicBlockIds().length > 0) { + TextBlock textBlock = toTextBlock(entryData.getAtomicBlockIds(), context, node); + node.setLeafTextBlock(textBlock); + } + List treeId = Arrays.stream(entryData.getTreeId()).boxed().toList(); + node.setTreeId(treeId); + + switch (entryData.getType()) { + case HEADER -> pages.forEach(page -> page.setHeader((Header) node)); + case FOOTER -> pages.forEach(page -> page.setFooter((Footer) node)); + default -> pages.forEach(page -> page.getMainBody().add(node)); + } + + newEntries.add(DocumentTree.Entry.builder().treeId(treeId).children(buildEntries(entryData.getChildren(), context)).node(node).build()); + } + return newEntries; + } + + + private Headline buildHeadline(Context context) { + + return Headline.builder().documentTree(context.documentTree).build(); + } + + + private Image buildImage(Context context, Map properties, Long[] pageNumbers) { + + assert pageNumbers.length == 1; + Page page = getPage(pageNumbers[0], context); + var builder = Image.builder(); + PropertiesMapper.parseImageProperties(properties, builder); + return builder.documentTree(context.documentTree).page(page).build(); + } + + + private TableCell buildTableCell(Context context, Map properties) { + + TableCell.TableCellBuilder builder = TableCell.builder(); + PropertiesMapper.parseTableCellProperties(properties, builder); + return builder.documentTree(context.documentTree).build(); + } + + + private Table buildTable(Context context, Map properties) { + + Table.TableBuilder builder = Table.builder(); + PropertiesMapper.parseTableProperties(properties, builder); + return builder.documentTree(context.documentTree).build(); + } + + + private Footer buildFooter(Context context) { + + return Footer.builder().documentTree(context.documentTree).build(); + } + + + private Header buildHeader(Context context) { + + return Header.builder().documentTree(context.documentTree).build(); + } + + + private Section buildSection(Context context) { + + return Section.builder().documentTree(context.documentTree).build(); + } + + + private Paragraph buildParagraph(Context context) { + + return Paragraph.builder().documentTree(context.documentTree).build(); + } + + + private TextBlock toTextBlock(Long[] atomicTextBlockIds, Context context, SemanticNode parent) { + + return Arrays.stream(atomicTextBlockIds).map(atomicTextBlockId -> getAtomicTextBlock(context, parent, atomicTextBlockId)).collect(new TextBlockCollector()); + } + + + private AtomicTextBlock getAtomicTextBlock(Context context, SemanticNode parent, Long atomicTextBlockId) { + + return AtomicTextBlock.fromAtomicTextBlockData(context.atomicTextBlockData.get(Math.toIntExact(atomicTextBlockId)), + context.atomicPositionBlockData.get(Math.toIntExact(atomicTextBlockId)), + parent, + getPage(context.atomicTextBlockData.get(Math.toIntExact(atomicTextBlockId)).getPage(), context)); + } + + + private Page buildPage(PageData p) { + + return Page.builder().rotation(p.getRotation()).height(p.getHeight()).width(p.getWidth()).number(p.getNumber()).mainBody(new LinkedList<>()).build(); + } + + + private Page getPage(Long pageIndex, Context context) { + + return context.pages.stream() + .filter(page -> page.getNumber() == Math.toIntExact(pageIndex)) + .findFirst() + .orElseThrow(() -> new NoSuchElementException(String.format("ClassificationPage with number %d not found", pageIndex))); + } + + + static final class Context { + + private final DocumentTree documentTree; + private final List pages; + private final List atomicTextBlockData; + private final List atomicPositionBlockData; + + + Context(DocumentData documentData, DocumentTree documentTree) { + + this.documentTree = documentTree; + this.pages = new LinkedList<>(); + this.atomicTextBlockData = Arrays.stream(documentData.getAtomicTextBlocks()).toList(); + this.atomicPositionBlockData = Arrays.stream(documentData.getAtomicPositionBlocks()).toList(); + + } + + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/PropertiesMapper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/PropertiesMapper.java new file mode 100644 index 0000000..cbb6d49 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/PropertiesMapper.java @@ -0,0 +1,112 @@ +package com.knecon.fforesight.service.layoutparser.processor.mapper; + +import java.awt.geom.Rectangle2D; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Image; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.ImageType; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.TableCell; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Table; +import com.knecon.fforesight.service.layoutparser.processor.services.RectangleTransformations; + +public class PropertiesMapper { + + public static final String TRANSPARENT = "transparent"; + public static final String IMAGE_TYPE = "imageType"; + public static final String POSITION = "position"; + public static final String ROW = "row"; + public static final String COL = "col"; + public static final String HEADER = "header"; + public static final String B_BOX = "bBox"; + public static final String NUMBER_OF_ROWS = "numberOfRows"; + public static final String NUMBER_OF_COLS = "numberOfCols"; + + + public static Map buildImageProperties(Image image) { + + Map properties = new HashMap<>(); + properties.put(IMAGE_TYPE, image.getImageType().toString()); + properties.put(TRANSPARENT, String.valueOf(image.isTransparent())); + properties.put(POSITION, RectangleTransformations.toString(image.getPosition())); + return properties; + } + + + public static Map buildTableCellProperties(TableCell tableCell) { + + Map properties = new HashMap<>(); + properties.put(ROW, String.valueOf(tableCell.getRow())); + properties.put(COL, String.valueOf(tableCell.getCol())); + properties.put(HEADER, String.valueOf(tableCell.isHeader())); + + if (tableCell.getPages().size() > 1 || tableCell.getBBox().keySet().size() > 1) { + throw new IllegalArgumentException("TableCell can only occur on a single page!"); + } + String bBoxString = RectangleTransformations.toString(tableCell.getBBox().get(tableCell.getPages().stream().findFirst().get())); + properties.put(B_BOX, bBoxString); + + return properties; + } + + + public static Map buildTableProperties(Table table) { + + Map properties = new HashMap<>(); + properties.put(NUMBER_OF_ROWS, String.valueOf(table.getNumberOfRows())); + properties.put(NUMBER_OF_COLS, String.valueOf(table.getNumberOfCols())); + return properties; + } + + + public static void parseImageProperties(Map properties, Image.ImageBuilder builder) { + + builder.imageType(parseImageType(properties.get(IMAGE_TYPE))); + builder.transparent(Boolean.parseBoolean(properties.get(TRANSPARENT))); + builder.position(parseRectangle2D(properties.get(POSITION))); + } + + + public static void parseTableCellProperties(Map properties, TableCell.TableCellBuilder builder) { + + builder.row(Integer.parseInt(properties.get(ROW))); + builder.col(Integer.parseInt(properties.get(COL))); + builder.header(Boolean.parseBoolean(properties.get(HEADER))); + builder.bBox(parseRectangle2D(properties.get(B_BOX))); + } + + + public static void parseTableProperties(Map properties, Table.TableBuilder builder) { + + builder.numberOfRows(Integer.parseInt(properties.get(NUMBER_OF_ROWS))); + builder.numberOfCols(Integer.parseInt(properties.get(NUMBER_OF_COLS))); + } + + + private static ImageType parseImageType(String imageType) { + + return switch (imageType) { + case "LOGO" -> ImageType.LOGO; + case "FORMULA" -> ImageType.FORMULA; + case "SIGNATURE" -> ImageType.SIGNATURE; + case "OCR" -> ImageType.OCR; + default -> ImageType.OTHER; + }; + } + + + public static String toString(Rectangle2D rectangle2D) { + + return String.format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight()); + } + + + public static Rectangle2D parseRectangle2D(String bBox) { + + List floats = Arrays.stream(bBox.split(",")).map(Float::parseFloat).toList(); + return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3)); + } + +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/services/RectangleTransformations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RectangleTransformations.java similarity index 93% rename from layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/services/RectangleTransformations.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RectangleTransformations.java index 6e0d68e..3618dbd 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/services/RectangleTransformations.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RectangleTransformations.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.internal.api.services; +package com.knecon.fforesight.service.layoutparser.processor.services; import static java.lang.String.format; @@ -13,7 +13,7 @@ import java.util.function.Function; import java.util.function.Supplier; import java.util.stream.Collector; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.AtomicTextBlock; import lombok.experimental.UtilityClass; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/IdBuilder.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/IdBuilder.java new file mode 100644 index 0000000..8c05bb7 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/IdBuilder.java @@ -0,0 +1,41 @@ +package com.knecon.fforesight.service.layoutparser.processor.utils; + +import java.awt.geom.Rectangle2D; +import java.nio.charset.StandardCharsets; +import java.util.Comparator; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +import com.google.common.hash.HashFunction; +import com.google.common.hash.Hashing; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page; + +import lombok.experimental.UtilityClass; + +@UtilityClass +public final class IdBuilder { + + private final HashFunction hashFunction = Hashing.murmur3_128(); + + + public String buildId(Set pages, List rectanglesPerLine) { + + return buildId(pages.stream().map(Page::getNumber).collect(Collectors.toList()), rectanglesPerLine); + } + + + public String buildId(List pageNumbers, List rectanglesPerLine) { + + StringBuilder sb = new StringBuilder(); + List sortedPageNumbers = pageNumbers.stream().sorted(Comparator.comparingInt(Integer::intValue)).toList(); + sortedPageNumbers.forEach(sb::append); + rectanglesPerLine.forEach(rectangle2D -> sb.append(Math.round(rectangle2D.getX())) + .append(Math.round(rectangle2D.getY())) + .append(Math.round(rectangle2D.getWidth())) + .append(Math.round(rectangle2D.getHeight()))); + + return hashFunction.hashString(sb.toString(), StandardCharsets.UTF_8).toString(); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PdfVisualisationUtility.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PdfVisualisationUtility.java new file mode 100644 index 0000000..2616560 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PdfVisualisationUtility.java @@ -0,0 +1,169 @@ +package com.knecon.fforesight.service.layoutparser.processor.utils; + +import java.awt.Color; +import java.awt.geom.Point2D; +import java.awt.geom.Rectangle2D; +import java.io.IOException; +import java.util.List; +import java.util.Map; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDPageContentStream; +import org.apache.pdfbox.pdmodel.font.PDType1Font; +import org.apache.pdfbox.pdmodel.font.Standard14Fonts; + +import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType; +import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.AtomicTextBlock; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; + +import lombok.AccessLevel; +import lombok.Builder; +import lombok.Getter; +import lombok.SneakyThrows; +import lombok.experimental.FieldDefaults; +import lombok.experimental.UtilityClass; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@UtilityClass +public class PdfVisualisationUtility { + + public void drawDocumentGraph(PDDocument document, Document documentGraph) { + + documentGraph.getDocumentTree().allEntriesInOrder().forEach(entry -> drawNode(document, entry)); + } + + + public void drawNode(PDDocument document, DocumentTree.Entry entry) { + + Options options = buildStandardOptionsForNodes(entry); + + drawBBoxAndLabelAndNumberOnPage(document, entry, options); + + } + + + public void drawTextBlock(PDDocument document, TextBlock textBlock, Options options) { + + textBlock.getAtomicTextBlocks().forEach(atb -> drawAtomicTextBlock(document, atb, options)); + } + + + public void drawAtomicTextBlock(PDDocument document, AtomicTextBlock atomicTextBlock, Options options) { + + drawRectangle2DList(document, atomicTextBlock.getPage().getNumber(), atomicTextBlock.getPositions().stream().toList(), options); + + } + + + @SneakyThrows + public void drawText(String string, PDDocument document, Point2D location, Integer pageNumber, Options options) { + + var pdPage = document.getPage(pageNumber - 1); + var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true); + + contentStream.setNonStrokingColor(options.getStrokeColor()); + contentStream.setLineWidth(options.getStrokeWidth()); + + contentStream.beginText(); + contentStream.newLineAtOffset((float) location.getX(), (float) location.getY()); + contentStream.setFont(new PDType1Font(Standard14Fonts.FontName.HELVETICA), 10); + contentStream.showText(string); + contentStream.endText(); + contentStream.close(); + } + + + @SneakyThrows + public void drawRectangle2DList(PDDocument document, int pageNumber, List rectCollection, Options options) { + + var pdPage = document.getPage(pageNumber - 1); + drawRectangle2DList(document, rectCollection, options, pdPage); + } + + + private void drawRectangle2DList(PDDocument document, List rectCollection, Options options, PDPage pdPage) throws IOException { + + var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true); + + contentStream.setStrokingColor(options.getStrokeColor()); + contentStream.setNonStrokingColor(options.getFillColor()); + contentStream.setLineWidth(options.getStrokeWidth()); + + for (var r : rectCollection) { + contentStream.addRect((float) r.getMinX(), (float) r.getMinY(), (float) r.getWidth(), (float) r.getHeight()); + + if (options.isStroke() && options.isFill()) { + contentStream.fillAndStroke(); + } else if (options.isStroke()) { + contentStream.stroke(); + } else if (options.isFill()) { + contentStream.fill(); + } + } + contentStream.close(); + } + + + private Options buildStandardOptionsForNodes(DocumentTree.Entry entry) { + + return Options.builder().stroke(true).strokeColor(switch (entry.getType()) { + case DOCUMENT -> Color.LIGHT_GRAY; + case HEADER, FOOTER -> Color.GREEN; + case PARAGRAPH -> Color.BLUE; + case HEADLINE -> Color.RED; + case SECTION -> Color.BLACK; + case TABLE -> Color.ORANGE; + case TABLE_CELL -> Color.GRAY; + case IMAGE -> Color.MAGENTA; + }).build(); + } + + + private void drawBBoxAndLabelAndNumberOnPage(PDDocument document, DocumentTree.Entry entry, Options options) { + + Map rectanglesPerPage = entry.getNode().getBBox(); + rectanglesPerPage.forEach((page, rectangle2D) -> { + Rectangle2D paddedRectangle2D = rectangle2D; + if (entry.getType() == NodeType.SECTION) { + paddedRectangle2D = pad(rectangle2D, 10, 10); + } + drawRectangle2DList(document, page.getNumber(), List.of(paddedRectangle2D), options); + drawText(buildString(entry), document, new Point2D.Double(paddedRectangle2D.getMinX(), paddedRectangle2D.getMaxY() + 2), page.getNumber(), options); + }); + } + + + private static Rectangle2D pad(Rectangle2D rectangle2D, int deltaX, int deltaY) { + + return new Rectangle2D.Double(rectangle2D.getMinX() - deltaX, rectangle2D.getMinY() - deltaY, rectangle2D.getWidth() + 2 * deltaX, rectangle2D.getHeight() + 2 * deltaY); + } + + + private String buildString(DocumentTree.Entry entry) { + + return entry.getNode().getNumberOnPage() + ": " + entry.getTreeId() + ": " + entry.getType().toString(); + } + + + @Builder + @Getter + @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) + public static class Options { + + boolean fill; + boolean stroke; + @Builder.Default + Color strokeColor = Color.BLACK; + @Builder.Default + float strokeWidth = 1f; + @Builder.Default + Color fillColor = Color.BLACK; + + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java new file mode 100644 index 0000000..d5617a3 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java @@ -0,0 +1,146 @@ +package com.knecon.fforesight.service.layoutparser.processor.utils; + +import java.awt.geom.Area; +import java.awt.geom.Rectangle2D; +import java.awt.geom.RectangularShape; +import java.util.Collections; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; +import java.util.function.BiConsumer; +import java.util.function.BinaryOperator; +import java.util.function.Function; +import java.util.function.Supplier; +import java.util.stream.Collector; + +import org.apache.pdfbox.pdmodel.common.PDRectangle; + +import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point; +import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.AtomicTextBlock; + +public class RectangleTransformations { + + public static PDRectangle toPDRectangleUnion(List rectangles) { + + Rectangle2D rectangle2D = RectangleTransformations.bBoxUnionRectangle(rectangles); + + PDRectangle annotationPosition = new PDRectangle(); + annotationPosition.setLowerLeftX((float) rectangle2D.getMinX()); + annotationPosition.setLowerLeftY((float) rectangle2D.getMinY()); + annotationPosition.setUpperRightX((float) rectangle2D.getMaxX()); + annotationPosition.setUpperRightY((float) rectangle2D.getMaxY()); + return annotationPosition; + } + + + public static Rectangle2D bBoxUnionAtomicTextBlock(List atomicTextBlocks) { + + return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DUnion()); + } + + + public static Rectangle2D bBoxUnionRectangle(List rectangles) { + + return rectangles.stream().map(RectangleTransformations::toRectangle2D).collect(new Rectangle2DUnion()); + } + + + public static Rectangle2D toRectangle2D(Rectangle redactionLogRectangle) { + + return new Rectangle2D.Double(redactionLogRectangle.getTopLeft().getX(), + redactionLogRectangle.getTopLeft().getY() + redactionLogRectangle.getHeight(), + redactionLogRectangle.getWidth(), + -redactionLogRectangle.getHeight()); + } + + + public static Rectangle toRedactionLogRectangle(Rectangle2D rectangle2D, int pageNumber) { + + return new Rectangle(new Point((float) rectangle2D.getMinX(), (float) (rectangle2D.getMinY() + rectangle2D.getHeight())), + (float) rectangle2D.getWidth(), + -(float) rectangle2D.getHeight(), + pageNumber); + } + + + public static Rectangle2D rectangleUnion(List rectangle2DList) { + + return rectangle2DList.stream().collect(new Rectangle2DUnion()); + } + + + /** + * If two rectangles are further apart than five times the average width of a rectangle, a gap is inserted. + * + * @param rectangle2DList A list of rectangles to combine + * @return A list of rectangles which are combined if they are closer than the split threshold + */ + public static List rectangleUnionWithGaps(List rectangle2DList) { + + if (rectangle2DList.isEmpty()) { + return Collections.emptyList(); + } + double splitThreshold = rectangle2DList.stream().mapToDouble(RectangularShape::getWidth).average().orElse(5) * 5.0; + + List> rectangleListsWithGaps = new LinkedList<>(); + List rectangleListWithoutGaps = new LinkedList<>(); + rectangleListsWithGaps.add(rectangleListWithoutGaps); + Rectangle2D previousRectangle = rectangle2DList.get(0); + for (Rectangle2D currentRectangle : rectangle2DList) { + if (Math.abs(currentRectangle.getMinX() - previousRectangle.getMaxX()) > splitThreshold) { + rectangleListWithoutGaps = new LinkedList<>(); + rectangleListWithoutGaps.add(currentRectangle); + rectangleListsWithGaps.add(rectangleListWithoutGaps); + previousRectangle = currentRectangle; + } else { + rectangleListWithoutGaps.add(currentRectangle); + previousRectangle = currentRectangle; + } + } + return rectangleListsWithGaps.stream().map(RectangleTransformations::rectangleUnion).toList(); + } + + + private static class Rectangle2DUnion implements Collector { + + @Override + public Supplier supplier() { + + return Area::new; + } + + + @Override + public BiConsumer accumulator() { + + return (area, rectangle2D) -> area.add(new Area(rectangle2D)); + } + + + @Override + public BinaryOperator combiner() { + + return (area1, area2) -> { + area1.add(area2); + return area1; + }; + } + + + @Override + public Function finisher() { + + return Area::getBounds2D; + } + + + @Override + public Set characteristics() { + + return Set.of(Characteristics.CONCURRENT, Characteristics.UNORDERED); + } + + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TableMergingUtility.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TableMergingUtility.java new file mode 100644 index 0000000..fd59588 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TableMergingUtility.java @@ -0,0 +1,42 @@ +package com.knecon.fforesight.service.layoutparser.processor.utils; + +import java.util.Collection; +import java.util.LinkedList; +import java.util.List; +import java.util.stream.Stream; + +import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock; + +import lombok.experimental.UtilityClass; + +@UtilityClass +public class TableMergingUtility { + + public List findConsecutiveTablesWithSameColCountAndSameHeaders(TablePageBlock originalTablePageBlock, List pageBlocks) { + + List consecutiveTables = pageBlocks.stream() + .map(abstractPageBlock -> (TablePageBlock) abstractPageBlock) + .filter(tablePageBlock -> !tablePageBlock.equals(originalTablePageBlock)) + .toList(); + assert consecutiveTables.size() == pageBlocks.size() - 1; + + List consecutiveTablesWithSameColCountAndHeaders = new LinkedList<>(); + for (TablePageBlock consecutiveTable : consecutiveTables) { + if (consecutiveTable.getColCount() == originalTablePageBlock.getColCount() && !hasTableHeader(consecutiveTable)) { + consecutiveTablesWithSameColCountAndHeaders.add(consecutiveTable); + } else { + break; + } + } + return Stream.concat(Stream.of(originalTablePageBlock), consecutiveTablesWithSameColCountAndHeaders.stream()).toList(); + } + + + private boolean hasTableHeader(TablePageBlock table) { + + return table.getRows().stream().flatMap(Collection::stream).anyMatch(Cell::isHeaderCell); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/TextPositionOperations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java similarity index 72% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/TextPositionOperations.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java index 006436d..c4c0eba 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/TextPositionOperations.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java @@ -1,14 +1,14 @@ -package com.knecon.fforesight.service.layoutparser.processor.factory; +package com.knecon.fforesight.service.layoutparser.processor.utils; import java.util.Comparator; import java.util.List; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence; public class TextPositionOperations { - public static List mergeAndSortTextPositionSequenceByYThenX(List textBlocks) { + public static List mergeAndSortTextPositionSequenceByYThenX(List textBlocks) { return textBlocks.stream()// .flatMap(tb -> tb.getSequences().stream())// diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BoundaryTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BoundaryTest.java index 3d97c8a..504d3f2 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BoundaryTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BoundaryTest.java @@ -11,7 +11,7 @@ import java.util.List; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary; +import com.knecon.fforesight.service.layoutparser.processor.graph.Boundary; class BoundaryTest { diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BuildDocumentGraphTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BuildDocumentGraphTest.java index a264ecf..d74d2e0 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BuildDocumentGraphTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BuildDocumentGraphTest.java @@ -10,9 +10,9 @@ import org.junit.jupiter.api.Test; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.core.io.ClassPathResource; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph; import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingService; import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.server.utils.BaseTest; import lombok.SneakyThrows; @@ -25,7 +25,7 @@ public class BuildDocumentGraphTest extends BaseTest { @Test public void buildMetolachlor() { - DocumentGraph documentGraph = buildGraph("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06"); + Document documentGraph = buildGraph("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06"); assertEquals(221, documentGraph.getPages().size()); assertEquals(220 , documentGraph.getPages().stream().filter(page -> page.getHeader().hasText()).count()); assertEquals(0 , documentGraph.getPages().stream().filter(page -> page.getFooter().hasText()).count()); @@ -33,7 +33,7 @@ public class BuildDocumentGraphTest extends BaseTest { @SneakyThrows - protected DocumentGraph buildGraph(String filename) { + protected Document buildGraph(String filename) { if (filename.equals("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06")) { prepareStorage(filename + ".pdf", "cv_table_parsing_response/empty.json", "image_service_response/S-Metolachlor_RAR_01_Volume_1_2018-09-06.IMAGE_INFO.json"); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphEntityInsertionTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphEntityInsertionTest.java deleted file mode 100644 index 0b385b6..0000000 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphEntityInsertionTest.java +++ /dev/null @@ -1,280 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.server.graph; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertInstanceOf; -import static org.wildfly.common.Assert.assertTrue; - -import java.util.List; - -import org.junit.jupiter.api.Test; -import org.springframework.beans.factory.annotation.Autowired; - -import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.HeadlineNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.ParagraphNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableCellNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.TableNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; -import com.knecon.fforesight.service.layoutparser.internal.api.services.EntityInsertionService; -import com.knecon.fforesight.service.layoutparser.server.utils.TestEntity; - -public class DocumentGraphEntityInsertionTest extends BuildDocumentGraphTest { - - @Autowired - private EntityInsertionService entityInsertionService; - - - @Test - public void assertCollectAllEntitiesWorks() { - - DocumentGraph documentGraph = buildGraph("files/crafted document"); - createAndInsertEntity(documentGraph, "Clarissa"); - createAndInsertEntity(documentGraph, "Lastname"); - createAndInsertEntity(documentGraph, "David Ksenia"); - createAndInsertEntity(documentGraph, "Michael N."); - createAndInsertEntity(documentGraph, "Page-Footer"); - createAndInsertEntity(documentGraph, "CTL/with dictionary entry 1234 with Slash"); - assertEquals(6, documentGraph.getEntities().size()); - } - - - private TestEntity createAndInsertEntity(DocumentGraph documentGraph, String searchTerm) { - - int start = documentGraph.getTextBlock().indexOf(searchTerm); - assert start != -1; - - Boundary boundary = new Boundary(start, start + searchTerm.length()); - TestEntity entityNode = TestEntity.initialEntityNode(boundary, "123", "123"); - entityInsertionService.addEntityToGraph(entityNode, documentGraph.getTableOfContents()); - return entityNode; - } - - - @Test - public void assertTextBeforeAndTextAfterForParagraphCrafted() { - - DocumentGraph documentGraph = buildGraph("files/crafted document"); - String searchTerm = "Clarissa"; - TestEntity entityNode = createAndInsertEntity(documentGraph, searchTerm); - - assertEquals("Expand to Hint ", entityNode.getTextBefore()); - assertEquals("’s Donut ←", entityNode.getTextAfter()); - assertEquals(searchTerm, entityNode.getValue()); - assertEquals("Rule 5: Do not redact genitive CBI_authors (Entries based on Dict) ", - entityNode.getDeepestFullyContainingNode().getHeadline().buildTextBlock().getSearchText()); - assertEquals(2, entityNode.getIntersectingNodes().size()); - assertEquals(5, entityNode.getDeepestFullyContainingNode().getNumberOnPage()); - assertInstanceOf(ParagraphNode.class, entityNode.getDeepestFullyContainingNode()); - - assertSameOffsetInAllIntersectingNodes(searchTerm, entityNode); - } - - - @Test - public void assertTextBeforeAndTextAfterForHeadlineCrafted() { - - DocumentGraph documentGraph = buildGraph("files/crafted document"); - String searchTerm = "Rule 39:"; - TestEntity entityNode = createAndInsertEntity(documentGraph, searchTerm); - - assertEquals("", entityNode.getTextBefore()); - assertEquals(" Purity Hint", entityNode.getTextAfter()); - assertEquals(searchTerm, entityNode.getValue()); - assertEquals("Rule 39: Purity Hint ", entityNode.getDeepestFullyContainingNode().getHeadline().buildTextBlock().getSearchText()); - assertEquals(2, entityNode.getIntersectingNodes().size()); - assertEquals(6, entityNode.getDeepestFullyContainingNode().getNumberOnPage()); - assertInstanceOf(HeadlineNode.class, entityNode.getDeepestFullyContainingNode()); - - assertSameOffsetInAllIntersectingNodes(searchTerm, entityNode); - } - - - @Test - public void assertTextBeforeAndTextAfterForTableCellCrafted() { - - DocumentGraph documentGraph = buildGraph("files/crafted document"); - String searchTerm = "1998"; - TestEntity entityNode = createAndInsertEntity(documentGraph, searchTerm); - - assertEquals("", entityNode.getTextBefore()); - assertEquals("", entityNode.getTextAfter()); - assertEquals(searchTerm, entityNode.getValue()); - assertEquals("Rule 6-11 (Authors Table) ", entityNode.getDeepestFullyContainingNode().getHeadline().buildTextBlock().getSearchText()); - assertEquals(3, entityNode.getIntersectingNodes().size()); - assertEquals(15, entityNode.getDeepestFullyContainingNode().getNumberOnPage()); - assertInstanceOf(TableCellNode.class, entityNode.getDeepestFullyContainingNode()); - - assertSameOffsetInAllIntersectingNodes(searchTerm, entityNode); - } - - - @Test - public void findAndCheckMultipleSearchTermsCrafted() { - - DocumentGraph documentGraph = buildGraph("files/crafted document"); - assertValueAndPageAndIntersectingNodes(documentGraph, "David", 1); - assertValueAndPageAndIntersectingNodes(documentGraph, "Weyland Industries", 2); - assertValueAndPageAndIntersectingNodes(documentGraph, "Desiree", 3); - assertValueAndPageAndIntersectingNodes(documentGraph, "kawasaki@me.com", 4); - assertValueAndPageAndIntersectingNodes(documentGraph, "Central Research Industry", 5); - } - - - @Test - public void assertTableStructure() { - - DocumentGraph documentGraph = buildGraph("files/crafted document"); - TableNode table = (TableNode) documentGraph.getTableOfContents()// - .streamAllEntriesInOrder()// - .filter(entry -> entry.getType().equals(NodeType.TABLE))// - .map(TableOfContents.Entry::getNode)// - .findFirst().orElseThrow(); - assertEquals(5, table.getNumberOfCols()); - assertEquals(4, table.getNumberOfRows()); - assertEquals(5, table.streamHeaders().toList().size()); - CharSequence firstHeader = table.streamHeadersForCell(1, 1).map(TableCellNode::buildTextBlock).map(TextBlock::getSearchText).findFirst().orElseThrow(); - assertEquals("Author(s)", firstHeader.toString().stripTrailing()); - } - - - @Test - public void findAndCheckMultipleSearchTermsMetolachlor() { - - DocumentGraph documentGraph = buildGraph("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06"); - assertValueAndPageAndIntersectingNodes(documentGraph, "sideeffects", 4); - assertValueAndPageAndIntersectingNodes(documentGraph, "Commission Regulation", 9); - assertValueAndPageAndIntersectingNodes(documentGraph, "Pre-emergence", 15); - assertValueAndPageAndIntersectingNodes(documentGraph, "LiChrosorb CN +", 22); - assertValueAndPageAndIntersectingNodes(documentGraph, "RCC856132", 22); - assertValueAndPageAndIntersectingNodes(documentGraph, "Number of references included", 33); - } - - - @Test - public void assertTableStructureMetolachlor() { - - DocumentGraph documentGraph = buildGraph("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06"); - TableNode table = (TableNode) documentGraph.getTableOfContents() - .streamAllEntriesInOrder() - .filter(entry -> entry.getNode().getPages().stream().anyMatch(page -> page.getNumber() == 22)) - .filter(entry -> entry.getType().equals(NodeType.TABLE)) - .map(TableOfContents.Entry::getNode) - .findFirst() - .orElseThrow(); - assertEquals(5, table.getNumberOfCols()); - assertEquals(14, table.getNumberOfRows()); - assertEquals(10, table.streamHeaders().toList().size()); - List twoHeaders = table.streamHeadersForCell(2, 1).map(TableCellNode::buildTextBlock).map(TextBlock::getSearchText).toList(); - assertEquals(2, twoHeaders.size()); - assertEquals("Component of residue definition: S-Metolachlor", twoHeaders.get(0).stripTrailing()); - assertEquals("Method type", twoHeaders.get(1).stripTrailing()); - } - - - @Test - public void assertTextBeforeAndTextAfterForParagraphMetolachlor() { - - DocumentGraph documentGraph = buildGraph("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06"); - String searchTerm = "Cucurbit"; - TestEntity entityNode = createAndInsertEntity(documentGraph, searchTerm); - - assertEquals("except Cranberry; Vegetable, ", entityNode.getTextBefore()); - assertEquals(", Group 9;", entityNode.getTextAfter()); - assertEquals("1.1.4 Evaluations carried out under other regulatory contexts ", entityNode.getDeepestFullyContainingNode().getHeadline().buildTextBlock().getSearchText()); - assertEquals(searchTerm, entityNode.getValue()); - assertEquals(2, entityNode.getIntersectingNodes().size()); - assertEquals(5, entityNode.getDeepestFullyContainingNode().getNumberOnPage()); - assertTrue(entityNode.getPages().stream().allMatch(pageNode -> pageNode.getNumber() == 10)); - assertInstanceOf(ParagraphNode.class, entityNode.getDeepestFullyContainingNode()); - - assertSameOffsetInAllIntersectingNodes(searchTerm, entityNode); - } - - - @Test - public void assertTextBeforeAndTextAfterForHeadlineMetolachlor() { - - DocumentGraph documentGraph = buildGraph("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06"); - String searchTerm = "absorption, distribution, metabolism"; - int start = documentGraph.getTextBlock().indexOf(searchTerm); - assert start != -1; - start = documentGraph.getTextBlock().indexOf(searchTerm, start + 1); - assert start != -1; - - Boundary boundary = new Boundary(start, start + searchTerm.length()); - TestEntity entityNode = TestEntity.initialEntityNode(boundary, "123", "123"); - entityInsertionService.addEntityToGraph(entityNode, documentGraph.getTableOfContents()); - - assertEquals("2.6.1 Summary of ", entityNode.getTextBefore()); - assertEquals(" and excretion in", entityNode.getTextAfter()); - assertEquals("2.6.1 Summary of absorption, distribution, metabolism and excretion in mammals ", - entityNode.getDeepestFullyContainingNode().getHeadline().buildTextBlock().getSearchText()); - assertEquals(searchTerm, entityNode.getValue()); - assertEquals(2, entityNode.getIntersectingNodes().size()); - assertEquals(4, entityNode.getDeepestFullyContainingNode().getNumberOnPage()); - assertTrue(entityNode.getPages().stream().allMatch(pageNode -> pageNode.getNumber() == 33)); - assertInstanceOf(HeadlineNode.class, entityNode.getDeepestFullyContainingNode()); - - assertSameOffsetInAllIntersectingNodes(searchTerm, entityNode); - } - - - @Test - public void assertTextBeforeAndTextAfterForTableCellMetolachlor() { - - DocumentGraph documentGraph = buildGraph("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06"); - String searchTerm = "N-deacetylation product"; - TestEntity entityNode = createAndInsertEntity(documentGraph, searchTerm); - - assertEquals("2-[(2-(1-hydroxy-ethyl)-6methyl-phenyl-amino]propan-1-ol (", entityNode.getTextBefore()); - assertEquals(" of metabolite of", entityNode.getTextAfter()); - assertEquals(searchTerm, entityNode.getValue()); - assertEquals(3, entityNode.getIntersectingNodes().size()); - assertEquals("2.7.2 Summary of metabolism, distribution and expression of residues in plants, poultry, lactating ruminants, pigs and fish ", - entityNode.getDeepestFullyContainingNode().getHeadline().buildTextBlock().getSearchText()); - assertTrue(entityNode.getPages().stream().allMatch(pageNode -> pageNode.getNumber() == 54)); - assertEquals(26, entityNode.getDeepestFullyContainingNode().getNumberOnPage()); - - assertInstanceOf(TableCellNode.class, entityNode.getDeepestFullyContainingNode()); - - assertSameOffsetInAllIntersectingNodes(searchTerm, entityNode); - } - - - private static void assertSameOffsetInAllIntersectingNodes(String searchTerm, EntityNode entityNode) { - - List paragraphStart = entityNode.getIntersectingNodes().stream()// - .map(SemanticNode::buildTextBlock)// - .map(textBlock -> textBlock.indexOf(searchTerm))// - .toList(); - - paragraphStart.forEach(nodeStart -> assertEquals(entityNode.getBoundary().start(), nodeStart)); - } - - - private void assertValueAndPageAndIntersectingNodes(DocumentGraph documentGraph, String searchTerm, int pageNumber) { - - int start = documentGraph.getTextBlock().indexOf(searchTerm); - - assert start != -1; - - Boundary boundary = new Boundary(start, start + searchTerm.length()); - TestEntity entityNode = TestEntity.initialEntityNode(boundary, "123", "123"); - entityInsertionService.addEntityToGraph(entityNode, documentGraph.getTableOfContents()); - PageNode pageNode = documentGraph.getPages().stream().filter(page -> page.getNumber() == pageNumber).findFirst().orElseThrow(); - - assertEquals(entityNode.getValue(), searchTerm); - assertTrue(pageNode.getEntities().contains(entityNode)); - assertTrue(documentGraph.getPages().stream().filter(page -> page != pageNode).noneMatch(page -> page.getEntities().contains(entityNode))); - assertTrue(entityNode.getPages().contains(pageNode)); - assertSameOffsetInAllIntersectingNodes(searchTerm, entityNode); - assertTrue(entityNode.getIntersectingNodes().stream().allMatch(node -> node.getEntities().contains(entityNode))); - } - -} diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java index a9d1521..0874336 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java @@ -9,8 +9,8 @@ import org.junit.jupiter.api.Test; import com.fasterxml.jackson.databind.ObjectMapper; import com.iqser.red.commons.jackson.ObjectMapperFactory; import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph; -import com.knecon.fforesight.service.layoutparser.internal.api.mapper.DocumentDataMapper; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.mapper.DocumentDataMapper; import lombok.SneakyThrows; @@ -26,13 +26,13 @@ public class DocumentGraphJsonWritingTest extends BuildDocumentGraphTest { @SneakyThrows private void writeJsons(String filename) { - DocumentGraph documentGraph = buildGraph(filename); + Document documentGraph = buildGraph(filename); DocumentData documentData = DocumentDataMapper.toDocumentData(documentGraph); ObjectMapper mapper = ObjectMapperFactory.create(); - mapper.writeValue(new FileOutputStream(File.createTempFile(filename + "_structure", "json")), documentData.getTableOfContents()); - mapper.writeValue(new FileOutputStream(File.createTempFile(filename + "_text", "json")), documentData.getTableOfContents()); - mapper.writeValue(new FileOutputStream(File.createTempFile(filename + "_positions", "json")), documentData.getTableOfContents()); - mapper.writeValue(new FileOutputStream(File.createTempFile(filename + "_pages", "json")), documentData.getTableOfContents()); + mapper.writeValue(new FileOutputStream(File.createTempFile(filename + "_structure", ".json")), documentData.getDocumentTreeData()); + mapper.writeValue(new FileOutputStream(File.createTempFile(filename + "_text", ".json")), documentData.getAtomicTextBlocks()); + mapper.writeValue(new FileOutputStream(File.createTempFile(filename + "_positions", ".json")), documentData.getAtomicPositionBlocks()); + mapper.writeValue(new FileOutputStream(File.createTempFile(filename + "_pages", ".json")), documentData.getPages()); } } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphMappingTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphMappingTest.java index c7ac68b..ccf6f82 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphMappingTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphMappingTest.java @@ -3,10 +3,10 @@ package com.knecon.fforesight.service.layoutparser.server.graph; import org.junit.jupiter.api.Test; import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph; -import com.knecon.fforesight.service.layoutparser.internal.api.mapper.DocumentDataMapper; -import com.knecon.fforesight.service.layoutparser.internal.api.mapper.DocumentGraphMapper; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.mapper.DocumentDataMapper; +import com.knecon.fforesight.service.layoutparser.processor.mapper.DocumentGraphMapper; import lombok.SneakyThrows; @@ -16,16 +16,16 @@ public class DocumentGraphMappingTest extends BuildDocumentGraphTest { @SneakyThrows public void testGraphMapping() { - DocumentGraph document = buildGraph("files/crafted document"); + Document document = buildGraph("files/crafted document"); LayoutParsingRequest layoutParsingRequest = buildStandardLayoutParsingRequest(); DocumentData documentData = DocumentDataMapper.toDocumentData(document); layoutParsingStorageService.storeDocumentData(layoutParsingRequest, documentData); DocumentData documentData2 = layoutParsingStorageService.readDocumentData(layoutParsingRequest); - DocumentGraph newDocumentGraph = DocumentGraphMapper.toDocumentGraph(documentData2); + Document newDocumentGraph = DocumentGraphMapper.toDocumentGraph(documentData2); assert document.toString().equals(newDocumentGraph.toString()); - assert document.getTableOfContents().toString().equals(newDocumentGraph.getTableOfContents().toString()); + assert document.getDocumentTree().toString().equals(newDocumentGraph.getDocumentTree().toString()); } } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphVisualizationTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphVisualizationTest.java index b8f5d5c..5d46535 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphVisualizationTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphVisualizationTest.java @@ -10,8 +10,8 @@ import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.springframework.core.io.ClassPathResource; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; import lombok.SneakyThrows; @@ -51,15 +51,14 @@ public class DocumentGraphVisualizationTest extends BuildDocumentGraphTest { @SneakyThrows private void visualizePdf(String filename) { - DocumentGraph documentGraph = buildGraph(filename); - TextBlock textBlock = documentGraph.buildTextBlock(); + Document documentGraph = buildGraph(filename); + TextBlock textBlock = documentGraph.getTextBlock(); visualizeSemanticNodes(filename, documentGraph, textBlock); - } - private static void visualizeSemanticNodes(String filename, DocumentGraph documentGraph, TextBlock textBlock) throws IOException { + private static void visualizeSemanticNodes(String filename, Document documentGraph, TextBlock textBlock) throws IOException { File tmpFile = File.createTempFile(filename, "SEMANTIC_NODES_BBOX.pdf"); ClassPathResource fileResource = new ClassPathResource(filename + ".pdf"); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BaseTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BaseTest.java index 542937e..962ab2f 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BaseTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BaseTest.java @@ -21,8 +21,6 @@ import org.springframework.core.io.ClassPathResource; import org.springframework.test.context.junit.jupiter.SpringExtension; import com.iqser.red.storage.commons.service.StorageService; -import com.knecon.fforesight.service.layoutparser.internal.api.services.EntityEnrichmentService; -import com.knecon.fforesight.service.layoutparser.internal.api.services.EntityInsertionService; import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingStorageService; import com.knecon.fforesight.service.layoutparser.processor.multitenancy.TenantContext; import com.knecon.fforesight.service.layoutparser.processor.multitenancy.TenantsClient; @@ -160,14 +158,6 @@ public class BaseTest { return new FileSystemBackedStorageService(); } - - @Bean - @Autowired - public EntityInsertionService entityInsertionService(EntityEnrichmentService entityEnrichmentService) { - - return new EntityInsertionService(entityEnrichmentService); - } - } } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/TestEntity.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/TestEntity.java deleted file mode 100644 index 1876606..0000000 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/TestEntity.java +++ /dev/null @@ -1,124 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.server.utils; - -import java.nio.charset.StandardCharsets; -import java.util.HashSet; -import java.util.LinkedList; -import java.util.List; -import java.util.Set; - -import com.google.common.hash.Hashing; -import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Engine; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityPosition; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.SemanticNode; - -import lombok.AccessLevel; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.experimental.FieldDefaults; - -@Data -@Builder -@AllArgsConstructor -@FieldDefaults(level = AccessLevel.PRIVATE) -public class TestEntity implements EntityNode { - - // initial values - final Boundary boundary; - final String type; - final String entityType; - - // empty defaults - boolean redaction; - boolean removed; - boolean ignored; - boolean resized; - boolean skipRemoveEntitiesContainedInLarger; - boolean dictionaryEntry; - boolean dossierDictionaryEntry; - Set engines; - Set references; - int matchedRule; - String redactionReason; - String legalBasis; - - // inferred on graph insertion - String value; - CharSequence textBefore; - CharSequence textAfter; - @Builder.Default - Set pages = new HashSet<>(); - List entityPositionsPerPage; - @Builder.Default - List intersectingNodes = new LinkedList<>(); - SemanticNode deepestFullyContainingNode; - - - public static TestEntity initialEntityNode(Boundary boundary, String type, String entityType) { - - return TestEntity.builder() - .type(type) - .entityType(entityType) - .boundary(boundary) - .redaction(false) - .removed(false) - .ignored(false) - .resized(false) - .skipRemoveEntitiesContainedInLarger(false) - .dictionaryEntry(false) - .dossierDictionaryEntry(false) - .engines(new HashSet<>()) - .references(new HashSet<>()) - .matchedRule(-1) - .redactionReason("") - .legalBasis("") - .build(); - } - - - public void addIntersectingNode(SemanticNode containingNode) { - - intersectingNodes.add(containingNode); - } - - - @Override - public String toString() { - - StringBuilder sb = new StringBuilder(); - sb.append("Entity[\""); - sb.append(value); - sb.append("\", "); - sb.append(boundary); - sb.append(", pages["); - pages.forEach(page -> { - sb.append(page.getNumber()); - sb.append(", "); - }); - sb.delete(sb.length() - 2, sb.length()); - sb.append("], type = \""); - sb.append(type); - sb.append("\", EntityType."); - sb.append(entityType); - sb.append("]"); - return sb.toString(); - } - - - @Override - public int hashCode() { - - return Hashing.murmur3_128().hashString(toString(), StandardCharsets.UTF_8).hashCode(); - } - - - @Override - public boolean equals(Object o) { - - return o instanceof TestEntity && o.hashCode() == hashCode(); - } - -} diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/TestEntityEnrichmentService.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/TestEntityEnrichmentService.java deleted file mode 100644 index 5336872..0000000 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/TestEntityEnrichmentService.java +++ /dev/null @@ -1,89 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.server.utils; - -import java.util.Arrays; -import java.util.List; -import java.util.Objects; - -import org.springframework.stereotype.Service; - -import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; -import com.knecon.fforesight.service.layoutparser.internal.api.services.EntityEnrichmentService; - -import lombok.RequiredArgsConstructor; - -@Service -@RequiredArgsConstructor -public class TestEntityEnrichmentService implements EntityEnrichmentService { - - public void enrichEntity(EntityNode entity, TextBlock textBlock) { - if (entity instanceof TestEntity) { - TestEntity entity2 = (TestEntity) entity; - entity2.setValue(textBlock.subSequence(entity.getBoundary()).toString()); - entity2.setTextAfter(findTextAfter(entity.getBoundary().end(), textBlock)); - entity2.setTextBefore(findTextBefore(entity.getBoundary().start(), textBlock)); - } - } - - - private CharSequence findTextAfter(int index, TextBlock textBlock) { - - int endOffset = Math.min(index + 100, textBlock.getBoundary().end()); - String textAfter = textBlock.subSequence(index, endOffset).toString(); - if (!textAfter.isBlank()) { - List wordsAfter = splitToWordsAndRemoveEmptyWords(textAfter); - int numberOfWordsAfter = Math.min(wordsAfter.size(), 3); - if (wordsAfter.size() > 0) { - return concatWordsAfter(wordsAfter.subList(0, numberOfWordsAfter), textAfter.startsWith(" ")); - } - } - return ""; - } - - - private CharSequence findTextBefore(int index, TextBlock textBlock) { - - int offsetBefore = Math.max(index - 100, textBlock.getBoundary().start()); - String textBefore = textBlock.subSequence(offsetBefore, index).toString(); - if (!textBefore.isBlank()) { - List wordsBefore = splitToWordsAndRemoveEmptyWords(textBefore); - int numberOfWordsBefore = Math.min(wordsBefore.size(), 3); - if (wordsBefore.size() > 0) { - return concatWordsBefore(wordsBefore.subList(wordsBefore.size() - numberOfWordsBefore, wordsBefore.size()), textBefore.endsWith(" ")); - } - } - return ""; - } - - - private static List splitToWordsAndRemoveEmptyWords(String textAfter) { - - return Arrays.stream(textAfter.split(" ")).filter(word -> !Objects.equals("", word)).toList(); - } - - - private static String concatWordsBefore(List words, boolean endWithSpace) { - - StringBuilder sb = new StringBuilder(); - - for (String word : words) { - sb.append(word).append(" "); - } - - String result = sb.toString().trim(); - return endWithSpace ? result + " " : result; - } - - - private static String concatWordsAfter(List words, boolean startWithSpace) { - - StringBuilder sb = new StringBuilder(); - - for (String word : words) { - sb.append(word).append(" "); - } - - String result = sb.toString().trim(); - return startWithSpace ? " " + result : result; - } -} diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/visualizations/PdfDraw.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/visualizations/PdfDraw.java index b47b459..e8fd284 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/visualizations/PdfDraw.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/visualizations/PdfDraw.java @@ -14,13 +14,13 @@ import org.apache.pdfbox.pdmodel.font.PDType1Font; import org.apache.pdfbox.pdmodel.font.Standard14Fonts; import org.apache.pdfbox.util.Matrix; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.DocumentGraph; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.NodeType; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.AtomicTextBlock; -import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock; -import com.knecon.fforesight.service.layoutparser.processor.factory.RectangleTransformations; +import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType; +import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.AtomicTextBlock; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.processor.services.RectangleTransformations; import lombok.AccessLevel; import lombok.AllArgsConstructor; @@ -34,13 +34,13 @@ import lombok.experimental.UtilityClass; @UtilityClass public class PdfDraw { - public static void drawDocumentGraph(PDDocument document, DocumentGraph documentGraph) { + public static void drawDocumentGraph(PDDocument document, Document documentGraph) { - documentGraph.getTableOfContents().streamAllEntriesInOrder().forEach(entry -> drawNode(document, entry)); + documentGraph.getDocumentTree().allEntriesInOrder().forEach(entry -> drawNode(document, entry)); } - public static void drawNode(PDDocument document, TableOfContents.Entry entry) { + public static void drawNode(PDDocument document, DocumentTree.Entry entry) { Options options = buildStandardOptionsForNodes(entry); @@ -136,7 +136,7 @@ public class PdfDraw { } - private static Options buildStandardOptionsForNodes(TableOfContents.Entry entry) { + private static Options buildStandardOptionsForNodes(DocumentTree.Entry entry) { return Options.builder().stroke(true).strokeColor(switch (entry.getType()) { case DOCUMENT -> Color.LIGHT_GRAY; @@ -151,9 +151,9 @@ public class PdfDraw { } - private static void drawBBoxAndLabelAndNumberOnPage(PDDocument document, TableOfContents.Entry entry, Options options) { + private static void drawBBoxAndLabelAndNumberOnPage(PDDocument document, DocumentTree.Entry entry, Options options) { - Map rectanglesPerPage = entry.getNode().getBBox(); + Map rectanglesPerPage = entry.getNode().getBBox(); rectanglesPerPage.forEach((page, rectangle2D) -> { if (entry.getType() == NodeType.SECTION) { rectangle2D = RectangleTransformations.pad(rectangle2D, 10, 10); @@ -164,9 +164,9 @@ public class PdfDraw { } - private static String buildString(TableOfContents.Entry entry) { + private static String buildString(DocumentTree.Entry entry) { - return entry.getNode().getNumberOnPage() + ": " + entry.getTocId() + ": " + entry.getType(); + return entry.getNode().getNumberOnPage() + ": " + entry.getTreeId() + ": " + entry.getType(); } } \ No newline at end of file