From 241a32cb4f61b79ebbc5ecbf8a6d590529ef0c74 Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Mon, 24 Jul 2023 15:48:28 +0200 Subject: [PATCH] TAAS-41/ RED-6725: integrate layoutparser into redactmanager --- .../api/queue/LayoutParsingRequest.java | 4 +- .../internal/api/queue/LayoutParsingType.java | 7 + .../src/test/resources/log4j2-test.xml | 16 + .../layoutparser-service-processor/pom.xml | 6 + ...ervice.java => LayoutParsingPipeline.java} | 59 +++- .../LayoutParsingStorageService.java | 19 +- .../adapter/ImageServiceResponseAdapter.java | 4 +- .../classification/model/Orientation.java | 8 - .../factory/DocumentGraphFactory.java | 17 +- .../SearchTextWithTextPositionFactory.java | 6 +- .../processor/factory/SectionNodeFactory.java | 15 +- .../processor/factory/TableNodeFactory.java | 8 +- .../processor/factory/TextBlockFactory.java | 2 +- .../processor/graph/Boundary.java | 5 + .../processor/graph/nodes/Page.java | 2 +- .../processor/graph/nodes/SemanticNode.java | 2 +- .../mapper/redaction/PropertiesMapper.java | 2 +- .../model/AbstractPageBlock.java | 4 +- .../model/ClassificationDocument.java | 6 +- .../model/ClassificationFooter.java | 4 +- .../model/ClassificationHeader.java | 4 +- .../model/ClassificationPage.java | 6 +- .../model/ClassificationSection.java | 6 +- .../model/FloatFrequencyCounter.java | 2 +- .../processor/model/GapInformation.java | 23 ++ .../processor/model/LineInformation.java | 5 + .../processor/model/Orientation.java | 8 + .../model/PageBlockType.java | 2 +- .../processor/model/PageContents.java | 20 ++ .../processor/model/PageInformation.java | 5 + .../processor/model/SectionIdentifier.java | 123 ++++++++ .../model/image/ClassifiedImage.java | 2 +- .../model/table/Cell.java | 8 +- .../model/table/CellPosition.java | 2 +- .../model/table/CleanRulings.java | 2 +- .../model/table/Rectangle.java | 2 +- .../model/table/Ruling.java | 6 +- .../model/table/TablePageBlock.java | 8 +- .../model/text/RedTextPosition.java | 2 +- .../model/text/SearchableText.java | 4 +- .../model/text/SimplifiedSectionText.java | 2 +- .../model/text/SimplifiedText.java | 2 +- .../model/text/StringFrequencyCounter.java | 2 +- .../model/text/TextDirection.java | 2 +- .../model/text/TextPageBlock.java | 12 +- .../model/text/TextPositionSequence.java | 2 +- .../model/text/UnclassifiedText.java | 2 +- .../parsing/LegacyPDFStreamEngine.java | 2 +- .../parsing/PDFLinesTextStripper.java | 8 +- .../parsing/PDFTextStripper.java | 2 +- .../processor/queue/MessageHandler.java | 7 +- .../BodyTextFrameService.java | 16 +- .../DividingColumnDetectionService.java | 149 ++++++++++ .../services/GapDetectionService.java | 169 +++++++++++ .../services/GapsAcrossLinesService.java | 199 +++++++++++++ .../InvisibleTableDetectionService.java | 5 + .../services/LineDetectionService.java | 122 ++++++++ .../MainBodyTextFrameExtractionService.java | 5 + .../services/PageInformationService.java | 2 + .../PdfParsingService.java | 43 ++- .../services/RectangleTransformations.java | 95 ------ .../RulingCleaningService.java | 8 +- .../services/SectionGridCreatorService.java | 146 +++++++++ .../SectionsBuilderService.java | 26 +- .../TableExtractionService.java | 20 +- .../services/TextPositionSequenceSorter.java | 75 +++++ .../DocuMineBlockificationService.java | 229 +++++++++++++++ .../RedactManagerBlockificationService.java | 278 ++++++++++++++++++ .../TaasBlockificationService.java} | 22 +- .../DocuMineClassificationService.java | 117 ++++++++ .../RedactManagerClassificationService.java | 116 ++++++++ .../TaasClassificationService.java} | 17 +- .../utils/CohenSutherlandClipping.java | 2 +- .../utils/DoubleComparisons.java | 2 +- .../utils/PdfVisualisationUtility.java | 27 ++ .../utils/PositionUtils.java | 4 +- .../{classification => }/utils/QuickSort.java | 2 +- .../utils/RectangleTransformations.java | 33 +++ .../utils/RulingTextDirAdjustUtil.java | 4 +- .../processor/utils/TableMergingUtility.java | 18 +- .../utils/TextNormalizationUtilities.java | 2 +- .../utils/TextPositionOperations.java | 4 +- .../utils/TextPositionSequenceComparator.java | 72 +++++ .../src/test/resources/log4j2-test.xml | 16 + .../layoutparser/server/BdrJsonBuildTest.java | 7 +- .../server/graph/BuildDocumentGraphTest.java | 7 +- .../graph/DocumentGraphMappingTest.java | 3 +- .../server/model/SectionIdentifierTest.java | 58 ++++ .../GapAcrossLinesDetectionServiceTest.java | 71 +++++ .../InvisibleTableDetectionServiceTest.java | 23 ++ ...ainBodyTextFrameExtractionServiceTest.java | 7 + .../services/PageInformationServiceTest.java | 50 ++++ .../TextPositionSequenceSorterTest.java | 39 +++ .../server/utils/visualizations/PdfDraw.java | 98 +++++- .../src/test/resources/application.yml | 37 +++ .../src/test/resources/log4j2-test.xml | 16 + layoutparser-service/pom.xml | 1 + 97 files changed, 2641 insertions(+), 300 deletions(-) create mode 100644 layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingType.java create mode 100644 layoutparser-service/layoutparser-service-internal-api/src/test/resources/log4j2-test.xml rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{LayoutParsingService.java => LayoutParsingPipeline.java} (59%) delete mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/Orientation.java rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification => }/model/AbstractPageBlock.java (91%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification => }/model/ClassificationDocument.java (78%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification => }/model/ClassificationFooter.java (50%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification => }/model/ClassificationHeader.java (50%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification => }/model/ClassificationPage.java (75%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification => }/model/ClassificationSection.java (68%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification => }/model/FloatFrequencyCounter.java (96%) create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/GapInformation.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/LineInformation.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/Orientation.java rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification => }/model/PageBlockType.java (89%) create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageContents.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageInformation.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/SectionIdentifier.java rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification => }/model/image/ClassifiedImage.java (84%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification => }/model/table/Cell.java (82%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification => }/model/table/CellPosition.java (79%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification => }/model/table/CleanRulings.java (65%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification => }/model/table/Rectangle.java (98%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification => }/model/table/Ruling.java (97%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification => }/model/table/TablePageBlock.java (96%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification => }/model/text/RedTextPosition.java (95%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification => }/model/text/SearchableText.java (84%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification => }/model/text/SimplifiedSectionText.java (74%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification => }/model/text/SimplifiedText.java (80%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification => }/model/text/StringFrequencyCounter.java (93%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification => }/model/text/TextDirection.java (92%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification => }/model/text/TextPageBlock.java (94%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification => }/model/text/TextPositionSequence.java (99%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification => }/model/text/UnclassifiedText.java (67%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification => }/parsing/LegacyPDFStreamEngine.java (99%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification => }/parsing/PDFLinesTextStripper.java (97%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification => }/parsing/PDFTextStripper.java (99%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification/service => services}/BodyTextFrameService.java (89%) create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/DividingColumnDetectionService.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapDetectionService.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapsAcrossLinesService.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/InvisibleTableDetectionService.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/LineDetectionService.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/MainBodyTextFrameExtractionService.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageInformationService.java rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification/service => services}/PdfParsingService.java (66%) delete mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RectangleTransformations.java rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification/service => services}/RulingCleaningService.java (95%) create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionGridCreatorService.java rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification/service => services}/SectionsBuilderService.java (90%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification/service => services}/TableExtractionService.java (92%) create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TextPositionSequenceSorter.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification/service/BlockificationService.java => services/blockification/TaasBlockificationService.java} (92%) create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification/service/ClassificationService.java => services/classification/TaasClassificationService.java} (89%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification => }/utils/CohenSutherlandClipping.java (97%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification => }/utils/DoubleComparisons.java (91%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification => }/utils/PositionUtils.java (95%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification => }/utils/QuickSort.java (96%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification => }/utils/RulingTextDirAdjustUtil.java (91%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/{classification => }/utils/TextNormalizationUtilities.java (88%) create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionSequenceComparator.java create mode 100644 layoutparser-service/layoutparser-service-processor/src/test/resources/log4j2-test.xml create mode 100644 layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/model/SectionIdentifierTest.java create mode 100644 layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/GapAcrossLinesDetectionServiceTest.java create mode 100644 layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/InvisibleTableDetectionServiceTest.java create mode 100644 layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/MainBodyTextFrameExtractionServiceTest.java create mode 100644 layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageInformationServiceTest.java create mode 100644 layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/TextPositionSequenceSorterTest.java create mode 100644 layoutparser-service/layoutparser-service-server/src/test/resources/application.yml create mode 100644 layoutparser-service/layoutparser-service-server/src/test/resources/log4j2-test.xml diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingRequest.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingRequest.java index caa8bba..f4177c5 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingRequest.java +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingRequest.java @@ -8,6 +8,7 @@ import lombok.Builder; @Builder public record LayoutParsingRequest( + LayoutParsingType layoutParsingType, Map identifier, String originFileStorageId, Optional tablesFileStorageId, @@ -16,6 +17,7 @@ public record LayoutParsingRequest( String researchDocumentStorageId, String textBlockFileStorageId, String positionBlockFileStorageId, - String pageFileStorageId) { + String pageFileStorageId, + String sectionGridStorageId) { } diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingType.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingType.java new file mode 100644 index 0000000..7598d29 --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingType.java @@ -0,0 +1,7 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.queue; + +public enum LayoutParsingType { + REDACT_MANAGER, + TAAS, + DOCUMINE +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/test/resources/log4j2-test.xml b/layoutparser-service/layoutparser-service-internal-api/src/test/resources/log4j2-test.xml new file mode 100644 index 0000000..b4895cf --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/test/resources/log4j2-test.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/layoutparser-service/layoutparser-service-processor/pom.xml b/layoutparser-service/layoutparser-service-processor/pom.xml index f976cd6..8bcaeb5 100644 --- a/layoutparser-service/layoutparser-service-processor/pom.xml +++ b/layoutparser-service/layoutparser-service-processor/pom.xml @@ -60,6 +60,12 @@ org.springframework.boot spring-boot-starter-amqp + + org.junit.jupiter + junit-jupiter + RELEASE + test + diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java similarity index 59% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingService.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index a6a6915..f945b6e 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -7,21 +7,24 @@ import java.io.IOException; import org.apache.pdfbox.pdmodel.PDDocument; import org.springframework.stereotype.Service; -import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.adapter.CvTableParsingAdapter; import com.knecon.fforesight.service.layoutparser.processor.adapter.ImageServiceResponseAdapter; import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument; -import com.knecon.fforesight.service.layoutparser.processor.classification.service.ClassificationService; -import com.knecon.fforesight.service.layoutparser.processor.classification.service.PdfParsingService; -import com.knecon.fforesight.service.layoutparser.processor.classification.service.SectionsBuilderService; import com.knecon.fforesight.service.layoutparser.processor.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.mapper.redaction.DocumentDataMapper; import com.knecon.fforesight.service.layoutparser.processor.mapper.taas.TaasDocumentDataMapper; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.services.PdfParsingService; +import com.knecon.fforesight.service.layoutparser.processor.services.SectionGridCreatorService; +import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService; +import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService; +import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService; +import com.knecon.fforesight.service.layoutparser.processor.services.classification.TaasClassificationService; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; @@ -29,14 +32,17 @@ import lombok.extern.slf4j.Slf4j; @Slf4j @Service @RequiredArgsConstructor -public class LayoutParsingService { +public class LayoutParsingPipeline { private final ImageServiceResponseAdapter imageServiceResponseAdapter; private final CvTableParsingAdapter cvTableParsingAdapter; private final LayoutParsingStorageService layoutParsingStorageService; private final PdfParsingService pdfParsingService; - private final ClassificationService classificationService; private final SectionsBuilderService sectionsBuilderService; + private final SectionGridCreatorService sectionGridCreatorService; + private final TaasClassificationService taasClassificationService; + private final RedactManagerClassificationService redactManagerClassificationService; + private final DocuMineClassificationService docuMineClassificationService; public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException { @@ -54,13 +60,17 @@ public class LayoutParsingService { tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.pageFileStorageId()); } - Document documentGraph = parseLayout(originDocument, imageServiceResponse, tableServiceResponse); + Document documentGraph = parseLayout(layoutParsingRequest.layoutParsingType(), originDocument, imageServiceResponse, tableServiceResponse); int numberOfPages = originDocument.getNumberOfPages(); originDocument.close(); - var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentGraph); + layoutParsingStorageService.storeSectionGrid(layoutParsingRequest, sectionGridCreatorService.createSectionGrid(documentGraph)); + layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph)); - layoutParsingStorageService.storeDocumentData(layoutParsingRequest, researchDocumentData, DocumentDataMapper.toDocumentData(documentGraph)); + if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.TAAS)) { + var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentGraph); + layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData); + } return LayoutParsingFinishedEvent.builder() .identifier(layoutParsingRequest.identifier()) @@ -75,13 +85,21 @@ public class LayoutParsingService { } - public Document parseLayout(PDDocument originDocument, ImageServiceResponse imageServiceResponse, TableServiceResponse tableServiceResponse) { + public Document parseLayout(LayoutParsingType layoutParsingType, + PDDocument originDocument, + ImageServiceResponse imageServiceResponse, + TableServiceResponse tableServiceResponse) { - ClassificationDocument classificationDocument = pdfParsingService.parseDocument(originDocument, + ClassificationDocument classificationDocument = pdfParsingService.parseDocument(layoutParsingType, + originDocument, cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse), imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse)); - classificationService.classifyDocument(classificationDocument); + switch (layoutParsingType) { + case TAAS -> taasClassificationService.classifyDocument(classificationDocument); + case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument); + case REDACT_MANAGER -> redactManagerClassificationService.classifyDocument(classificationDocument); + } sectionsBuilderService.buildSections(classificationDocument); @@ -89,16 +107,25 @@ public class LayoutParsingService { } - public Document parseLayoutWithTimer(PDDocument originDocument, ImageServiceResponse imageServiceResponse, TableServiceResponse tableServiceResponse) { + public Document parseLayoutWithTimer(LayoutParsingType layoutParsingType, + PDDocument originDocument, + ImageServiceResponse imageServiceResponse, + TableServiceResponse tableServiceResponse) { long start = System.currentTimeMillis(); - ClassificationDocument classificationDocument = pdfParsingService.parseDocument(originDocument, + + ClassificationDocument classificationDocument = pdfParsingService.parseDocument(layoutParsingType, originDocument, cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse), imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse)); + System.out.printf("parsed %d ms", System.currentTimeMillis() - start); start = System.currentTimeMillis(); - classificationService.classifyDocument(classificationDocument); + switch (layoutParsingType) { + case TAAS -> taasClassificationService.classifyDocument(classificationDocument); + case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument); + case REDACT_MANAGER -> redactManagerClassificationService.classifyDocument(classificationDocument); + } System.out.printf(", classified %d ms", System.currentTimeMillis() - start); start = System.currentTimeMillis(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java index f3738fd..4a82ff8 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java @@ -13,6 +13,7 @@ import org.apache.pdfbox.pdmodel.PDDocument; import org.springframework.stereotype.Service; import com.fasterxml.jackson.databind.ObjectMapper; +import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionGrid; import com.iqser.red.storage.commons.service.StorageService; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicPositionBlockData; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicTextBlockData; @@ -68,14 +69,24 @@ public class LayoutParsingStorageService { } - public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, ResearchDocumentData researchDocumentData, DocumentData documentData) { + public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, DocumentData documentData) { - storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.researchDocumentStorageId(), researchDocumentData); storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), documentData.getDocumentTreeData()); storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.textBlockFileStorageId(), documentData.getAtomicTextBlocks()); storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.positionBlockFileStorageId(), documentData.getAtomicPositionBlocks()); storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.pageFileStorageId(), documentData.getPages()); + } + + public void storeSectionGrid(LayoutParsingRequest layoutParsingRequest, SectionGrid sectionGrid) { + + storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.sectionGridStorageId(), sectionGrid); + } + + + public void storeResearchDocumentData(LayoutParsingRequest layoutParsingRequest, ResearchDocumentData researchDocumentData) { + + storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.researchDocumentStorageId(), researchDocumentData); } @@ -88,9 +99,7 @@ public class LayoutParsingStorageService { AtomicPositionBlockData[] atomicPositionBlockData = storageService.readJSONObject(TenantContext.getTenantId(), layoutParsingRequest.positionBlockFileStorageId(), AtomicPositionBlockData[].class); - DocumentTreeData tableOfContentsData = storageService.readJSONObject(TenantContext.getTenantId(), - layoutParsingRequest.structureFileStorageId(), - DocumentTreeData.class); + DocumentTreeData tableOfContentsData = storageService.readJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), DocumentTreeData.class); return DocumentData.builder() .documentTreeData(tableOfContentsData) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/ImageServiceResponseAdapter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/ImageServiceResponseAdapter.java index 29898c9..4b64f1c 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/ImageServiceResponseAdapter.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/ImageServiceResponseAdapter.java @@ -10,8 +10,8 @@ import java.util.Map; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.ImageType; import lombok.RequiredArgsConstructor; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/Orientation.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/Orientation.java deleted file mode 100644 index 75ae7bd..0000000 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/Orientation.java +++ /dev/null @@ -1,8 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model; - -public enum Orientation { - - NONE, - LEFT, - RIGHT -} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/DocumentGraphFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/DocumentGraphFactory.java index 4897aa2..8e56f49 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/DocumentGraphFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/DocumentGraphFactory.java @@ -13,13 +13,13 @@ import java.util.Map; import java.util.NoSuchElementException; import java.util.Set; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationFooter; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationHeader; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Footer; @@ -81,8 +81,9 @@ public class DocumentGraphFactory { page.getMainBody().add(node); - List textBlocks = new ArrayList<>(textBlocksToMerge); + List textBlocks = new ArrayList<>(); textBlocks.add(originalTextBlock); + textBlocks.addAll(textBlocksToMerge); AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), node, context, page); List treeId = context.documentTree.createNewChildEntryAndReturnId(parentNode, node); node.setLeafTextBlock(textBlock); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SearchTextWithTextPositionFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SearchTextWithTextPositionFactory.java index 1d005f1..afc179a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SearchTextWithTextPositionFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SearchTextWithTextPositionFactory.java @@ -7,9 +7,9 @@ import java.util.LinkedList; import java.util.List; import java.util.Objects; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.RedTextPosition; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextDirection; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.graph.Boundary; import lombok.experimental.UtilityClass; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SectionNodeFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SectionNodeFactory.java index d3942fa..01eea15 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SectionNodeFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SectionNodeFactory.java @@ -10,10 +10,10 @@ import java.util.List; import java.util.Map; import java.util.Set; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.GenericSemanticNode; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Section; @@ -80,7 +80,7 @@ public class SectionNodeFactory { remainingBlocks.removeAll(alreadyMerged); if (abstractPageBlock instanceof TextPageBlock) { - List textBlocks = findTextBlocksWithSameClassificationAndAlignsY(abstractPageBlock, remainingBlocks); + List textBlocks = findTextBlocksWithSameClassificationAndAlignsY((TextPageBlock) abstractPageBlock, remainingBlocks); alreadyMerged.addAll(textBlocks); DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks); } else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) { @@ -123,7 +123,7 @@ public class SectionNodeFactory { List previousList = splitList.get(i - 1); AbstractPageBlock lastPageBlockInPreviousList = previousList.get(previousList.size() - 1); if (lastPageBlockInPreviousList.isHeadline()) { - previousList.remove(i - 1); + previousList.remove(previousList.size() - 1); splitList.get(i).add(0, lastPageBlockInPreviousList); } } @@ -162,7 +162,7 @@ public class SectionNodeFactory { } - private List findTextBlocksWithSameClassificationAndAlignsY(AbstractPageBlock atc, List pageBlocks) { + private List findTextBlocksWithSameClassificationAndAlignsY(TextPageBlock atc, List pageBlocks) { return pageBlocks.stream() .filter(abstractTextContainer -> !abstractTextContainer.equals(atc)) @@ -170,6 +170,7 @@ public class SectionNodeFactory { .filter(abstractTextContainer -> abstractTextContainer instanceof TextPageBlock) .filter(abstractTextContainer -> abstractTextContainer.intersectsY(atc)) .map(abstractTextContainer -> (TextPageBlock) abstractTextContainer) + .filter(abstractTextContainer -> abstractTextContainer.getDir() == atc.getDir()) .toList(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/TableNodeFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/TableNodeFactory.java index 0124ec1..13977a1 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/TableNodeFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/TableNodeFactory.java @@ -7,10 +7,10 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.GenericSemanticNode; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/TextBlockFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/TextBlockFactory.java index 69c71e4..399c9d5 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/TextBlockFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/TextBlockFactory.java @@ -2,7 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.factory; import java.util.List; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode; import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.AtomicTextBlock; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/Boundary.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/Boundary.java index ff7366d..5808e20 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/Boundary.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/Boundary.java @@ -5,6 +5,7 @@ import static java.lang.String.format; import java.util.Collection; import java.util.LinkedList; import java.util.List; +import java.util.stream.IntStream; import lombok.EqualsAndHashCode; import lombok.Setter; @@ -107,6 +108,10 @@ public class Boundary implements Comparable { return splitBoundaries; } + public IntStream intStream() { + + return IntStream.range(start, end); + } public static Boundary merge(Collection boundaries) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Page.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Page.java index f01cc38..5fcfa51 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Page.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Page.java @@ -5,7 +5,7 @@ import java.util.LinkedList; import java.util.List; import java.util.Set; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlockCollector; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/SemanticNode.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/SemanticNode.java index 139464f..d76b7a2 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/SemanticNode.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/SemanticNode.java @@ -18,7 +18,7 @@ import com.knecon.fforesight.service.layoutparser.processor.graph.entity.EntityT import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.AtomicTextBlock; import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; -import com.knecon.fforesight.service.layoutparser.processor.services.RectangleTransformations; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; public interface SemanticNode { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/redaction/PropertiesMapper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/redaction/PropertiesMapper.java index a7a5aeb..fdff5c1 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/redaction/PropertiesMapper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/redaction/PropertiesMapper.java @@ -10,7 +10,7 @@ import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Image; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.ImageType; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.TableCell; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Table; -import com.knecon.fforesight.service.layoutparser.processor.services.RectangleTransformations; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; public class PropertiesMapper { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/AbstractPageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/AbstractPageBlock.java similarity index 91% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/AbstractPageBlock.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/AbstractPageBlock.java index 821a3f6..42ef081 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/AbstractPageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/AbstractPageBlock.java @@ -1,8 +1,8 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model; +package com.knecon.fforesight.service.layoutparser.processor.model; import com.fasterxml.jackson.annotation.JsonIgnore; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import lombok.AllArgsConstructor; import lombok.Data; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationDocument.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java similarity index 78% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationDocument.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java index 1ce5a1c..5062790 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationDocument.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java @@ -1,11 +1,11 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model; +package com.knecon.fforesight.service.layoutparser.processor.model; import java.util.ArrayList; import java.util.List; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionGrid; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.StringFrequencyCounter; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.UnclassifiedText; +import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; +import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText; import lombok.Data; import lombok.NoArgsConstructor; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationFooter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationFooter.java similarity index 50% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationFooter.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationFooter.java index 2aad008..c910293 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationFooter.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationFooter.java @@ -1,8 +1,8 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model; +package com.knecon.fforesight.service.layoutparser.processor.model; import java.util.List; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import lombok.AllArgsConstructor; import lombok.Data; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationHeader.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationHeader.java similarity index 50% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationHeader.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationHeader.java index be4447d..e161801 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationHeader.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationHeader.java @@ -1,8 +1,8 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model; +package com.knecon.fforesight.service.layoutparser.processor.model; import java.util.List; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import lombok.AllArgsConstructor; import lombok.Data; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationPage.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java similarity index 75% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationPage.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java index 91dfd79..21796c8 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationPage.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java @@ -1,11 +1,11 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model; +package com.knecon.fforesight.service.layoutparser.processor.model; import java.util.ArrayList; import java.util.List; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.StringFrequencyCounter; +import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; import lombok.Data; import lombok.NonNull; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationSection.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationSection.java similarity index 68% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationSection.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationSection.java index 8de2007..7074282 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationSection.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationSection.java @@ -1,10 +1,10 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model; +package com.knecon.fforesight.service.layoutparser.processor.model; import java.util.ArrayList; import java.util.List; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; import lombok.Data; import lombok.NoArgsConstructor; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/FloatFrequencyCounter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/FloatFrequencyCounter.java similarity index 96% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/FloatFrequencyCounter.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/FloatFrequencyCounter.java index 80bcbf6..a3d7917 100755 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/FloatFrequencyCounter.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/FloatFrequencyCounter.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model; +package com.knecon.fforesight.service.layoutparser.processor.model; import java.util.ArrayList; import java.util.Collections; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/GapInformation.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/GapInformation.java new file mode 100644 index 0000000..5531a43 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/GapInformation.java @@ -0,0 +1,23 @@ +package com.knecon.fforesight.service.layoutparser.processor.model; + +import java.awt.geom.Rectangle2D; +import java.util.LinkedList; +import java.util.List; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Getter; +import lombok.experimental.FieldDefaults; + +@Getter +@AllArgsConstructor +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +public class Gaps { + List> xGaps ; + List> yGaps ; + + public Gaps() { + xGaps = new LinkedList<>(); + yGaps = new LinkedList<>(); + } +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/LineInformation.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/LineInformation.java new file mode 100644 index 0000000..a8da688 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/LineInformation.java @@ -0,0 +1,5 @@ +package com.knecon.fforesight.service.layoutparser.processor.model; + +public class LineInformation { + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/Orientation.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/Orientation.java new file mode 100644 index 0000000..eddfa8e --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/Orientation.java @@ -0,0 +1,8 @@ +package com.knecon.fforesight.service.layoutparser.processor.model; + +public enum Orientation { + + NONE, + LEFT, + RIGHT +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/PageBlockType.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageBlockType.java similarity index 89% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/PageBlockType.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageBlockType.java index 9740979..1292138 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/PageBlockType.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageBlockType.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model; +package com.knecon.fforesight.service.layoutparser.processor.model; public enum PageBlockType { H1, diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageContents.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageContents.java new file mode 100644 index 0000000..966d669 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageContents.java @@ -0,0 +1,20 @@ +package com.knecon.fforesight.service.layoutparser.processor.model; + +import java.awt.geom.Rectangle2D; +import java.util.List; + +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Getter; + +@Getter +@Builder +@AllArgsConstructor +public class PageInformation { + + List sortedTextPositionSequences; + Rectangle2D cropBox; + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageInformation.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageInformation.java new file mode 100644 index 0000000..e04819a --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageInformation.java @@ -0,0 +1,5 @@ +package com.knecon.fforesight.service.layoutparser.processor.model; + +public class PageInformation { + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/SectionIdentifier.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/SectionIdentifier.java new file mode 100644 index 0000000..7b6f8c4 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/SectionIdentifier.java @@ -0,0 +1,123 @@ +package com.knecon.fforesight.service.layoutparser.processor.model; + +import java.util.Collections; +import java.util.LinkedList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.experimental.FieldDefaults; + +@AllArgsConstructor +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +public class SectionIdentifier { + + static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?"); + + private enum Format { + EMPTY, + NUMERICAL, + DOCUMENT + } + + Format format; + String identifierString; + List identifiers; + boolean asChild; + + + public static SectionIdentifier fromSearchText(String headline) { + + if (headline == null || headline.isEmpty() || headline.isBlank()) { + return SectionIdentifier.empty(); + } + + Matcher numericalIdentifierMatcher = numericalIdentifierPattern.matcher(headline); + if (numericalIdentifierMatcher.find()) { + return buildNumericalSectionIdentifier(headline, numericalIdentifierMatcher); + } + // more formats here + return SectionIdentifier.empty(); + } + + + public static SectionIdentifier asChildOf(SectionIdentifier sectionIdentifier) { + + return new SectionIdentifier(sectionIdentifier.format, sectionIdentifier.toString(), sectionIdentifier.identifiers, true); + } + + + public static SectionIdentifier document() { + + return new SectionIdentifier(Format.DOCUMENT, "document", Collections.emptyList(), false); + } + + + public static SectionIdentifier empty() { + + return new SectionIdentifier(Format.EMPTY, "empty", Collections.emptyList(), false); + } + + + private static SectionIdentifier buildNumericalSectionIdentifier(String headline, Matcher numericalIdentifierMatcher) { + + String identifierString = headline.substring(numericalIdentifierMatcher.start(), numericalIdentifierMatcher.end()); + List identifiers = new LinkedList<>(); + for (int i = 1; i <= 4; i++) { + String numericalIdentifier = numericalIdentifierMatcher.group(i); + if (numericalIdentifier == null || numericalIdentifier.equals("0") || numericalIdentifier.isEmpty() || numericalIdentifier.isBlank()) { + break; + } + identifiers.add(Integer.parseInt(numericalIdentifier.trim())); + } + return new SectionIdentifier(Format.NUMERICAL, identifierString, identifiers.stream().toList(), false); + } + + + /** + * Determines if the current section is the parent of the given section. + * + * @param sectionIdentifier The section identifier to compare against. + * @return true if the current section is the parent of the given section, false otherwise. + */ + public boolean isParentOf(SectionIdentifier sectionIdentifier) { + + if (this.format.equals(Format.EMPTY)) { + return false; + } + if (this.format.equals(Format.DOCUMENT)) { + return true; + } + if (!this.format.equals(sectionIdentifier.format)) { + return false; + } + if (this.identifiers.size() >= sectionIdentifier.identifiers.size() && !(this.identifiers.size() == sectionIdentifier.identifiers.size() && sectionIdentifier.asChild)) { + return false; + } + for (int i = 0; i < this.identifiers.size(); i++) { + if (!this.identifiers.get(i).equals(sectionIdentifier.identifiers.get(i))) { + return false; + } + } + return true; + } + + + public boolean isChildOf(SectionIdentifier sectionIdentifier) { + + if (this.format.equals(Format.DOCUMENT) || this.format.equals(Format.EMPTY)) { + return false; + } + return sectionIdentifier.isParentOf(this); + } + + + @Override + public String toString() { + + return identifierString; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/image/ClassifiedImage.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/image/ClassifiedImage.java similarity index 84% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/image/ClassifiedImage.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/image/ClassifiedImage.java index 3670100..b0da3b9 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/image/ClassifiedImage.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/image/ClassifiedImage.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model.image; +package com.knecon.fforesight.service.layoutparser.processor.model.image; import java.awt.geom.Rectangle2D; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/Cell.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Cell.java similarity index 82% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/Cell.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Cell.java index 42ac4be..2f324a4 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/Cell.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Cell.java @@ -1,13 +1,13 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model.table; +package com.knecon.fforesight.service.layoutparser.processor.model.table; import java.awt.geom.Point2D; import java.util.ArrayList; import java.util.Iterator; import java.util.List; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence; -import com.knecon.fforesight.service.layoutparser.processor.classification.utils.TextNormalizationUtilities; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities; import lombok.Data; import lombok.EqualsAndHashCode; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/CellPosition.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/CellPosition.java similarity index 79% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/CellPosition.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/CellPosition.java index 2b5ef89..a47e928 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/CellPosition.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/CellPosition.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model.table; +package com.knecon.fforesight.service.layoutparser.processor.model.table; import lombok.RequiredArgsConstructor; import lombok.Value; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/CleanRulings.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/CleanRulings.java similarity index 65% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/CleanRulings.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/CleanRulings.java index daa1055..735d7a5 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/CleanRulings.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/CleanRulings.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model.table; +package com.knecon.fforesight.service.layoutparser.processor.model.table; import java.util.List; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/Rectangle.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Rectangle.java similarity index 98% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/Rectangle.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Rectangle.java index 4ce30df..c357ab7 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/Rectangle.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Rectangle.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model.table; +package com.knecon.fforesight.service.layoutparser.processor.model.table; import java.awt.geom.Point2D; import java.awt.geom.Rectangle2D; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/Ruling.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Ruling.java similarity index 97% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/Ruling.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Ruling.java index 9759960..f2deee6 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/Ruling.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Ruling.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model.table; +package com.knecon.fforesight.service.layoutparser.processor.model.table; import java.awt.geom.Line2D; import java.awt.geom.Point2D; @@ -11,8 +11,8 @@ import java.util.List; import java.util.Map; import java.util.TreeMap; -import com.knecon.fforesight.service.layoutparser.processor.classification.utils.CohenSutherlandClipping; -import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons; +import com.knecon.fforesight.service.layoutparser.processor.utils.CohenSutherlandClipping; +import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons; import lombok.extern.slf4j.Slf4j; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/TablePageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java similarity index 96% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/TablePageBlock.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java index 23e5631..10331fe 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/TablePageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model.table; +package com.knecon.fforesight.service.layoutparser.processor.model.table; import java.awt.geom.Point2D; import java.util.ArrayList; @@ -8,9 +8,9 @@ import java.util.List; import java.util.Set; import java.util.TreeMap; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import lombok.Getter; import lombok.Setter; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/RedTextPosition.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java similarity index 95% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/RedTextPosition.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java index 2a8de35..92059ae 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/RedTextPosition.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model.text; +package com.knecon.fforesight.service.layoutparser.processor.model.text; import org.apache.pdfbox.text.TextPosition; import org.springframework.beans.BeanUtils; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/SearchableText.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/SearchableText.java similarity index 84% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/SearchableText.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/SearchableText.java index b8081be..c0ef4e3 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/SearchableText.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/SearchableText.java @@ -1,8 +1,8 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model.text; +package com.knecon.fforesight.service.layoutparser.processor.model.text; import java.util.ArrayList; import java.util.List; -import com.knecon.fforesight.service.layoutparser.processor.classification.utils.TextNormalizationUtilities; +import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities; import lombok.Getter; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/SimplifiedSectionText.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/SimplifiedSectionText.java similarity index 74% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/SimplifiedSectionText.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/SimplifiedSectionText.java index beb8d8f..21d71be 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/SimplifiedSectionText.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/SimplifiedSectionText.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model.text; +package com.knecon.fforesight.service.layoutparser.processor.model.text; import lombok.AllArgsConstructor; import lombok.Builder; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/SimplifiedText.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/SimplifiedText.java similarity index 80% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/SimplifiedText.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/SimplifiedText.java index ea9b7ca..d021c52 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/SimplifiedText.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/SimplifiedText.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model.text; +package com.knecon.fforesight.service.layoutparser.processor.model.text; import java.util.ArrayList; import java.util.List; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/StringFrequencyCounter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/StringFrequencyCounter.java similarity index 93% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/StringFrequencyCounter.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/StringFrequencyCounter.java index a210116..934b1b3 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/StringFrequencyCounter.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/StringFrequencyCounter.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model.text; +package com.knecon.fforesight.service.layoutparser.processor.model.text; import java.util.HashMap; import java.util.Map; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/TextDirection.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextDirection.java similarity index 92% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/TextDirection.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextDirection.java index e555301..8d1fa97 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/TextDirection.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextDirection.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model.text; +package com.knecon.fforesight.service.layoutparser.processor.model.text; import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonValue; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/TextPageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java similarity index 94% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/TextPageBlock.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java index 892d13a..b9c816a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/TextPageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model.text; +package com.knecon.fforesight.service.layoutparser.processor.model.text; import static java.util.stream.Collectors.toSet; @@ -7,11 +7,11 @@ import java.util.Comparator; import java.util.List; import com.fasterxml.jackson.annotation.JsonIgnore; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.FloatFrequencyCounter; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType; -import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons; -import com.knecon.fforesight.service.layoutparser.processor.classification.utils.TextNormalizationUtilities; +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter; +import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; +import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons; +import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities; import lombok.AllArgsConstructor; import lombok.Builder; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/TextPositionSequence.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java similarity index 99% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/TextPositionSequence.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java index 8b73a42..d4e58e8 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/TextPositionSequence.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model.text; +package com.knecon.fforesight.service.layoutparser.processor.model.text; import java.awt.geom.AffineTransform; import java.awt.geom.Point2D; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/UnclassifiedText.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/UnclassifiedText.java similarity index 67% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/UnclassifiedText.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/UnclassifiedText.java index 0d9bfb4..7da98c5 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/UnclassifiedText.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/UnclassifiedText.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model.text; +package com.knecon.fforesight.service.layoutparser.processor.model.text; import java.util.List; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/LegacyPDFStreamEngine.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/parsing/LegacyPDFStreamEngine.java similarity index 99% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/LegacyPDFStreamEngine.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/parsing/LegacyPDFStreamEngine.java index 307d442..f2ece49 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/LegacyPDFStreamEngine.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/parsing/LegacyPDFStreamEngine.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package com.knecon.fforesight.service.layoutparser.processor.classification.parsing; +package com.knecon.fforesight.service.layoutparser.processor.parsing; import java.io.IOException; import java.io.InputStream; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFLinesTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/parsing/PDFLinesTextStripper.java similarity index 97% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFLinesTextStripper.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/parsing/PDFLinesTextStripper.java index e6bee7e..549f726 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFLinesTextStripper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/parsing/PDFLinesTextStripper.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.parsing; +package com.knecon.fforesight.service.layoutparser.processor.parsing; import java.awt.color.CMMException; import java.awt.geom.Point2D; @@ -35,9 +35,9 @@ import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.graphics.color.PDColor; import org.apache.pdfbox.text.TextPosition; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.RedTextPosition; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import lombok.Getter; import lombok.Setter; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/parsing/PDFTextStripper.java similarity index 99% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFTextStripper.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/parsing/PDFTextStripper.java index 49e6a78..b7fe3e3 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFTextStripper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/parsing/PDFTextStripper.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package com.knecon.fforesight.service.layoutparser.processor.classification.parsing; +package com.knecon.fforesight.service.layoutparser.processor.parsing; import java.io.BufferedInputStream; import java.io.IOException; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/queue/MessageHandler.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/queue/MessageHandler.java index 1af9169..a834a25 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/queue/MessageHandler.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/queue/MessageHandler.java @@ -10,11 +10,10 @@ import org.springframework.amqp.rabbit.annotation.RabbitListener; import org.springframework.amqp.rabbit.core.RabbitTemplate; import org.springframework.stereotype.Service; -import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; -import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingService; +import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline; import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; @@ -25,7 +24,7 @@ import lombok.extern.slf4j.Slf4j; @RequiredArgsConstructor public class MessageHandler { - private final LayoutParsingService layoutParsingService; + private final LayoutParsingPipeline layoutParsingPipeline; private final ObjectMapper objectMapper; private final RabbitTemplate rabbitTemplate; @@ -42,7 +41,7 @@ public class MessageHandler { throw new AmqpRejectAndDontRequeueException(String.format("Error during last layout parsing of request with identifier: %s, do not retry.", layoutParsingRequest.identifier())); } - LayoutParsingFinishedEvent layoutParsingFinishedEvent = layoutParsingService.parseLayoutAndSaveFilesToStorage(layoutParsingRequest); + LayoutParsingFinishedEvent layoutParsingFinishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest); sendLayoutParsingFinishedEvent(layoutParsingFinishedEvent); log.info("Layout parsing finished {} in {} ms", layoutParsingRequest.identifier(), layoutParsingFinishedEvent.duration()); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/BodyTextFrameService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java similarity index 89% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/BodyTextFrameService.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java index bf8d597..bcb7ef4 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/BodyTextFrameService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.service; +package com.knecon.fforesight.service.layoutparser.processor.services; import java.util.List; @@ -6,13 +6,13 @@ import org.springframework.stereotype.Service; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.FloatFrequencyCounter; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.utils.PositionUtils; +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; @Service public class BodyTextFrameService { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/DividingColumnDetectionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/DividingColumnDetectionService.java new file mode 100644 index 0000000..5da1822 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/DividingColumnDetectionService.java @@ -0,0 +1,149 @@ +package com.knecon.fforesight.service.layoutparser.processor.services; + +import java.awt.geom.Rectangle2D; +import java.util.Collection; +import java.util.Comparator; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.stream.IntStream; + +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; + +import lombok.experimental.UtilityClass; + +@UtilityClass +public class DividingColumnDetectionService { + + private static final double SPLITTABLE_LINE_PERCENTAGE_THRESHOLD = 0.6; + private static final int MAX_NUMBER_OF_COLUMNS = 4; + + + public List detectColumns(List textPositionSequences, Rectangle2D mainBodyTextFrame) { + + if (textPositionSequences.size() < 2) { + return List.of(mainBodyTextFrame); + } + + List> linesWithGaps = LineDetectionService.findTextBlockInLines(textPositionSequences); + + Map> linesWithMatchingGapIndices = new HashMap<>(); + for (int numberOfColumns = 2; numberOfColumns <= MAX_NUMBER_OF_COLUMNS; numberOfColumns++) { + linesWithMatchingGapIndices.put(numberOfColumns, findConsecutiveLinesWithMatchingGaps(linesWithGaps, mainBodyTextFrame.getWidth(), numberOfColumns)); + } + + int optimalNumberOfColumns = findOptimalNumberOfColumns(linesWithMatchingGapIndices, linesWithGaps.size()); + if (optimalNumberOfColumns == 1) { + return List.of(mainBodyTextFrame); + } + return buildColumns(mainBodyTextFrame, getLinesWithMatchingGaps(linesWithMatchingGapIndices.get(optimalNumberOfColumns), linesWithGaps), optimalNumberOfColumns); + } + + + private static List findConsecutiveLinesWithMatchingGaps(List> linesWithGaps, double width, int numberOfColumns) { + + List booleans = lineHasMatchingGap(linesWithGaps, width, numberOfColumns); + return findConsecutiveTrueIndicesWithMaxLengthRun(booleans); + } + + + private List lineHasMatchingGap(List> linesWithGaps, double width, int numberOfColumns) { + + return linesWithGaps.stream() + .map(blocksWithGaps -> IntStream.range(1, numberOfColumns) + .allMatch(columnIndex -> noBlocksIntersectX(blocksWithGaps, calculateGapLocation(width, numberOfColumns, columnIndex)))) + .toList(); + } + + + private List findConsecutiveTrueIndicesWithMaxLengthRun(List booleans) { + + List maxConsecutiveTrueIndices = new LinkedList<>(); + List currentConsecutiveTrueIndices = new LinkedList<>(); + for (int i = 0; i < booleans.size(); i++) { + if (!booleans.get(i)) { + if (currentConsecutiveTrueIndices.isEmpty()) { + continue; + } + if (currentConsecutiveTrueIndices.size() > maxConsecutiveTrueIndices.size()) { + maxConsecutiveTrueIndices = currentConsecutiveTrueIndices; + } + currentConsecutiveTrueIndices = new LinkedList<>(); + continue; + } + currentConsecutiveTrueIndices.add(i); + } + if (currentConsecutiveTrueIndices.size() > maxConsecutiveTrueIndices.size()) { + return currentConsecutiveTrueIndices; + } + return maxConsecutiveTrueIndices; + } + + + private static int findOptimalNumberOfColumns(Map> linesWithMatchingGapIndices, Integer numberOfLines) { + + return linesWithMatchingGapIndices.entrySet() + .stream() + .max(comparePercentages(numberOfLines)) + .filter(entry -> percentageIsAboveThreshold(entry, numberOfLines)) + .map(Map.Entry::getKey) + .orElse(1); + } + + + private List buildColumns(Rectangle2D mainBodyTextFrame, List rectanglesToMerge, int optimalColumnCount) { + + if (optimalColumnCount == 1 || rectanglesToMerge.isEmpty()) { + return List.of(mainBodyTextFrame); + } + + double maxY = rectanglesToMerge.get(0).getMaxY(); + double minY = rectanglesToMerge.get(rectanglesToMerge.size() - 1).getMinY(); + + List columns = new LinkedList<>(); + double width = mainBodyTextFrame.getWidth() / optimalColumnCount; + double height = maxY - minY; + for (int i = 0; i < optimalColumnCount; i++) { + columns.add(new Rectangle2D.Double(mainBodyTextFrame.getMinY() + i * width, minY, width, height)); + } + return columns; + } + + + private Comparator>> comparePercentages(Integer numberOfLines) { + + return Comparator.comparingDouble(entry -> calculatePercentage(entry.getValue().size(), numberOfLines)); + } + + + private List getLinesWithMatchingGaps(List linesWithMatchingGapIndices, List> linesWithGaps) { + + return linesWithMatchingGapIndices.stream().map(linesWithGaps::get).flatMap(Collection::stream).toList(); + } + + + private boolean percentageIsAboveThreshold(Map.Entry> entry, Integer numberOfLines) { + + return calculatePercentage(entry.getValue().size(), numberOfLines) > SPLITTABLE_LINE_PERCENTAGE_THRESHOLD; + } + + + private double calculatePercentage(Integer numberOfMatchingLines, Integer numberOfLines) { + + return ((double) numberOfMatchingLines) / ((double) numberOfLines); + } + + + private double calculateGapLocation(double pageWidth, int numberOfColumns, int columnIndex) { + + return (pageWidth / numberOfColumns) * columnIndex; + } + + + private Boolean noBlocksIntersectX(List blocksWithGaps, double x) { + + return blocksWithGaps.stream().noneMatch(rect -> rect.getMaxX() > x && rect.getMinX() < x); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapDetectionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapDetectionService.java new file mode 100644 index 0000000..6d89762 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapDetectionService.java @@ -0,0 +1,169 @@ +package com.knecon.fforesight.service.layoutparser.processor.services; + +import java.awt.geom.Rectangle2D; +import java.util.LinkedList; +import java.util.List; + +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; + +import lombok.AllArgsConstructor; +import lombok.experimental.UtilityClass; + +@UtilityClass +public class GapDetectionService { + + private static final double X_GAP_FACTOR = 0.3; // multiplied with average text height, determines the minimum distance of gaps in lines + private static final double Y_GAP_FACTOR = 1; + private static final double NEW_LINE_FACTOR = 0.2; + + + public static Gaps findGapsInLines(List sortedTextPositionSequences, Rectangle2D mainBodyTextFrame) { + + if (sortedTextPositionSequences.isEmpty()) { + return new Gaps(); + } + //assertAllTextPositionsHaveSameDir(textPositionSequences); + + final double avgTextPositionHeight = getAvgTextPositionHeight(sortedTextPositionSequences); + + XGapsContext xGapContext = XGapsContext.init(mainBodyTextFrame); + YGapsContext yGapContext = YGapsContext.init(mainBodyTextFrame); + + var previousTextPosition = sortedTextPositionSequences.get(0); + Rectangle2D rectangle = toRectangle2D(previousTextPosition); + + yGapContext.addGapFromTopOfMainBody(rectangle); + xGapContext.addGapFromLeftEdgeOfMainBody(rectangle); + + for (TextPositionSequence currentTextPosition : sortedTextPositionSequences.subList(1, sortedTextPositionSequences.size())) { + + double yGap = Math.abs(currentTextPosition.getMaxYDirAdj() - previousTextPosition.getMaxYDirAdj()); + double xGap = Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getMinXDirAdj()); + Rectangle2D previousTextPositionBBox = toRectangle2D(previousTextPosition); + Rectangle2D currentTextPositionBBox = toRectangle2D(currentTextPosition); + + if (yGap > avgTextPositionHeight * Y_GAP_FACTOR) { + + yGapContext.addGap(mainBodyTextFrame.getMinX(), currentTextPositionBBox.getMaxY(), mainBodyTextFrame.getWidth(), yGap); + + } + if (yGap > avgTextPositionHeight * NEW_LINE_FACTOR) { + + xGapContext.addGapToRightEdgeOfMainBody(previousTextPositionBBox); + xGapContext.gapsInCurrentLine = new LinkedList<>(); + xGapContext.gapsPerLine.add(xGapContext.gapsInCurrentLine); + xGapContext.addGapFromLeftEdgeOfMainBody(currentTextPositionBBox); + + + } else if (xGap <= avgTextPositionHeight * X_GAP_FACTOR) { + addGapToLine(currentTextPositionBBox, previousTextPositionBBox, xGapContext); + } + previousTextPosition = currentTextPosition; + } + xGapContext.addGapToRightEdgeOfMainBody(toRectangle2D(sortedTextPositionSequences.get(sortedTextPositionSequences.size() - 1))); + xGapContext.gapsPerLine.add(xGapContext.gapsInCurrentLine); + + return new Gaps(xGapContext.gapsPerLine, yGapContext.gapsPerLine); + } + + + private static Rectangle2D toRectangle2D(TextPositionSequence textPosition) { + + return RectangleTransformations.toRectangle2D(textPosition.getRectangle()); + } + + + private static void addGapToLine(Rectangle2D currentTextPosition, Rectangle2D previousTextPosition, XGapsContext context) { + + context.gapsInCurrentLine.add(new Rectangle2D.Double(previousTextPosition.getMaxX(), + previousTextPosition.getMinY(), + currentTextPosition.getMinX() - previousTextPosition.getMaxX(), + (previousTextPosition.getHeight() + currentTextPosition.getHeight()) / 2)); + } + + + private static void assertAllTextPositionsHaveSameDir(List textPositionSequences) { + + assert textPositionSequences.stream().map(TextPositionSequence::getDir).allMatch(a -> a.equals(textPositionSequences.get(0).getDir())); + } + + + private static double getAvgTextPositionHeight(List textPositionSequences) { + + return textPositionSequences.stream().mapToDouble(TextPositionSequence::getHeight).average().orElseThrow(); + } + + + @AllArgsConstructor + private static class YGapsContext { + + List> gapsPerLine; + List gapsInCurrentLine; + Rectangle2D mainBodyTextFrame; + + + public static YGapsContext init(Rectangle2D mainBodyTextFrame) { + + List> initialLinesWithGaps = new LinkedList<>(); + List initialBlocksInLine = new LinkedList<>(); + initialLinesWithGaps.add(initialBlocksInLine); + return new YGapsContext(initialLinesWithGaps, initialBlocksInLine, mainBodyTextFrame); + } + + + public void addGapFromTopOfMainBody(Rectangle2D rectangle) { + + gapsInCurrentLine.add(new Rectangle2D.Double(mainBodyTextFrame.getMinX(), + rectangle.getMaxY(), + mainBodyTextFrame.getWidth(), + mainBodyTextFrame.getMaxY() - rectangle.getMaxY())); + } + + + + public void addGap(double x, double y, double w, double h) { + + gapsInCurrentLine.add(new Rectangle2D.Double(x, y, w, h)); + } + + } + + @AllArgsConstructor + private static class XGapsContext { + + List> gapsPerLine; + List gapsInCurrentLine; + Rectangle2D mainBodyTextFrame; + + + public static XGapsContext init(Rectangle2D mainBodyTextFrame) { + + List> initialLinesWithGaps = new LinkedList<>(); + List initialBlocksInLine = new LinkedList<>(); + initialLinesWithGaps.add(initialBlocksInLine); + return new XGapsContext(initialLinesWithGaps, initialBlocksInLine, mainBodyTextFrame); + } + + + public void addGapToRightEdgeOfMainBody(Rectangle2D textPosition) { + + Rectangle2D leftGap = new Rectangle2D.Double(textPosition.getMaxX(), + textPosition.getMinY(), + mainBodyTextFrame.getMaxX() - textPosition.getMaxX(), + textPosition.getHeight()); + gapsInCurrentLine.add(leftGap); + } + + + public void addGapFromLeftEdgeOfMainBody(Rectangle2D textPosition) { + + Rectangle2D leftGap = new Rectangle2D.Double(mainBodyTextFrame.getMinX(), + textPosition.getMinY(), + textPosition.getMinX() - mainBodyTextFrame.getMinX(), + textPosition.getHeight()); + gapsInCurrentLine.add(leftGap); + } + + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapsAcrossLinesService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapsAcrossLinesService.java new file mode 100644 index 0000000..49d31f3 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapsAcrossLinesService.java @@ -0,0 +1,199 @@ +package com.knecon.fforesight.service.layoutparser.processor.services; + +import java.awt.geom.Rectangle2D; +import java.awt.geom.RectangularShape; +import java.util.LinkedList; +import java.util.List; +import java.util.Queue; +import java.util.stream.Stream; + +import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation; + +import lombok.AllArgsConstructor; +import lombok.Getter; +import lombok.RequiredArgsConstructor; +import lombok.experimental.UtilityClass; + +@UtilityClass +public class GapFindingColumnDetectionService implements ColumnDetectionService { + + private static final double GAP_WIDTH_THRESHOLD_FACTOR = 0.01; // multiplied with avg text height + private static final double LINE_COUNT_THRESHOLD_FACTOR = 0.3; // multiplied with average line count per page + + + public List detectColumns(GapInformation gapInformation, Rectangle2D mainBodyTextFrame) { + + if (gapInformation.getXGaps().size() < 2) { + return List.of(mainBodyTextFrame); + } + double avgHeight = gapInformation.getXGaps() + .stream() + .filter(gaps -> !gaps.isEmpty()) + .map(gaps -> gaps.get(0)) + .mapToDouble(RectangularShape::getHeight) + .average() + .orElseThrow(); + + ColumnFactory columnFactory = ColumnFactory.init(avgHeight, gapInformation.getXGaps().size()); + gapInformation.getXGaps().get(0).stream().map(Column::new).forEach(columnFactory::addToQueue); + List> xGaps = gapInformation.getXGaps(); + for (var gaps : xGaps.subList(1, xGaps.size())) { + + while (columnFactory.hasColumnsToProcess()) { + Column column = columnFactory.getNext(); + rememberColumnIfValid(columnFactory, column); + elongateColumnsAndFilterForWidth(column, gaps, columnFactory).forEach(columnFactory::setToStillInProgress); + } + columnFactory.addStillInProgressToQueue(); + columnFactory.addGapsToQueue(gaps); + } + + return columnFactory.outputColumns.stream() + .filter(column -> columnFactory.outputColumns.stream().filter(column::intersectsX).noneMatch(column1 -> column1.lineCount > column.lineCount)) + .filter(column -> ) + .map(Column::getRectangle2D) + .toList(); + } + + + private static void rememberColumnIfValid(ColumnFactory columnFactory, Column column) { + + if (column.lineCount >= LINE_COUNT_THRESHOLD_FACTOR * (double) columnFactory.lineCount) { + columnFactory.outputColumns.add(column); + } + } + + + private static Stream elongateColumnsAndFilterForWidth(Column column, List gaps, ColumnFactory columnFactory) { + + return gaps.stream()// + .filter(gap -> column.getIntersectionWidth(gap) > GAP_WIDTH_THRESHOLD_FACTOR * columnFactory.avgHeight)// + .map(column::addNewLineAndShrink); + + } + + + private static Rectangle2D correctRectangle(Rectangle2D rectangle2D) { + + double minX = Math.min(rectangle2D.getMinX(), rectangle2D.getMaxX()); + double minY = Math.min(rectangle2D.getMinY(), rectangle2D.getMaxY()); + double maxX = Math.max(rectangle2D.getMinX(), rectangle2D.getMaxX()); + double maxY = Math.max(rectangle2D.getMinY(), rectangle2D.getMaxY()); + return new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY); + } + + + @Getter + @AllArgsConstructor + private class Column { + + Rectangle2D rectangle2D; + int lineCount = 1; + + + public Column(Rectangle2D rectangle2D) { + + this.rectangle2D = correctRectangle(rectangle2D); + } + + + public boolean intersectsX(Rectangle2D rectangle2D) { + + return rectangle2D.getMinX() < this.rectangle2D.getMaxX() && this.rectangle2D.getMinX() < rectangle2D.getMaxX(); + } + + + public boolean intersectsX(Column column) { + + return this.intersectsX(column.getRectangle2D()); + } + + + public double getIntersectionWidth(Rectangle2D rectangle2D) { + + if (!intersectsX(rectangle2D)) { + return -1; + } + double min_x = Math.max(rectangle2D.getMinX(), this.rectangle2D.getMinX()); + double max_x = Math.min(rectangle2D.getMaxX(), this.rectangle2D.getMaxX()); + return max_x - min_x; + } + + + public Column addNewLineAndShrink(Rectangle2D rectangle2D) { + + var correctedRectangle = correctRectangle(rectangle2D); + double min_x = Math.max(correctedRectangle.getMinX(), this.rectangle2D.getMinX()); + double max_x = Math.min(correctedRectangle.getMaxX(), this.rectangle2D.getMaxX()); + double min_y = correctedRectangle.getMinY(); + double max_y = this.rectangle2D.getMaxY(); + double width = max_x - min_x; + double height = max_y - min_y; + return new Column(new Rectangle2D.Double(min_x, min_y, width, height), lineCount + 1); + } + + } + + @RequiredArgsConstructor + private class ColumnFactory { + + final double avgHeight; + final int lineCount; + + List outputColumns = new LinkedList<>(); + Queue columnQueue = new LinkedList<>(); + List columnsToQueue = new LinkedList<>(); + + + public static ColumnFactory init(double avgHeight, int lineCount) { + + return new ColumnFactory(Math.abs(avgHeight), lineCount); + } + + + public Column getNext() { + + return columnQueue.remove(); + } + + + public void addToQueue(Column column) { + + columnQueue.add(column); + } + + + public void addToQueue(Rectangle2D gap) { + + columnQueue.add(new Column(gap)); + } + + + private boolean hasColumnsToProcess() { + + return columnQueue.peek() != null; + } + + + public void setToStillInProgress(Column column) { + + columnsToQueue.add(column); + } + + + private void addStillInProgressToQueue() { + + for (int i = columnsToQueue.size() - 1; i >= 0; i--) { + columnQueue.add(columnsToQueue.remove(i)); + } + } + + + public void addGapsToQueue(List gaps) { + + gaps.forEach(this::addToQueue); + } + + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/InvisibleTableDetectionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/InvisibleTableDetectionService.java new file mode 100644 index 0000000..d5391a8 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/InvisibleTableDetectionService.java @@ -0,0 +1,5 @@ +package com.knecon.fforesight.service.layoutparser.processor.services; + +public class InvisibleTableDetectionService { + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/LineDetectionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/LineDetectionService.java new file mode 100644 index 0000000..378748c --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/LineDetectionService.java @@ -0,0 +1,122 @@ +package com.knecon.fforesight.service.layoutparser.processor.services; + +import java.awt.geom.Rectangle2D; +import java.util.Collections; +import java.util.LinkedList; +import java.util.List; + +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionSequenceComparator; + +import lombok.AllArgsConstructor; +import lombok.experimental.UtilityClass; + +@UtilityClass +public class LineDetectionService { + + private static final double X_GAP_FACTOR = 1; // multiplied with average text height, determines the minimum distance of gaps in lines + + + public static List> findTextBlockInLines(List textPositionSequences) { + + if (textPositionSequences.isEmpty()) { + return Collections.emptyList(); + } + + final double avgTextPositionHeight = getAvgTextPositionHeight(textPositionSequences); + + TextBlockContext context = TextBlockContext.init(); + + List sortedTextPositionSequence = textPositionSequences.stream().sorted(new TextPositionSequenceComparator()).toList(); + + var previousTextPosition = sortedTextPositionSequence.get(0); + context.textPositionsToMerge.add(previousTextPosition); + for (TextPositionSequence currentTextPosition : sortedTextPositionSequence.subList(1, sortedTextPositionSequence.size())) { + if (isNewLine(currentTextPosition, previousTextPosition, avgTextPositionHeight) || isSplitByOrientation(currentTextPosition, previousTextPosition)) { + addBlockToLine(context); + startNewLine(currentTextPosition, context); + } else if (isXGap(currentTextPosition, previousTextPosition, avgTextPositionHeight)) { + addBlockToLine(context); + startNewBlock(currentTextPosition, context); + } else { + context.textPositionsToMerge.add(currentTextPosition); + } + previousTextPosition = currentTextPosition; + } + addBlockToLine(context); + return context.textBlocksInLines; + } + + + + private static double getAvgTextPositionHeight(List textPositionSequences) { + + return textPositionSequences.stream().mapToDouble(TextPositionSequence::getHeight).average().orElseThrow(); + } + + + private static boolean isXGap(TextPositionSequence currentTextPosition, TextPositionSequence previousTextPosition, double avgTextPositionHeight) { + + return Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getMinXDirAdj()) > (avgTextPositionHeight * X_GAP_FACTOR); + } + + + private static boolean isSplitByOrientation(TextPositionSequence currentTextPosition, TextPositionSequence previousTextPosition) { + + return !previousTextPosition.getDir().equals(currentTextPosition.getDir()); + } + + + private static boolean isNewLine(TextPositionSequence currentTextPosition, TextPositionSequence previousTextPosition, double avgTextPositionHeight) { + + return Math.abs(previousTextPosition.getMinYDirAdj() - currentTextPosition.getMinYDirAdj()) > avgTextPositionHeight; + } + + + private static void startNewBlock(TextPositionSequence currentTextPosition, TextBlockContext context) { + + context.textPositionsToMerge = new LinkedList<>(); + context.textPositionsToMerge.add(currentTextPosition); + } + + + private static void addBlockToLine(TextBlockContext context) { + + context.blocksInCurrentLine.add(textPositionBBox(context.textPositionsToMerge)); + } + + + private static void startNewLine(TextPositionSequence current, TextBlockContext context) { + + context.blocksInCurrentLine = new LinkedList<>(); + startNewBlock(current, context); + context.textBlocksInLines.add(context.blocksInCurrentLine); + } + + + private Rectangle2D textPositionBBox(List textPositionSequences) { + + return RectangleTransformations.rectangleBBox(textPositionSequences.stream().map(TextPositionSequence::getRectangle).toList()); + } + + + @AllArgsConstructor + private class TextBlockContext { + + List> textBlocksInLines; + List blocksInCurrentLine; + List textPositionsToMerge; + + + public static TextBlockContext init() { + + List> initialLinesWithGaps = new LinkedList<>(); + List initialBlocksInLine = new LinkedList<>(); + initialLinesWithGaps.add(initialBlocksInLine); + return new TextBlockContext(initialLinesWithGaps, initialBlocksInLine, new LinkedList<>()); + } + + } + + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/MainBodyTextFrameExtractionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/MainBodyTextFrameExtractionService.java new file mode 100644 index 0000000..6473717 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/MainBodyTextFrameExtractionService.java @@ -0,0 +1,5 @@ +package com.knecon.fforesight.service.layoutparser.processor.services; + +public class MainBodyTextFrameExtractionService { + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageInformationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageInformationService.java new file mode 100644 index 0000000..fce800b --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageInformationService.java @@ -0,0 +1,2 @@ +package com.knecon.fforesight.service.layoutparser.processor.services;public class PageInformationService { +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/PdfParsingService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PdfParsingService.java similarity index 66% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/PdfParsingService.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PdfParsingService.java index e6c22e2..5b7fb23 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/PdfParsingService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PdfParsingService.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.service; +package com.knecon.fforesight.service.layoutparser.processor.services; import java.util.ArrayList; import java.util.List; @@ -9,16 +9,20 @@ import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.springframework.stereotype.Service; +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.adapter.ImageServiceResponseAdapter; import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableCells; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.CleanRulings; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence; -import com.knecon.fforesight.service.layoutparser.processor.classification.parsing.PDFLinesTextStripper; +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.parsing.PDFLinesTextStripper; +import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService; +import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService; +import com.knecon.fforesight.service.layoutparser.processor.services.blockification.TaasBlockificationService; import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; @@ -31,11 +35,16 @@ public class PdfParsingService { private final RulingCleaningService rulingCleaningService; private final TableExtractionService tableExtractionService; - private final BlockificationService blockificationService; private final ImageServiceResponseAdapter imageServiceResponseAdapter; + private final TaasBlockificationService taasBlockificationService; + private final DocuMineBlockificationService docuMineBlockificationService; + private final RedactManagerBlockificationService redactManagerBlockificationService; - public ClassificationDocument parseDocument(PDDocument originDocument, Map> pdfTableCells, Map> pdfImages) { + public ClassificationDocument parseDocument(LayoutParsingType layoutParsingType, + PDDocument originDocument, + Map> pdfTableCells, + Map> pdfImages) { ClassificationDocument document = new ClassificationDocument(); List classificationPages = new ArrayList<>(); @@ -44,7 +53,7 @@ public class PdfParsingService { long pageCount = originDocument.getNumberOfPages(); for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) { - parsePage(pdfImages, originDocument, pdfTableCells, document, classificationPages, pageNumber); + parsePage(layoutParsingType, pdfImages, originDocument, pdfTableCells, document, classificationPages, pageNumber); } document.setPages(classificationPages); @@ -54,7 +63,8 @@ public class PdfParsingService { @SneakyThrows - private void parsePage(Map> pdfImages, + private void parsePage(LayoutParsingType layoutParsingType, + Map> pdfImages, PDDocument pdDocument, Map> pdfTableCells, ClassificationDocument document, @@ -79,7 +89,12 @@ public class PdfParsingService { stripper.getRulings(), stripper.getMinCharWidth(), stripper.getMaxCharHeight()); - ClassificationPage classificationPage = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); + + ClassificationPage classificationPage = switch (layoutParsingType) { + case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); + case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); + case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); + }; classificationPage.setRotation(rotation); classificationPage.setLandscape(isLandscape); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RectangleTransformations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RectangleTransformations.java deleted file mode 100644 index 3618dbd..0000000 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RectangleTransformations.java +++ /dev/null @@ -1,95 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.processor.services; - -import static java.lang.String.format; - -import java.awt.geom.Area; -import java.awt.geom.Rectangle2D; -import java.util.Arrays; -import java.util.List; -import java.util.Set; -import java.util.function.BiConsumer; -import java.util.function.BinaryOperator; -import java.util.function.Function; -import java.util.function.Supplier; -import java.util.stream.Collector; - -import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.AtomicTextBlock; - -import lombok.experimental.UtilityClass; - -@UtilityClass -public class RectangleTransformations { - - public static Rectangle2D pad(Rectangle2D rectangle2D, int deltaX, int deltaY) { - - return new Rectangle2D.Double(rectangle2D.getMinX() - deltaX, rectangle2D.getMinY() - deltaY, rectangle2D.getWidth() + 2 * deltaX, rectangle2D.getHeight() + 2 * deltaY); - } - - - public static Rectangle2D bBoxUnionAtomicTextBlock(List atomicTextBlocks) { - - return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DUnion()); - } - - - public static Rectangle2D rectangleUnion(List rectangle2DList) { - - return rectangle2DList.stream().collect(new Rectangle2DUnion()); - } - - - public static String toString(Rectangle2D rectangle2D) { - - return format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight()); - } - - - public static Rectangle2D parseRectangle2D(String bBox) { - - List floats = Arrays.stream(bBox.split(",")).map(Float::parseFloat).toList(); - return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3)); - } - - - private static class Rectangle2DUnion implements Collector { - - @Override - public Supplier supplier() { - - return Area::new; - } - - - @Override - public BiConsumer accumulator() { - - return (area, rectangle2D) -> area.add(new Area(rectangle2D)); - } - - - @Override - public BinaryOperator combiner() { - - return (area1, area2) -> { - area1.add(area2); - return area1; - }; - } - - - @Override - public Function finisher() { - - return Area::getBounds2D; - } - - - @Override - public Set characteristics() { - - return Set.of(Characteristics.CONCURRENT, Characteristics.UNORDERED); - } - - } - -} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/RulingCleaningService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java similarity index 95% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/RulingCleaningService.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java index 8e8de6f..bb102c9 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/RulingCleaningService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.service; +package com.knecon.fforesight.service.layoutparser.processor.services; import java.awt.geom.Line2D; import java.awt.geom.Point2D; @@ -13,9 +13,9 @@ import java.util.Map; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableCells; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.CleanRulings; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling; -import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons; +import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionGridCreatorService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionGridCreatorService.java new file mode 100644 index 0000000..5a4a40f --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionGridCreatorService.java @@ -0,0 +1,146 @@ +package com.knecon.fforesight.service.layoutparser.processor.services; + +import java.awt.geom.Rectangle2D; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.function.BiConsumer; +import java.util.function.BinaryOperator; +import java.util.function.Function; +import java.util.function.Supplier; +import java.util.stream.Collector; +import java.util.stream.Stream; + +import org.springframework.stereotype.Service; + +import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point; +import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.CellRectangle; +import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionGrid; +import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionRectangle; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Table; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.TableCell; + +import lombok.RequiredArgsConstructor; + +@Service +@RequiredArgsConstructor +public class SectionGridCreatorService { + + public SectionGrid createSectionGrid(Document document) { + + Map> sectionBBox = document.streamAllSubNodesOfType(NodeType.SECTION).map(SemanticNode::getBBox).collect(new SectionGridCollector()); + Map> paragraphBBox = document.streamAllSubNodesOfType(NodeType.PARAGRAPH).map(SemanticNode::getBBox).collect(new SectionGridCollector()); + Map> headlineBBox = document.streamAllSubNodesOfType(NodeType.HEADLINE).map(SemanticNode::getBBox).collect(new SectionGridCollector()); + Map> tableBBox = document.streamAllSubNodesOfType(NodeType.TABLE).map(node -> (Table) node).collect(new TableGridCollector()); + var sectionGrid = new SectionGrid(); + + sectionGrid.setRectanglesPerPage(mergeMapsByConcatenatingLists(// + mergeMapsByConcatenatingLists(paragraphBBox, headlineBBox), // + mergeMapsByConcatenatingLists(sectionBBox, tableBBox))); + + return sectionGrid; + } + + + private static abstract class GridCollector implements Collector>, Map>> { + + @Override + public Supplier>> supplier() { + + return HashMap::new; + } + + + @Override + public Function>, Map>> finisher() { + + return Function.identity(); + } + + + @Override + public BinaryOperator>> combiner() { + + return SectionGridCreatorService::mergeMapsByConcatenatingLists; + } + + + @Override + public Set characteristics() { + + return Set.of(Characteristics.IDENTITY_FINISH, Characteristics.CONCURRENT, Characteristics.UNORDERED); + } + + } + + private static class TableGridCollector extends GridCollector { + + @Override + public BiConsumer>, Table> accumulator() { + + return (map, table) -> table.getPages() + .forEach(page -> map.merge(page.getNumber(), List.of(toSectionRectangle(table, page, table.getPages().size())), SectionGridCreatorService::concatLists)); + } + + + private static SectionRectangle toSectionRectangle(Table table, Page page, int numberOfParts) { + + Rectangle2D rect = table.getBBox().get(page); + List tableCellRectangles = table.streamTableCells() + .map(TableCell::getBBox) + .map(map -> map.get(page)) + .filter(Objects::nonNull) + .map(rectangle2D -> new CellRectangle(new Point((float) rectangle2D.getX(), (float) rectangle2D.getY()), + (float) rectangle2D.getWidth(), + (float) rectangle2D.getHeight())) + .toList(); + return new SectionRectangle(new Point((float) rect.getX(), (float) rect.getY()), + (float) rect.getWidth(), + (float) rect.getHeight(), + 1, + numberOfParts, + tableCellRectangles); + } + + } + + private static class SectionGridCollector extends GridCollector> { + + @Override + public BiConsumer>, Map> accumulator() { + + return (mapToKeep, mapToMerge) -> mapToMerge.forEach((page, rectangle) -> mapToKeep.merge(page.getNumber(), + List.of(toSectionRectangle(rectangle, mapToMerge.values().size())), + SectionGridCreatorService::concatLists)); + + } + + + private static SectionRectangle toSectionRectangle(Rectangle2D rect, int numberOfParts) { + + return new SectionRectangle(new Point((float) rect.getX(), (float) rect.getY()), (float) rect.getWidth(), (float) rect.getHeight(), 1, numberOfParts, null); + } + + } + + + private static Map> mergeMapsByConcatenatingLists(Map> mapToKeep, + Map> mapToMerge) { + + mapToMerge.forEach((page, rectangle) -> mapToKeep.merge(page, rectangle, SectionGridCreatorService::concatLists)); + return mapToKeep; + } + + + private static List concatLists(List l1, List l2) { + + return Stream.concat(l1.stream(), l2.stream()).toList(); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/SectionsBuilderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java similarity index 90% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/SectionsBuilderService.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java index 044e98b..04cc930 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/SectionsBuilderService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.service; +package com.knecon.fforesight.service.layoutparser.processor.services; import java.util.ArrayList; import java.util.Collections; @@ -9,18 +9,18 @@ import java.util.stream.Collectors; import org.springframework.stereotype.Service; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationFooter; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationHeader; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationSection; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.UnclassifiedText; +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationSection; +import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; +import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText; import lombok.extern.slf4j.Slf4j; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/TableExtractionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java similarity index 92% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/TableExtractionService.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java index 4983220..c89db54 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/TableExtractionService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.service; +package com.knecon.fforesight.service.layoutparser.processor.services; import java.awt.geom.Point2D; import java.util.ArrayList; @@ -12,15 +12,15 @@ import java.util.Set; import org.springframework.stereotype.Service; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.CleanRulings; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Rectangle; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons; +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons; @Service public class TableExtractionService { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TextPositionSequenceSorter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TextPositionSequenceSorter.java new file mode 100644 index 0000000..353ab44 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TextPositionSequenceSorter.java @@ -0,0 +1,75 @@ +package com.knecon.fforesight.service.layoutparser.processor.services; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Collection; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.springframework.core.io.ClassPathResource; + +import com.knecon.fforesight.service.layoutparser.processor.model.TextPositionsWithPage; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.parsing.PDFLinesTextStripper; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; +import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionSequenceComparator; + +import lombok.experimental.UtilityClass; + +@UtilityClass +public class TextPositionSequenceExtractionService { + + public List getSortedTextPositionsWithPages(String filename) throws IOException { + + List textPositionSequencesPerPage = new LinkedList<>(); + try (InputStream inputStream = new ClassPathResource(filename).getInputStream()) { + + PDDocument pdDocument = Loader.loadPDF(inputStream); + + for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) { + + PDFLinesTextStripper stripper = new PDFLinesTextStripper(); + PDPage pdPage = pdDocument.getPage(pageNumber - 1); + stripper.setPageNumber(pageNumber); + stripper.setStartPage(pageNumber); + stripper.setEndPage(pageNumber); + stripper.setPdpage(pdPage); + stripper.getText(pdDocument); + + // var sortedTextPositionSequences = stripper.getTextPositionSequences(); + Map> sortedTextPositionSequencesPerDir = stripper.getTextPositionSequences() + .stream() + .sorted(new TextPositionSequenceComparator()) + .collect(Collectors.groupingBy(textPositionSequence -> textPositionSequence.getDir().getDegrees())); + + var sortedTextPositionSequences = sortByDirAccordingToPageRotation(sortedTextPositionSequencesPerDir, pdPage.getRotation()); + + textPositionSequencesPerPage.add(new TextPositionsWithPage(sortedTextPositionSequences, RectangleTransformations.toRectangle2D(pdPage.getCropBox()))); + } + + pdDocument.close(); + } + + return textPositionSequencesPerPage; + } + + + public List sortByDirAccordingToPageRotation(Map> sortedTextPositionSequencesPerDir, int rotation) { + + LinkedList sortedKeys = new LinkedList<>(sortedTextPositionSequencesPerDir.keySet().stream().sorted().toList()); + + for (int i = 0; i < sortedKeys.size(); i++) { + if (sortedKeys.get(i) < rotation) { + Float keyToSwap = sortedKeys.remove(i); + sortedKeys.addLast(keyToSwap); + } + } + return sortedKeys.stream().map(sortedTextPositionSequencesPerDir::get).flatMap(Collection::stream).toList(); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java new file mode 100644 index 0000000..d5bd90d --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java @@ -0,0 +1,229 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.blockification; + +import static java.util.stream.Collectors.toSet; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter; +import com.knecon.fforesight.service.layoutparser.processor.model.Orientation; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil; + +@Service +public class DocuMineBlockificationService { + + static final float THRESHOLD = 1f; + + + /** + * This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions. + * This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this! + * Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling. + * + * @param textPositions The words of a page. + * @param horizontalRulingLines Horizontal table lines. + * @param verticalRulingLines Vertical table lines. + * @return Page object that contains the Textblock and text statistics. + */ + public ClassificationPage blockify(List textPositions, List horizontalRulingLines, List verticalRulingLines) { + + List chunkWords = new ArrayList<>(); + List chunkBlockList1 = new ArrayList<>(); + + float minX = 1000, maxX = 0, minY = 1000, maxY = 0; + TextPositionSequence prev = null; + + boolean wasSplitted = false; + Float splitX1 = null; + for (TextPositionSequence word : textPositions) { + + boolean lineSeparation = prev != null && word.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * 1.25; + boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight(); + boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj(); + boolean negativeXGap = prev != null && word.getMinXDirAdj() - minX < -5; + boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj(); + boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines); + boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir()); + boolean splitByOtherFontAndOtherY = prev != null && prev.getMaxYDirAdj() != word.getMaxYDirAdj() && (word.getFontStyle().contains("bold") && !prev.getFontStyle() + .contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold")); + + if (prev != null && (lineSeparation || startFromTop || splitByDir || isSplitByRuling || splitByOtherFontAndOtherY || negativeXGap)) { + + Orientation prevOrientation = null; + if (!chunkBlockList1.isEmpty()) { + prevOrientation = chunkBlockList1.get(chunkBlockList1.size() - 1).getOrientation(); + } + + TextPageBlock cb1 = buildTextBlock(chunkWords); + chunkBlockList1.add(cb1); + chunkWords = new ArrayList<>(); + + if (splitByX && !isSplitByRuling) { + wasSplitted = true; + cb1.setOrientation(Orientation.LEFT); + splitX1 = word.getMinXDirAdj(); + } else if (newLineAfterSplit && !isSplitByRuling) { + wasSplitted = false; + cb1.setOrientation(Orientation.RIGHT); + splitX1 = null; + } else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) { + cb1.setOrientation(Orientation.LEFT); + } + + minX = 1000; + maxX = 0; + minY = 1000; + maxY = 0; + prev = null; + } + + chunkWords.add(word); + + prev = word; + if (word.getMinXDirAdj() < minX) { + minX = word.getMinXDirAdj(); + } + if (word.getMaxXDirAdj() > maxX) { + maxX = word.getMaxXDirAdj(); + } + if (word.getMinYDirAdj() < minY) { + minY = word.getMinYDirAdj(); + } + if (word.getMaxYDirAdj() > maxY) { + maxY = word.getMaxYDirAdj(); + } + } + + TextPageBlock cb1 = buildTextBlock(chunkWords); + if (cb1 != null) { + chunkBlockList1.add(cb1); + } + + return new ClassificationPage(chunkBlockList1); + } + + + private boolean equalsWithThreshold(float f1, float f2) { + + return Math.abs(f1 - f2) < THRESHOLD; + } + + + private TextPageBlock buildTextBlock(List wordBlockList) { + + TextPageBlock textBlock = null; + + FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter(); + FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter(); + FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter(); + StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter(); + StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter(); + + for (TextPositionSequence wordBlock : wordBlockList) { + + lineHeightFrequencyCounter.add(wordBlock.getTextHeight()); + fontSizeFrequencyCounter.add(wordBlock.getFontSize()); + spaceFrequencyCounter.add(wordBlock.getSpaceWidth()); + fontFrequencyCounter.add(wordBlock.getFont()); + styleFrequencyCounter.add(wordBlock.getFontStyle()); + + if (textBlock == null) { + textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(), + wordBlock.getMaxXDirAdj(), + wordBlock.getMinYDirAdj(), + wordBlock.getMaxYDirAdj(), + wordBlockList, + wordBlock.getRotation()); + } else { + TextPageBlock spatialEntity = textBlock.union(wordBlock); + textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight()); + } + } + + if (textBlock != null) { + textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular()); + textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest()); + } + + if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) { + textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj)); + } + return textBlock; + } + + + private boolean isSplitByRuling(float minX, + float minY, + float maxX, + float maxY, + TextPositionSequence word, + List horizontalRulingLines, + List verticalRulingLines) { + + return isSplitByRuling(maxX, + minY, + word.getMinXDirAdj(), + word.getMinYDirAdj(), + verticalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()) // + || isSplitByRuling(minX, + minY, + word.getMinXDirAdj(), + word.getMaxYDirAdj(), + horizontalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()) // + || isSplitByRuling(maxX, + minY, + word.getMinXDirAdj(), + word.getMinYDirAdj(), + horizontalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()) // + || isSplitByRuling(minX, + minY, + word.getMinXDirAdj(), + word.getMaxYDirAdj(), + verticalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()); // + } + + + private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List rulingLines, float dir, float pageWidth, float pageHeight) { + + for (Ruling ruling : rulingLines) { + var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight); + if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) { + return true; + } + } + return false; + } + + + private double round(float value, int decimalPoints) { + + var d = Math.pow(10, decimalPoints); + return Math.round(value * d) / d; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java new file mode 100644 index 0000000..3062c78 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java @@ -0,0 +1,278 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.blockification; + +import static java.util.stream.Collectors.toSet; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.Iterator; +import java.util.List; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter; +import com.knecon.fforesight.service.layoutparser.processor.model.Orientation; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil; + +@SuppressWarnings("all") +@Service +public class RedactManagerBlockificationService { + + static final float THRESHOLD = 1f; + + + /** + * This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions. + * This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this! + * Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling. + * + * @param textPositions The words of a page. + * @param horizontalRulingLines Horizontal table lines. + * @param verticalRulingLines Vertical table lines. + * @return Page object that contains the Textblock and text statistics. + */ + public ClassificationPage blockify(List textPositions, List horizontalRulingLines, List verticalRulingLines) { + + int indexOnPage = 0; + List chunkWords = new ArrayList<>(); + List chunkBlockList = new ArrayList<>(); + + float minX = 1000, maxX = 0, minY = 1000, maxY = 0; + TextPositionSequence prev = null; + + boolean wasSplitted = false; + Float splitX1 = null; + for (TextPositionSequence word : textPositions) { + + boolean lineSeparation = word.getMinYDirAdj() - maxY > word.getHeight() * 1.25; + boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight(); + boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj(); + boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX; + boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj(); + boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines); + boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir()); + + if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) { + + Orientation prevOrientation = null; + if (!chunkBlockList.isEmpty()) { + prevOrientation = chunkBlockList.get(chunkBlockList.size() - 1).getOrientation(); + } + + TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage); + indexOnPage++; + + chunkBlockList.add(cb1); + chunkWords = new ArrayList<>(); + + if (splitByX && !isSplitByRuling) { + wasSplitted = true; + cb1.setOrientation(Orientation.LEFT); + splitX1 = word.getMinXDirAdj(); + } else if (newLineAfterSplit && !isSplitByRuling) { + wasSplitted = false; + cb1.setOrientation(Orientation.RIGHT); + splitX1 = null; + } else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) { + cb1.setOrientation(Orientation.LEFT); + } + + minX = 1000; + maxX = 0; + minY = 1000; + maxY = 0; + prev = null; + } + + chunkWords.add(word); + + prev = word; + if (word.getMinXDirAdj() < minX) { + minX = word.getMinXDirAdj(); + } + if (word.getMaxXDirAdj() > maxX) { + maxX = word.getMaxXDirAdj(); + } + if (word.getMinYDirAdj() < minY) { + minY = word.getMinYDirAdj(); + } + if (word.getMaxYDirAdj() > maxY) { + maxY = word.getMaxYDirAdj(); + } + } + + TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage); + if (cb1 != null) { + chunkBlockList.add(cb1); + } + + Iterator itty = chunkBlockList.iterator(); + + TextPageBlock previousLeft = null; + TextPageBlock previousRight = null; + while (itty.hasNext()) { + TextPageBlock block = (TextPageBlock) itty.next(); + + if (previousLeft != null && block.getOrientation().equals(Orientation.LEFT)) { + if (previousLeft.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousLeft.getMinY()) { + previousLeft.add(block); + itty.remove(); + continue; + } + } + + if (previousRight != null && block.getOrientation().equals(Orientation.RIGHT)) { + if (previousRight.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousRight.getMinY()) { + previousRight.add(block); + itty.remove(); + continue; + } + } + + if (block.getOrientation().equals(Orientation.LEFT)) { + previousLeft = block; + } else if (block.getOrientation().equals(Orientation.RIGHT)) { + previousRight = block; + } + } + + itty = chunkBlockList.iterator(); + TextPageBlock previous = null; + while (itty.hasNext()) { + TextPageBlock block = (TextPageBlock) itty.next(); + + if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(), + previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation() + .equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) { + previous.add(block); + itty.remove(); + continue; + } + + previous = block; + } + + return new ClassificationPage(chunkBlockList); + } + + + private boolean equalsWithThreshold(float f1, float f2) { + + return Math.abs(f1 - f2) < THRESHOLD; + } + + + private TextPageBlock buildTextBlock(List wordBlockList, int indexOnPage) { + + TextPageBlock textBlock = null; + + FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter(); + FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter(); + FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter(); + StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter(); + StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter(); + + for (TextPositionSequence wordBlock : wordBlockList) { + + lineHeightFrequencyCounter.add(wordBlock.getTextHeight()); + fontSizeFrequencyCounter.add(wordBlock.getFontSize()); + spaceFrequencyCounter.add(wordBlock.getSpaceWidth()); + fontFrequencyCounter.add(wordBlock.getFont()); + styleFrequencyCounter.add(wordBlock.getFontStyle()); + + if (textBlock == null) { + textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(), + wordBlock.getMaxXDirAdj(), + wordBlock.getMinYDirAdj(), + wordBlock.getMaxYDirAdj(), + wordBlockList, + wordBlock.getRotation()); + } else { + TextPageBlock spatialEntity = textBlock.union(wordBlock); + textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight()); + } + } + + if (textBlock != null) { + textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular()); + textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest()); + } + + if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) { + textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj)); + } + return textBlock; + } + + + private boolean isSplitByRuling(float minX, + float minY, + float maxX, + float maxY, + TextPositionSequence word, + List horizontalRulingLines, + List verticalRulingLines) { + + return isSplitByRuling(maxX, + minY, + word.getMinXDirAdj(), + word.getMinYDirAdj(), + verticalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()) // + || isSplitByRuling(minX, + minY, + word.getMinXDirAdj(), + word.getMaxYDirAdj(), + horizontalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()) // + || isSplitByRuling(maxX, + minY, + word.getMinXDirAdj(), + word.getMinYDirAdj(), + horizontalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()) // + || isSplitByRuling(minX, + minY, + word.getMinXDirAdj(), + word.getMaxYDirAdj(), + verticalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()); + } + + + private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List rulingLines, float dir, float pageWidth, float pageHeight) { + + for (Ruling ruling : rulingLines) { + var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight); + if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) { + return true; + } + } + return false; + } + + + private double round(float value, int decimalPoints) { + + var d = Math.pow(10, decimalPoints); + return Math.round(value * d) / d; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/BlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/TaasBlockificationService.java similarity index 92% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/BlockificationService.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/TaasBlockificationService.java index 9281e2b..abcbcac 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/BlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/TaasBlockificationService.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.service; +package com.knecon.fforesight.service.layoutparser.processor.services.blockification; import java.util.ArrayList; import java.util.Iterator; @@ -9,17 +9,17 @@ import java.util.regex.Pattern; import org.springframework.stereotype.Service; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.Orientation; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence; -import com.knecon.fforesight.service.layoutparser.processor.classification.utils.RulingTextDirAdjustUtil; +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.Orientation; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil; @Service @SuppressWarnings("all") -public class BlockificationService { +public class TaasBlockificationService { private static final float THRESHOLD = 1f; private static final float Y_GAP_SPLIT_HEIGHT_MODIFIER = 1.25f; @@ -137,7 +137,7 @@ public class BlockificationService { float minX = 1000, maxX = 0, minY = 1000, maxY = 0; TextPositionSequence prev = null; - + // TODO: make static final constant var listIdentitifier = Pattern.compile("\\b(?:[1-9]|1\\d|20|[ivxlc]|[a-z])\\s*(?:[.)])", Pattern.CASE_INSENSITIVE); boolean wasSplitted = false; @@ -146,7 +146,7 @@ public class BlockificationService { Matcher listIdentifierPattern = listIdentitifier.matcher(word.toString()); - boolean yGap = word.getMinYDirAdj() - maxY > word.getHeight() * Y_GAP_SPLIT_HEIGHT_MODIFIER; + boolean yGap = prev != null && word.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * Y_GAP_SPLIT_HEIGHT_MODIFIER; boolean sameLine = prev != null && equalsWithThreshold(prev.getMinYDirAdj(), word.getMinYDirAdj()); boolean positiveXGapInline = prev != null && maxX + X_GAP_SPLIT_CONSTANT < word.getMinXDirAdj() && sameLine; boolean negativeXGap = prev != null && word.getMinXDirAdj() - minX < -5; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java new file mode 100644 index 0000000..3cedb20 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java @@ -0,0 +1,117 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.classification; + +import java.util.List; +import java.util.Locale; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.springframework.stereotype.Service; + +import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService; +import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; + +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; + + +@Slf4j +@Service +@RequiredArgsConstructor +public class DocuMineClassificationService { + + private final BodyTextFrameService bodyTextFrameService; + private static final Pattern pattern = Pattern.compile("^(\\d{1,1}\\.?){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z\\[\\]\\-]{2,50}", Pattern.CASE_INSENSITIVE); + private static final Pattern pattern2 = Pattern.compile(".*\\d{4}$", Pattern.CASE_INSENSITIVE); + private static final Pattern pattern3 = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*"); + + + public void classifyDocument(ClassificationDocument document) { + + Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false); + Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true); + List headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular(); + + log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue()); + + for (ClassificationPage page : document.getPages()) { + bodyTextFrameService.setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame); + classifyPage(page, document, headlineFontSizes); + } + } + + + private void classifyPage(ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { + + for (AbstractPageBlock textBlock : page.getTextBlocks()) { + if (textBlock instanceof TextPageBlock) { + classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes); + } + } + } + + + private void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { + + log.debug("headlineFontSizes: {}", headlineFontSizes); + var bodyTextFrame = page.getBodyTextFrame(); + + Matcher matcher = pattern.matcher(textBlock.toString()); + Matcher matcher2 = pattern2.matcher(textBlock.toString()); + Matcher matcher3 = pattern3.matcher(textBlock.toString()); + + if (document.getFontSizeCounter().getMostPopular() == null) { + textBlock.setClassification(PageBlockType.OTHER); + return; + } + if (textBlock.getText().length() > 6 && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter() + .getMostPopular() || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) && PositionUtils.getApproxLineCount(textBlock) < 5.9 + + && (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && !matcher2.matches() && !textBlock.toString() + .contains(":") || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && !matcher2.matches() && !textBlock.toString() + .contains(":") || textBlock.toString().startsWith("APPENDIX") || textBlock.toString().startsWith("FIGURE") || textBlock.toString() + .startsWith("TABLE")) && !textBlock.toString().endsWith(":")) { + textBlock.setClassification(PageBlockType.getHeadlineType(1)); + document.setHeadlines(true); + + } else if (matcher.find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && !matcher3.matches() && !matcher2.matches()) { + textBlock.setClassification(PageBlockType.getHeadlineType(2)); + document.setHeadlines(true); + } else if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() + .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) { + textBlock.setClassification(PageBlockType.HEADER); + + } else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() + .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) { + textBlock.setClassification(PageBlockType.FOOTER); + } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, + document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks() + .size() == 1)) { + if (!Pattern.matches("[0-9]+", textBlock.toString())) { + textBlock.setClassification(PageBlockType.TITLE); + } + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter() + .getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) { + textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD); + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont() + .equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle() + .equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) { + textBlock.setClassification(PageBlockType.PARAGRAPH); + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter() + .getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter() + .getMostPopular() + .equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) { + textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC); + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) { + textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN); + } else { + textBlock.setClassification(PageBlockType.OTHER); + } + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java new file mode 100644 index 0000000..6150cba --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java @@ -0,0 +1,116 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.classification; + +import java.util.List; +import java.util.regex.Pattern; + +import org.springframework.stereotype.Service; + +import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService; +import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; + +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@Service +@RequiredArgsConstructor +public class RedactManagerClassificationService { + + private final BodyTextFrameService bodyTextFrameService; + + + public void classifyDocument(ClassificationDocument document) { + + Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false); + Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true); + List headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular(); + + log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue()); + + for (ClassificationPage page : document.getPages()) { + bodyTextFrameService.setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame); + classifyPage(page, document, headlineFontSizes); + } + } + + + private void classifyPage(ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { + + for (AbstractPageBlock textBlock : page.getTextBlocks()) { + if (textBlock instanceof TextPageBlock) { + classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes); + } + } + } + + + private void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { + + var bodyTextFrame = page.getBodyTextFrame(); + + if (document.getFontSizeCounter().getMostPopular() == null) { + textBlock.setClassification(PageBlockType.OTHER); + return; + } + if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() + .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) { + textBlock.setClassification(PageBlockType.HEADER); + + } else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() + .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) { + textBlock.setClassification(PageBlockType.FOOTER); + } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, + document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks() + .size() == 1)) { + if (!Pattern.matches("[0-9]+", textBlock.toString())) { + textBlock.setClassification(PageBlockType.TITLE); + } + } else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter() + .getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter() + .getCountPerValue() + .containsKey("bold") && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1) && textBlock.getSequences() + .get(0) + .getTextPositions() + .get(0) + .getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { + + for (int i = 1; i <= headlineFontSizes.size(); i++) { + if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) { + textBlock.setClassification(PageBlockType.getHeadlineType(i)); + document.setHeadlines(true); + } + } + } else if (!textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle() + .equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences() + .get(0) + .getTextPositions() + .get(0) + .getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { + textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1)); + document.setHeadlines(true); + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter() + .getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) { + textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD); + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont() + .equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle() + .equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) { + textBlock.setClassification(PageBlockType.PARAGRAPH); + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter() + .getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter() + .getMostPopular() + .equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) { + textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC); + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) { + textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN); + } else { + textBlock.setClassification(PageBlockType.OTHER); + } + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/ClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TaasClassificationService.java similarity index 89% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/ClassificationService.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TaasClassificationService.java index 2060ace..76f2e63 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/ClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TaasClassificationService.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.service; +package com.knecon.fforesight.service.layoutparser.processor.services.classification; import java.util.List; import java.util.regex.Pattern; @@ -6,12 +6,13 @@ import java.util.regex.Pattern; import org.springframework.stereotype.Service; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.utils.PositionUtils; +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService; +import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; @@ -19,7 +20,7 @@ import lombok.extern.slf4j.Slf4j; @Slf4j @Service @RequiredArgsConstructor -public class ClassificationService { +public class TaasClassificationService { private final BodyTextFrameService bodyTextFrameService; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/CohenSutherlandClipping.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/CohenSutherlandClipping.java similarity index 97% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/CohenSutherlandClipping.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/CohenSutherlandClipping.java index 6c424c9..b1a409a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/CohenSutherlandClipping.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/CohenSutherlandClipping.java @@ -9,7 +9,7 @@ * This program is free software under the LGPL (>=v2.1) * Read the file LICENSE.txt coming with the sources for details. */ -package com.knecon.fforesight.service.layoutparser.processor.classification.utils; +package com.knecon.fforesight.service.layoutparser.processor.utils; import java.awt.geom.Line2D; import java.awt.geom.Rectangle2D; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/DoubleComparisons.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/DoubleComparisons.java similarity index 91% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/DoubleComparisons.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/DoubleComparisons.java index 05fe8ad..c8651bc 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/DoubleComparisons.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/DoubleComparisons.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.utils; +package com.knecon.fforesight.service.layoutparser.processor.utils; import java.math.BigDecimal; import java.util.Comparator; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PdfVisualisationUtility.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PdfVisualisationUtility.java index 9fd0b75..0e82c1d 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PdfVisualisationUtility.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PdfVisualisationUtility.java @@ -1,6 +1,7 @@ package com.knecon.fforesight.service.layoutparser.processor.utils; import java.awt.Color; +import java.awt.geom.Line2D; import java.awt.geom.Point2D; import java.awt.geom.Rectangle2D; import java.io.IOException; @@ -150,6 +151,32 @@ public class PdfVisualisationUtility { } + @SneakyThrows + public static void drawLine2DList(PDDocument pdDocument, int pageNumber, List line2DS, Options options) { + + var pdPage = pdDocument.getPage(pageNumber - 1); + var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, true); + + contentStream.setStrokingColor(options.getStrokeColor()); + contentStream.setNonStrokingColor(options.getFillColor()); + contentStream.setLineWidth(options.getStrokeWidth()); + + for (var line2D : line2DS) { + contentStream.moveTo((float) line2D.getX1(), (float) line2D.getY1()); + contentStream.lineTo((float) line2D.getX2(), (float) line2D.getY2()); + + if (options.isStroke() && options.isFill()) { + contentStream.fillAndStroke(); + } else if (options.isStroke()) { + contentStream.stroke(); + } else if (options.isFill()) { + contentStream.fill(); + } + } + contentStream.close(); + } + + @Builder @Getter @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/PositionUtils.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PositionUtils.java similarity index 95% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/PositionUtils.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PositionUtils.java index 8b52b74..3aecb92 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/PositionUtils.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PositionUtils.java @@ -1,7 +1,7 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.utils; +package com.knecon.fforesight.service.layoutparser.processor.utils; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import lombok.experimental.UtilityClass; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/QuickSort.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/QuickSort.java similarity index 96% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/QuickSort.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/QuickSort.java index 5e65c49..32793b0 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/QuickSort.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/QuickSort.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.utils; +package com.knecon.fforesight.service.layoutparser.processor.utils; import java.util.ArrayDeque; import java.util.Comparator; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java index 8cd8931..8dc23ad 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java @@ -1,5 +1,7 @@ package com.knecon.fforesight.service.layoutparser.processor.utils; +import static java.lang.String.format; + import java.awt.geom.Rectangle2D; import java.awt.geom.RectangularShape; import java.util.Collections; @@ -23,6 +25,27 @@ import lombok.NoArgsConstructor; public class RectangleTransformations { + public static Rectangle2D pad(Rectangle2D rectangle2D, int deltaX, int deltaY) { + + return new Rectangle2D.Double(rectangle2D.getMinX() - deltaX, rectangle2D.getMinY() - deltaY, rectangle2D.getWidth() + 2 * deltaX, rectangle2D.getHeight() + 2 * deltaY); + } + + + public static Rectangle2D pad(Rectangle2D rectangle2D, double deltaX, double deltaY) { + + return new Rectangle2D.Double(rectangle2D.getMinX() - deltaX, rectangle2D.getMinY() - deltaY, rectangle2D.getWidth() + 2 * deltaX, rectangle2D.getHeight() + 2 * deltaY); + } + + + public static Rectangle2D bBoxUnionAtomicTextBlock(List atomicTextBlocks) { + + return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DBBoxCollector()); + } + public static Collector collectBBox() { + + return new Rectangle2DBBoxCollector(); + } + public static PDRectangle toPDRectangleBBox(List rectangles) { Rectangle2D rectangle2D = RectangleTransformations.rectangleBBox(rectangles); @@ -42,6 +65,11 @@ public class RectangleTransformations { } + public static String toString(Rectangle2D rectangle2D) { + + return format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight()); + } + public static Rectangle2D rectangleBBox(List rectangles) { return rectangles.stream().map(RectangleTransformations::toRectangle2D).collect(new Rectangle2DBBoxCollector()); @@ -56,6 +84,11 @@ public class RectangleTransformations { -redactionLogRectangle.getHeight()); } + public static Rectangle2D toRectangle2D(PDRectangle rectangle) { + + return new Rectangle2D.Double(rectangle.getLowerLeftX(), rectangle.getLowerLeftY(), rectangle.getWidth(), rectangle.getHeight()); + } + public static Rectangle toRedactionLogRectangle(Rectangle2D rectangle2D, int pageNumber) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/RulingTextDirAdjustUtil.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RulingTextDirAdjustUtil.java similarity index 91% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/RulingTextDirAdjustUtil.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RulingTextDirAdjustUtil.java index 7931d65..04ff106 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/RulingTextDirAdjustUtil.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RulingTextDirAdjustUtil.java @@ -1,9 +1,9 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.utils; +package com.knecon.fforesight.service.layoutparser.processor.utils; import java.awt.geom.Line2D; import java.awt.geom.Point2D; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import lombok.experimental.UtilityClass; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TableMergingUtility.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TableMergingUtility.java index fd59588..e6a7332 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TableMergingUtility.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TableMergingUtility.java @@ -5,15 +5,18 @@ import java.util.LinkedList; import java.util.List; import java.util.stream.Stream; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; import lombok.experimental.UtilityClass; @UtilityClass public class TableMergingUtility { + private static final double TABLE_ALIGNMENT_THRESHOLD = 2d; + + public List findConsecutiveTablesWithSameColCountAndSameHeaders(TablePageBlock originalTablePageBlock, List pageBlocks) { List consecutiveTables = pageBlocks.stream() @@ -24,7 +27,8 @@ public class TableMergingUtility { List consecutiveTablesWithSameColCountAndHeaders = new LinkedList<>(); for (TablePageBlock consecutiveTable : consecutiveTables) { - if (consecutiveTable.getColCount() == originalTablePageBlock.getColCount() && !hasTableHeader(consecutiveTable)) { + if (consecutiveTable.getColCount() == originalTablePageBlock.getColCount() && !hasTableHeader(consecutiveTable) && outerBoundaryAlignsX(originalTablePageBlock, + consecutiveTable)) { consecutiveTablesWithSameColCountAndHeaders.add(consecutiveTable); } else { break; @@ -34,6 +38,12 @@ public class TableMergingUtility { } + private static boolean outerBoundaryAlignsX(TablePageBlock originalTablePageBlock, TablePageBlock consecutiveTable) { + + return Math.abs(consecutiveTable.getMinX() - originalTablePageBlock.getMinX()) < TABLE_ALIGNMENT_THRESHOLD && Math.abs(consecutiveTable.getMaxX() - originalTablePageBlock.getMaxX()) < TABLE_ALIGNMENT_THRESHOLD; + } + + private boolean hasTableHeader(TablePageBlock table) { return table.getRows().stream().flatMap(Collection::stream).anyMatch(Cell::isHeaderCell); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/TextNormalizationUtilities.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextNormalizationUtilities.java similarity index 88% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/TextNormalizationUtilities.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextNormalizationUtilities.java index 9cec075..9f90bee 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/TextNormalizationUtilities.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextNormalizationUtilities.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.utils; +package com.knecon.fforesight.service.layoutparser.processor.utils; import lombok.experimental.UtilityClass; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java index c4c0eba..fbd57c4 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java @@ -3,8 +3,8 @@ package com.knecon.fforesight.service.layoutparser.processor.utils; import java.util.Comparator; import java.util.List; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; public class TextPositionOperations { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionSequenceComparator.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionSequenceComparator.java new file mode 100644 index 0000000..40dce07 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionSequenceComparator.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.knecon.fforesight.service.layoutparser.processor.utils; + +import java.util.Comparator; + +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; + +/** + * This class is a comparator for TextPosition operators. It handles + * pages with text in different directions by grouping the text based + * on direction and sorting in that direction. This allows continuous text + * in a given direction to be more easily grouped together. + * + * @author Ben Litchfield + */ +public class TextPositionSequenceComparator implements Comparator +{ + @Override + public int compare(TextPositionSequence pos1, TextPositionSequence pos2) + { + // only compare text that is in the same direction + int cmp1 = Float.compare(pos1.getDir().getDegrees(), pos2.getDir().getDegrees()); + if (cmp1 != 0) + { + return cmp1; + } + + // get the text direction adjusted coordinates + float x1 = pos1.getMinXDirAdj(); + float x2 = pos2.getMinXDirAdj(); + + float pos1YBottom = pos1.getMaxYDirAdj(); + float pos2YBottom = pos2.getMaxYDirAdj(); + + // note that the coordinates have been adjusted so 0,0 is in upper left + float pos1YTop = pos1YBottom - pos1.getTextHeight(); + float pos2YTop = pos2YBottom - pos2.getTextHeight(); + + float yDifference = Math.abs(pos1YBottom - pos2YBottom); + + // we will do a simple tolerance comparison + if (yDifference < .1 || + pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom || + pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom) + { + return Float.compare(x1, x2); + } + else if (pos1YBottom < pos2YBottom) + { + return -1; + } + else + { + return 1; + } + } +} diff --git a/layoutparser-service/layoutparser-service-processor/src/test/resources/log4j2-test.xml b/layoutparser-service/layoutparser-service-processor/src/test/resources/log4j2-test.xml new file mode 100644 index 0000000..b4895cf --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/test/resources/log4j2-test.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java index 6220b46..86b2a66 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java @@ -20,7 +20,8 @@ import org.springframework.beans.factory.annotation.Autowired; import com.fasterxml.jackson.databind.ObjectMapper; import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData; -import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingService; +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; +import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline; import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; @@ -37,7 +38,7 @@ public class BdrJsonBuildTest extends BaseTest { private ObjectMapper objectMapper; @Autowired - private LayoutParsingService layoutParsingService; + private LayoutParsingPipeline layoutParsingPipeline; @SneakyThrows @@ -45,7 +46,7 @@ public class BdrJsonBuildTest extends BaseTest { try (InputStream inputStream = new FileInputStream(filename)) { PDDocument pdDocument = Loader.loadPDF(inputStream); - return layoutParsingService.parseLayoutWithTimer(pdDocument, new ImageServiceResponse(), new TableServiceResponse()); + return layoutParsingPipeline.parseLayoutWithTimer(LayoutParsingType.REDACT_MANAGER, pdDocument, new ImageServiceResponse(), new TableServiceResponse()); } } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BuildDocumentGraphTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BuildDocumentGraphTest.java index a776ec4..094cb5c 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BuildDocumentGraphTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BuildDocumentGraphTest.java @@ -11,7 +11,8 @@ import org.junit.jupiter.api.Test; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.core.io.ClassPathResource; -import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingService; +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; +import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline; import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.server.utils.BaseTest; @@ -21,7 +22,7 @@ import lombok.SneakyThrows; public class BuildDocumentGraphTest extends BaseTest { @Autowired - private LayoutParsingService layoutParsingService; + private LayoutParsingPipeline layoutParsingPipeline; @Test @Disabled @@ -50,7 +51,7 @@ public class BuildDocumentGraphTest extends BaseTest { try (InputStream inputStream = fileResource.getInputStream()) { PDDocument pdDocument = Loader.loadPDF(inputStream); - return layoutParsingService.parseLayout(pdDocument, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), new TableServiceResponse()); + return layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, pdDocument, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), new TableServiceResponse()); } } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphMappingTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphMappingTest.java index f7b8fbd..80e9ecb 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphMappingTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphMappingTest.java @@ -25,7 +25,8 @@ public class DocumentGraphMappingTest extends BuildDocumentGraphTest { DocumentData documentData = DocumentDataMapper.toDocumentData(document); var researchDocumentData = TaasDocumentDataMapper.fromDocument(document); - layoutParsingStorageService.storeDocumentData(layoutParsingRequest, researchDocumentData, documentData); + layoutParsingStorageService.storeDocumentData(layoutParsingRequest, documentData); + DocumentData documentData2 = layoutParsingStorageService.readDocumentData(layoutParsingRequest); Document newDocumentGraph = DocumentGraphMapper.toDocumentGraph(documentData2); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/model/SectionIdentifierTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/model/SectionIdentifierTest.java new file mode 100644 index 0000000..5ff1fe1 --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/model/SectionIdentifierTest.java @@ -0,0 +1,58 @@ +package com.knecon.fforesight.service.layoutparser.server.model; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import org.junit.jupiter.api.Test; + +import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentifier; + +class SectionIdentifierTest { + + @Test + public void testParentOf() { + + var headline = SectionIdentifier.fromSearchText("1 Did you ever hear the tragedy of Darth Plagueis The Wise?"); + var headline1 = SectionIdentifier.fromSearchText("1.0 I thought not. It’s not a story the Jedi would tell you."); + var headline2 = SectionIdentifier.fromSearchText("1.1 It’s a Sith legend. Darth Plagueis was a Dark Lord of the Sith, "); + var headline3 = SectionIdentifier.fromSearchText("1.2.3 so powerful and so wise he could use the Force to influence the midichlorians to create life…"); + var headline4 = SectionIdentifier.fromSearchText("1.2.3.4 He had such a knowledge of the dark side that he could even keep the ones he cared about from dying."); + var headline5 = SectionIdentifier.fromSearchText("1.2.3.4.5 The dark side of the Force is a pathway to many abilities some consider to be unnatural."); + var headline6 = SectionIdentifier.fromSearchText("2.0 He became so powerful…"); + var headline7 = SectionIdentifier.fromSearchText("10000.0 the only thing he was afraid of was losing his power,"); + var headline8 = SectionIdentifier.fromSearchText("A.0 which eventually, of course, he did."); + var headline9 = SectionIdentifier.fromSearchText("Unfortunately, he taught his apprentice everything he knew, then his apprentice killed him in his sleep."); + var headline10 = SectionIdentifier.fromSearchText("2.1.2 Ironic."); + var headline11 = SectionIdentifier.fromSearchText("2.He could save others from death,"); + var headline12 = SectionIdentifier.fromSearchText(" 2. but not himself."); + + var paragraph1 = SectionIdentifier.asChildOf(headline); + assertTrue(paragraph1.isChildOf(headline)); + assertTrue(headline.isParentOf(paragraph1)); + assertFalse(paragraph1.isParentOf(headline)); + + assertFalse(headline.isParentOf(headline1)); + assertTrue(headline.isParentOf(headline2)); + assertTrue(headline.isParentOf(headline3)); + assertTrue(headline.isParentOf(headline4)); + assertTrue(headline.isParentOf(headline5)); + assertTrue(headline1.isParentOf(headline2)); + assertFalse(headline1.isParentOf(headline1)); + assertTrue(headline3.isParentOf(headline4)); + assertFalse(headline4.isParentOf(headline5)); + assertFalse(headline2.isParentOf(headline3)); + assertFalse(headline2.isParentOf(headline4)); + assertTrue(headline1.isParentOf(headline3)); + assertTrue(headline1.isParentOf(headline4)); + assertFalse(headline1.isParentOf(headline6)); + assertFalse(headline1.isParentOf(headline7)); + assertFalse(headline8.isParentOf(headline1)); + assertFalse(headline8.isParentOf(headline2)); + assertFalse(headline8.isParentOf(headline3)); + assertFalse(headline8.isParentOf(headline4)); + assertFalse(headline9.isParentOf(headline9)); + assertTrue(headline10.isChildOf(headline11)); + assertTrue(headline10.isChildOf(headline12)); + } + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/GapAcrossLinesDetectionServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/GapAcrossLinesDetectionServiceTest.java new file mode 100644 index 0000000..5d434e8 --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/GapAcrossLinesDetectionServiceTest.java @@ -0,0 +1,71 @@ +package com.knecon.fforesight.service.layoutparser.server.services; + +import java.awt.geom.Rectangle2D; +import java.util.LinkedList; +import java.util.List; + +import org.junit.jupiter.api.Test; + +import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation; +import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; +import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation; +import com.knecon.fforesight.service.layoutparser.processor.services.DividingColumnDetectionService; +import com.knecon.fforesight.service.layoutparser.processor.services.GapDetectionService; +import com.knecon.fforesight.service.layoutparser.processor.services.GapsAcrossLinesService; +import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService; +import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter; +import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; + +import lombok.SneakyThrows; + +class ColumnDetectionServiceTest { + + @Test + @SneakyThrows + public void testGapBasedColumnDetection() { + + String filename = "files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; + var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf"; + System.out.println("start TextPosition extraction"); + long start = System.currentTimeMillis(); + List pageInformations = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename).stream().map(PageInformationService::build).toList(); + List> columnsPerPage = new LinkedList<>(); + System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start); + System.out.println("start column detection"); + start = System.currentTimeMillis(); + for (PageInformation pageInformation : pageInformations) { + GapInformation gapInformation = GapDetectionService.findGapsInLines(pageInformation.getPageContents().getSortedTextPositionSequences(), pageInformation.getMainBodyTextFrame()); + columnsPerPage.add(GapsAcrossLinesService.detectXGapsAcrossLines(gapInformation, pageInformation.getMainBodyTextFrame())); + } + System.out.printf("Finished column detection in %d ms%n", System.currentTimeMillis() - start); + System.out.println("start draw rectangles"); + start = System.currentTimeMillis(); + PdfDraw.drawRectanglesPerPage(filename, columnsPerPage, tmpFileName); + System.out.printf("Finished drawing rectangles in %d ms%n", System.currentTimeMillis() - start); + } + + + @Test + @SneakyThrows + public void testColumnDetection() { + + String filename = "files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; + var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf"; + System.out.println("start TextPosition extraction"); + long start = System.currentTimeMillis(); + List sortedTextPositionSequencesPerPage = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename); + List> columnsPerPage = new LinkedList<>(); + System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start); + System.out.println("start column detection"); + start = System.currentTimeMillis(); + for (PageContents pageContents : sortedTextPositionSequencesPerPage) { + columnsPerPage.add(DividingColumnDetectionService.detectColumns(pageContents)); + } + System.out.printf("Finished column detection in %d ms%n", System.currentTimeMillis() - start); + System.out.println("start draw rectangles"); + start = System.currentTimeMillis(); + PdfDraw.drawRectanglesPerPage(filename, columnsPerPage, tmpFileName); + System.out.printf("Finished drawing rectangles in %d ms%n", System.currentTimeMillis() - start); + } + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/InvisibleTableDetectionServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/InvisibleTableDetectionServiceTest.java new file mode 100644 index 0000000..b44e948 --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/InvisibleTableDetectionServiceTest.java @@ -0,0 +1,23 @@ +package com.knecon.fforesight.service.layoutparser.processor.services; + +import java.util.List; + +import org.junit.jupiter.api.Test; + +import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; + +import lombok.SneakyThrows; + +class InvisibleTableDetectionServiceTest { + + + @Test + @SneakyThrows + public void detectInvisibleTableTest() { + + String fileName = "files/test-two-pages_ocred-2.pdf"; + + List pageContents = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName); + + } +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/MainBodyTextFrameExtractionServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/MainBodyTextFrameExtractionServiceTest.java new file mode 100644 index 0000000..c51c0e6 --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/MainBodyTextFrameExtractionServiceTest.java @@ -0,0 +1,7 @@ +package com.knecon.fforesight.service.layoutparser.processor.services; + +import static org.junit.jupiter.api.Assertions.*; + +class MainBodyTextFrameExtractionServiceTest { + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageInformationServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageInformationServiceTest.java new file mode 100644 index 0000000..3682fcc --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageInformationServiceTest.java @@ -0,0 +1,50 @@ +package com.knecon.fforesight.service.layoutparser.server.services; + +import java.awt.geom.Rectangle2D; +import java.util.Collection; +import java.util.LinkedList; +import java.util.List; + +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; + +import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation; +import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; +import com.knecon.fforesight.service.layoutparser.processor.services.GapDetectionService; +import com.knecon.fforesight.service.layoutparser.processor.services.MainBodyTextFrameExtractionService; +import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter; +import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; + +import lombok.SneakyThrows; + +class GapDetectionServiceTest { + + @Test + @Disabled + @SneakyThrows + public void testGapDetection() { + + String filename = "files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; + var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf"; + System.out.println("start TextPosition extraction"); + long start = System.currentTimeMillis(); + List sortedTextPositionSequencesPerPage = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename); + List gapInformationInLinesPerPage = new LinkedList<>(); + System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start); + System.out.println("start gap detection"); + start = System.currentTimeMillis(); + for (PageContents pageContents : sortedTextPositionSequencesPerPage) { +// List> lines = LineDetectionService.orderByLine(textPositionsWithPage.getSortedTextPositionSequences()); + Rectangle2D mainBodyTextFrame = MainBodyTextFrameExtractionService.calculateMainBodyTextFrame(pageContents); + gapInformationInLinesPerPage.add(GapDetectionService.findGapsInLines(pageContents.getSortedTextPositionSequences(), mainBodyTextFrame)); + } + System.out.printf("Finished gap detection in %d ms%n", System.currentTimeMillis() - start); + System.out.println("start draw rectangles"); + start = System.currentTimeMillis(); + PdfDraw.drawRectanglesAndLinesPerPage(filename, + gapInformationInLinesPerPage.stream().map(gaps -> gaps.getYGaps().stream().flatMap(Collection::stream).toList()).toList(), + gapInformationInLinesPerPage.stream().map(gaps -> gaps.getXGaps().stream().flatMap(Collection::stream).toList()).toList(), tmpFileName); + System.out.printf("Finished drawing rectangles in %d ms%n", System.currentTimeMillis() - start); + } + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/TextPositionSequenceSorterTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/TextPositionSequenceSorterTest.java new file mode 100644 index 0000000..8bbfdce --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/TextPositionSequenceSorterTest.java @@ -0,0 +1,39 @@ +package com.knecon.fforesight.service.layoutparser.server.services; + +import java.nio.file.Path; +import java.util.List; + +import org.junit.jupiter.api.Test; + +import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.services.RectangleTransformations; +import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter; +import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; + +import lombok.SneakyThrows; + +class TextPositionSequenceSorterTest { + + @Test + @SneakyThrows + public void testTextPositionSequenceExtraction() { + + String fileName = "files/new/test-two-pages_ocred-2.pdf"; + var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TEXT_POSITION_SEQUENCES.pdf").toString(); + + List textPositionPerPage = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName); + + PdfDraw.drawRectanglesPerPageNumberedByLine(fileName, + textPositionPerPage.stream() + .map(t -> t.getSortedTextPositionSequences() + .stream() + .map(TextPositionSequence::getRectangle) + .map(RectangleTransformations::toRectangle2D) + //.map(textPositionSequence -> (Rectangle2D) new Rectangle2D.Double(textPositionSequence.getMaxXDirAdj(), textPositionSequence.getMaxYDirAdj(), textPositionSequence.getWidth(), textPositionSequence.getHeight())) + .map(List::of) + .toList()) + .toList(), tmpFileName); + } + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/visualizations/PdfDraw.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/visualizations/PdfDraw.java index 2b8eeb4..e3d7822 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/visualizations/PdfDraw.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/visualizations/PdfDraw.java @@ -3,16 +3,20 @@ package com.knecon.fforesight.service.layoutparser.server.utils.visualizations; import java.awt.Color; import java.awt.geom.Point2D; import java.awt.geom.Rectangle2D; +import java.io.FileOutputStream; import java.io.IOException; +import java.io.InputStream; import java.util.List; import java.util.Map; +import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPageContentStream; import org.apache.pdfbox.pdmodel.font.PDType1Font; import org.apache.pdfbox.pdmodel.font.Standard14Fonts; import org.apache.pdfbox.util.Matrix; +import org.springframework.core.io.ClassPathResource; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; @@ -20,7 +24,8 @@ import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.AtomicTextBlock; import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; -import com.knecon.fforesight.service.layoutparser.processor.services.RectangleTransformations; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; +import com.knecon.fforesight.service.layoutparser.processor.utils.PdfVisualisationUtility; import lombok.AccessLevel; import lombok.AllArgsConstructor; @@ -34,6 +39,68 @@ import lombok.experimental.UtilityClass; @UtilityClass public class PdfDraw { + public static void drawRectanglesPerPage(String filename, List> rectanglesPerPage, String tmpFileName) throws IOException { + + try (InputStream inputStream = new ClassPathResource(filename).getInputStream()) { + PDDocument pdDocument = Loader.loadPDF(inputStream); + + for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) { + PdfVisualisationUtility.drawRectangle2DList(pdDocument, + pageNumber, + rectanglesPerPage.get(pageNumber - 1), + PdfVisualisationUtility.Options.builder().stroke(true).build()); + } + try (var out = new FileOutputStream(tmpFileName)) { + pdDocument.save(out); + pdDocument.close(); + } + + } + + } + + + public static void drawRectanglesPerPageNumberedByLine(String filename, List>> rectanglesPerPage, String tmpFileName) throws IOException { + + try (InputStream inputStream = new ClassPathResource(filename).getInputStream()) { + PDDocument pdDocument = Loader.loadPDF(inputStream); + + for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) { + var rectanglesOnPage = rectanglesPerPage.get(pageNumber - 1); + for (int lineNumber = 0; lineNumber < rectanglesOnPage.size(); lineNumber++) { + var rectanglesInLine = rectanglesOnPage.get(lineNumber); + PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, rectanglesInLine, PdfVisualisationUtility.Options.builder().stroke(true).build()); + double y = Math.min(rectanglesInLine.get(0).getMinY(), rectanglesInLine.get(0).getMaxY()); + PdfVisualisationUtility.drawText(String.format("%d", lineNumber), + pdDocument, + new Point2D.Double(rectanglesInLine.get(0).getX() - (5 + (5 * countNumberOfDigits(lineNumber))), y + 2), + pageNumber, + PdfVisualisationUtility.Options.builder().stroke(true).build()); + + } + + } + try (var out = new FileOutputStream(tmpFileName)) { + pdDocument.save(out); + pdDocument.close(); + } + + } + + } + + + private static int countNumberOfDigits(int num) { + + if (num == 0) { + return 1; + } + int count = 0; + for (; num != 0; num /= 10, ++count) { + } + return count; + } + public static void drawDocumentGraph(PDDocument document, Document documentGraph) { documentGraph.getDocumentTree().allEntriesInOrder().forEach(entry -> drawNode(document, entry)); @@ -115,6 +182,35 @@ public class PdfDraw { } + @SneakyThrows + public static void drawRectanglesAndLinesPerPage(String filename, List> list, List> rectanglesPerPage, String tmpFileName) { + + try (InputStream inputStream = new ClassPathResource(filename).getInputStream()) { + PDDocument pdDocument = Loader.loadPDF(inputStream); + + for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) { +// PdfVisualisationUtility.drawLine2DList(pdDocument, +// pageNumber, +// list.get(pageNumber - 1), +// PdfVisualisationUtility.Options.builder().stroke(true).build()); + PdfVisualisationUtility.drawRectangle2DList(pdDocument, + pageNumber, + rectanglesPerPage.get(pageNumber - 1), + PdfVisualisationUtility.Options.builder().stroke(true).build()); + PdfVisualisationUtility.drawRectangle2DList(pdDocument, + pageNumber, + list.get(pageNumber - 1), + PdfVisualisationUtility.Options.builder().stroke(true).build()); + } + try (var out = new FileOutputStream(tmpFileName)) { + pdDocument.save(out); + pdDocument.close(); + } + + } + } + + @Builder @AllArgsConstructor @NoArgsConstructor diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/application.yml b/layoutparser-service/layoutparser-service-server/src/test/resources/application.yml new file mode 100644 index 0000000..83c7a1c --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/resources/application.yml @@ -0,0 +1,37 @@ +info: + description: Layout Parser Service Processor + +tenant-user-management-service.url: "http://tenant-user-management-service:8080/internal" +fforesight.tenants.remote: true + +server: + port: 8080 + +spring: + main: + allow-circular-references: true # FIXME + rabbitmq: + host: ${RABBITMQ_HOST:localhost} + port: ${RABBITMQ_PORT:5672} + username: ${RABBITMQ_USERNAME:user} + password: ${RABBITMQ_PASSWORD:rabbitmq} + listener: + simple: + acknowledge-mode: AUTO + concurrency: 2 + retry: + enabled: true + max-attempts: 3 + max-interval: 15000 + prefetch: 1 + +management: + endpoint: + metrics.enabled: ${monitoring.enabled:false} + prometheus.enabled: ${monitoring.enabled:false} + health.enabled: true + endpoints.web.exposure.include: prometheus, health + + +storage: + backend: 's3' diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/log4j2-test.xml b/layoutparser-service/layoutparser-service-server/src/test/resources/log4j2-test.xml new file mode 100644 index 0000000..b4895cf --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/resources/log4j2-test.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/layoutparser-service/pom.xml b/layoutparser-service/pom.xml index 87d1ca1..7f61c7e 100644 --- a/layoutparser-service/pom.xml +++ b/layoutparser-service/pom.xml @@ -7,6 +7,7 @@ org.springframework.boot spring-boot-starter-parent 3.0.6 + com.knecon.fforesight