From 1f9e1510923333a2aefe25342bf0c419e50d0ed1 Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Tue, 6 Jun 2023 11:19:34 +0200 Subject: [PATCH] RED-6009: Document Tree Structure * squashed commits --- redaction-service-v1/pom.xml | 22 + .../redaction-service-api-v1/pom.xml | 1 - .../redaction-service-server-v1/pom.xml | 10 +- .../server/classification/model/Footer.java | 26 - .../server/classification/model/Header.java | 26 - .../classification/model/Orientation.java | 8 - .../classification/model/Paragraph.java | 65 - .../classification/model/SectionText.java | 64 - .../v1/server/classification/model/Text.java | 19 - .../model/UnclassifiedText.java | 26 - ...tity.java => EntityRecognitionEntity.java} | 2 +- ...NerEntities.java => NerEntitiesModel.java} | 5 +- .../controller/RuleBuilderController.java | 6 +- .../adapter/ImageServiceResponseAdapter.java} | 24 +- .../adapter/RedactionLogEntryAdapter.java | 164 + .../adapter/TableServiceResponseAdapter.java} | 10 +- .../adapter}/image/Classification.java | 3 +- .../adapter}/image/FilterGeometry.java | 3 +- .../adapter}/image/Filters.java | 3 +- .../adapter}/image/Geometry.java | 3 +- .../adapter}/image/ImageFormat.java | 3 +- .../adapter}/image/ImageMetadata.java | 3 +- .../adapter}/image/ImageServiceResponse.java | 8 +- .../adapter}/image/ImageSize.java | 2 +- .../adapter}/image/Position.java | 3 +- .../adapter/image/Probability.java | 10 + .../adapter}/table/PageInfo.java | 3 +- .../adapter/table}/PdfTableCell.java | 2 +- .../adapter}/table/TableCells.java | 3 +- .../adapter}/table/TableData.java | 3 +- .../adapter}/table/TableServiceResponse.java | 3 +- .../model/AbstractPageBlock.java} | 25 +- .../model/ClassificationDocument.java} | 16 +- .../model/ClassificationFooter.java | 16 + .../model/ClassificationHeader.java | 16 + .../model/ClassificationPage.java} | 18 +- .../model/ClassificationSection.java | 32 + .../model/FloatFrequencyCounter.java | 6 +- .../classification/model/Orientation.java | 8 + .../classification/model/PageBlockType.java | 38 + .../model/image/ClassifiedImage.java | 25 + .../classification/model/table}/Cell.java | 28 +- .../model/table}/CellPosition.java | 2 +- .../model/table}/CleanRulings.java | 6 +- .../model/table}/Rectangle.java | 2 +- .../classification/model/table}/Ruling.java | 20 +- .../model/table/TablePageBlock.java} | 43 +- .../model/text}/RedTextPosition.java | 2 +- .../model/text/SearchableText.java | 49 + .../model/text}/SimplifiedSectionText.java | 3 +- .../model/text}/SimplifiedText.java | 2 +- .../model/text}/StringFrequencyCounter.java | 6 +- .../model/text}/TextDirection.java | 18 +- .../model/text/TextPageBlock.java} | 28 +- .../model/text}/TextPositionSequence.java | 2 +- .../model/text/UnclassifiedText.java | 14 + .../parsing/LegacyPDFStreamEngine.java | 44 +- .../parsing/PDFLinesTextStripper.java | 51 +- .../parsing/PDFTextStripper.java | 2 +- .../service/BlockificationService.java | 111 +- .../service/BodyTextFrameService.java | 41 +- .../service/ClassificationService.java | 55 +- .../service}/PdfSegmentationService.java | 71 +- .../service/RulingCleaningService.java | 10 +- .../service}/SectionsBuilderService.java | 206 +- .../service/TableExtractionService.java | 44 +- .../utils/CohenSutherlandClipping.java | 2 +- .../classification}/utils/FileUtils.java | 4 +- .../classification/utils/PositionUtils.java | 18 +- .../classification}/utils/QuickSort.java | 2 +- .../utils/RulingTextDirAdjustUtil.java | 6 +- .../classification}/utils/Utils.java | 2 +- .../data/AtomicPositionBlockData.java | 50 + .../document/data/AtomicTextBlockData.java | 39 + .../document/data/DocumentData.java | 43 + .../document/data/DocumentTreeData.java | 128 + .../layoutparsing/document/data/PageData.java | 28 + .../data/mapper/DocumentGraphMapper.java | 198 + .../data/mapper/PropertiesMapper.java | 110 + .../factory/DocumentGraphFactory.java | 246 + .../SearchTextWithTextPositionDto.java | 33 + .../SearchTextWithTextPositionFactory.java | 185 + .../document/factory/SectionNodeFactory.java | 183 + .../document/factory/TableNodeFactory.java | 136 + .../document/factory/TextBlockFactory.java | 53 + .../document/graph/Boundary.java | 163 + .../document/graph/DocumentTree.java | 217 + .../document/graph/entity}/EntityType.java | 2 +- .../graph/entity/RedactionEntity.java | 229 + .../graph/entity/RedactionPosition.java | 24 + .../document/graph/nodes/Document.java | 119 + .../document/graph/nodes/Footer.java | 64 + .../graph/nodes/GenericSemanticNode.java | 5 + .../document/graph/nodes/Header.java | 64 + .../document/graph/nodes/Headline.java | 71 + .../document/graph/nodes/Image.java | 94 + .../document/graph/nodes/ImageType.java | 21 + .../document/graph/nodes/NodeType.java | 21 + .../document/graph/nodes/Page.java | 87 + .../document/graph/nodes/Paragraph.java | 62 + .../document/graph/nodes/Section.java | 76 + .../document/graph/nodes/SemanticNode.java | 468 + .../document/graph/nodes/Table.java | 295 + .../document/graph/nodes/TableCell.java | 90 + .../graph/textblock/AtomicTextBlock.java | 215 + .../textblock/ConcatenatedTextBlock.java | 185 + .../document/graph/textblock/TextBlock.java | 136 + .../graph/textblock/TextBlockCollector.java | 49 + .../services/EntityCreationService.java | 372 + .../services/EntityEnrichmentService.java | 91 + .../ManualRedactionApplicationService.java | 55 + .../utils/PdfVisualisationUtility.java | 168 + .../utils/RectangleTransformations.java | 146 + .../utils/RedactionSearchUtility.java | 139 + .../document/utils/TableMergingUtility.java | 42 + .../utils/TextPositionOperations.java | 20 + .../server/parsing/PDFAreaTextStripper.java | 82 - .../queue/RedactionMessageReceiver.java | 2 +- .../server/redaction/adapter/NerEntities.java | 44 + .../redaction/adapter/NerEntitiesAdapter.java | 169 + .../v1/server/redaction/model/CellValue.java | 53 - .../v1/server/redaction/model/Dictionary.java | 78 - .../v1/server/redaction/model/Entities.java | 23 - .../v1/server/redaction/model/Entity.java | 124 - .../model/EntityPositionSequence.java | 26 - .../redaction/model/FindEntitiesResult.java | 20 - .../v1/server/redaction/model/Image.java | 27 - .../v1/server/redaction/model/ImageType.java | 9 - .../server/redaction/model/OffsetString.java | 27 - .../server/redaction/model/PageEntities.java | 28 - .../v1/server/redaction/model/PdfImage.java | 21 - .../v1/server/redaction/model/PdfTable.java | 17 - .../server/redaction/model/ReasonHolder.java | 16 - .../redaction/model/RedRectangle2D.java | 48 - .../redaction/model/SearchableText.java | 263 - .../v1/server/redaction/model/Section.java | 1684 - .../model/SectionSearchableTextPair.java | 16 - .../model/dictionary/Dictionary.java | 133 + .../{ => dictionary}/DictionaryEntries.java | 2 +- .../DictionaryEntryModel.java | 10 +- .../{ => dictionary}/DictionaryIncrement.java | 6 +- .../DictionaryIncrementValue.java | 2 +- .../{ => dictionary}/DictionaryModel.java | 13 +- .../DictionaryRepresentation.java | 6 +- .../{ => dictionary}/DictionaryVersion.java | 2 +- .../dictionary}/SearchImplementation.java | 46 +- .../{ => dictionary}/TenantDictionary.java | 3 +- .../redaction/model/image/Probability.java | 11 - .../rulebuilder/RuleBuilderModelService.java | 39 - .../service/{analyze => }/AnalyzeService.java | 299 +- .../redaction/service/DictionaryService.java | 32 +- .../service/DroolsExecutionService.java | 133 +- .../service/EntityRedactionService.java | 93 + ...ManualRedactionSurroundingTextService.java | 119 +- .../service/RedactionChangeLogService.java | 86 +- .../service/RedactionLogCreatorService.java | 231 +- .../service/RuleBuilderModelService.java | 23 + ...nFinder.java => SectionFinderService.java} | 50 +- .../service/SectionGridCreatorService.java | 188 +- .../service/SectionTextBuilderService.java | 206 - .../service/SurroundingWordsService.java | 140 - .../service/entityredaction/EntityFinder.java | 127 - .../EntityRedactionService.java | 313 - .../redaction/utils/EntitySearchUtils.java | 384 - .../redaction/utils/FindEntityDetails.java | 28 - .../v1/server/redaction/utils/IdBuilder.java | 37 +- .../redaction/utils/OffsetStringUtils.java | 56 - .../v1/server/redaction/utils/Patterns.java | 2 +- .../redaction/utils/SeparatorUtils.java | 38 + .../utils/TextNormalizationUtilities.java | 12 + .../storage/RedactionStorageService.java | 22 +- .../service/PdfVisualisationService.java | 184 - .../AbstractRedactionIntegrationTest.java | 38 +- .../v1/server/DictionaryServiceTest.java | 8 +- .../FileSystemBackedStorageService.java | 7 +- .../HeadlinesGoldStandardIntegrationTest.java | 250 +- .../v1/server/RedactionIntegrationTest.java | 225 +- .../v1/server/RedactionIntegrationV2Test.java | 3 +- .../redaction/v1/server/RulesTest.java | 23 +- .../v1/server/annotate/AnnotationService.java | 115 +- .../server/document/graph/BoundaryTest.java | 77 + .../graph/BuildDocumentIntegrationTest.java | 101 + ...ocumentEntityInsertionIntegrationTest.java | 283 + .../graph/DocumentMappingIntegrationTest.java | 86 + .../graph/DocumentTableIntegrationTest.java | 30 + .../DocumentVisualizationIntegrationTest.java | 93 + .../ManualResizeRedactionIntegrationTest.java | 266 + .../document/graph/MigrationPocTest.java | 162 + .../DocumentPerformanceIntegrationTest.java | 302 + .../model/TextPositionSequenceTest.java | 3 + .../AnalyseFileRealDataIntegrationTest.java | 15 +- .../realdata/LiveDataIntegrationTest.java | 9 +- .../adapter/NerEntitiesAdapterTest.java | 182 + .../RuleBuilderModelServiceTest.java | 20 - .../utils/EntitySearchUtilsTest.java | 198 - .../PdfSegmentationServiceTest.java | 143 +- .../src/test/resources/drools/allAuthors.drl | 293 - .../src/test/resources/drools/headlines.drl | 13 - .../drools/manual_redaction_rules.drl | 98 + .../resources/drools/prod_syngenta_new.drl | 725 + .../src/test/resources/drools/rules.drl | 843 +- .../test/resources/drools/rules_backup.drl | 697 + .../src/test/resources/drools/rules_v2.drl | 402 +- .../src/test/resources/drools/testRules.drl | 431 - ...ithout_highlights.IMPORTED_REDACTIONS.json | 128 +- ...RAR_01_Volume_1_2018-09-06.IMAGE_INFO.json | 638 + ...ute Oral Toxicity in the Rat- Up and D.pdf | Bin 561583 -> 0 bytes .../resources/files/empty_image_response.json | 5 + .../files/migration/legacy_redactionlog.json | 1 + ...R_02_Volume_2_2018-09-06.NER_ENTITIES.json | 29806 ++++++++++++++++ .../crafted document.NER_ENTITIES.json | 792 + .../EFSA_sanitisation_GFL_v1/rules.drl | 827 +- 212 files changed, 43279 insertions(+), 7761 deletions(-) delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Footer.java delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Header.java delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Orientation.java delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Paragraph.java delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Text.java delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/UnclassifiedText.java rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/client/model/{EntityRecogintionEntity.java => EntityRecognitionEntity.java} (89%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/client/model/{NerEntities.java => NerEntitiesModel.java} (75%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{segmentation/ImageService.java => layoutparsing/classification/adapter/ImageServiceResponseAdapter.java} (74%) create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/RedactionLogEntryAdapter.java rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{segmentation/TableService.java => layoutparsing/classification/adapter/TableServiceResponseAdapter.java} (81%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{redaction/model => layoutparsing/classification/adapter}/image/Classification.java (68%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{redaction/model => layoutparsing/classification/adapter}/image/FilterGeometry.java (58%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{redaction/model => layoutparsing/classification/adapter}/image/Filters.java (62%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{redaction/model => layoutparsing/classification/adapter}/image/Geometry.java (53%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{redaction/model => layoutparsing/classification/adapter}/image/ImageFormat.java (60%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{redaction/model => layoutparsing/classification/adapter}/image/ImageMetadata.java (70%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{redaction/model => layoutparsing/classification/adapter}/image/ImageServiceResponse.java (86%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{redaction/model => layoutparsing/classification/adapter}/image/ImageSize.java (60%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{redaction/model => layoutparsing/classification/adapter}/image/Position.java (64%) create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/image/Probability.java rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{redaction/model => layoutparsing/classification/adapter}/table/PageInfo.java (62%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{redaction/model => layoutparsing/classification/adapter/table}/PdfTableCell.java (78%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{redaction/model => layoutparsing/classification/adapter}/table/TableCells.java (67%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{redaction/model => layoutparsing/classification/adapter}/table/TableData.java (68%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{redaction/model => layoutparsing/classification/adapter}/table/TableServiceResponse.java (78%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{tableextraction/model/AbstractTextContainer.java => layoutparsing/classification/model/AbstractPageBlock.java} (66%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{classification/model/Document.java => layoutparsing/classification/model/ClassificationDocument.java} (57%) create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/ClassificationFooter.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/ClassificationHeader.java rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{classification/model/Page.java => layoutparsing/classification/model/ClassificationPage.java} (63%) create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/ClassificationSection.java rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{ => layoutparsing}/classification/model/FloatFrequencyCounter.java (95%) create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/Orientation.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/PageBlockType.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/image/ClassifiedImage.java rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{tableextraction/model => layoutparsing/classification/model/table}/Cell.java (75%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{tableextraction/model => layoutparsing/classification/model/table}/CellPosition.java (79%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{tableextraction/model => layoutparsing/classification/model/table}/CleanRulings.java (65%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{tableextraction/model => layoutparsing/classification/model/table}/Rectangle.java (98%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{tableextraction/model => layoutparsing/classification/model/table}/Ruling.java (95%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{tableextraction/model/Table.java => layoutparsing/classification/model/table/TablePageBlock.java} (88%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{parsing/model => layoutparsing/classification/model/text}/RedTextPosition.java (95%) create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/SearchableText.java rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{classification/model => layoutparsing/classification/model/text}/SimplifiedSectionText.java (74%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{classification/model => layoutparsing/classification/model/text}/SimplifiedText.java (79%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{classification/model => layoutparsing/classification/model/text}/StringFrequencyCounter.java (93%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{parsing/model => layoutparsing/classification/model/text}/TextDirection.java (66%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{classification/model/TextBlock.java => layoutparsing/classification/model/text/TextPageBlock.java} (88%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{parsing/model => layoutparsing/classification/model/text}/TextPositionSequence.java (99%) create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/UnclassifiedText.java rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{ => layoutparsing/classification}/parsing/LegacyPDFStreamEngine.java (99%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{ => layoutparsing/classification}/parsing/PDFLinesTextStripper.java (86%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{ => layoutparsing/classification}/parsing/PDFTextStripper.java (99%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{ => layoutparsing}/classification/service/BlockificationService.java (66%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{ => layoutparsing}/classification/service/BodyTextFrameService.java (75%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{ => layoutparsing}/classification/service/ClassificationService.java (64%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{segmentation => layoutparsing/classification/service}/PdfSegmentationService.java (61%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{tableextraction => layoutparsing/classification}/service/RulingCleaningService.java (94%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{segmentation => layoutparsing/classification/service}/SectionsBuilderService.java (51%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{tableextraction => layoutparsing/classification}/service/TableExtractionService.java (85%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{tableextraction => layoutparsing/classification}/utils/CohenSutherlandClipping.java (97%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{tableextraction => layoutparsing/classification}/utils/FileUtils.java (94%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{ => layoutparsing}/classification/utils/PositionUtils.java (84%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{tableextraction => layoutparsing/classification}/utils/QuickSort.java (96%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{ => layoutparsing}/classification/utils/RulingTextDirAdjustUtil.java (90%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{tableextraction => layoutparsing/classification}/utils/Utils.java (91%) create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/data/AtomicPositionBlockData.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/data/AtomicTextBlockData.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/data/DocumentData.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/data/DocumentTreeData.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/data/PageData.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/data/mapper/DocumentGraphMapper.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/data/mapper/PropertiesMapper.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/factory/DocumentGraphFactory.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/factory/SearchTextWithTextPositionDto.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/factory/SearchTextWithTextPositionFactory.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/factory/SectionNodeFactory.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/factory/TableNodeFactory.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/factory/TextBlockFactory.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/Boundary.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/DocumentTree.java rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/{redaction/model => layoutparsing/document/graph/entity}/EntityType.java (54%) create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/entity/RedactionEntity.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/entity/RedactionPosition.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/nodes/Document.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/nodes/Footer.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/nodes/GenericSemanticNode.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/nodes/Header.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/nodes/Headline.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/nodes/Image.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/nodes/ImageType.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/nodes/NodeType.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/nodes/Page.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/nodes/Paragraph.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/nodes/Section.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/nodes/SemanticNode.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/nodes/Table.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/nodes/TableCell.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/textblock/AtomicTextBlock.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/textblock/ConcatenatedTextBlock.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/textblock/TextBlock.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/graph/textblock/TextBlockCollector.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/services/EntityCreationService.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/services/EntityEnrichmentService.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/services/ManualRedactionApplicationService.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/utils/PdfVisualisationUtility.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/utils/RectangleTransformations.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/utils/RedactionSearchUtility.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/utils/TableMergingUtility.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/utils/TextPositionOperations.java delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFAreaTextStripper.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/adapter/NerEntities.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/adapter/NerEntitiesAdapter.java delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/CellValue.java delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Dictionary.java delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Entities.java delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Entity.java delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/EntityPositionSequence.java delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/FindEntitiesResult.java delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Image.java delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/ImageType.java delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/OffsetString.java delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/PageEntities.java delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/PdfImage.java delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/PdfTable.java delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/ReasonHolder.java delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/RedRectangle2D.java delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SearchableText.java delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/SectionSearchableTextPair.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/dictionary/Dictionary.java rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/{ => dictionary}/DictionaryEntries.java (97%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/{ => dictionary}/DictionaryEntryModel.java (94%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/{ => dictionary}/DictionaryIncrement.java (95%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/{ => dictionary}/DictionaryIncrementValue.java (94%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/{ => dictionary}/DictionaryModel.java (97%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/{ => dictionary}/DictionaryRepresentation.java (97%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/{ => dictionary}/DictionaryVersion.java (95%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/{utils => model/dictionary}/SearchImplementation.java (64%) rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/{ => dictionary}/TenantDictionary.java (96%) delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/image/Probability.java delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/rulebuilder/RuleBuilderModelService.java rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/{analyze => }/AnalyzeService.java (50%) create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/RuleBuilderModelService.java rename redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/{analyze/SectionFinder.java => SectionFinderService.java} (66%) delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/SectionTextBuilderService.java delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/SurroundingWordsService.java delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/entityredaction/EntityFinder.java delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/entityredaction/EntityRedactionService.java delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtils.java delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/FindEntityDetails.java delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/utils/OffsetStringUtils.java delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/visualization/service/PdfVisualisationService.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/document/graph/BoundaryTest.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/document/graph/BuildDocumentIntegrationTest.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/document/graph/DocumentEntityInsertionIntegrationTest.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/document/graph/DocumentMappingIntegrationTest.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/document/graph/DocumentTableIntegrationTest.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/document/graph/DocumentVisualizationIntegrationTest.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/document/graph/ManualResizeRedactionIntegrationTest.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/document/graph/MigrationPocTest.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/document/services/DocumentPerformanceIntegrationTest.java create mode 100644 redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/adapter/NerEntitiesAdapterTest.java delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/rulebuilder/RuleBuilderModelServiceTest.java delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/redaction/utils/EntitySearchUtilsTest.java delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/allAuthors.drl delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/headlines.drl create mode 100644 redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/manual_redaction_rules.drl create mode 100644 redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/prod_syngenta_new.drl create mode 100644 redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/rules_backup.drl delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/test/resources/drools/testRules.drl create mode 100644 redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.IMAGE_INFO.json delete mode 100644 redaction-service-v1/redaction-service-server-v1/src/test/resources/files/RSS/07 - Acute Oral Toxicity in the Rat- Up and D.pdf create mode 100644 redaction-service-v1/redaction-service-server-v1/src/test/resources/files/empty_image_response.json create mode 100644 redaction-service-v1/redaction-service-server-v1/src/test/resources/files/migration/legacy_redactionlog.json create mode 100644 redaction-service-v1/redaction-service-server-v1/src/test/resources/ner_entities/S-Metolachlor_RAR_02_Volume_2_2018-09-06.NER_ENTITIES.json create mode 100644 redaction-service-v1/redaction-service-server-v1/src/test/resources/ner_entities/crafted document.NER_ENTITIES.json diff --git a/redaction-service-v1/pom.xml b/redaction-service-v1/pom.xml index 3cdcd5b2..7983e6b4 100644 --- a/redaction-service-v1/pom.xml +++ b/redaction-service-v1/pom.xml @@ -23,6 +23,7 @@ 2.0.24 + 1.18.26 @@ -88,5 +89,26 @@ + + + org.projectlombok + lombok-maven-plugin + 1.18.20.0 + + + delombok + generate-sources + + delombok + + + false + src/main/java + ${delomboked.sources} + + + + + diff --git a/redaction-service-v1/redaction-service-api-v1/pom.xml b/redaction-service-v1/redaction-service-api-v1/pom.xml index 82969479..c739a0cc 100644 --- a/redaction-service-v1/redaction-service-api-v1/pom.xml +++ b/redaction-service-v1/redaction-service-api-v1/pom.xml @@ -39,7 +39,6 @@ - diff --git a/redaction-service-v1/redaction-service-server-v1/pom.xml b/redaction-service-v1/redaction-service-server-v1/pom.xml index 27bca733..2f090926 100644 --- a/redaction-service-v1/redaction-service-server-v1/pom.xml +++ b/redaction-service-v1/redaction-service-server-v1/pom.xml @@ -12,7 +12,7 @@ redaction-service-server-v1 - 7.73.0.Final + 8.37.0.Final 7.73.0.Final 1.19.0 3.29.2-GA @@ -64,7 +64,12 @@ org.drools - drools-core + drools-engine + ${drools.version} + + + org.drools + drools-mvel ${drools.version} @@ -198,5 +203,4 @@ - diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Footer.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Footer.java deleted file mode 100644 index d428bcfa..00000000 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Footer.java +++ /dev/null @@ -1,26 +0,0 @@ -package com.iqser.red.service.redaction.v1.server.classification.model; - -import com.fasterxml.jackson.annotation.JsonIgnore; -import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; - -import lombok.AllArgsConstructor; -import lombok.Data; - -import java.util.List; - -@Data -@AllArgsConstructor -public class Footer { - - private List textBlocks; - - - @JsonIgnore - public SearchableText getSearchableText() { - - SearchableText searchableText = new SearchableText(); - textBlocks.forEach(block -> searchableText.addAll(block.getSequences())); - return searchableText; - } - -} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Header.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Header.java deleted file mode 100644 index 90244010..00000000 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Header.java +++ /dev/null @@ -1,26 +0,0 @@ -package com.iqser.red.service.redaction.v1.server.classification.model; - -import com.fasterxml.jackson.annotation.JsonIgnore; -import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; - -import lombok.AllArgsConstructor; -import lombok.Data; - -import java.util.List; - -@Data -@AllArgsConstructor -public class Header { - - private List textBlocks; - - - @JsonIgnore - public SearchableText getSearchableText() { - - SearchableText searchableText = new SearchableText(); - textBlocks.forEach(block -> searchableText.addAll(block.getSequences())); - return searchableText; - } - -} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Orientation.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Orientation.java deleted file mode 100644 index 1696b565..00000000 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Orientation.java +++ /dev/null @@ -1,8 +0,0 @@ -package com.iqser.red.service.redaction.v1.server.classification.model; - -public enum Orientation { - - NONE, - LEFT, - RIGHT -} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Paragraph.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Paragraph.java deleted file mode 100644 index 51bce3f0..00000000 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Paragraph.java +++ /dev/null @@ -1,65 +0,0 @@ -package com.iqser.red.service.redaction.v1.server.classification.model; - -import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage; -import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; - -import lombok.Data; -import lombok.NoArgsConstructor; - -import java.util.ArrayList; -import java.util.List; - -@Data -@NoArgsConstructor -public class Paragraph implements Comparable { - - private List pageBlocks = new ArrayList<>(); - private List images = new ArrayList<>(); - private String headline; - - - public SearchableText getSearchableText() { - - SearchableText searchableText = new SearchableText(); - pageBlocks.forEach(block -> { - if (block instanceof TextBlock) { - searchableText.addAll(((TextBlock) block).getSequences()); - } - }); - return searchableText; - } - - - public List getTables() { - - List
tables = new ArrayList<>(); - pageBlocks.forEach(block -> { - if (block instanceof Table) { - tables.add((Table) block); - } - }); - return tables; - } - - - public List getTextBlocks() { - - List textBlocks = new ArrayList<>(); - pageBlocks.forEach(block -> { - if (block instanceof TextBlock) { - textBlocks.add((TextBlock) block); - } - }); - return textBlocks; - } - - - @Override - public int compareTo(Object o) { - - return 0; - } - -} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java deleted file mode 100644 index e3073231..00000000 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SectionText.java +++ /dev/null @@ -1,64 +0,0 @@ -package com.iqser.red.service.redaction.v1.server.classification.model; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import com.fasterxml.jackson.annotation.JsonIgnore; -import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionArea; -import com.iqser.red.service.redaction.v1.server.redaction.model.CellValue; -import com.iqser.red.service.redaction.v1.server.redaction.model.Image; -import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; - -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.NoArgsConstructor; - -@Data -@Builder -@NoArgsConstructor -@AllArgsConstructor -public class SectionText { - - private int sectionNumber; - private String text; - - private boolean isTable; - private String headline; - - @Builder.Default - private List sectionAreas = new ArrayList<>(); - @Builder.Default - private Set images = new HashSet<>(); - @Builder.Default - private List textBlocks = new ArrayList<>(); - @Builder.Default - private Map tabularData = new HashMap<>(); - @Builder.Default - private List cellStarts = new ArrayList<>(); - - - public void setTabularData(Map tabularData) { - - tabularData.remove(null); - this.tabularData = tabularData; - } - - - @JsonIgnore - public SearchableText getSearchableText() { - - SearchableText searchableText = new SearchableText(); - textBlocks.forEach(block -> { - if (block != null) { - searchableText.addAll(block.getSequences()); - } - }); - return searchableText; - } - -} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Text.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Text.java deleted file mode 100644 index ce0f8824..00000000 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Text.java +++ /dev/null @@ -1,19 +0,0 @@ -package com.iqser.red.service.redaction.v1.server.classification.model; - - -import lombok.AllArgsConstructor; -import lombok.Data; -import lombok.NoArgsConstructor; - -import java.util.ArrayList; -import java.util.List; - -@Data -@NoArgsConstructor -@AllArgsConstructor -public class Text { - - private int numberOfPages; - private List sectionTexts = new ArrayList<>(); - -} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/UnclassifiedText.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/UnclassifiedText.java deleted file mode 100644 index ad56370f..00000000 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/UnclassifiedText.java +++ /dev/null @@ -1,26 +0,0 @@ -package com.iqser.red.service.redaction.v1.server.classification.model; - -import com.fasterxml.jackson.annotation.JsonIgnore; -import com.iqser.red.service.redaction.v1.server.redaction.model.SearchableText; - -import lombok.AllArgsConstructor; -import lombok.Data; - -import java.util.List; - -@Data -@AllArgsConstructor -public class UnclassifiedText { - - private List textBlocks; - - - @JsonIgnore - public SearchableText getSearchableText() { - - SearchableText searchableText = new SearchableText(); - textBlocks.forEach(block -> searchableText.addAll(block.getSequences())); - return searchableText; - } - -} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/client/model/EntityRecogintionEntity.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/client/model/EntityRecognitionEntity.java similarity index 89% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/client/model/EntityRecogintionEntity.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/client/model/EntityRecognitionEntity.java index c7de7eda..80975e15 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/client/model/EntityRecogintionEntity.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/client/model/EntityRecognitionEntity.java @@ -10,7 +10,7 @@ import lombok.NoArgsConstructor; @Builder @AllArgsConstructor @NoArgsConstructor -public class EntityRecogintionEntity { +public class EntityRecognitionEntity { private String value; private int startOffset; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/client/model/NerEntities.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/client/model/NerEntitiesModel.java similarity index 75% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/client/model/NerEntities.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/client/model/NerEntitiesModel.java index 356c343a..8b7b9018 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/client/model/NerEntities.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/client/model/NerEntitiesModel.java @@ -4,7 +4,6 @@ import java.util.HashMap; import java.util.List; import java.util.Map; - import lombok.AllArgsConstructor; import lombok.Data; import lombok.NoArgsConstructor; @@ -12,8 +11,8 @@ import lombok.NoArgsConstructor; @Data @NoArgsConstructor @AllArgsConstructor -public class NerEntities { +public class NerEntitiesModel { - private Map> data = new HashMap<>(); + private Map> data = new HashMap<>(); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RuleBuilderController.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RuleBuilderController.java index 7e1d5894..1a800e0b 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RuleBuilderController.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/controller/RuleBuilderController.java @@ -1,13 +1,13 @@ package com.iqser.red.service.redaction.v1.server.controller; +import org.springframework.web.bind.annotation.RestController; + import com.iqser.red.service.redaction.v1.model.RuleBuilderModel; import com.iqser.red.service.redaction.v1.resources.RuleBuilderResource; -import com.iqser.red.service.redaction.v1.server.redaction.rulebuilder.RuleBuilderModelService; +import com.iqser.red.service.redaction.v1.server.redaction.service.RuleBuilderModelService; import lombok.RequiredArgsConstructor; -import org.springframework.web.bind.annotation.RestController; - @RestController @RequiredArgsConstructor public class RuleBuilderController implements RuleBuilderResource { diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/ImageService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/ImageServiceResponseAdapter.java similarity index 74% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/ImageService.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/ImageServiceResponseAdapter.java index 813c17fb..f4eb8b32 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/ImageService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/ImageServiceResponseAdapter.java @@ -1,5 +1,6 @@ -package com.iqser.red.service.redaction.v1.server.segmentation; +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter; +import java.awt.geom.Rectangle2D; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -10,11 +11,10 @@ import org.springframework.stereotype.Service; import com.fasterxml.jackson.databind.ObjectMapper; import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType; -import com.iqser.red.service.redaction.v1.server.classification.model.Page; -import com.iqser.red.service.redaction.v1.server.redaction.model.ImageType; -import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage; -import com.iqser.red.service.redaction.v1.server.redaction.model.RedRectangle2D; -import com.iqser.red.service.redaction.v1.server.redaction.model.image.ImageServiceResponse; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image.ImageServiceResponse; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.image.ClassifiedImage; +import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.ImageType; import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService; import lombok.RequiredArgsConstructor; @@ -22,26 +22,26 @@ import lombok.SneakyThrows; @Service @RequiredArgsConstructor -public class ImageService { +public class ImageServiceResponseAdapter { private final ObjectMapper objectMapper; private final RedactionStorageService redactionStorageService; @SneakyThrows - public Map> convertImages(String dossierId, String fileId) { + public Map> convertImages(String dossierId, String fileId) { var imageClassificationStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(dossierId, fileId, FileType.IMAGE_INFO)); ImageServiceResponse imageServiceResponse = objectMapper.readValue(imageClassificationStream, ImageServiceResponse.class); - Map> images = new HashMap<>(); + Map> images = new HashMap<>(); imageServiceResponse.getData().forEach(imageMetadata -> { var classification = imageMetadata.getFilters().isAllPassed() ? ImageType.valueOf(imageMetadata.getClassification() .getLabel() .toUpperCase(Locale.ROOT)) : ImageType.OTHER; images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>()) - .add(new PdfImage(new RedRectangle2D(imageMetadata.getPosition().getX1(), + .add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(), imageMetadata.getPosition().getY1(), imageMetadata.getGeometry().getWidth(), imageMetadata.getGeometry().getHeight()), classification, imageMetadata.isAlpha(), imageMetadata.getPosition().getPageNumber())); @@ -53,7 +53,7 @@ public class ImageService { .getLabel() .toUpperCase(Locale.ROOT)) : ImageType.OTHER; images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>()) - .add(new PdfImage(new RedRectangle2D(imageMetadata.getPosition().getX1(), + .add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(), imageMetadata.getPosition().getY1(), imageMetadata.getGeometry().getWidth(), imageMetadata.getGeometry().getHeight()), classification, imageMetadata.isAlpha(), imageMetadata.getPosition().getPageNumber())); @@ -63,7 +63,7 @@ public class ImageService { } - public void findOcr(Page page) { + public void findOcr(ClassificationPage page) { page.getImages().forEach(image -> { if (image.getImageType().equals(ImageType.OTHER)) { diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/RedactionLogEntryAdapter.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/RedactionLogEntryAdapter.java new file mode 100644 index 00000000..ac256585 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/RedactionLogEntryAdapter.java @@ -0,0 +1,164 @@ +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter; + +import static java.lang.String.format; +import static java.util.stream.Collectors.groupingBy; + +import java.awt.geom.Rectangle2D; +import java.util.Collection; +import java.util.Comparator; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import org.springframework.stereotype.Service; + +import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; +import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.RedactionLog; +import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.RedactionLogEntry; +import com.iqser.red.service.redaction.v1.server.exception.NotFoundException; +import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.EntityType; +import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionEntity; +import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.entity.RedactionPosition; +import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.Page; +import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.SemanticNode; +import com.iqser.red.service.redaction.v1.server.layoutparsing.document.services.EntityCreationService; +import com.iqser.red.service.redaction.v1.server.redaction.model.dictionary.SearchImplementation; + +import lombok.RequiredArgsConstructor; + +@Service +@RequiredArgsConstructor +public class RedactionLogEntryAdapter { + + private static final double MATCH_THRESHOLD = 1; + private final EntityCreationService entityCreationService; + + + public Stream toRedactionEntity(RedactionLog redactionLog, SemanticNode node) { + + List pageNumbers = redactionLog.getRedactionLogEntry().stream().flatMap(entry -> entry.getPositions().stream().map(Rectangle::getPage)).distinct().toList(); + if (!pageNumbers.stream().allMatch(node::isOnPage)) { + throw new IllegalArgumentException(format("SemanticNode %s does not contain these pages %s present in the redaction log", + node, + pageNumbers.stream().filter(pageNumber -> !node.isOnPage(pageNumber)).toList())); + } + Set entryValues = redactionLog.getRedactionLogEntry().stream().map(RedactionLogEntry::getValue).map(String::toLowerCase).collect(Collectors.toSet()); + SearchImplementation searchImplementation = new SearchImplementation(entryValues, true); + + Map> tempEntitiesByValue = findAllPossibleEntitiesAndGroupByValueIgnoringCase(node, searchImplementation); + + assert allValuesFound(tempEntitiesByValue, entryValues); + + List entities = redactionLog.getRedactionLogEntry() + .stream() + .map(entry -> findClosestRedactionEntity(entry, tempEntitiesByValue.get(entry.getValue().toLowerCase(Locale.ROOT)), node)) + .toList(); + tempEntitiesByValue.values().stream().flatMap(Collection::stream).forEach(RedactionEntity::removeFromGraph); + return entities.stream(); + } + + + private static boolean allValuesFound(Map> entitiesByValue, Set entryValues) { + + return entitiesByValue.keySet().equals(entryValues); + } + + + private Map> findAllPossibleEntitiesAndGroupByValueIgnoringCase(SemanticNode node, SearchImplementation searchImplementation) { + + return searchImplementation.getBoundaries(node.getTextBlock(), node.getBoundary()) + .stream() + .map(boundary -> entityCreationService.byBoundary(boundary, "temp", EntityType.ENTITY, node)) + .collect(groupingBy(entity -> entity.getValue().toLowerCase(Locale.ROOT))); + } + + + private RedactionEntity findClosestRedactionEntity(RedactionLogEntry redactionLogEntry, List entitiesWithSameValue, SemanticNode node) { + + RedactionEntity closestEntity = entitiesWithSameValue.stream() + .filter(entity -> pagesMatch(entity, redactionLogEntry)) + .min(Comparator.comparingDouble(entity -> calculateMinDistance(redactionLogEntry, entity))) + .orElseThrow(() -> new NotFoundException(format("No entity with similar position found for %s", redactionLogEntry))); + + double distance = calculateMinDistance(redactionLogEntry, closestEntity); + if (distance > MATCH_THRESHOLD) { + throw new NotFoundException(format("Distance to closest found entity is %.2f for \n%s \n%s", + distance, + redactionLogEntry.getPositions(), + closestEntity.getRedactionPositionsPerPage())); + } + + return createCorrectEntity(redactionLogEntry, node, closestEntity); + } + + + private RedactionEntity createCorrectEntity(RedactionLogEntry redactionLogEntry, SemanticNode node, RedactionEntity closestEntity) { + + RedactionEntity correctEntity = entityCreationService.byBoundary(closestEntity.getBoundary(), + redactionLogEntry.getType(), + redactionLogEntry.isRecommendation() ? EntityType.RECOMMENDATION : EntityType.ENTITY, + node); + correctEntity.setLegalBasis(redactionLogEntry.getLegalBasis()); + correctEntity.setRedactionReason(redactionLogEntry.getReason()); + correctEntity.addMatchedRule(redactionLogEntry.getMatchedRule()); + correctEntity.setRedaction(redactionLogEntry.isRedacted()); + correctEntity.setDictionaryEntry(redactionLogEntry.isDictionaryEntry()); + correctEntity.setDossierDictionaryEntry(redactionLogEntry.isDossierDictionaryEntry()); + return correctEntity; + } + + + private static boolean pagesMatch(RedactionEntity entity, RedactionLogEntry redactionLogEntry) { + + Set entityPageNumbers = entity.getRedactionPositionsPerPage().stream().map(RedactionPosition::getPage).map(Page::getNumber).collect(Collectors.toSet()); + Set redactionLogEntryPageNumbers = redactionLogEntry.getPositions().stream().map(Rectangle::getPage).collect(Collectors.toSet()); + return entityPageNumbers.equals(redactionLogEntryPageNumbers); + } + + + private double calculateMinDistance(RedactionLogEntry redactionLogEntry, RedactionEntity entity) { + + if (redactionLogEntry.getPositions().size() != countRectangles(entity)) { + return Double.MAX_VALUE; + } + return redactionLogEntry.getPositions().stream().mapToDouble(redactionLogEntryRectangle -> calculateMinDistancePerRectangle(entity, redactionLogEntryRectangle)).sum(); + } + + + private static long countRectangles(RedactionEntity entity) { + + return entity.getRedactionPositionsPerPage().stream().mapToLong(redactionPosition -> redactionPosition.getRectanglePerLine().size()).sum(); + } + + + private double calculateMinDistancePerRectangle(RedactionEntity entity, Rectangle redactionLogEntryRectangle) { + + return entity.getRedactionPositionsPerPage() + .stream() + .filter(redactionPosition -> redactionPosition.getPage().getNumber() == redactionLogEntryRectangle.getPage()) + .map(RedactionPosition::getRectanglePerLine) + .flatMap(Collection::stream) + .mapToDouble(rectangle -> calculateDistance(rectangle, toRectangle2D(redactionLogEntryRectangle))) + .min() + .orElse(Double.MAX_VALUE); + } + + + private double calculateDistance(Rectangle2D rectangle, Rectangle2D rectangle2D) { + + return Math.abs(rectangle.getMinX() - rectangle2D.getMinX()) // + + Math.abs(rectangle.getMinY() - rectangle2D.getMinY()) // + + Math.abs(rectangle.getMaxX() - rectangle2D.getMaxX()) // + + Math.abs(rectangle.getMaxY() - rectangle2D.getMaxY()); + } + + + private Rectangle2D toRectangle2D(Rectangle rectangle) { + + return new Rectangle2D.Float(rectangle.getTopLeft().getX(), rectangle.getTopLeft().getY() + rectangle.getHeight(), rectangle.getWidth(), -rectangle.getHeight()); + } + +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/TableService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/TableServiceResponseAdapter.java similarity index 81% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/TableService.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/TableServiceResponseAdapter.java index 9cd273e4..779414f5 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/TableService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/TableServiceResponseAdapter.java @@ -1,4 +1,4 @@ -package com.iqser.red.service.redaction.v1.server.segmentation; +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter; import java.util.ArrayList; import java.util.Collection; @@ -10,9 +10,9 @@ import org.springframework.stereotype.Service; import com.fasterxml.jackson.databind.ObjectMapper; import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType; -import com.iqser.red.service.redaction.v1.server.redaction.model.PdfTableCell; -import com.iqser.red.service.redaction.v1.server.redaction.model.table.TableCells; -import com.iqser.red.service.redaction.v1.server.redaction.model.table.TableServiceResponse; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table.PdfTableCell; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table.TableCells; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table.TableServiceResponse; import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService; import lombok.RequiredArgsConstructor; @@ -22,7 +22,7 @@ import lombok.extern.slf4j.Slf4j; @Slf4j @Service @RequiredArgsConstructor -public class TableService { +public class TableServiceResponseAdapter { private final ObjectMapper objectMapper; private final RedactionStorageService redactionStorageService; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/image/Classification.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/image/Classification.java similarity index 68% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/image/Classification.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/image/Classification.java index 3a63e8f5..9de8a10d 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/image/Classification.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/image/Classification.java @@ -1,9 +1,8 @@ -package com.iqser.red.service.redaction.v1.server.redaction.model.image; +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image; import java.util.HashMap; import java.util.Map; - import lombok.Data; @Data diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/image/FilterGeometry.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/image/FilterGeometry.java similarity index 58% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/image/FilterGeometry.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/image/FilterGeometry.java index da5221cc..ed8fead8 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/image/FilterGeometry.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/image/FilterGeometry.java @@ -1,5 +1,4 @@ -package com.iqser.red.service.redaction.v1.server.redaction.model.image; - +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image; import lombok.Data; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/image/Filters.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/image/Filters.java similarity index 62% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/image/Filters.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/image/Filters.java index c752528d..ee0e7723 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/image/Filters.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/image/Filters.java @@ -1,5 +1,4 @@ -package com.iqser.red.service.redaction.v1.server.redaction.model.image; - +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image; import lombok.Data; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/image/Geometry.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/image/Geometry.java similarity index 53% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/image/Geometry.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/image/Geometry.java index 63a09756..ddebcdf8 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/image/Geometry.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/image/Geometry.java @@ -1,5 +1,4 @@ -package com.iqser.red.service.redaction.v1.server.redaction.model.image; - +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image; import lombok.Data; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/image/ImageFormat.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/image/ImageFormat.java similarity index 60% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/image/ImageFormat.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/image/ImageFormat.java index 4f638825..7cea1acb 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/image/ImageFormat.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/image/ImageFormat.java @@ -1,5 +1,4 @@ -package com.iqser.red.service.redaction.v1.server.redaction.model.image; - +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image; import lombok.Data; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/image/ImageMetadata.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/image/ImageMetadata.java similarity index 70% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/image/ImageMetadata.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/image/ImageMetadata.java index 0c52b549..1efe46dd 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/image/ImageMetadata.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/image/ImageMetadata.java @@ -1,5 +1,4 @@ -package com.iqser.red.service.redaction.v1.server.redaction.model.image; - +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image; import lombok.Data; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/image/ImageServiceResponse.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/image/ImageServiceResponse.java similarity index 86% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/image/ImageServiceResponse.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/image/ImageServiceResponse.java index 1cb161ec..8b699e24 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/image/ImageServiceResponse.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/image/ImageServiceResponse.java @@ -1,13 +1,13 @@ -package com.iqser.red.service.redaction.v1.server.redaction.model.image; +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image; + +import java.util.ArrayList; +import java.util.List; import com.fasterxml.jackson.annotation.JsonAlias; import com.fasterxml.jackson.annotation.JsonProperty; import lombok.Data; -import java.util.ArrayList; -import java.util.List; - @Data public class ImageServiceResponse { diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/image/ImageSize.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/image/ImageSize.java similarity index 60% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/image/ImageSize.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/image/ImageSize.java index 06c04440..226f6ca1 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/image/ImageSize.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/image/ImageSize.java @@ -1,4 +1,4 @@ -package com.iqser.red.service.redaction.v1.server.redaction.model.image; +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image; import lombok.Data; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/image/Position.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/image/Position.java similarity index 64% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/image/Position.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/image/Position.java index da12f50f..911f5aed 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/image/Position.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/image/Position.java @@ -1,5 +1,4 @@ -package com.iqser.red.service.redaction.v1.server.redaction.model.image; - +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image; import lombok.Data; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/image/Probability.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/image/Probability.java new file mode 100644 index 00000000..db286e40 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/image/Probability.java @@ -0,0 +1,10 @@ +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.image; + +import lombok.Data; + +@Data +public class Probability { + + private boolean unconfident; + +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/table/PageInfo.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/table/PageInfo.java similarity index 62% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/table/PageInfo.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/table/PageInfo.java index 7145f846..f13a5a72 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/table/PageInfo.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/table/PageInfo.java @@ -1,5 +1,4 @@ -package com.iqser.red.service.redaction.v1.server.redaction.model.table; - +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table; import lombok.Data; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/PdfTableCell.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/table/PdfTableCell.java similarity index 78% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/PdfTableCell.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/table/PdfTableCell.java index df3ca322..313778bc 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/PdfTableCell.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/table/PdfTableCell.java @@ -1,4 +1,4 @@ -package com.iqser.red.service.redaction.v1.server.redaction.model; +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table; import lombok.AllArgsConstructor; import lombok.Builder; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/table/TableCells.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/table/TableCells.java similarity index 67% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/table/TableCells.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/table/TableCells.java index 7a67f0d0..1771b794 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/table/TableCells.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/table/TableCells.java @@ -1,5 +1,4 @@ -package com.iqser.red.service.redaction.v1.server.redaction.model.table; - +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table; import lombok.Data; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/table/TableData.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/table/TableData.java similarity index 68% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/table/TableData.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/table/TableData.java index cff0887c..5426e952 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/table/TableData.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/table/TableData.java @@ -1,9 +1,8 @@ -package com.iqser.red.service.redaction.v1.server.redaction.model.table; +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table; import java.util.ArrayList; import java.util.List; - import lombok.Data; @Data diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/table/TableServiceResponse.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/table/TableServiceResponse.java similarity index 78% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/table/TableServiceResponse.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/table/TableServiceResponse.java index 2cb9af4e..2afc4a80 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/table/TableServiceResponse.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/adapter/table/TableServiceResponse.java @@ -1,9 +1,8 @@ -package com.iqser.red.service.redaction.v1.server.redaction.model.table; +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table; import java.util.ArrayList; import java.util.List; - import lombok.Data; @Data diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/AbstractTextContainer.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/AbstractPageBlock.java similarity index 66% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/AbstractTextContainer.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/AbstractPageBlock.java index 8c7e169d..a0d2caef 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/AbstractTextContainer.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/AbstractPageBlock.java @@ -1,9 +1,8 @@ -package com.iqser.red.service.redaction.v1.server.tableextraction.model; +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model; import com.fasterxml.jackson.annotation.JsonIgnore; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; -import com.iqser.red.service.redaction.v1.server.classification.model.Orientation; -import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock; import lombok.AllArgsConstructor; import lombok.Data; @@ -12,7 +11,7 @@ import lombok.NoArgsConstructor; @Data @AllArgsConstructor @NoArgsConstructor -public abstract class AbstractTextContainer { +public abstract class AbstractPageBlock { @JsonIgnore protected float minX; @@ -23,7 +22,7 @@ public abstract class AbstractTextContainer { @JsonIgnore protected float maxY; @JsonIgnore - protected String classification; + protected PageBlockType classification; @JsonIgnore protected int page; @@ -34,13 +33,19 @@ public abstract class AbstractTextContainer { public abstract String getText(); - public boolean containsBlock(TextBlock other) { + public boolean isHeadline() { + + return this instanceof TextPageBlock && this.getClassification() != null && this.getClassification().isHeadline(); + } + + + public boolean containsBlock(TextPageBlock other) { return this.minX <= other.getPdfMinX() && this.maxX >= other.getPdfMaxX() && this.minY >= other.getPdfMinY() && this.maxY <= other.getPdfMaxY(); } - public boolean contains(AbstractTextContainer other) { + public boolean contains(AbstractPageBlock other) { return this.minX <= other.minX && this.maxX >= other.maxX && this.minY >= other.minY && this.maxY <= other.maxY; } @@ -66,4 +71,10 @@ public abstract class AbstractTextContainer { return maxX - minX; } + + public boolean intersectsY(AbstractPageBlock atc) { + + return this.minY <= atc.getMaxY() && this.maxY >= atc.getMinY(); + } + } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Document.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/ClassificationDocument.java similarity index 57% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Document.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/ClassificationDocument.java index 7ae5f880..e27faf0e 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Document.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/ClassificationDocument.java @@ -1,22 +1,24 @@ -package com.iqser.red.service.redaction.v1.server.classification.model; +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model; import java.util.ArrayList; import java.util.List; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionGrid; -import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryVersion; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.StringFrequencyCounter; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.UnclassifiedText; +import com.iqser.red.service.redaction.v1.server.redaction.model.dictionary.DictionaryVersion; import lombok.Data; import lombok.NoArgsConstructor; @Data @NoArgsConstructor -public class Document { +public class ClassificationDocument { - private List pages = new ArrayList<>(); - private List paragraphs = new ArrayList<>(); - private List
headers = new ArrayList<>(); - private List
footers = new ArrayList<>(); + private List pages = new ArrayList<>(); + private List sections = new ArrayList<>(); + private List headers = new ArrayList<>(); + private List footers = new ArrayList<>(); private List unclassifiedTexts = new ArrayList<>(); private FloatFrequencyCounter textHeightCounter = new FloatFrequencyCounter(); private FloatFrequencyCounter fontSizeCounter = new FloatFrequencyCounter(); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/ClassificationFooter.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/ClassificationFooter.java new file mode 100644 index 00000000..221bda7f --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/ClassificationFooter.java @@ -0,0 +1,16 @@ +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model; + +import java.util.List; + +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock; + +import lombok.AllArgsConstructor; +import lombok.Data; + +@Data +@AllArgsConstructor +public class ClassificationFooter { + + private List textBlocks; + +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/ClassificationHeader.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/ClassificationHeader.java new file mode 100644 index 00000000..53f1972a --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/ClassificationHeader.java @@ -0,0 +1,16 @@ +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model; + +import java.util.List; + +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock; + +import lombok.AllArgsConstructor; +import lombok.Data; + +@Data +@AllArgsConstructor +public class ClassificationHeader { + + private List textBlocks; + +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Page.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/ClassificationPage.java similarity index 63% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Page.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/ClassificationPage.java index c6754dfc..d9b38623 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/Page.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/ClassificationPage.java @@ -1,11 +1,11 @@ -package com.iqser.red.service.redaction.v1.server.classification.model; +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model; import java.util.ArrayList; import java.util.List; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; -import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.image.ClassifiedImage; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.StringFrequencyCounter; import lombok.Data; import lombok.NonNull; @@ -13,12 +13,12 @@ import lombok.RequiredArgsConstructor; @Data @RequiredArgsConstructor -public class Page { +public class ClassificationPage { @NonNull - private List textBlocks; + private List textBlocks; - private List images = new ArrayList<>(); + private List images = new ArrayList<>(); private Rectangle bodyTextFrame; @@ -35,10 +35,4 @@ public class Page { private float pageWidth; private float pageHeight; - - public boolean isRotated() { - - return rotation != 0; - } - } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/ClassificationSection.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/ClassificationSection.java new file mode 100644 index 00000000..5ee7a0a4 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/ClassificationSection.java @@ -0,0 +1,32 @@ +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model; + +import java.util.ArrayList; +import java.util.List; + +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.image.ClassifiedImage; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.TablePageBlock; + +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@NoArgsConstructor +public class ClassificationSection { + + private List pageBlocks = new ArrayList<>(); + private List images = new ArrayList<>(); + private String headline; + + + public List getTables() { + + List tables = new ArrayList<>(); + pageBlocks.forEach(block -> { + if (block instanceof TablePageBlock) { + tables.add((TablePageBlock) block); + } + }); + return tables; + } + +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/FloatFrequencyCounter.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/FloatFrequencyCounter.java similarity index 95% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/FloatFrequencyCounter.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/FloatFrequencyCounter.java index f51792d3..f656a839 100755 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/FloatFrequencyCounter.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/FloatFrequencyCounter.java @@ -1,6 +1,4 @@ -package com.iqser.red.service.redaction.v1.server.classification.model; - -import lombok.Getter; +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model; import java.util.ArrayList; import java.util.Collections; @@ -9,6 +7,8 @@ import java.util.List; import java.util.Map; import java.util.stream.Collectors; +import lombok.Getter; + public class FloatFrequencyCounter { @Getter diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/Orientation.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/Orientation.java new file mode 100644 index 00000000..5cd6a10f --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/Orientation.java @@ -0,0 +1,8 @@ +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model; + +public enum Orientation { + + NONE, + LEFT, + RIGHT +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/PageBlockType.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/PageBlockType.java new file mode 100644 index 00000000..d2991efd --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/PageBlockType.java @@ -0,0 +1,38 @@ +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model; + +public enum PageBlockType { + H1, + H2, + H3, + H4, + H5, + H6, + HEADER, + FOOTER, + TITLE, + PARAGRAPH, + PARAGRAPH_BOLD, + PARAGRAPH_ITALIC, + PARAGRAPH_UNKNOWN, + OTHER, + TABLE; + + + public static PageBlockType getHeadlineType(int i) { + + return switch (i) { + case 1 -> PageBlockType.H1; + case 2 -> PageBlockType.H2; + case 3 -> PageBlockType.H3; + case 4 -> PageBlockType.H4; + case 5 -> PageBlockType.H5; + default -> PageBlockType.H6; + }; + } + + + public boolean isHeadline() { + + return this.equals(H1) || this.equals(H2) || this.equals(H3) || this.equals(H4) || this.equals(H5) || this.equals(H6); + } +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/image/ClassifiedImage.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/image/ClassifiedImage.java new file mode 100644 index 00000000..c9129e08 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/image/ClassifiedImage.java @@ -0,0 +1,25 @@ +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.image; + +import java.awt.geom.Rectangle2D; + +import com.iqser.red.service.redaction.v1.server.layoutparsing.document.graph.nodes.ImageType; + +import lombok.Data; +import lombok.NonNull; +import lombok.RequiredArgsConstructor; + +@Data +@RequiredArgsConstructor +public class ClassifiedImage { + + @NonNull + private Rectangle2D position; + @NonNull + private ImageType imageType; + private boolean isAppendedToSection; + @NonNull + private boolean hasTransparency; + @NonNull + private int page; + +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Cell.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/table/Cell.java similarity index 75% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Cell.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/table/Cell.java index a4e755fb..19b64a6a 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Cell.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/table/Cell.java @@ -1,25 +1,25 @@ -package com.iqser.red.service.redaction.v1.server.tableextraction.model; - -import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; -import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; -import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities; - -import lombok.Data; -import lombok.EqualsAndHashCode; -import lombok.NoArgsConstructor; +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table; import java.awt.geom.Point2D; import java.util.ArrayList; import java.util.Iterator; import java.util.List; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence; +import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities; + +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.NoArgsConstructor; + @SuppressWarnings("serial") @Data @EqualsAndHashCode(callSuper = true) @NoArgsConstructor public class Cell extends Rectangle { - private List textBlocks = new ArrayList<>(); + private List textBlocks = new ArrayList<>(); private List headerCells = new ArrayList<>(); @@ -27,6 +27,8 @@ public class Cell extends Rectangle { private static final int MIN_SIZE = 1; + private int pageNumber; + public Cell(Point2D topLeft, Point2D bottomRight) { @@ -34,7 +36,7 @@ public class Cell extends Rectangle { } - public void addTextBlock(TextBlock textBlock) { + public void addTextBlock(TextPageBlock textBlock) { textBlocks.add(textBlock); } @@ -45,11 +47,11 @@ public class Cell extends Rectangle { StringBuilder sb = new StringBuilder(); - Iterator itty = textBlocks.iterator(); + Iterator itty = textBlocks.iterator(); TextPositionSequence previous = null; while (itty.hasNext()) { - TextBlock textBlock = itty.next(); + TextPageBlock textBlock = itty.next(); for (TextPositionSequence word : textBlock.getSequences()) { if (previous != null) { diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/CellPosition.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/table/CellPosition.java similarity index 79% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/CellPosition.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/table/CellPosition.java index 70a9800c..bec2f274 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/CellPosition.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/table/CellPosition.java @@ -1,4 +1,4 @@ -package com.iqser.red.service.redaction.v1.server.tableextraction.model; +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table; import lombok.RequiredArgsConstructor; import lombok.Value; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/CleanRulings.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/table/CleanRulings.java similarity index 65% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/CleanRulings.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/table/CleanRulings.java index e539f4ba..8e41b20a 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/CleanRulings.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/table/CleanRulings.java @@ -1,10 +1,10 @@ -package com.iqser.red.service.redaction.v1.server.tableextraction.model; +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table; + +import java.util.List; import lombok.Builder; import lombok.Data; -import java.util.List; - @Data @Builder public class CleanRulings { diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Rectangle.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/table/Rectangle.java similarity index 98% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Rectangle.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/table/Rectangle.java index 2862e268..c20f2368 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Rectangle.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/table/Rectangle.java @@ -1,4 +1,4 @@ -package com.iqser.red.service.redaction.v1.server.tableextraction.model; +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table; import java.awt.geom.Point2D; import java.awt.geom.Rectangle2D; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Ruling.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/table/Ruling.java similarity index 95% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Ruling.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/table/Ruling.java index 433e64ba..9988b61e 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Ruling.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/table/Ruling.java @@ -1,14 +1,20 @@ -package com.iqser.red.service.redaction.v1.server.tableextraction.model; - -import com.iqser.red.service.redaction.v1.server.tableextraction.utils.CohenSutherlandClipping; -import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils; - -import lombok.extern.slf4j.Slf4j; +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table; import java.awt.geom.Line2D; import java.awt.geom.Point2D; import java.awt.geom.Rectangle2D; -import java.util.*; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.Formatter; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; + +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.CohenSutherlandClipping; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.Utils; + +import lombok.extern.slf4j.Slf4j; @Slf4j @SuppressWarnings("all") diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/table/TablePageBlock.java similarity index 88% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/table/TablePageBlock.java index d5759bf6..8d8684ef 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/model/Table.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/table/TablePageBlock.java @@ -1,25 +1,25 @@ -package com.iqser.red.service.redaction.v1.server.tableextraction.model; +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table; import java.awt.geom.Point2D; import java.util.ArrayList; import java.util.Collections; -import java.util.Comparator; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.TreeMap; -import java.util.stream.Collectors; import org.apache.commons.collections4.CollectionUtils; -import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.PageBlockType; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock; import lombok.Getter; import lombok.Setter; import lombok.extern.slf4j.Slf4j; @Slf4j -public class Table extends AbstractTextContainer { +public class TablePageBlock extends AbstractPageBlock { private final TreeMap cells = new TreeMap<>(); @@ -29,21 +29,18 @@ public class Table extends AbstractTextContainer { private String headline; private int unrotatedRowCount; private int unrotatedColCount; - private int rowCount = -1; - private int colCount = -1; private List> rows; - public Table(List cells, Rectangle area, int rotation) { + public TablePageBlock(List cells, Rectangle area, int rotation) { addCells(cells); minX = area.getLeft(); minY = area.getBottom(); maxX = area.getRight(); maxY = area.getTop(); - classification = "Table"; + classification = PageBlockType.TABLE; this.rotation = rotation; - } @@ -71,19 +68,13 @@ public class Table extends AbstractTextContainer { public int getRowCount() { - if (rowCount == -1) { - rowCount = getRows().size(); - } - return rowCount; + return getRows().size(); } public int getColCount() { - if (colCount == -1) { - colCount = getRows().stream().mapToInt(List::size).max().orElse(0); - } - return colCount; + return getRows().stream().mapToInt(List::size).max().orElse(0); } @@ -224,7 +215,7 @@ public class Table extends AbstractTextContainer { * Calculates the structure of the table. For spanning rows and columns multiple cells with the same values will be inserted. * * @param cells The found cells - * @return Table Structure + * @return TablePageBlock Structure */ private List> calculateStructure(List cells) { @@ -243,8 +234,8 @@ public class Table extends AbstractTextContainer { uniqueY.add(c.getTop()); }); - var sortedUniqueX = uniqueX.stream().sorted().collect(Collectors.toList()); - var sortedUniqueY = uniqueY.stream().sorted().collect(Collectors.toList()); + var sortedUniqueX = uniqueX.stream().sorted().toList(); + var sortedUniqueY = uniqueY.stream().sorted().toList(); Float prevY = null; for (Float y : sortedUniqueY) { @@ -258,9 +249,7 @@ public class Table extends AbstractTextContainer { var cell = new Cell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y)); var intersectionCell = cells.stream().filter(c -> cell.intersects(c) && cell.overlapRatio(c) > 0.1f).findFirst(); - if (intersectionCell.isPresent()) { - cell.getTextBlocks().addAll(intersectionCell.get().getTextBlocks()); - } + intersectionCell.ifPresent(value -> cell.getTextBlocks().addAll(value.getTextBlocks())); if (cell.hasMinimumSize()) { row.add(cell); } @@ -268,7 +257,7 @@ public class Table extends AbstractTextContainer { prevX = x; } - if (prevY != null && prevX != null) { + if (prevY != null && prevX != null && !row.isEmpty()) { matrix.add(row); } prevY = y; @@ -299,7 +288,7 @@ public class Table extends AbstractTextContainer { } if (column != null && column.getTextBlocks() != null) { boolean first = true; - for (TextBlock textBlock : column.getTextBlocks()) { + for (TextPageBlock textBlock : column.getTextBlocks()) { if (!first) { sb.append("\n"); } @@ -331,7 +320,7 @@ public class Table extends AbstractTextContainer { sb.append(i == 0 ? "\n
" : "\n"); if (column != null && column.getTextBlocks() != null) { boolean first = true; - for (TextBlock textBlock : column.getTextBlocks()) { + for (TextPageBlock textBlock : column.getTextBlocks()) { if (!first) { sb.append("
"); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedTextPosition.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/RedTextPosition.java similarity index 95% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedTextPosition.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/RedTextPosition.java index 241900a1..392b1eb0 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/RedTextPosition.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/RedTextPosition.java @@ -1,4 +1,4 @@ -package com.iqser.red.service.redaction.v1.server.parsing.model; +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text; import org.apache.pdfbox.text.TextPosition; import org.springframework.beans.BeanUtils; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/SearchableText.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/SearchableText.java new file mode 100644 index 00000000..36677e1f --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/SearchableText.java @@ -0,0 +1,49 @@ +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text; + +import java.util.ArrayList; +import java.util.List; + +import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities; + +import lombok.Getter; + +public class SearchableText { + + @Getter + private final List sequences = new ArrayList<>(); + + + public void add(TextPositionSequence textPositionSequence) { + + sequences.add(textPositionSequence); + } + + + public void addAll(List textPositionSequences) { + + sequences.addAll(textPositionSequences); + } + + + @Override + public String toString() { + + return buildString(sequences); + } + + + public static String buildString(List sequences) { + + StringBuilder sb = new StringBuilder(); + for (TextPositionSequence word : sequences) { + sb.append(word); + sb.append(' '); + } + String text = sb.toString(); + text = TextNormalizationUtilities.removeHyphenLineBreaks(text); + text = TextNormalizationUtilities.removeLineBreaks(text); + text = TextNormalizationUtilities.removeRepeatingWhitespaces(text); + return text; + } + +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SimplifiedSectionText.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/SimplifiedSectionText.java similarity index 74% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SimplifiedSectionText.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/SimplifiedSectionText.java index 58b11aae..5c1ba630 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SimplifiedSectionText.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/SimplifiedSectionText.java @@ -1,5 +1,4 @@ -package com.iqser.red.service.redaction.v1.server.classification.model; - +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text; import lombok.AllArgsConstructor; import lombok.Builder; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SimplifiedText.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/SimplifiedText.java similarity index 79% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SimplifiedText.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/SimplifiedText.java index 496d64ae..2d4eabf4 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/SimplifiedText.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/SimplifiedText.java @@ -1,4 +1,4 @@ -package com.iqser.red.service.redaction.v1.server.classification.model; +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text; import java.util.ArrayList; import java.util.List; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/StringFrequencyCounter.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/StringFrequencyCounter.java similarity index 93% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/StringFrequencyCounter.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/StringFrequencyCounter.java index 757f6dc3..2186cc25 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/StringFrequencyCounter.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/StringFrequencyCounter.java @@ -1,10 +1,10 @@ -package com.iqser.red.service.redaction.v1.server.classification.model; - -import lombok.Getter; +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text; import java.util.HashMap; import java.util.Map; +import lombok.Getter; + public class StringFrequencyCounter { @Getter diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextDirection.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/TextDirection.java similarity index 66% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextDirection.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/TextDirection.java index 003eca70..aceb3751 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextDirection.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/TextDirection.java @@ -1,6 +1,4 @@ -package com.iqser.red.service.redaction.v1.server.parsing.model; - -import java.util.Objects; +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text; import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonValue; @@ -46,18 +44,4 @@ public enum TextDirection { throw new IllegalArgumentException(String.format("A value of %f is not supported by TextDirection", degrees)); } - - - public static TextDirection fromString(String degreesAsString) { - - Objects.requireNonNull(degreesAsString, "Cannot construct a text direction from a null value"); - - String value = degreesAsString.strip(); - - if (degreesAsString.endsWith(VALUE_STRING_SUFFIX)) { - value = degreesAsString.replace(VALUE_STRING_SUFFIX + "$", ""); - } - - return fromDegrees(Float.parseFloat(value)); - } } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/TextBlock.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/TextPageBlock.java similarity index 88% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/TextBlock.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/TextPageBlock.java index 3061541b..ca8cc8e2 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/model/TextBlock.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/TextPageBlock.java @@ -1,13 +1,12 @@ -package com.iqser.red.service.redaction.v1.server.classification.model; +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text; import java.util.ArrayList; import java.util.List; import com.fasterxml.jackson.annotation.JsonIgnore; -import com.iqser.red.service.redaction.v1.server.parsing.model.TextDirection; -import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.PageBlockType; import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; import lombok.AllArgsConstructor; import lombok.Builder; @@ -18,7 +17,7 @@ import lombok.NoArgsConstructor; @Builder @Data @NoArgsConstructor -public class TextBlock extends AbstractTextContainer { +public class TextPageBlock extends AbstractPageBlock { @Builder.Default private List sequences = new ArrayList<>(); @@ -45,7 +44,7 @@ public class TextBlock extends AbstractTextContainer { private float highestFontSize; @JsonIgnore - private String classification; + private PageBlockType classification; @JsonIgnore @@ -95,6 +94,7 @@ public class TextBlock extends AbstractTextContainer { } } + /** * Returns the maxX value in pdf coordinate system. * Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation. @@ -174,7 +174,7 @@ public class TextBlock extends AbstractTextContainer { } - public TextBlock(float minX, float maxX, float minY, float maxY, List sequences, int rotation) { + public TextPageBlock(float minX, float maxX, float minY, float maxY, List sequences, int rotation) { this.minX = minX; this.maxX = maxX; @@ -185,23 +185,23 @@ public class TextBlock extends AbstractTextContainer { } - public TextBlock union(TextPositionSequence r) { + public TextPageBlock union(TextPositionSequence r) { - TextBlock union = this.copy(); + TextPageBlock union = this.copy(); union.add(r); return union; } - public TextBlock union(TextBlock r) { + public TextPageBlock union(TextPageBlock r) { - TextBlock union = this.copy(); + TextPageBlock union = this.copy(); union.add(r); return union; } - public void add(TextBlock r) { + public void add(TextPageBlock r) { if (r.getMinX() < minX) { minX = r.getMinX(); @@ -236,9 +236,9 @@ public class TextBlock extends AbstractTextContainer { } - public TextBlock copy() { + public TextPageBlock copy() { - return new TextBlock(minX, maxX, minY, maxY, sequences, rotation); + return new TextPageBlock(minX, maxX, minY, maxY, sequences, rotation); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/TextPositionSequence.java similarity index 99% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/TextPositionSequence.java index b1aecb99..48a33a6b 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/model/TextPositionSequence.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/TextPositionSequence.java @@ -1,4 +1,4 @@ -package com.iqser.red.service.redaction.v1.server.parsing.model; +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text; import java.awt.geom.AffineTransform; import java.awt.geom.Point2D; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/UnclassifiedText.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/UnclassifiedText.java new file mode 100644 index 00000000..50925713 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/model/text/UnclassifiedText.java @@ -0,0 +1,14 @@ +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text; + +import java.util.List; + +import lombok.AllArgsConstructor; +import lombok.Data; + +@Data +@AllArgsConstructor +public class UnclassifiedText { + + private List textBlocks; + +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/LegacyPDFStreamEngine.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/parsing/LegacyPDFStreamEngine.java similarity index 99% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/LegacyPDFStreamEngine.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/parsing/LegacyPDFStreamEngine.java index bb00562e..5d96ca09 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/LegacyPDFStreamEngine.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/parsing/LegacyPDFStreamEngine.java @@ -14,34 +14,18 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package com.iqser.red.service.redaction.v1.server.parsing; +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.parsing; -import java.io.InputStream; import java.io.IOException; +import java.io.InputStream; import java.util.Map; import java.util.WeakHashMap; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; - import org.apache.fontbox.ttf.TrueTypeFont; import org.apache.fontbox.util.BoundingBox; - import org.apache.pdfbox.contentstream.PDFStreamEngine; -import org.apache.pdfbox.pdmodel.PDPage; -import org.apache.pdfbox.pdmodel.font.encoding.GlyphList; -import org.apache.pdfbox.pdmodel.common.PDRectangle; -import org.apache.pdfbox.pdmodel.font.PDCIDFont; -import org.apache.pdfbox.pdmodel.font.PDCIDFontType2; -import org.apache.pdfbox.pdmodel.font.PDFont; -import org.apache.pdfbox.pdmodel.font.PDSimpleFont; -import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont; -import org.apache.pdfbox.pdmodel.font.PDType0Font; -import org.apache.pdfbox.pdmodel.font.PDType3Font; -import org.apache.pdfbox.pdmodel.graphics.state.PDGraphicsState; -import org.apache.pdfbox.text.TextPosition; -import org.apache.pdfbox.util.Matrix; -import org.apache.pdfbox.util.Vector; import org.apache.pdfbox.contentstream.operator.DrawObject; import org.apache.pdfbox.contentstream.operator.state.Concatenate; import org.apache.pdfbox.contentstream.operator.state.Restore; @@ -50,22 +34,36 @@ import org.apache.pdfbox.contentstream.operator.state.SetGraphicsStateParameters import org.apache.pdfbox.contentstream.operator.state.SetMatrix; import org.apache.pdfbox.contentstream.operator.text.BeginText; import org.apache.pdfbox.contentstream.operator.text.EndText; -import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize; -import org.apache.pdfbox.contentstream.operator.text.SetTextHorizontalScaling; -import org.apache.pdfbox.contentstream.operator.text.ShowTextAdjusted; -import org.apache.pdfbox.contentstream.operator.text.ShowTextLine; -import org.apache.pdfbox.contentstream.operator.text.ShowTextLineAndSpace; import org.apache.pdfbox.contentstream.operator.text.MoveText; import org.apache.pdfbox.contentstream.operator.text.MoveTextSetLeading; import org.apache.pdfbox.contentstream.operator.text.NextLine; import org.apache.pdfbox.contentstream.operator.text.SetCharSpacing; +import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize; +import org.apache.pdfbox.contentstream.operator.text.SetTextHorizontalScaling; import org.apache.pdfbox.contentstream.operator.text.SetTextLeading; import org.apache.pdfbox.contentstream.operator.text.SetTextRenderingMode; import org.apache.pdfbox.contentstream.operator.text.SetTextRise; import org.apache.pdfbox.contentstream.operator.text.SetWordSpacing; import org.apache.pdfbox.contentstream.operator.text.ShowText; +import org.apache.pdfbox.contentstream.operator.text.ShowTextAdjusted; +import org.apache.pdfbox.contentstream.operator.text.ShowTextLine; +import org.apache.pdfbox.contentstream.operator.text.ShowTextLineAndSpace; import org.apache.pdfbox.cos.COSDictionary; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.pdmodel.font.PDCIDFont; +import org.apache.pdfbox.pdmodel.font.PDCIDFontType2; +import org.apache.pdfbox.pdmodel.font.PDFont; import org.apache.pdfbox.pdmodel.font.PDFontDescriptor; +import org.apache.pdfbox.pdmodel.font.PDSimpleFont; +import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont; +import org.apache.pdfbox.pdmodel.font.PDType0Font; +import org.apache.pdfbox.pdmodel.font.PDType3Font; +import org.apache.pdfbox.pdmodel.font.encoding.GlyphList; +import org.apache.pdfbox.pdmodel.graphics.state.PDGraphicsState; +import org.apache.pdfbox.text.TextPosition; +import org.apache.pdfbox.util.Matrix; +import org.apache.pdfbox.util.Vector; /** * LEGACY text calculations which are known to be incorrect but are depended on by PDFTextStripper. diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/parsing/PDFLinesTextStripper.java similarity index 86% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/parsing/PDFLinesTextStripper.java index 560d5b2c..fda2c3cb 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFLinesTextStripper.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/parsing/PDFLinesTextStripper.java @@ -1,17 +1,32 @@ -package com.iqser.red.service.redaction.v1.server.parsing; +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.parsing; -import com.iqser.red.service.redaction.v1.server.parsing.model.RedTextPosition; -import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling; - -import lombok.Getter; -import lombok.Setter; -import lombok.extern.slf4j.Slf4j; +import java.awt.geom.Point2D; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; import org.apache.pdfbox.contentstream.operator.Operator; import org.apache.pdfbox.contentstream.operator.OperatorName; -import org.apache.pdfbox.contentstream.operator.color.*; -import org.apache.pdfbox.contentstream.operator.state.*; +import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColor; +import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorN; +import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingColorSpace; +import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceCMYKColor; +import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceGrayColor; +import org.apache.pdfbox.contentstream.operator.color.SetNonStrokingDeviceRGBColor; +import org.apache.pdfbox.contentstream.operator.color.SetStrokingColor; +import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorN; +import org.apache.pdfbox.contentstream.operator.color.SetStrokingColorSpace; +import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceCMYKColor; +import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceGrayColor; +import org.apache.pdfbox.contentstream.operator.color.SetStrokingDeviceRGBColor; +import org.apache.pdfbox.contentstream.operator.state.SetFlatness; +import org.apache.pdfbox.contentstream.operator.state.SetLineCapStyle; +import org.apache.pdfbox.contentstream.operator.state.SetLineDashPattern; +import org.apache.pdfbox.contentstream.operator.state.SetLineJoinStyle; +import org.apache.pdfbox.contentstream.operator.state.SetLineMiterLimit; +import org.apache.pdfbox.contentstream.operator.state.SetLineWidth; +import org.apache.pdfbox.contentstream.operator.state.SetRenderingIntent; import org.apache.pdfbox.contentstream.operator.text.SetFontAndSize; import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSNumber; @@ -19,11 +34,13 @@ import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.text.TextPosition; -import java.awt.geom.Point2D; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Comparator; -import java.util.List; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Ruling; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.RedTextPosition; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence; + +import lombok.Getter; +import lombok.Setter; +import lombok.extern.slf4j.Slf4j; @Slf4j public class PDFLinesTextStripper extends PDFTextStripper { @@ -264,8 +281,8 @@ public class PDFLinesTextStripper extends PDFTextStripper { // Remove false sequence ends (whitespaces) if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0) .getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) { - for (TextPosition t : sublist) { - textPositionSequences.get(textPositionSequences.size() - 1).add(t); + for (TextPosition textPosition : sublist) { + textPositionSequences.get(textPositionSequences.size() - 1).add(textPosition); } } else { textPositionSequences.add(new TextPositionSequence(sublist, pageNumber)); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFTextStripper.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/parsing/PDFTextStripper.java similarity index 99% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFTextStripper.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/parsing/PDFTextStripper.java index a0eeaaa2..18be3d0e 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/parsing/PDFTextStripper.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/parsing/PDFTextStripper.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package com.iqser.red.service.redaction.v1.server.parsing; +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.parsing; import java.io.BufferedInputStream; import java.io.IOException; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/BlockificationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/BlockificationService.java similarity index 66% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/BlockificationService.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/BlockificationService.java index 2cff711a..d67447a9 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/BlockificationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/BlockificationService.java @@ -1,4 +1,4 @@ -package com.iqser.red.service.redaction.v1.server.classification.service; +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service; import static java.util.stream.Collectors.toSet; @@ -9,15 +9,15 @@ import java.util.List; import org.springframework.stereotype.Service; -import com.iqser.red.service.redaction.v1.server.classification.model.FloatFrequencyCounter; -import com.iqser.red.service.redaction.v1.server.classification.model.Orientation; -import com.iqser.red.service.redaction.v1.server.classification.model.Page; -import com.iqser.red.service.redaction.v1.server.classification.model.StringFrequencyCounter; -import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; -import com.iqser.red.service.redaction.v1.server.classification.utils.RulingTextDirAdjustUtil; -import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.FloatFrequencyCounter; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.Orientation; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Ruling; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.StringFrequencyCounter; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.RulingTextDirAdjustUtil; @Service @SuppressWarnings("all") @@ -29,16 +29,18 @@ public class BlockificationService { /** * This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions. * This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this! - * Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling. - * @param textPositions The words of a page. + * Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling. + * + * @param textPositions The words of a page. * @param horizontalRulingLines Horizontal table lines. - * @param verticalRulingLines Vertical table lines. + * @param verticalRulingLines Vertical table lines. * @return Page object that contains the Textblock and text statistics. */ - public Page blockify(List textPositions, List horizontalRulingLines, List verticalRulingLines) { + public ClassificationPage blockify(List textPositions, List horizontalRulingLines, List verticalRulingLines) { + int indexOnPage = 0; List chunkWords = new ArrayList<>(); - List chunkBlockList1 = new ArrayList<>(); + List chunkBlockList = new ArrayList<>(); float minX = 1000, maxX = 0, minY = 1000, maxY = 0; TextPositionSequence prev = null; @@ -58,12 +60,14 @@ public class BlockificationService { if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) { Orientation prevOrientation = null; - if (!chunkBlockList1.isEmpty()) { - prevOrientation = chunkBlockList1.get(chunkBlockList1.size() - 1).getOrientation(); + if (!chunkBlockList.isEmpty()) { + prevOrientation = chunkBlockList.get(chunkBlockList.size() - 1).getOrientation(); } - TextBlock cb1 = buildTextBlock(chunkWords); - chunkBlockList1.add(cb1); + TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage); + indexOnPage++; + + chunkBlockList.add(cb1); chunkWords = new ArrayList<>(); if (splitByX && !isSplitByRuling) { @@ -102,17 +106,17 @@ public class BlockificationService { } } - TextBlock cb1 = buildTextBlock(chunkWords); + TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage); if (cb1 != null) { - chunkBlockList1.add(cb1); + chunkBlockList.add(cb1); } - Iterator itty = chunkBlockList1.iterator(); + Iterator itty = chunkBlockList.iterator(); - TextBlock previousLeft = null; - TextBlock previousRight = null; + TextPageBlock previousLeft = null; + TextPageBlock previousRight = null; while (itty.hasNext()) { - TextBlock block = (TextBlock) itty.next(); + TextPageBlock block = (TextPageBlock) itty.next(); if (previousLeft != null && block.getOrientation().equals(Orientation.LEFT)) { if (previousLeft.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousLeft.getMinY()) { @@ -137,10 +141,10 @@ public class BlockificationService { } } - itty = chunkBlockList1.iterator(); - TextBlock previous = null; + itty = chunkBlockList.iterator(); + TextPageBlock previous = null; while (itty.hasNext()) { - TextBlock block = (TextBlock) itty.next(); + TextPageBlock block = (TextPageBlock) itty.next(); if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation() @@ -153,7 +157,7 @@ public class BlockificationService { previous = block; } - return new Page(chunkBlockList1); + return new ClassificationPage(chunkBlockList); } @@ -163,9 +167,9 @@ public class BlockificationService { } - private TextBlock buildTextBlock(List wordBlockList) { + private TextPageBlock buildTextBlock(List wordBlockList, int indexOnPage) { - TextBlock textBlock = null; + TextPageBlock textBlock = null; FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter(); FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter(); @@ -182,9 +186,14 @@ public class BlockificationService { styleFrequencyCounter.add(wordBlock.getFontStyle()); if (textBlock == null) { - textBlock = new TextBlock(wordBlock.getMinXDirAdj(), wordBlock.getMaxXDirAdj(), wordBlock.getMinYDirAdj(), wordBlock.getMaxYDirAdj(), wordBlockList, wordBlock.getRotation()); + textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(), + wordBlock.getMaxXDirAdj(), + wordBlock.getMinYDirAdj(), + wordBlock.getMaxYDirAdj(), + wordBlockList, + wordBlock.getRotation()); } else { - TextBlock spatialEntity = textBlock.union(wordBlock); + TextPageBlock spatialEntity = textBlock.union(wordBlock); textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight()); } } @@ -213,10 +222,38 @@ public class BlockificationService { List horizontalRulingLines, List verticalRulingLines) { - return isSplitByRuling(maxX, minY, word.getMinXDirAdj(), word.getMinYDirAdj(), verticalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight()) // - || isSplitByRuling(minX, minY, word.getMinXDirAdj(), word.getMaxYDirAdj(), horizontalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight()) // - || isSplitByRuling(maxX, minY, word.getMinXDirAdj(), word.getMinYDirAdj(), horizontalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight()) // - || isSplitByRuling(minX, minY, word.getMinXDirAdj(), word.getMaxYDirAdj(), verticalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight()); // + return isSplitByRuling(maxX, + minY, + word.getMinXDirAdj(), + word.getMinYDirAdj(), + verticalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()) // + || isSplitByRuling(minX, + minY, + word.getMinXDirAdj(), + word.getMaxYDirAdj(), + horizontalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()) // + || isSplitByRuling(maxX, + minY, + word.getMinXDirAdj(), + word.getMinYDirAdj(), + horizontalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()) // + || isSplitByRuling(minX, + minY, + word.getMinXDirAdj(), + word.getMaxYDirAdj(), + verticalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()); } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/BodyTextFrameService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/BodyTextFrameService.java similarity index 75% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/BodyTextFrameService.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/BodyTextFrameService.java index a8874081..c695c3c4 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/BodyTextFrameService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/BodyTextFrameService.java @@ -1,4 +1,4 @@ -package com.iqser.red.service.redaction.v1.server.classification.service; +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service; import java.util.List; @@ -6,17 +6,20 @@ import org.springframework.stereotype.Service; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; -import com.iqser.red.service.redaction.v1.server.classification.model.FloatFrequencyCounter; -import com.iqser.red.service.redaction.v1.server.classification.model.Page; -import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; -import com.iqser.red.service.redaction.v1.server.classification.utils.PositionUtils; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.FloatFrequencyCounter; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Cell; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.TablePageBlock; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.PositionUtils; @Service public class BodyTextFrameService { + private static final float APPROXIMATE_HEADER_LINE_COUNT = 2.9f; + + /** * Adjusts and sets the body text frame to a page. * Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation. @@ -30,7 +33,7 @@ public class BodyTextFrameService { * @param bodyTextFrame frame that contains the main text on portrait pages * @param landscapeBodyTextFrame frame that contains the main text on landscape pages */ - public void setBodyTextFrameAdjustedToPage(Page page, Rectangle bodyTextFrame, Rectangle landscapeBodyTextFrame) { + public void setBodyTextFrameAdjustedToPage(ClassificationPage page, Rectangle bodyTextFrame, Rectangle landscapeBodyTextFrame) { Rectangle textFrame = page.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame; @@ -65,26 +68,26 @@ public class BodyTextFrameService { * @param landscape Calculate for landscape or portrait * @return Rectangle of the text frame */ - public Rectangle calculateBodyTextFrame(List pages, FloatFrequencyCounter documentFontSizeCounter, boolean landscape) { + public Rectangle calculateBodyTextFrame(List pages, FloatFrequencyCounter documentFontSizeCounter, boolean landscape) { BodyTextFrameExpansionsRectangle expansionsRectangle = new BodyTextFrameExpansionsRectangle(); - for (Page page : pages) { + for (ClassificationPage page : pages) { if (page.getTextBlocks().isEmpty() || landscape != page.isLandscape()) { continue; } - for (AbstractTextContainer container : page.getTextBlocks()) { + for (AbstractPageBlock container : page.getTextBlocks()) { - if (container instanceof TextBlock) { - TextBlock textBlock = (TextBlock) container; + if (container instanceof TextPageBlock) { + TextPageBlock textBlock = (TextPageBlock) container; if (textBlock.getMostPopularWordFont() == null || textBlock.getMostPopularWordStyle() == null) { continue; } float approxLineCount = PositionUtils.getApproxLineCount(textBlock); - if (approxLineCount < 2.9f) { + if (approxLineCount < APPROXIMATE_HEADER_LINE_COUNT) { continue; } @@ -94,15 +97,15 @@ public class BodyTextFrameService { } } - if (container instanceof Table) { - Table table = (Table) container; + if (container instanceof TablePageBlock) { + TablePageBlock table = (TablePageBlock) container; for (List row : table.getRows()) { for (Cell cell : row) { if (cell == null || cell.getTextBlocks() == null) { continue; } - for (TextBlock textBlock : cell.getTextBlocks()) { + for (TextPageBlock textBlock : cell.getTextBlocks()) { expandRectangle(textBlock, page, expansionsRectangle); } } @@ -117,7 +120,7 @@ public class BodyTextFrameService { } - private void expandRectangle(TextBlock textBlock, Page page, BodyTextFrameExpansionsRectangle expansionsRectangle) { + private void expandRectangle(TextPageBlock textBlock, ClassificationPage page, BodyTextFrameExpansionsRectangle expansionsRectangle) { if (page.getPageWidth() > page.getPageHeight() && page.getRotation() != 0) { if (textBlock.getPdfMinY() < expansionsRectangle.minX) { diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/ClassificationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/ClassificationService.java similarity index 64% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/ClassificationService.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/ClassificationService.java index c5247fda..ea325637 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/classification/service/ClassificationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/ClassificationService.java @@ -1,4 +1,4 @@ -package com.iqser.red.service.redaction.v1.server.classification.service; +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service; import java.util.List; import java.util.regex.Pattern; @@ -6,11 +6,12 @@ import java.util.regex.Pattern; import org.springframework.stereotype.Service; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; -import com.iqser.red.service.redaction.v1.server.classification.model.Document; -import com.iqser.red.service.redaction.v1.server.classification.model.Page; -import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; -import com.iqser.red.service.redaction.v1.server.classification.utils.PositionUtils; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationDocument; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.PageBlockType; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.PositionUtils; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; @@ -23,7 +24,7 @@ public class ClassificationService { private final BodyTextFrameService bodyTextFrameService; - public void classifyDocument(Document document) { + public void classifyDocument(ClassificationDocument document) { Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false); Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true); @@ -31,43 +32,43 @@ public class ClassificationService { log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue()); - for (Page page : document.getPages()) { + for (ClassificationPage page : document.getPages()) { bodyTextFrameService.setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame); classifyPage(page, document, headlineFontSizes); } } - public void classifyPage(Page page, Document document, List headlineFontSizes) { + public void classifyPage(ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { - for (AbstractTextContainer textBlock : page.getTextBlocks()) { - if (textBlock instanceof TextBlock) { - classifyBlock((TextBlock) textBlock, page, document, headlineFontSizes); + for (AbstractPageBlock textBlock : page.getTextBlocks()) { + if (textBlock instanceof TextPageBlock) { + classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes); } } } - public void classifyBlock(TextBlock textBlock, Page page, Document document, List headlineFontSizes) { + public void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { var bodyTextFrame = page.getBodyTextFrame(); if (document.getFontSizeCounter().getMostPopular() == null) { - textBlock.setClassification("Other"); + textBlock.setClassification(PageBlockType.OTHER); return; } if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) { - textBlock.setClassification("Header"); + textBlock.setClassification(PageBlockType.HEADER); } else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) { - textBlock.setClassification("Footer"); + textBlock.setClassification(PageBlockType.FOOTER); } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks() .size() == 1)) { if (!Pattern.matches("[0-9]+", textBlock.toString())) { - textBlock.setClassification("Title"); + textBlock.setClassification(PageBlockType.TITLE); } } else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter() .getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter() @@ -80,36 +81,34 @@ public class ClassificationService { for (int i = 1; i <= headlineFontSizes.size(); i++) { if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) { - textBlock.setClassification("H " + i); + textBlock.setClassification(PageBlockType.getHeadlineType(i)); document.setHeadlines(true); } } - } else if (!textBlock.getText().startsWith("Table ") && !textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, - textBlock) && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter() - .getMostPopular() - .equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences() + } else if (!textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle() + .equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences() .get(0) .getTextPositions() .get(0) .getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { - textBlock.setClassification("H " + (headlineFontSizes.size() + 1)); + textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1)); document.setHeadlines(true); } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter() .getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) { - textBlock.setClassification("TextBlock Bold"); + textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD); } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont() .equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle() .equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) { - textBlock.setClassification("TextBlock"); + textBlock.setClassification(PageBlockType.PARAGRAPH); } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter() .getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter() .getMostPopular() .equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) { - textBlock.setClassification("TextBlock Italic"); + textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC); } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) { - textBlock.setClassification("TextBlock Unknown"); + textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN); } else { - textBlock.setClassification("Other"); + textBlock.setClassification(PageBlockType.OTHER); } } diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/PdfSegmentationService.java similarity index 61% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/PdfSegmentationService.java index 8223b17f..000c1343 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/PdfSegmentationService.java @@ -1,4 +1,4 @@ -package com.iqser.red.service.redaction.v1.server.segmentation; +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service; import java.io.File; import java.io.FileOutputStream; @@ -16,21 +16,19 @@ import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.springframework.stereotype.Service; -import com.iqser.red.service.redaction.v1.server.classification.model.Document; -import com.iqser.red.service.redaction.v1.server.classification.model.Page; -import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; -import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService; -import com.iqser.red.service.redaction.v1.server.classification.service.ClassificationService; -import com.iqser.red.service.redaction.v1.server.parsing.PDFLinesTextStripper; -import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; -import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage; -import com.iqser.red.service.redaction.v1.server.redaction.model.PdfTableCell; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.ImageServiceResponseAdapter; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.TableServiceResponseAdapter; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table.PdfTableCell; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationDocument; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.image.ClassifiedImage; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.CleanRulings; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPositionSequence; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.parsing.PDFLinesTextStripper; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.FileUtils; import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings; -import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService; -import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService; -import com.iqser.red.service.redaction.v1.server.tableextraction.utils.FileUtils; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; @@ -46,18 +44,21 @@ public class PdfSegmentationService { private final BlockificationService blockificationService; private final ClassificationService classificationService; private final SectionsBuilderService sectionsBuilderService; - private final ImageService imageService; - private final TableService tableService; + private final ImageServiceResponseAdapter imageServiceResponseAdapter; + private final TableServiceResponseAdapter tableServiceResponseAdapter; - public Document parseDocument(String dossierId, String fileId, InputStream documentInputStream, Map> pdfImages) throws IOException { + public ClassificationDocument parseDocument(String dossierId, + String fileId, + InputStream documentInputStream, + Map> pdfImages) throws IOException { PDDocument pdDocument = null; File tempFile = null; try { Map> pdfTableCells = new HashMap<>(); if (redactionServiceSettings.isCvTableParsingEnabled()) { - pdfTableCells = tableService.convertTables(dossierId, fileId); + pdfTableCells = tableServiceResponseAdapter.convertTables(dossierId, fileId); } tempFile = FileUtils.createTempFile("document", ".pdf"); @@ -65,8 +66,8 @@ public class PdfSegmentationService { IOUtils.copy(documentInputStream, fos); // initialize required variables - Document document = new Document(); - List pages = new ArrayList<>(); + ClassificationDocument document = new ClassificationDocument(); + List pages = new ArrayList<>(); pdDocument = PDDocument.load(tempFile, MemoryUsageSetting.setupMixed(67108864L)); pdDocument.setAllSecurityToBeRemoved(true); @@ -94,12 +95,12 @@ public class PdfSegmentationService { } - private void processPage(Map> pdfImages, - PDDocument pdDocument, - Map> pdfTableCells, - Document document, - List pages, - int pageNumber) throws IOException { + private void processPage(Map> pdfImages, + PDDocument pdDocument, + Map> pdfTableCells, + ClassificationDocument document, + List pages, + int pageNumber) throws IOException { PDFLinesTextStripper stripper = new PDFLinesTextStripper(); PDPage pdPage = pdDocument.getPage(pageNumber - 1); @@ -119,7 +120,7 @@ public class PdfSegmentationService { stripper.getRulings(), stripper.getMinCharWidth(), stripper.getMaxCharHeight()); - Page page = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); + ClassificationPage page = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); page.setRotation(rotation); page.setLandscape(isLandscape); @@ -130,7 +131,7 @@ public class PdfSegmentationService { // If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted. if (pdfImages != null && pdfImages.containsKey(pageNumber)) { page.setImages(pdfImages.get(pageNumber)); - imageService.findOcr(page); + imageServiceResponseAdapter.findOcr(page); } tableExtractionService.extractTables(cleanRulings, page); @@ -141,7 +142,7 @@ public class PdfSegmentationService { } - private void increaseDocumentStatistics(Page page, Document document) { + private void increaseDocumentStatistics(ClassificationPage page, ClassificationDocument document) { if (!page.isLandscape()) { document.getFontSizeCounter().addAll(page.getFontSizeCounter().getCountPerValue()); @@ -152,15 +153,15 @@ public class PdfSegmentationService { } - private void buildPageStatistics(Page page) { + private void buildPageStatistics(ClassificationPage page) { // Collect all statistics for the page, except from blocks inside tables, as tables will always be added to BodyTextFrame. - for (AbstractTextContainer textBlock : page.getTextBlocks()) { - if (textBlock instanceof TextBlock) { - if (((TextBlock) textBlock).getSequences() == null) { + for (AbstractPageBlock textBlock : page.getTextBlocks()) { + if (textBlock instanceof TextPageBlock) { + if (((TextPageBlock) textBlock).getSequences() == null) { continue; } - for (TextPositionSequence word : ((TextBlock) textBlock).getSequences()) { + for (TextPositionSequence word : ((TextPageBlock) textBlock).getSequences()) { page.getTextHeightCounter().add(word.getTextHeight()); page.getFontCounter().add(word.getFont()); page.getFontSizeCounter().add(word.getFontSize()); diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/service/RulingCleaningService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/RulingCleaningService.java similarity index 94% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/service/RulingCleaningService.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/RulingCleaningService.java index dcc9c498..25f88849 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/tableextraction/service/RulingCleaningService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/RulingCleaningService.java @@ -1,4 +1,4 @@ -package com.iqser.red.service.redaction.v1.server.tableextraction.service; +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service; import java.awt.geom.Line2D; import java.awt.geom.Point2D; @@ -12,11 +12,11 @@ import java.util.Map; import org.springframework.stereotype.Service; -import com.iqser.red.service.redaction.v1.server.redaction.model.PdfTableCell; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.adapter.table.PdfTableCell; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.CleanRulings; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Ruling; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.Utils; import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.Ruling; -import com.iqser.red.service.redaction.v1.server.tableextraction.utils.Utils; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/SectionsBuilderService.java similarity index 51% rename from redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java rename to redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/SectionsBuilderService.java index fddcb26f..48b7ccbf 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/SectionsBuilderService.java @@ -1,9 +1,8 @@ -package com.iqser.red.service.redaction.v1.server.segmentation; +package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; -import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.stream.Collectors; @@ -11,17 +10,18 @@ import java.util.stream.Collectors; import org.apache.commons.collections4.CollectionUtils; import org.springframework.stereotype.Service; -import com.iqser.red.service.redaction.v1.server.classification.model.Document; -import com.iqser.red.service.redaction.v1.server.classification.model.Footer; -import com.iqser.red.service.redaction.v1.server.classification.model.Header; -import com.iqser.red.service.redaction.v1.server.classification.model.Page; -import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph; -import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; -import com.iqser.red.service.redaction.v1.server.classification.model.UnclassifiedText; -import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; -import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationDocument; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationFooter; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationHeader; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationSection; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.PageBlockType; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.image.ClassifiedImage; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.Cell; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.table.TablePageBlock; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock; +import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.UnclassifiedText; import lombok.extern.slf4j.Slf4j; @@ -29,23 +29,23 @@ import lombok.extern.slf4j.Slf4j; @Service public class SectionsBuilderService { - public void buildSections(Document document) { + public void buildSections(ClassificationDocument document) { - List chunkWords = new ArrayList<>(); - List chunkBlockList = new ArrayList<>(); - List
headers = new ArrayList<>(); - List