diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/DocumentStructure.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/DocumentStructure.java index 523308c..4b26f52 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/DocumentStructure.java +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/DocumentStructure.java @@ -55,6 +55,13 @@ public class DocumentStructure implements Serializable { } + @Schema(description = "Object containing the extra field names, a duplicate paragraph has in its properties field.") + public static class DuplicateParagraphProperties implements Serializable { + + public static final String UNSORTED_TEXTBLOCK_ID = "utbid"; + + } + public static final String RECTANGLE_DELIMITER = ";"; diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingType.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingType.java index 7598d29..9d066c1 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingType.java +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingType.java @@ -2,6 +2,9 @@ package com.knecon.fforesight.service.layoutparser.internal.api.queue; public enum LayoutParsingType { REDACT_MANAGER, - TAAS, - DOCUMINE + REDACT_MANAGER_OLD, + REDACT_MANAGER_PARAGRAPH_DEBUG, + DOCUMINE, + CLARIFYND, + CLARIFYND_PARAGRAPH_DEBUG } diff --git a/layoutparser-service/layoutparser-service-processor/build.gradle.kts b/layoutparser-service/layoutparser-service-processor/build.gradle.kts index df746ba..e56f8b5 100644 --- a/layoutparser-service/layoutparser-service-processor/build.gradle.kts +++ b/layoutparser-service/layoutparser-service-processor/build.gradle.kts @@ -24,4 +24,5 @@ dependencies { implementation("com.fasterxml.jackson.module:jackson-module-afterburner:${jacksonVersion}") implementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:${jacksonVersion}") implementation("org.springframework.boot:spring-boot-starter-web:3.1.3") + implementation("org.jgrapht:jgrapht-core:1.5.2") } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index b3cdb6b..c391497 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -28,6 +28,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; @@ -43,12 +44,11 @@ import com.knecon.fforesight.service.layoutparser.processor.services.RulingClean import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService; import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService; import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService; +import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService; import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService; import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService; -import com.knecon.fforesight.service.layoutparser.processor.services.blockification.TaasBlockificationService; import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService; import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService; -import com.knecon.fforesight.service.layoutparser.processor.services.classification.TaasClassificationService; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper; @@ -76,16 +76,15 @@ public class LayoutParsingPipeline { CvTableParsingAdapter cvTableParsingAdapter; LayoutParsingStorageService layoutParsingStorageService; SectionsBuilderService sectionsBuilderService; - TaasClassificationService taasClassificationService; RedactManagerClassificationService redactManagerClassificationService; DocuMineClassificationService docuMineClassificationService; SimplifiedSectionTextService simplifiedSectionTextService; BodyTextFrameService bodyTextFrameService; RulingCleaningService rulingCleaningService; TableExtractionService tableExtractionService; - TaasBlockificationService taasBlockificationService; DocuMineBlockificationService docuMineBlockificationService; RedactManagerBlockificationService redactManagerBlockificationService; + DocstrumBlockificationService docstrumBlockificationService; LayoutGridService layoutGridService; ObservationRegistry observationRegistry; VisualLayoutParsingAdapter visualLayoutParsingAdapter; @@ -97,40 +96,33 @@ public class LayoutParsingPipeline { log.info("Starting layout parsing for {}", layoutParsingRequest.identifier()); File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId()); - File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()) - .orElse(originFile); + File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile); VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse(); - if (layoutParsingRequest.visualLayoutParsingFileId() - .isPresent()) { - visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId() - .get()); + if (layoutParsingRequest.visualLayoutParsingFileId().isPresent()) { + visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId().get()); } ImageServiceResponse imageServiceResponse = new ImageServiceResponse(); - if (layoutParsingRequest.imagesFileStorageId() - .isPresent()) { - imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId() - .get()); + if (layoutParsingRequest.imagesFileStorageId().isPresent()) { + imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get()); } TableServiceResponse tableServiceResponse = new TableServiceResponse(); - if (layoutParsingRequest.tablesFileStorageId() - .isPresent()) { - tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId() - .get()); + if (layoutParsingRequest.tablesFileStorageId().isPresent()) { + tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get()); } ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(), - originFile, - imageServiceResponse, - tableServiceResponse, - visualLayoutParsingResponse, - layoutParsingRequest.identifier().toString()); + originFile, + imageServiceResponse, + tableServiceResponse, + visualLayoutParsingResponse, + layoutParsingRequest.identifier().toString()); log.info("Building document graph for {}", layoutParsingRequest.identifier()); - Document documentGraph = observeBuildDocumentGraph(classificationDocument); + Document documentGraph = observeBuildDocumentGraph(layoutParsingRequest.layoutParsingType(), classificationDocument); log.info("Creating viewer document for {}", layoutParsingRequest.identifier()); @@ -142,7 +134,7 @@ public class LayoutParsingPipeline { layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph)); layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile); - if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.TAAS)) { + if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.CLARIFYND)) { log.info("Building research document data for {}", layoutParsingRequest.identifier()); var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentGraph); layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData); @@ -158,37 +150,37 @@ public class LayoutParsingPipeline { .numberOfPages(documentGraph.getNumberOfPages()) .duration(System.currentTimeMillis() - start) .message(format(""" - Layout parsing has finished in %.02f s. - identifiers: %s - %s - Files have been saved with Ids: - Structure: %s - Text: %s - Positions: %s - PageData: %s - Simplified Text: %s - Viewer Doc: %s""", - ((float) (System.currentTimeMillis() - start)) / 1000, - layoutParsingRequest.identifier(), - buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()), - layoutParsingRequest.structureFileStorageId(), - layoutParsingRequest.textBlockFileStorageId(), - layoutParsingRequest.positionBlockFileStorageId(), - layoutParsingRequest.pageFileStorageId(), - layoutParsingRequest.simplifiedTextStorageId(), - layoutParsingRequest.viewerDocumentStorageId())) + Layout parsing has finished in %.02f s. + identifiers: %s + %s + Files have been saved with Ids: + Structure: %s + Text: %s + Positions: %s + PageData: %s + Simplified Text: %s + Viewer Doc: %s""", + ((float) (System.currentTimeMillis() - start)) / 1000, + layoutParsingRequest.identifier(), + buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()), + layoutParsingRequest.structureFileStorageId(), + layoutParsingRequest.textBlockFileStorageId(), + layoutParsingRequest.positionBlockFileStorageId(), + layoutParsingRequest.pageFileStorageId(), + layoutParsingRequest.simplifiedTextStorageId(), + layoutParsingRequest.viewerDocumentStorageId())) .build(); } - private Document observeBuildDocumentGraph(ClassificationDocument classificationDocument) { + private Document observeBuildDocumentGraph(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument) { AtomicReference documentReference = new AtomicReference<>(); Observation.createNotStarted("LayoutParsingPipeline", observationRegistry) .contextualName("build-document-graph") - .observe(() -> documentReference.set(DocumentGraphFactory.buildDocumentGraph(classificationDocument))); + .observe(() -> documentReference.set(DocumentGraphFactory.buildDocumentGraph(layoutParsingType, classificationDocument))); return documentReference.get(); } @@ -197,14 +189,14 @@ public class LayoutParsingPipeline { private String buildSemanticNodeCountMessage(int numberOfPages, Map semanticNodeCounts) { return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed", - numberOfPages, - semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION), - semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE), - semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH), - semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE), - semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL), - semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER), - semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER)); + numberOfPages, + semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION), + semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE), + semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH), + semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE), + semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL), + semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER), + semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER)); } @@ -260,11 +252,16 @@ public class LayoutParsingPipeline { PDRectangle cropbox = pdPage.getCropBox(); CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings()); + List emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical()); + ClassificationPage classificationPage = switch (layoutParsingType) { - case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); - case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); + case REDACT_MANAGER_OLD -> + redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); + case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, true); + case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false); }; + classificationPage.setCleanRulings(cleanRulings); classificationPage.setRotation(rotation); classificationPage.setLandscape(isLandscape); @@ -289,7 +286,13 @@ public class LayoutParsingPipeline { } } - tableExtractionService.extractTables(cleanRulings, classificationPage); + tableExtractionService.extractTables(emptyTableCells, classificationPage); + + if (layoutParsingType == LayoutParsingType.REDACT_MANAGER) { + docstrumBlockificationService.combineBlocks(classificationPage); + } else if (layoutParsingType == LayoutParsingType.CLARIFYND) { + docstrumBlockificationService.mergeZones(classificationPage.getTextBlocks()); + } buildPageStatistics(classificationPage); increaseDocumentStatistics(classificationPage, classificationDocument); @@ -303,14 +306,21 @@ public class LayoutParsingPipeline { bodyTextFrameService.setBodyTextFrames(classificationDocument, layoutParsingType); log.info("Classify TextBlocks for {}", identifier); switch (layoutParsingType) { - case TAAS -> taasClassificationService.classifyDocument(classificationDocument); + case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> + redactManagerClassificationService.classifyDocument(classificationDocument); case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument); - case REDACT_MANAGER -> redactManagerClassificationService.classifyDocument(classificationDocument); } log.info("Building Sections for {}", identifier); - sectionsBuilderService.buildSections(classificationDocument); - sectionsBuilderService.addImagesToSections(classificationDocument); + + switch (layoutParsingType) { + case CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_PARAGRAPH_DEBUG -> sectionsBuilderService.buildParagraphDebugSections(classificationDocument); + default -> { + sectionsBuilderService.buildSections(classificationDocument); + sectionsBuilderService.addImagesToSections(classificationDocument); + } + } + return classificationDocument; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/AbstractPageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/AbstractPageBlock.java index d113dfa..1f01f2f 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/AbstractPageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/AbstractPageBlock.java @@ -96,7 +96,7 @@ public abstract class AbstractPageBlock extends Rectangle { return this.minX - threshold <= apb.getMaxX() && this.maxX + threshold >= apb.getMinX(); } - + public abstract boolean isEmpty(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java index 23a3cd0..e31a27c 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java @@ -15,7 +15,6 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.No import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector; import lombok.AccessLevel; import lombok.AllArgsConstructor; @@ -52,7 +51,7 @@ public class Document implements GenericSemanticNode { public TextBlock getTextBlock() { if (textBlock == null) { - textBlock = streamTerminalTextBlocksInOrder().collect(new TextBlockCollector()); + textBlock = GenericSemanticNode.super.getTextBlock(); } return textBlock; } @@ -67,8 +66,7 @@ public class Document implements GenericSemanticNode { public Stream streamTerminalTextBlocksInOrder() { - return streamAllNodes().filter(SemanticNode::isLeaf) - .map(SemanticNode::getLeafTextBlock); + return streamAllNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getTextBlock); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/DuplicatedParagraph.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/DuplicatedParagraph.java new file mode 100644 index 0000000..93c2427 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/DuplicatedParagraph.java @@ -0,0 +1,34 @@ +package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes; + +import java.util.stream.Stream; + +import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector; + +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.experimental.SuperBuilder; + +@Data +@EqualsAndHashCode(callSuper = true) +@SuperBuilder +public class DuplicatedParagraph extends Paragraph { + + TextBlock unsortedLeafTextBlock; + + + @Override + public TextBlock getTextBlock() { + + return Stream.of(leafTextBlock, unsortedLeafTextBlock).collect(new TextBlockCollector()); + + } + + + @Override + public String toString() { + + return super.toString(); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Paragraph.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Paragraph.java index 71010b6..dfcb4f9 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Paragraph.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Paragraph.java @@ -18,11 +18,12 @@ import lombok.Builder; import lombok.Data; import lombok.EqualsAndHashCode; import lombok.experimental.FieldDefaults; +import lombok.experimental.SuperBuilder; @Data -@Builder +@SuperBuilder @AllArgsConstructor -@FieldDefaults(level = AccessLevel.PRIVATE) +@FieldDefaults(level = AccessLevel.PROTECTED) public class Paragraph implements GenericSemanticNode { @Builder.Default diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Section.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Section.java index 6680c01..3a59884 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Section.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Section.java @@ -11,7 +11,6 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.No import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector; import lombok.AccessLevel; import lombok.AllArgsConstructor; @@ -62,9 +61,7 @@ public class Section implements GenericSemanticNode { public TextBlock getTextBlock() { if (textBlock == null) { - textBlock = streamAllSubNodes().filter(SemanticNode::isLeaf) - .map(SemanticNode::getLeafTextBlock) - .collect(new TextBlockCollector()); + textBlock = GenericSemanticNode.super.getTextBlock(); } return textBlock; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SemanticNode.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SemanticNode.java index e35a83e..32369e6 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SemanticNode.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SemanticNode.java @@ -20,6 +20,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.E import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; public interface SemanticNode { @@ -39,7 +40,10 @@ public interface SemanticNode { * * @return TextBlock containing all AtomicTextBlocks that are located under this Node. */ - TextBlock getTextBlock(); + default TextBlock getTextBlock() { + + return streamAllSubNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getTextBlock).collect(new TextBlockCollector()); + } /** diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java index 5bed37b..18f3ef5 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java @@ -48,7 +48,6 @@ public class Table implements SemanticNode { @EqualsAndHashCode.Exclude Map bBoxCache; - /** * Streams all entities in this table, that appear in a row, which contains any of the provided strings. * @@ -332,9 +331,7 @@ public class Table implements SemanticNode { public TextBlock getTextBlock() { if (textBlock == null) { - textBlock = streamAllSubNodes().filter(SemanticNode::isLeaf) - .map(SemanticNode::getLeafTextBlock) - .collect(new TextBlockCollector()); + textBlock = SemanticNode.super.getTextBlock(); } return textBlock; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java index 0442af6..6323205 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java @@ -53,6 +53,9 @@ public class TextPageBlock extends AbstractPageBlock { @JsonIgnore private PageBlockType classification; + @JsonIgnore + private boolean toDuplicate; + @JsonIgnore public TextDirection getDir() { @@ -73,7 +76,7 @@ public class TextPageBlock extends AbstractPageBlock { return sequences.get(0).getPageWidth(); } - + public static TextPageBlock merge(List textBlocksToMerge) { @@ -82,6 +85,7 @@ public class TextPageBlock extends AbstractPageBlock { return fromTextPositionSequences(sequences); } + public static TextPageBlock fromTextPositionSequences(List wordBlockList) { TextPageBlock textBlock = null; @@ -133,7 +137,6 @@ public class TextPageBlock extends AbstractPageBlock { } - /** * Returns the minX value in pdf coordinate system. * Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation. @@ -362,7 +365,22 @@ public class TextPageBlock extends AbstractPageBlock { } return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString()); + } + + public int getNumberOfLines() { + + int numberOfLines = 1; + TextPositionSequence previous = null; + for (TextPositionSequence word : sequences) { + if (previous != null) { + if (word.getMaxYDirAdj() - previous.getMaxYDirAdj() > word.getTextHeight()) { + numberOfLines++; + } + } + previous = word; + } + return numberOfLines; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java index 82829c6..dc77c45 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java @@ -55,6 +55,17 @@ public class TextPositionSequence implements CharSequence { } + public TextPositionSequence(List textPositions, int page) { + + this.textPositions = textPositions; + this.page = page; + this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir()); + this.rotation = textPositions.get(0).getRotation(); + this.pageHeight = textPositions.get(0).getPageHeight(); + this.pageWidth = textPositions.get(0).getPageWidth(); + } + + @Override public int length() { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java index 5942443..3cd09a8 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java @@ -25,6 +25,7 @@ public class BodyTextFrameService { private static final float RULING_HEIGHT_THRESHOLD = 0.15f; // multiplied with page height. Header/Footer Rulings must be within that border of the page. private static final float RULING_WIDTH_THRESHOLD = 0.75f; // multiplied with page width. Header/Footer Rulings must be at least that wide. + public void setBodyTextFrames(ClassificationDocument classificationDocument, LayoutParsingType layoutParsingType) { Rectangle bodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), false, layoutParsingType); @@ -132,12 +133,7 @@ public class BodyTextFrameService { boolean landscape, LayoutParsingType layoutParsingType) { - float approximateHeaderLineCount; - if (layoutParsingType.equals(LayoutParsingType.TAAS)) { - approximateHeaderLineCount = 3.3f; - } else { - approximateHeaderLineCount = 2.9f; - } + float approximateHeaderLineCount = 2.9f; BodyTextFrameExpansionsRectangle expansionsRectangle = new BodyTextFrameExpansionsRectangle(); @@ -155,8 +151,9 @@ public class BodyTextFrameService { continue; } - if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) - || MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)) { + if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || MarkedContentUtils.intersects(textBlock, + page.getMarkedContentBboxPerType(), + MarkedContentUtils.FOOTER)) { continue; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java index 04cc930..c9f6e93 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java @@ -7,6 +7,7 @@ import java.util.List; import java.util.Map; import java.util.stream.Collectors; +import org.apache.logging.log4j.util.Strings; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; @@ -110,6 +111,20 @@ public class SectionsBuilderService { } + public void buildParagraphDebugSections(ClassificationDocument document) { + + List sections = new ArrayList<>(); + for (var page : document.getPages()) { + page.getTextBlocks().forEach(block -> { + block.setPage(page.getPageNumber()); + var section = buildTextBlock(List.of(block), Strings.EMPTY); + sections.add(section); + }); + } + document.setSections(sections); + } + + public void addImagesToSections(ClassificationDocument document) { Map> sectionMap = new HashMap<>(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java index 2827153..4af2a04 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java @@ -14,7 +14,6 @@ import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; -import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; @@ -41,19 +40,18 @@ public class TableExtractionService { *

* DirAdj (Text direction adjusted) values can not be used here. * - * @param cleanRulings The lines used to build the table. - * @param page Page object that contains textblocks and statistics. + * @param emptyCells The cells used to build the table. + * @param page Page object that contains textblocks and statistics. */ - public void extractTables(CleanRulings cleanRulings, ClassificationPage page) { + public void extractTables(List emptyCells, ClassificationPage page) { - List cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical()); // sort cells by size (height * width) ascending so that textBlocks are always assigned to the smallest cells that contain them - cells.sort(CELL_SIZE_COMPARATOR); + emptyCells.sort(CELL_SIZE_COMPARATOR); for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) { TextPageBlock textBlock = (TextPageBlock) abstractPageBlock; - for (Cell cell : cells) { + for (Cell cell : emptyCells) { if (cell.hasMinimumSize() && doesCellContainTextBlock(cell, textBlock)) { cell.addTextBlock(textBlock); break; @@ -61,7 +59,7 @@ public class TableExtractionService { } } - cells = new ArrayList<>(new HashSet<>(cells)); + var cells = new ArrayList<>(new HashSet<>(emptyCells)); DoubleComparisons.sort(cells, Rectangle.ILL_DEFINED_ORDER); List spreadsheetAreas = SpreadsheetFinder.findSpreadsheetsFromCells(cells); @@ -79,9 +77,7 @@ public class TableExtractionService { } } - var containedCellsWithText = containedCells.stream() - .filter(cell -> !cell.getTextBlocks().isEmpty()) - .toList(); + var containedCellsWithText = containedCells.stream().filter(cell -> !cell.getTextBlocks().isEmpty()).toList(); // verify if table would contain fewer cells with text than the threshold allows if (containedCellsWithText.size() >= MAX_TABLE_CONTAINED_CELLS_WITH_TEXT && checkIfTableCellsAreUniform(containedCells)) { @@ -101,11 +97,7 @@ public class TableExtractionService { if (position != -1) { page.getTextBlocks().add(position, table); - var toBeRemoved = table.getCells() - .stream() - .map(Cell::getTextBlocks) - .flatMap(List::stream) - .toList(); + var toBeRemoved = table.getCells().stream().map(Cell::getTextBlocks).flatMap(List::stream).toList(); // remove text blocks from the page that were also added with the table (from its contained cells) page.getTextBlocks().removeAll(toBeRemoved); } @@ -115,7 +107,7 @@ public class TableExtractionService { private boolean checkIfTableCellsAreUniform(List containedCells) { - if(containedCells.size() <= 2) { + if (containedCells.size() <= 2) { return true; } @@ -139,19 +131,13 @@ public class TableExtractionService { } double x0 = cell.getX(); double y0 = cell.getY(); - return (x >= x0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE - && y >= y0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE - && (x + w) <= x0 + cell.getWidth() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE - && (y + h) <= y0 + cell.getHeight() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE); + return (x >= x0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE && y >= y0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE && (x + w) <= x0 + cell.getWidth() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE && (y + h) <= y0 + cell.getHeight() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE); } public static List findCells(List horizontalRulingLines, List verticalRulingLines) { - return RectangularIntersectionFinder.find(horizontalRulingLines, verticalRulingLines) - .stream() - .map(Cell::new) - .collect(Collectors.toList()); + return RectangularIntersectionFinder.find(horizontalRulingLines, verticalRulingLines).stream().map(Cell::new).collect(Collectors.toList()); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java new file mode 100644 index 0000000..82c36c9 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java @@ -0,0 +1,408 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.blockification; + +import static java.util.stream.Collectors.toSet; + +import java.awt.geom.Point2D; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.HashSet; +import java.util.List; +import java.util.ListIterator; +import java.util.Set; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.DocstrumSegmentationService; +import com.knecon.fforesight.service.layoutparser.processor.utils.QuickSort; +import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil; +import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionSequenceComparator; + +import lombok.RequiredArgsConstructor; + +@SuppressWarnings("all") +@Service +@RequiredArgsConstructor +public class DocstrumBlockificationService { + + private final DocstrumSegmentationService docstrumSegmentationService; + + static final float THRESHOLD = 1f; + + + public ClassificationPage blockify(List textPositions, List cells, boolean xyOrder) { + + // Underlined or strikethrough are also in rulings but we dont want to split blocks with them so we use cells. + List usedHorizonalRulings = new ArrayList<>(); + List usedVerticalRulings = new ArrayList<>(); + + cells.forEach(cell -> { + usedHorizonalRulings.add(new Ruling(new Point2D.Float(cell.x, cell.y), new Point2D.Float(cell.x + cell.width, cell.y))); + usedHorizonalRulings.add(new Ruling(new Point2D.Float(cell.x, cell.y + cell.height), new Point2D.Float(cell.x + cell.width, cell.y + cell.height))); + usedVerticalRulings.add(new Ruling(new Point2D.Float(cell.x, cell.y), new Point2D.Float(cell.x, cell.y + cell.height))); + usedVerticalRulings.add(new Ruling(new Point2D.Float(cell.x + cell.width, cell.y), new Point2D.Float(cell.x + cell.width, cell.y + cell.height))); + }); + + List abstractPageBlocks = new ArrayList<>(); + var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder); + zones.forEach(zone -> { + + List textPositionSequences = new ArrayList<>(); + zone.getLines().forEach(line -> { + line.getWords().forEach(word -> { + textPositionSequences.add(new TextPositionSequence(word.getTextPositions(), word.getPage())); + }); + }); + + abstractPageBlocks.addAll(splitZonesAtRulings(textPositionSequences, usedHorizonalRulings, usedVerticalRulings)); + }); + + return new ClassificationPage(abstractPageBlocks); + } + + + public void combineBlocks(ClassificationPage page) { + + mergeZones(page.getTextBlocks()); + + TextPageBlock previous = new TextPageBlock(); + ListIterator itty = page.getTextBlocks().listIterator(); + while (itty.hasNext()) { + AbstractPageBlock block = itty.next(); + if (block instanceof TablePageBlock) { + continue; + } + TextPageBlock current = (TextPageBlock) block; + + if (previous != null && !previous.getSequences().isEmpty()) { + + if (current.getDir() == previous.getDir() // + && previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 // + && previous.intersectsY(current) // + && !hasBetween(current, previous, page.getTextBlocks()) // + && numberOfYIntersections(current, previous, page.getTextBlocks()) == 0) { + + previous.getSequences().addAll(current.getSequences()); + previous = buildTextBlock(previous.getSequences(), 0); + previous.setToDuplicate(true); + itty.remove(); + itty.previous(); + itty.set(previous); + itty.next(); + continue; + } + + if (current.getDir() == previous.getDir() && (previous.almostIntersects(current, 0, 0))) { + + previous.getSequences().addAll(current.getSequences()); + boolean toDuplicate = previous.isToDuplicate(); + previous = buildTextBlock(previous.getSequences(), 0); + previous.setToDuplicate(toDuplicate); + itty.remove(); + itty.previous(); + itty.set(previous); + itty.next(); + continue; + } + + if (current.getDir() == previous.getDir() // + && (Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) // + && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) // + && !hasBetween(current, previous, page.getTextBlocks()) && numberOfYIntersections(current, previous, page.getTextBlocks()) <= 4) { + + previous.getSequences().addAll(current.getSequences()); + previous = buildTextBlock(previous.getSequences(), 0); + itty.remove(); + itty.previous(); + itty.set(previous); + itty.next(); + continue; + } + + if (current.getDir() == previous.getDir() // + && current.intersectsY(previous) // + && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) // + && !hasBetween(current, previous, page.getTextBlocks()) // + && numberOfYIntersections(current, previous, page.getTextBlocks()) <= 0) { + previous.getSequences().addAll(current.getSequences()); + previous = buildTextBlock(previous.getSequences(), 0); + itty.remove(); + itty.previous(); + itty.set(previous); + itty.next(); + continue; + } + + } + previous = current; + } + + mergeZones(page.getTextBlocks()); + + } + + + private boolean hasBetween(TextPageBlock block, TextPageBlock other, List allBlocks) { + + for (AbstractPageBlock current : allBlocks) { + + if (current == other || current == block) { + continue; + } + + if (other.intersectsY(current) && other.getMaxX() <= current.getMinX() && current.getMaxX() <= block.getMinX()) { + return true; + } + } + + return false; + } + + + private int numberOfYIntersections(TextPageBlock block, TextPageBlock other, List allBlocks) { + + double minY = Math.min(block.getMinY(), other.getMinY()); + double maxY = Math.min(block.getMaxY(), other.getMaxY()); + + int numberOfYIntersections = 0; + for (AbstractPageBlock current : allBlocks) { + + if (current == other || current == block) { + continue; + } + + if (minY <= current.getMaxY() && maxY >= current.getMinY()) { + numberOfYIntersections++; + } + } + + return numberOfYIntersections; + } + + + public void mergeZones(List zones) { + + ListIterator itty = zones.listIterator(); + Set toRemove = new HashSet<>(); + while (itty.hasNext()) { + AbstractPageBlock block = itty.next(); + if (block instanceof TablePageBlock) { + continue; + } + + TextPageBlock current = (TextPageBlock) block; + + if (current.isToDuplicate()) { + continue; + } + + for (int i = 0; i < zones.size(); i++) { + + if (toRemove.contains(zones.get(i))) { + continue; + } + if (zones.get(i) == current) { + continue; + } + if (zones.get(i) instanceof TablePageBlock) { + continue; + } + + TextPageBlock inner = (TextPageBlock) zones.get(i); + + if (inner.isToDuplicate()) { + continue; + } + + if (current.getDir() == inner.getDir() && current.almostIntersects(inner, 0, 0)) { + + current.getSequences().addAll(inner.getSequences()); + QuickSort.sort(current.getSequences(), new TextPositionSequenceComparator()); + current = buildTextBlock(current.getSequences(), 0); + toRemove.add(inner); + itty.set(current); + } + } + } + zones.removeAll(toRemove); + } + + + public List splitZonesAtRulings(List textPositions, List horizontalRulingLines, List verticalRulingLines) { + + int indexOnPage = 0; + List chunkWords = new ArrayList<>(); + List chunkBlockList = new ArrayList<>(); + + float minX = 1000, maxX = 0, minY = 1000, maxY = 0; + TextPositionSequence prev = null; + + for (TextPositionSequence word : textPositions) { + + boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines); + boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir()); + + if (prev != null && (splitByDir || isSplitByRuling)) { + + TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage); + indexOnPage++; + + chunkBlockList.add(cb1); + chunkWords = new ArrayList<>(); + + minX = 1000; + maxX = 0; + minY = 1000; + maxY = 0; + prev = null; + } + + chunkWords.add(word); + + prev = word; + if (word.getMinXDirAdj() < minX) { + minX = word.getMinXDirAdj(); + } + if (word.getMaxXDirAdj() > maxX) { + maxX = word.getMaxXDirAdj(); + } + if (word.getMinYDirAdj() < minY) { + minY = word.getMinYDirAdj(); + } + if (word.getMaxYDirAdj() > maxY) { + maxY = word.getMaxYDirAdj(); + } + } + + TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage); + if (cb1 != null) { + chunkBlockList.add(cb1); + } + + return chunkBlockList; + } + + + private boolean equalsWithThreshold(float f1, float f2) { + + return Math.abs(f1 - f2) < THRESHOLD; + } + + + private TextPageBlock buildTextBlock(List wordBlockList, int indexOnPage) { + + TextPageBlock textBlock = null; + + FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter(); + FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter(); + FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter(); + StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter(); + StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter(); + + for (TextPositionSequence wordBlock : wordBlockList) { + + lineHeightFrequencyCounter.add(wordBlock.getTextHeight()); + fontSizeFrequencyCounter.add(wordBlock.getFontSize()); + spaceFrequencyCounter.add(wordBlock.getSpaceWidth()); + fontFrequencyCounter.add(wordBlock.getFont()); + styleFrequencyCounter.add(wordBlock.getFontStyle()); + + if (textBlock == null) { + textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(), + wordBlock.getMaxXDirAdj(), + wordBlock.getMinYDirAdj(), + wordBlock.getMaxYDirAdj(), + wordBlockList, + wordBlock.getRotation()); + } else { + TextPageBlock spatialEntity = textBlock.union(wordBlock); + textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight()); + } + } + + if (textBlock != null) { + textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular()); + textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest()); + } + + if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) { + textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj)); + } + return textBlock; + } + + + private boolean isSplitByRuling(float minX, + float minY, + float maxX, + float maxY, + TextPositionSequence word, + List horizontalRulingLines, + List verticalRulingLines) { + + return isSplitByRuling(maxX, + minY, + word.getMinXDirAdj(), + word.getMinYDirAdj(), + verticalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()) // + || isSplitByRuling(minX, + minY, + word.getMinXDirAdj(), + word.getMaxYDirAdj(), + horizontalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()) // + || isSplitByRuling(maxX, + minY, + word.getMinXDirAdj(), + word.getMinYDirAdj(), + horizontalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()) // + || isSplitByRuling(minX, + minY, + word.getMinXDirAdj(), + word.getMaxYDirAdj(), + verticalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()); + } + + + private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List rulingLines, float dir, float pageWidth, float pageHeight) { + + for (Ruling ruling : rulingLines) { + var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight); + if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) { + return true; + } + } + return false; + } + + + private double round(float value, int decimalPoints) { + + var d = Math.pow(10, decimalPoints); + return Math.round(value * d) / d; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/TaasBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/TaasBlockificationService.java deleted file mode 100644 index 287d2ba..0000000 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/TaasBlockificationService.java +++ /dev/null @@ -1,330 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.processor.services.blockification; - - -// TODO: figure out, why this fails the build -// import static com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory.HEIGHT_PADDING; - -import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; -import com.knecon.fforesight.service.layoutparser.processor.model.Orientation; -import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; -import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil; -import org.springframework.stereotype.Service; - -import java.util.*; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import java.util.stream.Stream; - -@Service -@SuppressWarnings("all") -public class TaasBlockificationService { - - private static final float THRESHOLD = 1f; - private static final float Y_GAP_SPLIT_HEIGHT_MODIFIER = 1.25f; // multiplied with text height - private static final float INTERSECTS_Y_THRESHOLD = 4;// 2 * HEIGHT_PADDING // This is exactly 2 times our position height padding. This is required to find boxes that are visually intersecting. - private static final int X_GAP_SPLIT_CONSTANT = 50; - public static final int X_ALIGNMENT_THRESHOLD = 1; - public static final int NEGATIVE_X_GAP_THRESHOLD = -5; - - private Pattern listIdentifier = Pattern.compile("^(?:(?:[1-9]|1\\d|20|[ivxlc]|[a-z])\\s*(?:[.)]))|\\uF0B7", Pattern.CASE_INSENSITIVE); - - - /** - * This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions. - * This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this! - * Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling. - * - * @param textPositions The words of a page. - * @param horizontalRulingLines Horizontal table lines. - * @param verticalRulingLines Vertical table lines. - * @return ClassificationPage object that contains the Textblock and text statistics. - */ - public ClassificationPage blockify(List textPositions, List horizontalRulingLines, List verticalRulingLines) { - - List classificationTextBlocks = constructFineGranularTextPageBlocks(textPositions, horizontalRulingLines, verticalRulingLines); - classificationTextBlocks = mergeTextPageBlocksAligningX(classificationTextBlocks); - classificationTextBlocks = mergeIntersectingTextBlocksUntilConvergence(classificationTextBlocks); - - return new ClassificationPage(new ArrayList<>(classificationTextBlocks.stream().map(classificationTextBlock -> (AbstractPageBlock) classificationTextBlock).toList())); - } - - - private List mergeIntersectingTextBlocksUntilConvergence(List classificationTextBlocks) { - - int currentSize = classificationTextBlocks.size(); - while (true) { - classificationTextBlocks = mergeTextPageBlocksAlmostIntersecting(classificationTextBlocks); - if (classificationTextBlocks.size() == currentSize) { - break; - } - currentSize = classificationTextBlocks.size(); - } - return classificationTextBlocks; - } - - - private List mergeTextPageBlocksAligningX(List classificationTextBlocks) { - - if (classificationTextBlocks.isEmpty()) { - return new ArrayList<>(); - } - List> textBlocksToMerge = new LinkedList<>(); - List currentTextBlocksToMerge = new LinkedList<>(); - textBlocksToMerge.add(currentTextBlocksToMerge); - TextPageBlock previousTextBlock = null; - Float lastLineGap = null; - for (TextPageBlock currentTextBlock : classificationTextBlocks) { - if (previousTextBlock == null) { - currentTextBlocksToMerge.add(currentTextBlock); - previousTextBlock = currentTextBlock; - continue; - } - - - Matcher listIdentifierPattern = listIdentifier.matcher(currentTextBlock.getText()); - boolean isListIdentifier = listIdentifierPattern.find(); - - boolean yGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < previousTextBlock.getMostPopularWordHeight() * Y_GAP_SPLIT_HEIGHT_MODIFIER; - - boolean sameFont = previousTextBlock.getMostPopularWordFont().equals(currentTextBlock.getMostPopularWordFont()) && previousTextBlock.getMostPopularWordFontSize() == currentTextBlock.getMostPopularWordFontSize(); -// boolean yGap = previousTextBlock != null && currentTextBlock.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * Y_GAP_SPLIT_HEIGHT_MODIFIER; - - boolean alignsXRight = Math.abs(currentTextBlock.getPdfMaxX() - previousTextBlock.getPdfMaxX()) < X_ALIGNMENT_THRESHOLD; - boolean alignsXLeft = Math.abs(currentTextBlock.getPdfMinX() - previousTextBlock.getPdfMinX()) < X_ALIGNMENT_THRESHOLD; -// boolean smallYGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < yGap; - if (yGap && sameFont && !isListIdentifier) { - currentTextBlocksToMerge.add(currentTextBlock); - - } else { - currentTextBlocksToMerge = new LinkedList<>(); - currentTextBlocksToMerge.add(currentTextBlock); - textBlocksToMerge.add(currentTextBlocksToMerge); - } - previousTextBlock = currentTextBlock; - } - return textBlocksToMerge.stream().map(TextPageBlock::merge).toList(); - } - - - private List mergeTextPageBlocksAlmostIntersecting(List textPageBlocks) { - - Set alreadyMerged = new HashSet<>(); - List> textBlocksToMerge = new LinkedList<>(); - for (TextPageBlock textPageBlock : textPageBlocks) { - if (alreadyMerged.contains(textPageBlock)) { - continue; - } - alreadyMerged.add(textPageBlock); - textBlocksToMerge.add(Stream.concat(Stream.of(textPageBlock), - textPageBlocks.stream().filter(textPageBlock2 -> textPageBlock.almostIntersects(textPageBlock2, INTERSECTS_Y_THRESHOLD, 0) && !alreadyMerged.contains(textPageBlock2)).peek(alreadyMerged::add)) - .toList()); - } - return textBlocksToMerge.stream().map(TextPageBlock::merge).toList(); - } - - - private void assignOrientations(List classificationTextBlocks) { - - Iterator itty = classificationTextBlocks.iterator(); - - TextPageBlock previousLeft = null; - TextPageBlock previousRight = null; - while (itty.hasNext()) { - TextPageBlock block = (TextPageBlock) itty.next(); - - if (previousLeft != null && block.getOrientation().equals(Orientation.LEFT)) { - if (previousLeft.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousLeft.getMinY()) { - previousLeft.add(block); - itty.remove(); - continue; - } - } - - if (previousRight != null && block.getOrientation().equals(Orientation.RIGHT)) { - if (previousRight.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousRight.getMinY()) { - previousRight.add(block); - itty.remove(); - continue; - } - } - - if (block.getOrientation().equals(Orientation.LEFT)) { - previousLeft = block; - } else if (block.getOrientation().equals(Orientation.RIGHT)) { - previousRight = block; - } - } - - itty = classificationTextBlocks.iterator(); - TextPageBlock previous = null; - while (itty.hasNext()) { - TextPageBlock block = (TextPageBlock) itty.next(); - - if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold( - block.getMaxY(), - previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation() - .equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) { - previous.add(block); - itty.remove(); - continue; - } - - previous = block; - } - } - - - private List constructFineGranularTextPageBlocks(List textPositions, - List horizontalRulingLines, - List verticalRulingLines) { - - int indexOnPage = 0; - List wordClusterToCombine = new ArrayList<>(); - List classificationTextBlocks = new ArrayList<>(); - - float minX = 1000, maxX = 0, minY = 1000, maxY = 0; - TextPositionSequence prev = null; - // TODO: make static final constant - - - boolean wasSplitted = false; - Float splitX1 = null; - for (TextPositionSequence word : textPositions) { - - Matcher listIdentifierPattern = listIdentifier.matcher(word.toString()); - - boolean yGap = prev != null && word.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * Y_GAP_SPLIT_HEIGHT_MODIFIER; - boolean sameLine = prev != null && equalsWithThreshold(prev.getMinYDirAdj(), word.getMinYDirAdj()); - boolean positiveXGapInline = prev != null && maxX + X_GAP_SPLIT_CONSTANT < word.getMinXDirAdj() && sameLine; - boolean negativeXGap = prev != null && word.getMinXDirAdj() - minX < NEGATIVE_X_GAP_THRESHOLD; - boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight(); - boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj(); - boolean splitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines); - boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir()); - boolean fontChange = prev != null && (!word.getFont().equals(prev.getFont()) || !word.getFontStyle() - .equals(prev.getFontStyle()) || word.getFontSize() != prev.getFontSize()); - boolean newline = prev != null && Math.abs(word.getMinYDirAdj() - prev.getMinYDirAdj()) > word.getHeight(); - boolean isListIdentifier = listIdentifierPattern.matches(); - - if (prev != null && (prev.isParagraphStart() || negativeXGap || positiveXGapInline || yGap || startFromTop || splitByRuling || (newline && (fontChange || isListIdentifier)))) { -// if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) { - - Orientation prevOrientation = null; - if (!classificationTextBlocks.isEmpty()) { - prevOrientation = classificationTextBlocks.get(classificationTextBlocks.size() - X_ALIGNMENT_THRESHOLD).getOrientation(); - } - - TextPageBlock classificationTextBlock = TextPageBlock.fromTextPositionSequences(wordClusterToCombine); - - classificationTextBlocks.add(classificationTextBlock); - wordClusterToCombine = new ArrayList<>(); - - if (positiveXGapInline && !splitByRuling) { - wasSplitted = true; - classificationTextBlock.setOrientation(Orientation.LEFT); - splitX1 = word.getMinXDirAdj(); - } else if (newLineAfterSplit && !splitByRuling) { - wasSplitted = false; - classificationTextBlock.setOrientation(Orientation.RIGHT); - splitX1 = null; - } else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (yGap || !startFromTop || !positiveXGapInline || !newLineAfterSplit || !splitByRuling)) { - classificationTextBlock.setOrientation(Orientation.LEFT); - } - - minX = 1000; - maxX = 0; - minY = 1000; - maxY = 0; - prev = null; - } - - wordClusterToCombine.add(word); - - prev = word; - if (word.getMinXDirAdj() < minX) { - minX = word.getMinXDirAdj(); - } - if (word.getMaxXDirAdj() > maxX) { - maxX = word.getMaxXDirAdj(); - } - if (word.getMinYDirAdj() < minY) { - minY = word.getMinYDirAdj(); - } - if (word.getMaxYDirAdj() > maxY) { - maxY = word.getMaxYDirAdj(); - } - } - - TextPageBlock classificationTextBlock = TextPageBlock.fromTextPositionSequences(wordClusterToCombine); - if (classificationTextBlock != null) { - classificationTextBlocks.add(classificationTextBlock); - } - return classificationTextBlocks; - } - - - private boolean equalsWithThreshold(float f1, float f2) { - - return Math.abs(f1 - f2) < THRESHOLD; - } - - - private boolean isSplitByRuling(float minX, - float minY, - float maxX, - float maxY, - TextPositionSequence word, - List horizontalRulingLines, - List verticalRulingLines) { - - return isSplitByRuling(maxX, - minY, - word.getMinXDirAdj(), - word.getMinYDirAdj(), - verticalRulingLines, - word.getDir().getDegrees(), - word.getPageWidth(), - word.getPageHeight()) // - || isSplitByRuling(minX, - minY, - word.getMinXDirAdj(), - word.getMaxYDirAdj(), - horizontalRulingLines, - word.getDir().getDegrees(), - word.getPageWidth(), - word.getPageHeight()) // - || isSplitByRuling(maxX, - minY, - word.getMinXDirAdj(), - word.getMinYDirAdj(), - horizontalRulingLines, - word.getDir().getDegrees(), - word.getPageWidth(), - word.getPageHeight()) // - || isSplitByRuling(minX, - minY, - word.getMinXDirAdj(), - word.getMaxYDirAdj(), - verticalRulingLines, - word.getDir().getDegrees(), - word.getPageWidth(), - word.getPageHeight()); // - } - - - private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List rulingLines, float dir, float pageWidth, float pageHeight) { - - for (Ruling ruling : rulingLines) { - var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight); - if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) { - return true; - } - } - return false; - } - -} - diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TaasClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TaasClassificationService.java deleted file mode 100644 index 7a91be1..0000000 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TaasClassificationService.java +++ /dev/null @@ -1,114 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.processor.services.classification; - -import java.util.List; -import java.util.regex.Pattern; - -import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; -import org.springframework.stereotype.Service; - -import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; -import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; -import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService; -import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; - -import lombok.RequiredArgsConstructor; -import lombok.extern.slf4j.Slf4j; - -@Slf4j -@Service -@RequiredArgsConstructor -public class TaasClassificationService { - - private final BodyTextFrameService bodyTextFrameService; - - - public void classifyDocument(ClassificationDocument document) { - - - List headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular(); - - log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue()); - - for (ClassificationPage page : document.getPages()) { - - classifyPage(page, document, headlineFontSizes); - } - } - - - public void classifyPage(ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { - - for (AbstractPageBlock textBlock : page.getTextBlocks()) { - if (textBlock instanceof TextPageBlock) { - classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes); - } - } - } - - - public void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { - - var bodyTextFrame = page.getBodyTextFrame(); - - if (document.getFontSizeCounter().getMostPopular() == null) { - textBlock.setClassification(PageBlockType.OTHER); - return; - } - if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) - || PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) { - textBlock.setClassification(PageBlockType.HEADER); - } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) - || PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) { - textBlock.setClassification(PageBlockType.FOOTER); - } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, - document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks() - .size() == 1)) { - if (!Pattern.matches("[0-9]+", textBlock.toString())) { - textBlock.setClassification(PageBlockType.TITLE); - } - } else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter() - .getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter() - .getCountPerValue() - .containsKey("bold") && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1) && textBlock.getSequences() - .get(0) - .getTextPositions() - .get(0) - .getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { - - for (int i = 1; i <= headlineFontSizes.size(); i++) { - if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) { - textBlock.setClassification(PageBlockType.getHeadlineType(i)); - document.setHeadlines(true); - } - } - } else if (!textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle() - .equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences() - .get(0) - .getTextPositions() - .get(0) - .getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { - textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1)); - document.setHeadlines(true); - } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter() - .getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) { - textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD); - } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont() - .equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle() - .equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) { - textBlock.setClassification(PageBlockType.PARAGRAPH); - } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter() - .getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter() - .getMostPopular() - .equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) { - textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC); - } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) { - textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN); - } else { - textBlock.setClassification(PageBlockType.OTHER); - } - } - -} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmentationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmentationService.java new file mode 100644 index 0000000..019b4a8 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmentationService.java @@ -0,0 +1,59 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum; + +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.LineBuilderService; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.NearestNeighbourService; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.ReadingOrderService; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.SpacingService; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.ZoneBuilderService; + +import lombok.RequiredArgsConstructor; + +@Service +@RequiredArgsConstructor +public class DocstrumSegmentationService { + + private final NearestNeighbourService nearestNeighbourService; + private final SpacingService spacingService; + private final LineBuilderService lineBuilderService; + private final ZoneBuilderService zoneBuilderService; + private final ReadingOrderService readingOrderService; + + + public List segmentPage(List textPositions, boolean xyOrder) { + + List zones = new ArrayList<>(); + zones.addAll(computeZones(textPositions, TextDirection.ZERO)); + zones.addAll(computeZones(textPositions, TextDirection.QUARTER_CIRCLE)); + zones.addAll(computeZones(textPositions, TextDirection.HALF_CIRCLE)); + zones.addAll(computeZones(textPositions, TextDirection.THREE_QUARTER_CIRCLE)); + + return readingOrderService.resolve(zones, xyOrder); + } + + + private List computeZones(List textPositions, TextDirection direction) { + + var positions = textPositions.stream().filter(t -> t.getDir() == direction).map(TextPositionSequence::getTextPositions).flatMap(List::stream).toList(); + + var characters = positions.stream().map(Character::new).collect(Collectors.toList()); + + nearestNeighbourService.findNearestNeighbors(characters); + + var characterSpacing = spacingService.computeCharacterSpacing(characters); + var lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20); + + var lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing); + return zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/AngleFilter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/AngleFilter.java new file mode 100644 index 0000000..c7fd0a6 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/AngleFilter.java @@ -0,0 +1,25 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model; + +public class AngleFilter { + + protected double lowerAngle; + protected double upperAngle; + + + public AngleFilter(double lowerAngle, double upperAngle) { + + this.lowerAngle = lowerAngle < -Math.PI / 2 ? lowerAngle + Math.PI : lowerAngle; + this.upperAngle = upperAngle >= Math.PI / 2 ? upperAngle - Math.PI : upperAngle; + } + + + public boolean matches(Neighbor neighbor) { + + if (lowerAngle <= upperAngle) { + return lowerAngle <= neighbor.getAngle() && neighbor.getAngle() < upperAngle; + } else { + return lowerAngle <= neighbor.getAngle() || neighbor.getAngle() < upperAngle; + } + } + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/BoundingBox.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/BoundingBox.java new file mode 100644 index 0000000..79647ed --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/BoundingBox.java @@ -0,0 +1,57 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model; + +import java.awt.geom.Rectangle2D; + +import lombok.Data; + +@Data +public abstract class BoundingBox { + + private Rectangle2D bBox; + + + public double getX() { + + return bBox.getX(); + } + + + public double getY() { + + return bBox.getY(); + } + + + public double getWidth() { + + return bBox.getWidth(); + } + + + public double getHeight() { + + return bBox.getHeight(); + } + + + public double getArea() { + + return (bBox.getHeight() * bBox.getWidth()); + } + + + public boolean contains(Rectangle2D contained, double tolerance) { + + return bBox.getX() <= contained.getX() + tolerance + && bBox.getY() <= contained.getY() + tolerance + && bBox.getX() + bBox.getWidth() >= contained.getX() + contained.getWidth() - tolerance + && bBox.getY() + bBox.getHeight() >= contained.getY() + contained.getHeight() - tolerance; + } + + + public boolean intersectsY(BoundingBox other) { + + return this.getBBox().getMinY() <= other.getBBox().getMaxY() && this.getBBox().getMaxY() >= other.getBBox().getMinY(); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Character.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Character.java new file mode 100644 index 0000000..3e768ed --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Character.java @@ -0,0 +1,85 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; + +import lombok.Data; + +@Data +public class Character { + + private final double x; + private final double y; + private final RedTextPosition textPosition; + + private List neighbors = new ArrayList<>(); + + + public Character(RedTextPosition chunk) { + + this.x = chunk.getXDirAdj() + chunk.getWidthDirAdj() / 2; + this.y = chunk.getYDirAdj() + chunk.getHeightDir() / 2; + this.textPosition = chunk; + } + + + public double getHeight() { + + return textPosition.getHeightDir(); + } + + + public double distance(Character character) { + + double dx = getX() - character.getX(); + double dy = getY() - character.getY(); + return Math.sqrt(dx * dx + dy * dy); + } + + + public double horizontalDistance(Character character) { + + return Math.abs(getX() - character.getX()); + } + + + public double verticalDistance(Character character) { + + return Math.abs(getY() - character.getY()); + } + + + public double overlappingDistance(Character other) { + + double[] xs = new double[4]; + double s = Math.sin(-0); + double c = Math.cos(-0); + xs[0] = c * x - s * y; + xs[1] = c * (x + textPosition.getWidthDirAdj()) - s * (y + textPosition.getHeightDir()); + xs[2] = c * other.x - s * other.y; + xs[3] = c * (other.x + other.textPosition.getWidthDirAdj()) - s * (other.y + other.textPosition.getHeightDir()); + boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0]; + Arrays.sort(xs); + return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1); + } + + + public void setNeighbors(List neighbors) { + + this.neighbors = neighbors; + } + + + public double angle(Character character) { + + if (getX() > character.getX()) { + return Math.atan2(getY() - character.getY(), getX() - character.getX()); + } else { + return Math.atan2(character.getY() - getY(), character.getX() - getX()); + } + } + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Histogram.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Histogram.java new file mode 100644 index 0000000..fb34ac6 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Histogram.java @@ -0,0 +1,90 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model; + +public class Histogram { + + private static final double EPSILON = 1.0e-6; + private final double min; + private final double resolution; + private double[] frequencies; + + + public Histogram(double minValue, double maxValue, double resolution) { + + this.min = minValue - EPSILON; + double delta = maxValue - minValue + 2 * EPSILON; + int size = Math.max(1, (int) Math.round((maxValue - minValue) / resolution)); + this.resolution = delta / size; + this.frequencies = new double[size]; + } + + + public void kernelSmooth(double[] kernel) { + + double[] newFrequencies = new double[frequencies.length]; + int shift = (kernel.length - 1) / 2; + for (int i = 0; i < kernel.length; i++) { + int jStart = Math.max(0, i - shift); + int jEnd = Math.min(frequencies.length, frequencies.length + i - shift); + for (int j = jStart; j < jEnd; j++) { + newFrequencies[j - i + shift] += kernel[i] * frequencies[j]; + } + } + frequencies = newFrequencies; + } + + + public double[] createGaussianKernel(double length, double stdDeviation) { + + int r = (int) Math.round(length / resolution) / 2; + + int size = 2 * r + 1; + double[] kernel = new double[size]; + double sum = 0; + double b = 2 * (stdDeviation / resolution) * (stdDeviation / resolution); + double a = 1 / Math.sqrt(Math.PI * b); + for (int i = 0; i < size; i++) { + kernel[i] = a * Math.exp(-(i - r) * (i - r) / b); + sum += kernel[i]; + } + for (int i = 0; i < size; i++) { + kernel[i] /= sum; + } + return kernel; + } + + + public void gaussianSmooth(double windowLength, double stdDeviation) { + + kernelSmooth(createGaussianKernel(windowLength, stdDeviation)); + } + + + public void add(double value) { + + frequencies[(int) ((value - min) / resolution)] += 1.0; + } + + + public int getSize() { + + return frequencies.length; + } + + + public double getPeakValue() { + + int peakIndex = 0; + for (int i = 1; i < frequencies.length; i++) { + if (frequencies[i] > frequencies[peakIndex]) { + peakIndex = i; + } + } + int peakEndIndex = peakIndex + 1; + final double EPS = 0.0001; + while (peakEndIndex < frequencies.length && Math.abs(frequencies[peakEndIndex] - frequencies[peakIndex]) < EPS) { + peakEndIndex++; + } + return ((double) peakIndex + peakEndIndex) / 2 * resolution + min; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Line.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Line.java new file mode 100644 index 0000000..e4979cd --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Line.java @@ -0,0 +1,164 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model; + +import java.awt.geom.Rectangle2D; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; + +import lombok.Data; + +@Data +public class Line extends BoundingBox { + + private static final double WORD_DISTANCE_MULTIPLIER = 0.18; + + private final double x0; + private final double y0; + + private final double x1; + private final double y1; + + private final double height; + + private final List characters; + private final List words = new ArrayList<>(); + + + public Line(List characters, double wordSpacing) { + + this.characters = characters; + + if (characters.size() >= 2) { + // linear regression + double sx = 0.0; + double sxx = 0.0; + double sxy = 0.0; + double sy = 0.0; + for (Character character : characters) { + sx += character.getX(); + sxx += character.getX() * character.getX(); + sxy += character.getX() * character.getY(); + sy += character.getY(); + } + double b = (characters.size() * sxy - sx * sy) / (characters.size() * sxx - sx * sx); + double a = (sy - b * sx) / characters.size(); + + this.x0 = characters.get(0).getX(); + this.y0 = a + b * this.x0; + this.x1 = characters.get(characters.size() - 1).getX(); + this.y1 = a + b * this.x1; + } else { + Character character = characters.get(0); + double dx = character.getTextPosition().getWidthDirAdj() / 3; + double dy = dx * Math.tan(0); + this.x0 = character.getX() - dx; + this.x1 = character.getX() + dx; + this.y0 = character.getY() - dy; + this.y1 = character.getY() + dy; + } + height = computeHeight(); + computeWords(wordSpacing * WORD_DISTANCE_MULTIPLIER); + buildBBox(); + } + + + public double getAngle() { + + return Math.atan2(y1 - y0, x1 - x0); + } + + + public double getLength() { + + return Math.sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1)); + } + + + private double computeHeight() { + + return characters.stream().map(Character::getHeight).reduce(0d, Double::sum) / characters.size(); + } + + + public double angularDifference(Line j) { + + double diff = Math.abs(getAngle() - j.getAngle()); + if (diff <= Math.PI / 2) { + return diff; + } else { + return Math.PI - diff; + } + } + + + public double horizontalDistance(Line other) { + + double[] xs = new double[4]; + xs[0] = x0; + xs[1] = x1; + xs[2] = other.x0; + xs[3] = other.x1; + boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0]; + Arrays.sort(xs); + return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1); + } + + + public double verticalDistance(Line other) { + + double ym = (y0 + y1) / 2; + double yn = (other.y0 + other.y1) / 2; + return Math.abs(ym - yn) / Math.sqrt(1); + } + + + private void computeWords(double wordSpacing) { + + TextPositionSequence word = new TextPositionSequence(); + Character previous = null; + for (Character current : characters) { + if (previous != null) { + double dist = current.getTextPosition().getXDirAdj() - previous.getTextPosition().getXDirAdj() - previous.getTextPosition().getWidthDirAdj(); + if (dist > wordSpacing) { + words.add(word); + word = new TextPositionSequence(); + } + } + word.getTextPositions().add(current.getTextPosition()); + previous = current; + } + words.add(word); + } + + + private void buildBBox() { + + double minX = Double.POSITIVE_INFINITY; + double minY = Double.POSITIVE_INFINITY; + double maxX = Double.NEGATIVE_INFINITY; + double maxY = Double.NEGATIVE_INFINITY; + + for (Character character : characters) { + + minX = Math.min(minX, character.getTextPosition().getXDirAdj()); + minY = Math.min(minY, character.getTextPosition().getYDirAdj()); + maxX = Math.max(maxX, character.getTextPosition().getXDirAdj() + character.getTextPosition().getWidthDirAdj()); + maxY = Math.max(maxY, character.getTextPosition().getYDirAdj() + character.getTextPosition().getHeightDir()); + + } + + this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY)); + } + + + public String toString() { + + StringBuilder sb = new StringBuilder(); + words.forEach(word -> sb.append(word.toString()).append(" ")); + return sb.toString().trim(); + } + +} + diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Neighbor.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Neighbor.java new file mode 100644 index 0000000..b2b4174 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Neighbor.java @@ -0,0 +1,36 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model; + +import lombok.Getter; + +public class Neighbor { + + @Getter + private final double distance; + @Getter + private final double angle; + private final Character originCharacter; + @Getter + private final Character character; + + + public Neighbor(Character neighbor, Character origin) { + + this.distance = neighbor.distance(origin); + this.angle = neighbor.angle(origin); + this.character = neighbor; + this.originCharacter = origin; + } + + + public double getHorizontalDistance() { + + return character.horizontalDistance(originCharacter); + } + + + public double getVerticalDistance() { + + return character.verticalDistance(originCharacter); + } + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/UnionFind.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/UnionFind.java new file mode 100644 index 0000000..aaa2c37 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/UnionFind.java @@ -0,0 +1,31 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model; + +import java.util.Collection; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.Map; +import java.util.Set; + +public class UnionFind extends org.jgrapht.alg.util.UnionFind { + + public UnionFind(Set elements) { + + super(elements); + } + + + public Collection> getGroups() { + + Map> setRep = new LinkedHashMap<>(); + for (T t : getParentMap().keySet()) { + T representative = find(t); + if (!setRep.containsKey(representative)) { + setRep.put(representative, new LinkedHashSet<>()); + } + setRep.get(representative).add(t); + } + + return setRep.values(); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Zone.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Zone.java new file mode 100644 index 0000000..d5651d8 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Zone.java @@ -0,0 +1,51 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model; + +import java.awt.geom.Rectangle2D; +import java.util.Comparator; +import java.util.List; + +import lombok.Data; + +@Data +public class Zone extends BoundingBox { + + private List lines; + + + @SuppressWarnings("PMD.ConstructorCallsOverridableMethod") + public Zone(List lines) { + + lines.sort(Comparator.comparingDouble(Line::getY)); + this.lines = lines; + buildBBox(); + } + + + public void buildBBox() { + + double minX = Double.POSITIVE_INFINITY; + double minY = Double.POSITIVE_INFINITY; + double maxX = Double.NEGATIVE_INFINITY; + double maxY = Double.NEGATIVE_INFINITY; + + for (Line line : lines) { + + minX = Math.min(minX, line.getX()); + minY = Math.min(minY, line.getY()); + maxX = Math.max(maxX, line.getX() + line.getWidth()); + maxY = Math.max(maxY, line.getY() + line.getHeight()); + + } + + this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY)); + } + + + public String toString() { + + StringBuilder sb = new StringBuilder(); + lines.forEach(line -> sb.append(line.toString()).append("\n")); + return sb.toString().trim(); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/LineBuilderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/LineBuilderService.java new file mode 100644 index 0000000..06cd65e --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/LineBuilderService.java @@ -0,0 +1,53 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.HashSet; +import java.util.List; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.AngleFilter; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Line; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.UnionFind; + +@Service +public class LineBuilderService { + + private static final double CHARACTER_SPACING_DISTANCE_MULTIPLIER = 3.5; + private static final double MAX_VERTICAL_CHARACTER_DISTANCE = 0.67; + private static final double ANGLE_TOLERANCE = Math.PI / 6; + + + public List buildLines(List characters, double characterSpacing, double lineSpacing) { + + double maxHorizontalDistance = characterSpacing * CHARACTER_SPACING_DISTANCE_MULTIPLIER; + double maxVerticalDistance = lineSpacing * MAX_VERTICAL_CHARACTER_DISTANCE; + + UnionFind unionFind = new UnionFind<>(new HashSet<>(characters)); + + AngleFilter filter = new AngleFilter(-ANGLE_TOLERANCE, ANGLE_TOLERANCE); + + characters.forEach(character -> { + character.getNeighbors().forEach(neighbor -> { + double x = neighbor.getHorizontalDistance() / maxHorizontalDistance; + double y = neighbor.getVerticalDistance() / maxVerticalDistance; + if (character.getTextPosition().getDir() == neighbor.getCharacter().getTextPosition().getDir() && filter.matches(neighbor) && Math.pow(x, 2) + Math.pow(y, + 2) <= 1) { + unionFind.union(character, neighbor.getCharacter()); + } + }); + }); + + List lines = new ArrayList<>(); + unionFind.getGroups().forEach(group -> { + List lineCharacters = new ArrayList<>(group); + lineCharacters.sort(Comparator.comparingDouble(Character::getX)); + lines.add(new Line(lineCharacters, characterSpacing)); + }); + + return lines; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/NearestNeighbourService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/NearestNeighbourService.java new file mode 100644 index 0000000..925ba27 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/NearestNeighbourService.java @@ -0,0 +1,78 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Neighbor; + +@Service +public class NearestNeighbourService { + + private static final int NUMBER_OF_NEIGHBOURS = 8; + private static final double STEP = 16.0; + + + public void findNearestNeighbors(List characters) { + + if (characters.isEmpty() || characters.size() == 1) { + return; + } + + characters.sort(Comparator.comparingDouble(Character::getX)); + + int maxNeighborCount = NUMBER_OF_NEIGHBOURS; + if (characters.size() <= NUMBER_OF_NEIGHBOURS) { + maxNeighborCount = characters.size() - 1; + } + + for (int i = 0; i < characters.size(); i++) { + + List candidates = new ArrayList<>(); + + int start = i; + int end = i + 1; + + double distance = Double.POSITIVE_INFINITY; + + for (double searchDistance = 0; searchDistance < distance; ) { + + searchDistance += STEP; + boolean newCandidatesFound = false; + + while (start > 0 && characters.get(i).getX() - characters.get(start - 1).getX() < searchDistance) { + start--; + candidates.add(new Neighbor(characters.get(start), characters.get(i))); + clearLeastDistant(candidates, maxNeighborCount); + newCandidatesFound = true; + } + + while (end < characters.size() && characters.get(end).getX() - characters.get(i).getX() < searchDistance) { + candidates.add(new Neighbor(characters.get(end), characters.get(i))); + clearLeastDistant(candidates, maxNeighborCount); + end++; + newCandidatesFound = true; + } + + if (newCandidatesFound && candidates.size() >= maxNeighborCount) { + distance = candidates.get(maxNeighborCount - 1).getDistance(); + } + } + clearLeastDistant(candidates, maxNeighborCount); + characters.get(i).setNeighbors(new ArrayList<>(candidates)); + } + } + + + private void clearLeastDistant(List candidates, int maxNeighborCount) { + + if (candidates.size() > maxNeighborCount) { + candidates.sort(Comparator.comparingDouble(Neighbor::getDistance)); + candidates.remove(candidates.remove(candidates.size() - 1)); + } + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ReadingOrderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ReadingOrderService.java new file mode 100644 index 0000000..e084d88 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ReadingOrderService.java @@ -0,0 +1,165 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.ListIterator; +import java.util.Map; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.BoundingBox; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils.DoubleUtils; + +@Service +public class ReadingOrderService { + + private static final double THRESHOLD = 5; + public static final double MULTI_COLUMN_DETECTION_THRESHOLD = 1.5; + + + public List resolve(List zones, boolean xyReadingOrder) { + + if (zones.isEmpty() || zones.size() == 1) { + return zones; + } + + if (xyReadingOrder) { + return resolveSingleColumnReadingOrder(zones); + } + + Map histogram = new HashMap<>(); + for (Zone zone : zones) { + long minY = Math.round(zone.getBBox().getMinY()); + long maxY = Math.round(zone.getBBox().getMaxY()); + for (long i = minY; i <= maxY; i++) { + histogram.put(i, histogram.getOrDefault(i, 0) + 1); + } + } + + if (histogram.values().stream().mapToInt(Integer::intValue).average().orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) { + return resolveSingleColumnReadingOrder(zones); + } else { + + return resolveMultiColumnReadingOder(zones); + } + + } + + + private static List resolveSingleColumnReadingOrder(List zones) { + + zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) + .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); + return zones; + } + + + private List resolveMultiColumnReadingOder(List zones) { + + // Simple reading order resolver for multi column page layout as described here : https://pub.towardsai.net/advanced-rag-02-unveiling-pdf-parsing-b84ae866344e + // TODO implement a more fancy reading order resolver see https://github.com/BobLd/DocumentLayoutAnalysis/blob/master/README.md#reading-order + + double minX = Double.POSITIVE_INFINITY; + double maxX = Double.NEGATIVE_INFINITY; + + for (Zone zone : zones) { + if (zone.getX() < minX) { + minX = zone.getX(); + } + if (zone.getX() + zone.getWidth() > maxX) { + maxX = zone.getX() + zone.getWidth(); + } + } + + double midLineXCoordinate = (minX + maxX) / 2; + + List leftOf = new ArrayList<>(); + List rightOf = new ArrayList<>(); + List middle = new ArrayList<>(); + for (Zone zone : zones) { + if (zone.getX() < midLineXCoordinate && zone.getX() + zone.getWidth() < midLineXCoordinate) { + leftOf.add(zone); + } else if (zone.getX() > midLineXCoordinate && zone.getX() + zone.getWidth() > midLineXCoordinate) { + rightOf.add(zone); + } else { + middle.add(zone); + } + } + + leftOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) + .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); + + rightOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) + .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); + + middle.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) + .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); + + List leftNotIntersecting = new ArrayList<>(); + for (Zone leftZone : leftOf) { + boolean intersects = false; + for (Zone rightZone : rightOf) { + if (leftZone.intersectsY(rightZone)) { + intersects = true; + break; + } + // early stopping + if (rightZone.getBBox().getMinY() > leftZone.getBBox().getMaxY()) { + break; + } + } + if (!intersects) { + leftNotIntersecting.add(leftZone); + } + } + + List rightNotIntersecting = new ArrayList<>(); + for (Zone rightZone : rightOf) { + boolean intersects = false; + for (Zone leftZone : leftOf) { + if (rightZone.intersectsY(leftZone)) { + intersects = true; + break; + } + // early stopping + if (leftZone.getBBox().getMinY() > rightZone.getBBox().getMaxY()) { + break; + } + } + if (!intersects) { + rightNotIntersecting.add(rightZone); + } + } + + leftOf.removeAll(leftNotIntersecting); + rightOf.removeAll(rightNotIntersecting); + + middle.addAll(leftNotIntersecting); + middle.addAll(rightNotIntersecting); + + List sortedZones = new ArrayList<>(); + sortedZones.addAll(leftOf); + sortedZones.addAll(rightOf); + + ListIterator itty = middle.listIterator(); + + while (itty.hasNext()) { + Zone current = itty.next(); + for (int i = 0; i < sortedZones.size(); i++) { + if (current.getY() < sortedZones.get(i).getY()) { + sortedZones.add(i, current); + itty.remove(); + break; + } + } + } + + sortedZones.addAll(middle); + + return sortedZones; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/SpacingService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/SpacingService.java new file mode 100644 index 0000000..2aab22d --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/SpacingService.java @@ -0,0 +1,56 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service; + +import java.util.List; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.AngleFilter; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Histogram; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Neighbor; + +@Service +public class SpacingService { + + private static final double SPACING_HISTOGRAM_RESOLUTION = 0.5; + private static final double SPACING_HISTOGRAM_SMOOTHING_LENGTH = 2.5; + private static final double SPACING_HIST_SMOOTHING_STANDARD_DEVIATION = 0.5; + private static final double ANGLE_TOLERANCE = Math.PI / 6; + + + public double computeCharacterSpacing(List characters) { + + return computeSpacing(characters, 0); + } + + + public double computeLineSpacing(List characters) { + + return computeSpacing(characters, Math.PI / 2); + } + + + private double computeSpacing(List characters, double angle) { + + double maxDistance = Double.NEGATIVE_INFINITY; + + for (Character character : characters) { + for (Neighbor neighbor : character.getNeighbors()) { + maxDistance = Math.max(maxDistance, neighbor.getDistance()); + } + } + Histogram histogram = new Histogram(0, maxDistance, SPACING_HISTOGRAM_RESOLUTION); + AngleFilter angleFilter = new AngleFilter(angle - ANGLE_TOLERANCE, angle + ANGLE_TOLERANCE); + for (Character character : characters) { + for (Neighbor neighbor : character.getNeighbors()) { + if (angleFilter.matches(neighbor)) { + histogram.add(neighbor.getDistance()); + } + } + } + + histogram.gaussianSmooth(SPACING_HISTOGRAM_SMOOTHING_LENGTH, SPACING_HIST_SMOOTHING_STANDARD_DEVIATION); + return histogram.getPeakValue(); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ZoneBuilderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ZoneBuilderService.java new file mode 100644 index 0000000..c7bdaa8 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ZoneBuilderService.java @@ -0,0 +1,152 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Line; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.UnionFind; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone; + +@Service +public class ZoneBuilderService { + + private static final double MIN_HORIZONTAL_DISTANCE_MULTIPLIER = -0.5; + private static final double MAX_VERTICAL_DISTANCE_MULTIPLIER = 1.2; + + private static final double MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER = -3.0; + + private static final double MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER = 0.5; + + private static final double MIN_LINE_SIZE_SCALE = 0.9; + + private static final double MAX_LINE_SIZE_SCALE = 2.5; + + private static final double ANGLE_TOLERANCE = Math.PI / 6; + + private static final int MAX_ZONES = 300; + + private static final double MAX_VERTICAL_MERGE_DISTANCE = 0.5; + + + public List buildZones(List lines, double characterSpacing, double lineSpacing) { + + double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER; + double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER; + double minHorizontalMergeDistance = characterSpacing * MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER; + double maxVerticalMergeDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER; + + UnionFind unionFind = new UnionFind<>(new HashSet<>(lines)); + + double meanHeight = calculateMeanHeight(lines); + + lines.forEach(outerLine -> // + lines.forEach(innerLine -> { + + double scale = Math.min(outerLine.getHeight(), innerLine.getHeight()) / meanHeight; + scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE)); + + if (!unionFind.inSameSet(outerLine, innerLine) && outerLine.angularDifference(innerLine) <= ANGLE_TOLERANCE) { + + double horizontalDistance = outerLine.horizontalDistance(innerLine) / scale; + double verticalDistance = outerLine.verticalDistance(innerLine) / scale; + + if (minHorizontalDistance <= horizontalDistance && verticalDistance <= maxVerticalDistance // + || minHorizontalMergeDistance <= horizontalDistance && verticalDistance <= maxVerticalMergeDistance) { + unionFind.union(outerLine, innerLine); + } + } + })); + + List zones = new ArrayList<>(); + unionFind.getGroups().forEach(group -> { + zones.add(new Zone(new ArrayList<>(group))); + }); + + if (zones.size() > MAX_ZONES) { + List oneZoneLines = new ArrayList<>(); + for (Zone zone : zones) { + oneZoneLines.addAll(zone.getLines()); + } + return List.of(mergeLinesInZone(oneZoneLines, characterSpacing, lineSpacing)); + } + + return zones; + } + + + private double calculateMeanHeight(List lines) { + + double meanHeight = 0.0; + double weights = 0.0; + for (Line line : lines) { + double weight = line.getLength(); + meanHeight += line.getHeight() * weight; + weights += weight; + } + meanHeight /= weights; + return meanHeight; + } + + + private Zone mergeLinesInZone(List lines, double characterSpacing, double lineSpacing) { + + double maxHorizontalDistance = 0; + double minVerticalDistance = 0; + double maxVerticalDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE; + + UnionFind unionFind = new UnionFind<>(new HashSet<>(lines)); + + lines.forEach(outer -> { + + lines.forEach(inner -> { + if (inner != outer) { + + double horizontalDistance = outer.horizontalDistance(inner); + double verticalDistance = outer.verticalDistance(inner); + + if (horizontalDistance <= maxHorizontalDistance && minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance) { + unionFind.union(outer, inner); + } else if (minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance && Math.abs(horizontalDistance - Math.min(outer.getLength(), + inner.getLength())) < 0.1) { + boolean characterOverlap = false; + int overlappingCount = 0; + for (Character outerCharacter : outer.getCharacters()) { + for (Character innerCharacter : inner.getCharacters()) { + double characterOverlapDistance = outerCharacter.overlappingDistance(innerCharacter); + if (characterOverlapDistance > 2) { + characterOverlap = true; + } + if (characterOverlapDistance > 0) { + overlappingCount++; + } + } + } + if (!characterOverlap && overlappingCount <= 2) { + unionFind.union(outer, inner); + } + } + } + }); + }); + + List outputZone = new ArrayList<>(); + for (Set group : unionFind.getGroups()) { + List characters = new ArrayList<>(); + for (Line line : group) { + characters.addAll(line.getCharacters()); + } + characters.sort(Comparator.comparingDouble(Character::getX)); + + outputZone.add(new Line(characters, characterSpacing)); + } + + return new Zone(outputZone); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/utils/DoubleUtils.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/utils/DoubleUtils.java new file mode 100644 index 0000000..d762cf0 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/utils/DoubleUtils.java @@ -0,0 +1,15 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils; + +public class DoubleUtils { + + public static int compareDouble(double d1, double d2, double precision) { + + if (Double.isNaN(d1) || Double.isNaN(d2)) { + return Double.compare(d1, d2); + } + long i1 = Math.round(d1 / (precision == 0 ? 1 : precision)); + long i2 = Math.round(d2 / (precision == 0 ? 1 : precision)); + return Long.compare(i1, i2); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java index eaf5bf2..f402c8b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java @@ -13,8 +13,10 @@ import java.util.List; import java.util.Map; import java.util.NoSuchElementException; import java.util.Set; +import java.util.stream.Collectors; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter; @@ -22,6 +24,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Header; @@ -46,14 +49,14 @@ import lombok.experimental.UtilityClass; @UtilityClass public class DocumentGraphFactory { - public Document buildDocumentGraph(ClassificationDocument document) { + public Document buildDocumentGraph(LayoutParsingType layoutParsingType, ClassificationDocument document) { Document documentGraph = new Document(); Context context = new Context(documentGraph); document.getPages().forEach(context::buildAndAddPageWithCounter); document.getSections().stream().flatMap(section -> section.getImages().stream()).forEach(image -> context.getImages().add(image)); - addSections(document, context); + addSections(layoutParsingType, document, context); addHeaderAndFooterToEachPage(document, context); documentGraph.setNumberOfPages(context.pages.size()); @@ -64,9 +67,9 @@ public class DocumentGraphFactory { } - private void addSections(ClassificationDocument document, Context context) { + private void addSections(LayoutParsingType layoutParsingType, ClassificationDocument document, Context context) { - document.getSections().forEach(section -> SectionNodeFactory.addSection(null, section.getNonEmptyPageBlocks(), section.getImages(), context)); + document.getSections().forEach(section -> SectionNodeFactory.addSection(layoutParsingType, null, section.getNonEmptyPageBlocks(), section.getImages(), context)); } @@ -77,6 +80,8 @@ public class DocumentGraphFactory { GenericSemanticNode node; if (originalTextBlock.isHeadline()) { node = Headline.builder().documentTree(context.getDocumentTree()).build(); + } else if (originalTextBlock.isToDuplicate()) { + node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree()).build(); } else { node = Paragraph.builder().documentTree(context.getDocumentTree()).build(); } @@ -86,7 +91,16 @@ public class DocumentGraphFactory { List textBlocks = new ArrayList<>(); textBlocks.add(originalTextBlock); textBlocks.addAll(textBlocksToMerge); + AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), node, context, page); + + if (node instanceof DuplicatedParagraph duplicatedParagraph) { + AtomicTextBlock unsortedTextBlock = context.textBlockFactory.buildAtomicTextBlock(textBlocks.stream() + .flatMap(tb -> tb.getSequences().stream()) + .collect(Collectors.toList()), node, context, page); + duplicatedParagraph.setUnsortedLeafTextBlock(unsortedTextBlock); + } + List treeId = context.documentTree.createNewChildEntryAndReturnId(parentNode, node); node.setLeafTextBlock(textBlock); node.setTreeId(treeId); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java index 7bd82e2..490f83c 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java @@ -4,19 +4,21 @@ import static java.lang.String.format; import static java.util.Collections.emptyList; import static java.util.stream.Collectors.groupingBy; +import java.util.ArrayList; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; -import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section; +import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.utils.TableMergingUtility; import lombok.experimental.UtilityClass; @@ -24,7 +26,11 @@ import lombok.experimental.UtilityClass; @UtilityClass public class SectionNodeFactory { - public void addSection(GenericSemanticNode parentNode, List pageBlocks, List images, DocumentGraphFactory.Context context) { + public void addSection(LayoutParsingType layoutParsingType, + GenericSemanticNode parentNode, + List pageBlocks, + List images, + DocumentGraphFactory.Context context) { if (pageBlocks.isEmpty()) { return; @@ -37,11 +43,11 @@ public class SectionNodeFactory { section.setTreeId(getTreeId(parentNode, context, section)); - addFirstHeadlineDirectlyToSection(pageBlocks, context, section); + addFirstHeadlineDirectlyToSection(layoutParsingType, pageBlocks, context, section); if (containsTablesAndTextBlocks(pageBlocks)) { - splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(section, subSectionPageBlocks, emptyList(), context)); + splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(layoutParsingType, section, subSectionPageBlocks, emptyList(), context)); } else { - addTablesAndParagraphsAndHeadlinesToSection(pageBlocks, context, section); + addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, pageBlocks, context, section); } images.stream().distinct().forEach(image -> DocumentGraphFactory.addImage(section, image, context)); @@ -58,16 +64,19 @@ public class SectionNodeFactory { } - private void addFirstHeadlineDirectlyToSection(List pageBlocks, DocumentGraphFactory.Context context, Section section) { + private void addFirstHeadlineDirectlyToSection(LayoutParsingType layoutParsingType, List pageBlocks, DocumentGraphFactory.Context context, Section section) { if (pageBlocks.get(0).isHeadline()) { - addTablesAndParagraphsAndHeadlinesToSection(List.of(pageBlocks.get(0)), context, section); + addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, List.of(pageBlocks.get(0)), context, section); pageBlocks.remove(0); } } - private void addTablesAndParagraphsAndHeadlinesToSection(List pageBlocks, DocumentGraphFactory.Context context, Section section) { + private void addTablesAndParagraphsAndHeadlinesToSection(LayoutParsingType layoutParsingType, + List pageBlocks, + DocumentGraphFactory.Context context, + Section section) { Set alreadyMerged = new HashSet<>(); List remainingBlocks = new LinkedList<>(pageBlocks); @@ -80,13 +89,23 @@ public class SectionNodeFactory { remainingBlocks.removeAll(alreadyMerged); if (abstractPageBlock instanceof TextPageBlock) { - List textBlocks = findTextBlocksWithSameClassificationAndAlignsY((TextPageBlock) abstractPageBlock, remainingBlocks); - alreadyMerged.addAll(textBlocks); - DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks); + + switch (layoutParsingType) { + case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> { + alreadyMerged.add(abstractPageBlock); + remainingBlocks.remove(abstractPageBlock); + DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, new ArrayList<>()); + } + default -> { + List textBlocks = findTextBlocksWithSameClassificationAndAlignsY((TextPageBlock) abstractPageBlock, remainingBlocks); + alreadyMerged.addAll(textBlocks); + DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks); + } + } } else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) { List tablesToMerge = TableMergingUtility.findConsecutiveTablesWithSameColCountAndSameHeaders(tablePageBlock, remainingBlocks); alreadyMerged.addAll(tablesToMerge); - TableNodeFactory.addTable(section, tablesToMerge, context); + TableNodeFactory.addTable(layoutParsingType, section, tablesToMerge, context); } else { throw new RuntimeException(format("Unhandled AbstractPageBlockType %s!", abstractPageBlock.getClass())); } @@ -171,6 +190,7 @@ public class SectionNodeFactory { .filter(abstractTextContainer -> abstractTextContainer.intersectsY(atc)) .map(abstractTextContainer -> (TextPageBlock) abstractTextContainer) .filter(abstractTextContainer -> abstractTextContainer.getDir() == atc.getDir()) + .filter(abstractTextContainer -> !abstractTextContainer.isToDuplicate()) .toList(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java index c00edd1..21d05fd 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/TableNodeFactory.java @@ -7,16 +7,17 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; -import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations; import lombok.experimental.UtilityClass; @@ -27,7 +28,7 @@ public class TableNodeFactory { public final double TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD = 0.05; - public void addTable(GenericSemanticNode parentNode, List tablesToMerge, DocumentGraphFactory.Context context) { + public void addTable(LayoutParsingType layoutParsingType, GenericSemanticNode parentNode, List tablesToMerge, DocumentGraphFactory.Context context) { setPageNumberInCells(tablesToMerge); Set pages = tablesToMerge.stream().map(AbstractPageBlock::getPage).map(context::getPage).collect(Collectors.toSet()); @@ -43,7 +44,7 @@ public class TableNodeFactory { List treeId = context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, table); table.setTreeId(treeId); - addTableCells(mergedRows, table, context); + addTableCells(layoutParsingType, mergedRows, table, context); ifTableHasNoHeadersSetFirstRowAsHeaders(table); } @@ -88,18 +89,18 @@ public class TableNodeFactory { } - private void addTableCells(List> rows, Table table, DocumentGraphFactory.Context context) { + private void addTableCells(LayoutParsingType layoutParsingType, List> rows, Table table, DocumentGraphFactory.Context context) { for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) { for (int colIndex = 0; colIndex < rows.get(rowIndex).size(); colIndex++) { - addTableCell(rows.get(rowIndex).get(colIndex), rowIndex, colIndex, table, context); + addTableCell(layoutParsingType, rows.get(rowIndex).get(colIndex), rowIndex, colIndex, table, context); } } } @SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong - private void addTableCell(Cell cell, int rowIndex, int colIndex, Table tableNode, DocumentGraphFactory.Context context) { + private void addTableCell(LayoutParsingType layoutParsingType, Cell cell, int rowIndex, int colIndex, Table tableNode, DocumentGraphFactory.Context context) { Page page = context.getPage(cell.getPageNumber()); @@ -116,7 +117,7 @@ public class TableNodeFactory { textBlock = context.getTextBlockFactory().buildAtomicTextBlock(cell.getTextBlocks().get(0).getSequences(), tableCell, context, page); tableCell.setLeafTextBlock(textBlock); } else if (firstTextBlockIsHeadline(cell)) { - SectionNodeFactory.addSection(tableCell, cell.getTextBlocks().stream().map(tb -> (AbstractPageBlock) tb).toList(), emptyList(), context); + SectionNodeFactory.addSection(layoutParsingType, tableCell, cell.getTextBlocks().stream().map(tb -> (AbstractPageBlock) tb).toList(), emptyList(), context); } else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) { List sequences = TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(cell.getTextBlocks()); textBlock = context.getTextBlockFactory().buildAtomicTextBlock(sequences, tableCell, context, page); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentDataMapper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentDataMapper.java index ac42f15..b4b20d8 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentDataMapper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentDataMapper.java @@ -8,8 +8,6 @@ import java.util.Map; import java.util.Set; import java.util.stream.Collectors; -import javax.xml.parsers.DocumentBuilder; - import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData; @@ -18,6 +16,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.Do import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; @@ -33,27 +32,20 @@ public class DocumentDataMapper { public DocumentData toDocumentData(Document document) { List documentTextData = document.streamTerminalTextBlocksInOrder() - .flatMap(textBlock -> textBlock.getAtomicTextBlocks() - .stream()) + .flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream()) .distinct() .map(DocumentDataMapper::toAtomicTextBlockData) .toList(); List atomicPositionBlockData = document.streamTerminalTextBlocksInOrder() - .flatMap(textBlock -> textBlock.getAtomicTextBlocks() - .stream()) + .flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream()) .distinct() .map(DocumentDataMapper::toAtomicPositionBlockData) .toList(); - Set nonEmptyTextBlocks = documentTextData.stream() - .mapToLong(DocumentTextData::getId).boxed() - .collect(Collectors.toSet()); + Set nonEmptyTextBlocks = documentTextData.stream().mapToLong(DocumentTextData::getId).boxed().collect(Collectors.toSet()); - List documentPageData = document.getPages() - .stream() - .map(DocumentDataMapper::toPageData) - .toList(); + List documentPageData = document.getPages().stream().map(DocumentDataMapper::toPageData).toList(); DocumentStructure tableOfContentsData = toDocumentTreeData(document.getDocumentTree()); return DocumentData.builder() .documentTextData(documentTextData.toArray(new DocumentTextData[0])) @@ -84,22 +76,17 @@ public class DocumentDataMapper { case TABLE -> PropertiesMapper.buildTableProperties((Table) entry.getNode()); case TABLE_CELL -> PropertiesMapper.buildTableCellProperties((TableCell) entry.getNode()); case IMAGE -> PropertiesMapper.buildImageProperties((Image) entry.getNode()); + case PARAGRAPH -> + entry.getNode() instanceof DuplicatedParagraph duplicatedParagraph ? PropertiesMapper.buildDuplicateParagraphProperties(duplicatedParagraph) : new HashMap<>(); default -> new HashMap<>(); }; DocumentStructure.EntryData.EntryDataBuilder documentBuilder = DocumentStructure.EntryData.builder() .treeId(toPrimitiveIntArray(entry.getTreeId())) - .children(entry.getChildren() - .stream() - .map(DocumentDataMapper::toEntryData) - .toList()) + .children(entry.getChildren().stream().map(DocumentDataMapper::toEntryData).toList()) .type(entry.getType()) .atomicBlockIds(atomicTextBlocks) - .pageNumbers(entry.getNode().getPages() - .stream() - .map(Page::getNumber) - .map(Integer::longValue) - .toArray(Long[]::new)) + .pageNumbers(entry.getNode().getPages().stream().map(Page::getNumber).map(Integer::longValue).toArray(Long[]::new)) .properties(properties); if (entry.getNode() != null) { documentBuilder.engines(entry.getNode().getEngines()); @@ -112,10 +99,7 @@ public class DocumentDataMapper { private Long[] toAtomicTextBlockIds(TextBlock textBlock) { - return textBlock.getAtomicTextBlocks() - .stream() - .map(AtomicTextBlock::getId) - .toArray(Long[]::new); + return textBlock.getAtomicTextBlocks().stream().map(AtomicTextBlock::getId).toArray(Long[]::new); } @@ -167,9 +151,7 @@ public class DocumentDataMapper { private int[] toPrimitiveIntArray(List list) { - return list.stream() - .mapToInt(Integer::intValue) - .toArray(); + return list.stream().mapToInt(Integer::intValue).toArray(); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentGraphMapper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentGraphMapper.java index c51f9ec..a53c6d8 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentGraphMapper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentGraphMapper.java @@ -7,13 +7,14 @@ import java.util.List; import java.util.Map; import java.util.NoSuchElementException; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData; import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Header; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline; @@ -61,7 +62,7 @@ public class DocumentGraphMapper { SemanticNode node = switch (entryData.getType()) { case SECTION -> buildSection(context); - case PARAGRAPH -> buildParagraph(context); + case PARAGRAPH -> buildParagraph(context, entryData.getProperties()); case HEADLINE -> buildHeadline(context); case HEADER -> buildHeader(context); case FOOTER -> buildFooter(context); @@ -140,7 +141,17 @@ public class DocumentGraphMapper { } - private Paragraph buildParagraph(Context context) { + private Paragraph buildParagraph(Context context, Map properties) { + + if (PropertiesMapper.isDuplicateParagraph(properties)) { + + DuplicatedParagraph duplicatedParagraph = DuplicatedParagraph.builder().documentTree(context.documentTree).build(); + + Long[] unsortedTextblockIds = PropertiesMapper.getUnsortedTextblockIds(properties); + duplicatedParagraph.setUnsortedLeafTextBlock(toTextBlock(unsortedTextblockIds, context, duplicatedParagraph)); + return duplicatedParagraph; + + } return Paragraph.builder().documentTree(context.documentTree).build(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/PropertiesMapper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/PropertiesMapper.java index 329bd40..f4ebbd5 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/PropertiesMapper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/PropertiesMapper.java @@ -1,17 +1,19 @@ package com.knecon.fforesight.service.layoutparser.processor.services.mapper; import java.awt.geom.Rectangle2D; -import java.util.Collections; +import java.util.Arrays; import java.util.HashMap; import java.util.Locale; import java.util.Map; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; public class PropertiesMapper { @@ -76,6 +78,32 @@ public class PropertiesMapper { } + public static Map buildDuplicateParagraphProperties(DuplicatedParagraph duplicatedParagraph) { + + Map properties = new HashMap<>(); + properties.put(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID, Arrays.toString(toAtomicTextBlockIds(duplicatedParagraph.getUnsortedLeafTextBlock()))); + return properties; + } + + + public static boolean isDuplicateParagraph(Map properties) { + + return properties.containsKey(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID); + } + + + public static Long[] getUnsortedTextblockIds(Map properties) { + + return toLongArray(properties.get(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID)); + } + + + public static Long[] toLongArray(String ids) { + + return Arrays.stream(ids.substring(1, ids.length() - 1).trim().split(",")).map(Long::valueOf).toArray(Long[]::new); + } + + private static ImageType parseImageType(String imageType) { return switch (imageType) { @@ -101,4 +129,10 @@ public class PropertiesMapper { rectangle2D.getHeight()); } + + private static Long[] toAtomicTextBlockIds(TextBlock textBlock) { + + return textBlock.getAtomicTextBlocks().stream().map(AtomicTextBlock::getId).toArray(Long[]::new); + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java index 09a8eb2..18e5a5a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java @@ -237,8 +237,13 @@ public class PDFLinesTextStripper extends PDFTextStripper { int startIndex = 0; RedTextPosition previous = null; + float direction = -1; for (int i = 0; i <= textPositions.size() - 1; i++) { + if (direction == -1) { + direction = textPositions.get(i).getDir(); + } + if (!textPositionSequences.isEmpty()) { previous = textPositionSequences.get(textPositionSequences.size() - 1) .getTextPositions() @@ -250,6 +255,13 @@ public class PDFLinesTextStripper extends PDFTextStripper { continue; } + if (textPositions.get(i).getDir() != direction && startIndex != i) { + List sublist = textPositions.subList(startIndex, i); + textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart)); + startIndex = i; + direction = textPositions.get(i).getDir(); + } + // Strange but sometimes this is happening, for example: Metolachlor2.pdf if (checkIfCurrentPositionIsToTheRightOfPreviousPosition(i, textPositions)) { List sublist = textPositions.subList(startIndex, i); @@ -329,6 +341,7 @@ public class PDFLinesTextStripper extends PDFTextStripper { .getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < maximumGapSize; } + @Override public String getText(PDDocument doc) throws IOException { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java index e17a8b1..d357614 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java @@ -20,6 +20,7 @@ import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; @@ -53,6 +54,8 @@ public class LayoutGridService { static Color INNER_LINES_COLOR = new Color(255, 175, 175); static Color PARAGRAPH_COLOR = new Color(70, 130, 180); + + static Color DUPLICATE_PARAGRAPH_COLOR = new Color(70, 180, 101); static Color TABLE_COLOR = new Color(102, 205, 170); static Color SECTION_COLOR = new Color(50, 50, 50); static Color HEADLINE_COLOR = new Color(162, 56, 56); @@ -100,6 +103,11 @@ public class LayoutGridService { case IMAGE -> IMAGE_COLOR; default -> null; }; + + if (semanticNode instanceof DuplicatedParagraph) { + color = DUPLICATE_PARAGRAPH_COLOR; + } + if (isNotSectionOrTableCellOrDocument(semanticNode)) { addAsRectangle(semanticNode, layoutGrid, color); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java index 53e8c29..9927685 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java @@ -3,7 +3,6 @@ package com.knecon.fforesight.service.layoutparser.processor.utils; import java.util.List; import java.util.stream.Collectors; - import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; diff --git a/layoutparser-service/layoutparser-service-server/src/main/java/com/knecon/fforesight/service/layoutparser/server/queue/MessageHandler.java b/layoutparser-service/layoutparser-service-server/src/main/java/com/knecon/fforesight/service/layoutparser/server/queue/MessageHandler.java index 22f4899..4b6a6cd 100644 --- a/layoutparser-service/layoutparser-service-server/src/main/java/com/knecon/fforesight/service/layoutparser/server/queue/MessageHandler.java +++ b/layoutparser-service/layoutparser-service-server/src/main/java/com/knecon/fforesight/service/layoutparser/server/queue/MessageHandler.java @@ -37,7 +37,7 @@ public class MessageHandler { LayoutParsingRequest layoutParsingRequest = objectMapper.readValue(message.getBody(), LayoutParsingRequest.class); - if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.TAAS) && layoutParsingRequest.researchDocumentStorageId() == null) { + if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.CLARIFYND) && layoutParsingRequest.researchDocumentStorageId() == null) { throw new IllegalArgumentException("ResearchDocumentDataStorageId is null!"); } log.info("Layout parsing request received {}", layoutParsingRequest.identifier()); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java index 6dd4312..c80bfbd 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java @@ -48,12 +48,13 @@ public class BdrJsonBuildTest extends AbstractTest { @SneakyThrows protected Document buildGraph(File file) { - return DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.TAAS, - file, - new ImageServiceResponse(), - new TableServiceResponse(), - new VisualLayoutParsingResponse(), - file.toString())); + return DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.CLARIFYND, + layoutParsingPipeline.parseLayout(LayoutParsingType.CLARIFYND, + file, + new ImageServiceResponse(), + new TableServiceResponse(), + new VisualLayoutParsingResponse(), + file.toString())); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java index 4ea6204..34915e9 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java @@ -95,12 +95,13 @@ public class HeadlinesGoldStandardIntegrationTest { goldStandardLog.getRedactionLogEntry().removeIf(r -> !r.isRedacted() || r.getChanges().get(r.getChanges().size() - 1).getType().equals(ChangeType.REMOVED)); goldStandardLog.getRedactionLogEntry().forEach(e -> goldStandardHeadlines.add(new Headline(e.getPositions().get(0).getPage(), e.getValue()))); - Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, - pdfFileResource.getFile(), - new ImageServiceResponse(), - new TableServiceResponse(), - new VisualLayoutParsingResponse(), - filePath)); + Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD, + layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD, + pdfFileResource.getFile(), + new ImageServiceResponse(), + new TableServiceResponse(), + new VisualLayoutParsingResponse(), + filePath)); var foundHeadlines = documentGraph.streamAllSubNodes() .map(SemanticNode::getHeadline) diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java index 0751be3..3af1376 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java @@ -26,7 +26,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest { public void testLayoutParserEndToEnd() { prepareStorage("files/bdr/Wie weiter bei Kristeneinrichtungen.pdf"); - LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER); + LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD); LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest); Arrays.stream(finishedEvent.message().split("\n")).forEach(log::info); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java index f5bf3a2..51b12cb 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java @@ -55,12 +55,13 @@ public class DocumentGraphJsonWritingTest extends BuildDocumentTest { @SneakyThrows private void writeJsons(Path filename) { - Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, - filename.toFile(), - new ImageServiceResponse(), - new TableServiceResponse(), - new VisualLayoutParsingResponse(), - filename.toFile().toString())); + Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD, + layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD, + filename.toFile(), + new ImageServiceResponse(), + new TableServiceResponse(), + new VisualLayoutParsingResponse(), + filename.toFile().toString())); DocumentData documentData = DocumentDataMapper.toDocumentData(documentGraph); ObjectMapper mapper = ObjectMapperFactory.create(); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index 9511cdd..246d9aa 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -26,7 +26,7 @@ public class ViewerDocumentTest extends BuildDocumentTest { @SneakyThrows public void testViewerDocument() { - String fileName = "files/SinglePages/T5 VV-640252-Page16.pdf"; + String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; var documentFile = new ClassPathResource(fileName).getFile(); @@ -54,13 +54,14 @@ public class ViewerDocumentTest extends BuildDocumentTest { var documentFile = new ClassPathResource(fileName).getFile(); var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE, - documentFile, - new ImageServiceResponse(), - tableResponse, - new VisualLayoutParsingResponse(),Path.of(fileName).getFileName().toFile().toString()); + documentFile, + new ImageServiceResponse(), + tableResponse, + new VisualLayoutParsingResponse(), + Path.of(fileName).getFileName().toFile().toString()); ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null); LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService); - Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument); + Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE, classificationDocument); layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java index f2a4b87..8db6fdc 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java @@ -56,12 +56,12 @@ public class PdfSegmentationServiceTest extends AbstractTest { @SneakyThrows public ClassificationDocument buildClassificationDocument(File originDocument, TableServiceResponse tableServiceResponse) { - ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, - originDocument, - new ImageServiceResponse(), - tableServiceResponse, - new VisualLayoutParsingResponse(), - "document"); + ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD, + originDocument, + new ImageServiceResponse(), + tableServiceResponse, + new VisualLayoutParsingResponse(), + "document"); redactManagerClassificationService.classifyDocument(classificationDocument); @@ -112,16 +112,8 @@ public class PdfSegmentationServiceTest extends AbstractTest { var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class); ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile(), tableServiceResponse); - assertThat(document.getSections() - .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .collect(Collectors.toList())).isNotEmpty(); - var tables = document.getSections() - .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .toList(); + assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty(); + var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList(); // Quality of the table parsing is not good, because the file is rotated at scanning. // We only asset that the table border is not the page border. @@ -143,12 +135,12 @@ public class PdfSegmentationServiceTest extends AbstractTest { imageServiceResponse.getData() .forEach(imageMetadata -> images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>()) .add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(), - imageMetadata.getPosition().getY1(), - imageMetadata.getGeometry().getWidth(), - imageMetadata.getGeometry().getHeight()), - ImageType.valueOf(imageMetadata.getClassification().getLabel().toUpperCase(Locale.ROOT)), - imageMetadata.isAlpha(), - imageMetadata.getPosition().getPageNumber()))); + imageMetadata.getPosition().getY1(), + imageMetadata.getGeometry().getWidth(), + imageMetadata.getGeometry().getHeight()), + ImageType.valueOf(imageMetadata.getClassification().getLabel().toUpperCase(Locale.ROOT)), + imageMetadata.isAlpha(), + imageMetadata.getPosition().getPageNumber()))); System.out.println("object"); } @@ -160,22 +152,11 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Spanning Cells.pdf"); ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); - assertThat(document.getSections() - .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .collect(Collectors.toList())).isNotEmpty(); - TablePageBlock table = document.getSections() - .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .toList() - .get(0); + assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty(); + TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0); assertThat(table.getColCount()).isEqualTo(6); assertThat(table.getRowCount()).isEqualTo(13); - assertThat(table.getRows() - .stream() - .mapToInt(List::size).sum()).isEqualTo(6 * 13); + assertThat(table.getRows().stream().mapToInt(List::size).sum()).isEqualTo(6 * 13); } @@ -185,37 +166,15 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Table.pdf"); ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); - assertThat(document.getSections() - .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .collect(Collectors.toList())).isNotEmpty(); - TablePageBlock firstTable = document.getSections() - .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .toList() - .get(0); + assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty(); + TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0); assertThat(firstTable.getColCount()).isEqualTo(8); assertThat(firstTable.getRowCount()).isEqualTo(1); - TablePageBlock secondTable = document.getSections() - .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .toList() - .get(1); + TablePageBlock secondTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(1); assertThat(secondTable.getColCount()).isEqualTo(8); assertThat(secondTable.getRowCount()).isEqualTo(2); - List> firstTableHeaderCells = firstTable.getRows() - .get(0) - .stream() - .map(Collections::singletonList) - .collect(Collectors.toList()); - assertThat(secondTable.getRows() - .stream() - .allMatch(row -> row.stream() - .map(Cell::getHeaderCells) - .toList().equals(firstTableHeaderCells))).isTrue(); + List> firstTableHeaderCells = firstTable.getRows().get(0).stream().map(Collections::singletonList).collect(Collectors.toList()); + assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue(); } @@ -225,37 +184,15 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Multi Page Table.pdf"); ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); - assertThat(document.getSections() - .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .collect(Collectors.toList())).isNotEmpty(); - TablePageBlock firstTable = document.getSections() - .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .toList() - .get(0); + assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty(); + TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0); assertThat(firstTable.getColCount()).isEqualTo(9); assertThat(firstTable.getRowCount()).isEqualTo(5); - TablePageBlock secondTable = document.getSections() - .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .toList() - .get(1); + TablePageBlock secondTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(1); assertThat(secondTable.getColCount()).isEqualTo(9); assertThat(secondTable.getRowCount()).isEqualTo(6); - List> firstTableHeaderCells = firstTable.getRows() - .get(firstTable.getRowCount() - 1) - .stream() - .map(Cell::getHeaderCells) - .collect(Collectors.toList()); - assertThat(secondTable.getRows() - .stream() - .allMatch(row -> row.stream() - .map(Cell::getHeaderCells) - .toList().equals(firstTableHeaderCells))).isTrue(); + List> firstTableHeaderCells = firstTable.getRows().get(firstTable.getRowCount() - 1).stream().map(Cell::getHeaderCells).collect(Collectors.toList()); + assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue(); } @@ -265,37 +202,15 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Rotated Table Headers.pdf"); ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); - assertThat(document.getSections() - .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .collect(Collectors.toList())).isNotEmpty(); - TablePageBlock firstTable = document.getSections() - .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .toList() - .get(0); + assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty(); + TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0); assertThat(firstTable.getColCount()).isEqualTo(8); assertThat(firstTable.getRowCount()).isEqualTo(1); - TablePageBlock secondTable = document.getSections() - .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .toList() - .get(1); + TablePageBlock secondTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(1); assertThat(secondTable.getColCount()).isEqualTo(8); assertThat(secondTable.getRowCount()).isEqualTo(6); - List> firstTableHeaderCells = firstTable.getRows() - .get(0) - .stream() - .map(Collections::singletonList) - .collect(Collectors.toList()); - assertThat(secondTable.getRows() - .stream() - .allMatch(row -> row.stream() - .map(Cell::getHeaderCells) - .toList().equals(firstTableHeaderCells))).isTrue(); + List> firstTableHeaderCells = firstTable.getRows().get(0).stream().map(Collections::singletonList).collect(Collectors.toList()); + assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue(); } @@ -345,30 +260,29 @@ public class PdfSegmentationServiceTest extends AbstractTest { validateTable(document, 0, 8, 8, 0, 0); List> values = Arrays.asList(Arrays.asList("Annex point Reference within DAR/RAR", - "Author, date", - "Study title", - "Analytical method Author, date, No.", - "Technique, LOQ of the method, validated working range", - "Method meets analytical validation criteria", - "Remarks (in case validation criteria are not met)", - "Acceptability of the method"), - Arrays.asList( - "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", - "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", - "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", - "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", - "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", - "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", - "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", - "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"), - Arrays.asList("CA 7.1.2.1.1 DAR (2009)", - "Evans P.G. 2001 TMJ4569B, VV-323245", - "Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom", - "Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845", - "LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02 mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD", - "Y", - "N/A", - "Y")); + "Author, date", + "Study title", + "Analytical method Author, date, No.", + "Technique, LOQ of the method, validated working range", + "Method meets analytical validation criteria", + "Remarks (in case validation criteria are not met)", + "Acceptability of the method"), + Arrays.asList("Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"), + Arrays.asList("CA 7.1.2.1.1 DAR (2009)", + "Evans P.G. 2001 TMJ4569B, VV-323245", + "Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom", + "Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845", + "LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02 mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD", + "Y", + "N/A", + "Y")); validateTable(document, 0, values); @@ -757,11 +671,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { @SneakyThrows private void toHtml(ClassificationDocument document, String filename) { - var tables = document.getSections() - .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .toList(); + var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList(); StringBuilder sb = new StringBuilder(); int currentPage = 1; @@ -782,19 +692,9 @@ public class PdfSegmentationServiceTest extends AbstractTest { private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) { - TablePageBlock table = document.getSections() - .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .toList() - .get(tableIndex); + TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex); List> rows = table.getRows(); - int emptyCellsFoundFound = rows.stream() - .flatMap(List::stream) - .toList() - .stream() - .filter(f -> f.toString().isEmpty()) - .toList().size(); + int emptyCellsFoundFound = rows.stream().flatMap(List::stream).toList().stream().filter(f -> f.toString().isEmpty()).toList().size(); for (List row : table.getRows()) { row.forEach(r -> System.out.println(r.toString())); @@ -809,20 +709,11 @@ public class PdfSegmentationServiceTest extends AbstractTest { private void validateTable(ClassificationDocument document, int tableIndex, List> values) { - TablePageBlock table = document.getSections() - .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .toList() - .get(tableIndex); + TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex); List> rows = table.getRows(); - List rowsFlattened = rows.stream() - .flatMap(List::stream) - .toList(); - List valuesFlattened = values.stream() - .flatMap(List::stream) - .toList(); + List rowsFlattened = rows.stream().flatMap(List::stream).toList(); + List valuesFlattened = values.stream().flatMap(List::stream).toList(); for (int i = 0; i < valuesFlattened.size(); i++) { Cell cell = rowsFlattened.get(i); @@ -835,11 +726,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { private void validateTableSize(ClassificationDocument document, int tableSize) { - assertThat(document.getSections() - .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .toList().size()).isEqualTo(tableSize); + assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().size()).isEqualTo(tableSize); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/BodyTextFrameServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/BodyTextFrameServiceTest.java index cd74bbe..8c362cd 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/BodyTextFrameServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/BodyTextFrameServiceTest.java @@ -21,7 +21,7 @@ class BodyTextFrameServiceTest extends BuildDocumentTest { String filename = "files/211.pdf"; String outputFilename = "/tmp/" + Path.of(filename).getFileName() + "_MAINBODY.pdf"; - ClassificationDocument document = parseLayout(filename, LayoutParsingType.TAAS); + ClassificationDocument document = parseLayout(filename, LayoutParsingType.CLARIFYND); PdfDraw.drawRectanglesPerPage(filename, document.getPages().stream().map(page -> List.of(RectangleTransformations.toRectangle2D(page.getBodyTextFrame()))).toList(), outputFilename); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java index 3aae43a..ca08fbd 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java @@ -74,7 +74,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest { cleanRulingsPerPage.add(rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings())); } var cleanRulings = cleanRulingsPerPage.stream().map(CleanRulings::getVertical).collect(Collectors.toList()); - PdfDraw.drawLinesPerPage(fileName, cleanRulings, lineFileName); + PdfDraw.drawLinesPerPage(fileName, cleanRulings, lineFileName); } @@ -99,18 +99,20 @@ public class RulingCleaningServiceTest extends BuildDocumentTest { @SneakyThrows private void writeJsons(Path filename) { - Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, - filename.toFile(), - new ImageServiceResponse(), - new TableServiceResponse(), - new VisualLayoutParsingResponse(), - filename.toFile().toString())); - Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, - filename.toFile(), - new ImageServiceResponse(), - new TableServiceResponse(), - new VisualLayoutParsingResponse(), - filename.toFile().toString())); + Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD, + layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD, + filename.toFile(), + new ImageServiceResponse(), + new TableServiceResponse(), + new VisualLayoutParsingResponse(), + filename.toFile().toString())); + Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD, + layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD, + filename.toFile(), + new ImageServiceResponse(), + new TableServiceResponse(), + new VisualLayoutParsingResponse(), + filename.toFile().toString())); DocumentData documentDataBefore = DocumentDataMapper.toDocumentData(documentGraphBefore); DocumentData documentDataAfter = DocumentDataMapper.toDocumentData(documentGraphAfter); if (!compareStructures(documentDataBefore.getDocumentStructure(), documentDataAfter.getDocumentStructure())) { diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/AbstractTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/AbstractTest.java index 759f0e8..c0e2809 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/AbstractTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/AbstractTest.java @@ -20,7 +20,6 @@ import org.springframework.context.annotation.Import; import org.springframework.context.annotation.Primary; import org.springframework.core.io.ClassPathResource; import org.springframework.test.context.junit.jupiter.SpringExtension; -import org.xmlunit.builder.Input; import com.iqser.red.commons.jackson.ObjectMapperFactory; import com.iqser.red.storage.commons.service.StorageService; @@ -68,7 +67,7 @@ public abstract class AbstractTest { protected LayoutParsingRequest buildStandardLayoutParsingRequest() { return LayoutParsingRequest.builder() - .layoutParsingType(LayoutParsingType.REDACT_MANAGER) + .layoutParsingType(LayoutParsingType.REDACT_MANAGER_OLD) .originFileStorageId(ORIGIN_FILE_ID) .tablesFileStorageId(Optional.of(TABLE_FILE_ID)) .imagesFileStorageId(Optional.of(IMAGE_FILE_ID)) @@ -99,7 +98,7 @@ public abstract class AbstractTest { @SneakyThrows protected LayoutParsingRequest prepareStorage(String file) { - return prepareStorage(file, "cv_table_parsing_response/empty.json", "image_service_response/empty.json","visual_layout_parsing_response/empty.json"); + return prepareStorage(file, "cv_table_parsing_response/empty.json", "image_service_response/empty.json", "visual_layout_parsing_response/empty.json"); } @@ -107,7 +106,7 @@ public abstract class AbstractTest { protected LayoutParsingRequest prepareStorage(InputStream fileInputStream) { storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileInputStream); - return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER); + return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD); } @@ -140,6 +139,7 @@ public abstract class AbstractTest { return prepareStorage(pdfFileResource.getInputStream(), cvServiceResponseFileResource.getInputStream(), imageInfoFileResource.getInputStream()); } + @SneakyThrows protected LayoutParsingRequest prepareStorage(String file, String cvServiceResponseFile, String imageInfoFile, String visualLayoutParsingResponseFile) { @@ -148,9 +148,13 @@ public abstract class AbstractTest { ClassPathResource imageInfoFileResource = new ClassPathResource(imageInfoFile); ClassPathResource visualLayoutParsingResponseResource = new ClassPathResource(visualLayoutParsingResponseFile); - return prepareStorage(pdfFileResource.getInputStream(), cvServiceResponseFileResource.getInputStream(), imageInfoFileResource.getInputStream(), visualLayoutParsingResponseResource.getInputStream()); + return prepareStorage(pdfFileResource.getInputStream(), + cvServiceResponseFileResource.getInputStream(), + imageInfoFileResource.getInputStream(), + visualLayoutParsingResponseResource.getInputStream()); } + @SneakyThrows protected LayoutParsingRequest prepareStorage(InputStream fileStream, InputStream cvServiceResponseFileStream, InputStream imageInfoStream) { @@ -158,18 +162,22 @@ public abstract class AbstractTest { storageService.storeObject(TenantContext.getTenantId(), TABLE_FILE_ID, cvServiceResponseFileStream); storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream); - return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER); + return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD); } + @SneakyThrows - protected LayoutParsingRequest prepareStorage(InputStream fileStream, InputStream cvServiceResponseFileStream, InputStream imageInfoStream, InputStream visualLayoutParsingResponseFileStream) { + protected LayoutParsingRequest prepareStorage(InputStream fileStream, + InputStream cvServiceResponseFileStream, + InputStream imageInfoStream, + InputStream visualLayoutParsingResponseFileStream) { storageService.storeObject(TenantContext.getTenantId(), IMAGE_FILE_ID, imageInfoStream); storageService.storeObject(TenantContext.getTenantId(), TABLE_FILE_ID, cvServiceResponseFileStream); storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream); - storageService.storeObject(TenantContext.getTenantId(),VISUAL_LAYOUT_FILE,visualLayoutParsingResponseFileStream ); + storageService.storeObject(TenantContext.getTenantId(), VISUAL_LAYOUT_FILE, visualLayoutParsingResponseFileStream); - return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER); + return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java index 79db6bf..b070582 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java @@ -26,14 +26,19 @@ public abstract class BuildDocumentTest extends AbstractTest { File fileResource = new ClassPathResource(filename).getFile(); prepareStorage(filename); - return layoutParsingPipeline.parseLayout(layoutParsingType, fileResource, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), new TableServiceResponse(), new VisualLayoutParsingResponse(),filename); + return layoutParsingPipeline.parseLayout(layoutParsingType, + fileResource, + layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), + new TableServiceResponse(), + new VisualLayoutParsingResponse(), + filename); } @SneakyThrows protected Document buildGraph(String filename) { - return buildGraph(filename, LayoutParsingType.REDACT_MANAGER); + return buildGraph(filename, LayoutParsingType.REDACT_MANAGER_OLD); } @@ -46,7 +51,7 @@ public abstract class BuildDocumentTest extends AbstractTest { prepareStorage(filename); } - return DocumentGraphFactory.buildDocumentGraph(parseLayout(filename, layoutParsingType)); + return DocumentGraphFactory.buildDocumentGraph(layoutParsingType, parseLayout(filename, layoutParsingType)); } } diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/brokenTableOnOcr_ocred 1.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/brokenTableOnOcr_ocred 1.pdf new file mode 100644 index 0000000..1a00988 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/brokenTableOnOcr_ocred 1.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/new/wrongOrder 2.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/new/wrongOrder 2.pdf new file mode 100644 index 0000000..4834562 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/new/wrongOrder 2.pdf differ diff --git a/layoutparser-service/viewer-doc-processor/build.gradle b/layoutparser-service/viewer-doc-processor/build.gradle index c6c5dbc..4a2f2c4 100644 --- a/layoutparser-service/viewer-doc-processor/build.gradle +++ b/layoutparser-service/viewer-doc-processor/build.gradle @@ -1,6 +1,6 @@ plugins { id("com.knecon.fforesight.java-conventions") - id("io.freefair.lombok") version "8.2.2" + id("io.freefair.lombok") version "8.4" } description = "Library for adding/removing layers in the viewer document"