diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/DocumentStructure.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/DocumentStructure.java index 523308c..4b26f52 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/DocumentStructure.java +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/DocumentStructure.java @@ -55,6 +55,13 @@ public class DocumentStructure implements Serializable { } + @Schema(description = "Object containing the extra field names, a duplicate paragraph has in its properties field.") + public static class DuplicateParagraphProperties implements Serializable { + + public static final String UNSORTED_TEXTBLOCK_ID = "utbid"; + + } + public static final String RECTANGLE_DELIMITER = ";"; diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingType.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingType.java index 7598d29..2aad26b 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingType.java +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingType.java @@ -1,7 +1,10 @@ package com.knecon.fforesight.service.layoutparser.internal.api.queue; public enum LayoutParsingType { - REDACT_MANAGER, + REDACT_MANAGER_OLD, TAAS, - DOCUMINE + DOCUMINE, + + DOCSTRUM, + REDACT_MANAGER } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index b3cdb6b..895f932 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -28,6 +28,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; @@ -43,6 +44,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.RulingClean import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService; import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService; import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService; +import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService; import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService; import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService; import com.knecon.fforesight.service.layoutparser.processor.services.blockification.TaasBlockificationService; @@ -86,6 +88,7 @@ public class LayoutParsingPipeline { TaasBlockificationService taasBlockificationService; DocuMineBlockificationService docuMineBlockificationService; RedactManagerBlockificationService redactManagerBlockificationService; + DocstrumBlockificationService docstrumBlockificationService; LayoutGridService layoutGridService; ObservationRegistry observationRegistry; VisualLayoutParsingAdapter visualLayoutParsingAdapter; @@ -97,36 +100,29 @@ public class LayoutParsingPipeline { log.info("Starting layout parsing for {}", layoutParsingRequest.identifier()); File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId()); - File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()) - .orElse(originFile); + File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile); VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse(); - if (layoutParsingRequest.visualLayoutParsingFileId() - .isPresent()) { - visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId() - .get()); + if (layoutParsingRequest.visualLayoutParsingFileId().isPresent()) { + visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId().get()); } ImageServiceResponse imageServiceResponse = new ImageServiceResponse(); - if (layoutParsingRequest.imagesFileStorageId() - .isPresent()) { - imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId() - .get()); + if (layoutParsingRequest.imagesFileStorageId().isPresent()) { + imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get()); } TableServiceResponse tableServiceResponse = new TableServiceResponse(); - if (layoutParsingRequest.tablesFileStorageId() - .isPresent()) { - tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId() - .get()); + if (layoutParsingRequest.tablesFileStorageId().isPresent()) { + tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get()); } ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(), - originFile, - imageServiceResponse, - tableServiceResponse, - visualLayoutParsingResponse, - layoutParsingRequest.identifier().toString()); + originFile, + imageServiceResponse, + tableServiceResponse, + visualLayoutParsingResponse, + layoutParsingRequest.identifier().toString()); log.info("Building document graph for {}", layoutParsingRequest.identifier()); @@ -158,25 +154,25 @@ public class LayoutParsingPipeline { .numberOfPages(documentGraph.getNumberOfPages()) .duration(System.currentTimeMillis() - start) .message(format(""" - Layout parsing has finished in %.02f s. - identifiers: %s - %s - Files have been saved with Ids: - Structure: %s - Text: %s - Positions: %s - PageData: %s - Simplified Text: %s - Viewer Doc: %s""", - ((float) (System.currentTimeMillis() - start)) / 1000, - layoutParsingRequest.identifier(), - buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()), - layoutParsingRequest.structureFileStorageId(), - layoutParsingRequest.textBlockFileStorageId(), - layoutParsingRequest.positionBlockFileStorageId(), - layoutParsingRequest.pageFileStorageId(), - layoutParsingRequest.simplifiedTextStorageId(), - layoutParsingRequest.viewerDocumentStorageId())) + Layout parsing has finished in %.02f s. + identifiers: %s + %s + Files have been saved with Ids: + Structure: %s + Text: %s + Positions: %s + PageData: %s + Simplified Text: %s + Viewer Doc: %s""", + ((float) (System.currentTimeMillis() - start)) / 1000, + layoutParsingRequest.identifier(), + buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()), + layoutParsingRequest.structureFileStorageId(), + layoutParsingRequest.textBlockFileStorageId(), + layoutParsingRequest.positionBlockFileStorageId(), + layoutParsingRequest.pageFileStorageId(), + layoutParsingRequest.simplifiedTextStorageId(), + layoutParsingRequest.viewerDocumentStorageId())) .build(); } @@ -197,14 +193,14 @@ public class LayoutParsingPipeline { private String buildSemanticNodeCountMessage(int numberOfPages, Map semanticNodeCounts) { return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed", - numberOfPages, - semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION), - semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE), - semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH), - semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE), - semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL), - semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER), - semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER)); + numberOfPages, + semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION), + semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE), + semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH), + semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE), + semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL), + semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER), + semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER)); } @@ -260,10 +256,15 @@ public class LayoutParsingPipeline { PDRectangle cropbox = pdPage.getCropBox(); CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings()); + List emptyTableCells = tableExtractionService.findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical()); + ClassificationPage classificationPage = switch (layoutParsingType) { - case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); + case REDACT_MANAGER_OLD -> + redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); + case DOCSTRUM -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false); + case REDACT_MANAGER -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, true); }; classificationPage.setCleanRulings(cleanRulings); classificationPage.setRotation(rotation); @@ -289,7 +290,11 @@ public class LayoutParsingPipeline { } } - tableExtractionService.extractTables(cleanRulings, classificationPage); + tableExtractionService.extractTables(emptyTableCells, classificationPage); + + if (layoutParsingType == LayoutParsingType.DOCSTRUM || layoutParsingType == LayoutParsingType.REDACT_MANAGER) { + docstrumBlockificationService.combineBlocks(classificationPage); + } buildPageStatistics(classificationPage); increaseDocumentStatistics(classificationPage, classificationDocument); @@ -305,12 +310,28 @@ public class LayoutParsingPipeline { switch (layoutParsingType) { case TAAS -> taasClassificationService.classifyDocument(classificationDocument); case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument); + case REDACT_MANAGER_OLD -> redactManagerClassificationService.classifyDocument(classificationDocument); case REDACT_MANAGER -> redactManagerClassificationService.classifyDocument(classificationDocument); + case DOCSTRUM -> redactManagerClassificationService.classifyDocument(classificationDocument); } log.info("Building Sections for {}", identifier); + +// if (layoutParsingType == DOCSTRUM || layoutParsingType == DOCSTRUM_XY) { +// // Currently for debugging return paragraphs as sections, because there is a merging logic in sectionBuilder +// List sections = new ArrayList<>(); +// for (var page : classificationPages) { +// page.getTextBlocks().forEach(block -> { +// block.setPage(page.getPageNumber()); +// var section = sectionsBuilderService.buildTextBlock(List.of(block), "a"); +// sections.add(section); +// }); +// } +// classificationDocument.setSections(sections); +// } else { sectionsBuilderService.buildSections(classificationDocument); sectionsBuilderService.addImagesToSections(classificationDocument); +// } return classificationDocument; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/AbstractPageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/AbstractPageBlock.java index d113dfa..1f01f2f 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/AbstractPageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/AbstractPageBlock.java @@ -96,7 +96,7 @@ public abstract class AbstractPageBlock extends Rectangle { return this.minX - threshold <= apb.getMaxX() && this.maxX + threshold >= apb.getMinX(); } - + public abstract boolean isEmpty(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java index 23a3cd0..e31a27c 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java @@ -15,7 +15,6 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.No import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector; import lombok.AccessLevel; import lombok.AllArgsConstructor; @@ -52,7 +51,7 @@ public class Document implements GenericSemanticNode { public TextBlock getTextBlock() { if (textBlock == null) { - textBlock = streamTerminalTextBlocksInOrder().collect(new TextBlockCollector()); + textBlock = GenericSemanticNode.super.getTextBlock(); } return textBlock; } @@ -67,8 +66,7 @@ public class Document implements GenericSemanticNode { public Stream streamTerminalTextBlocksInOrder() { - return streamAllNodes().filter(SemanticNode::isLeaf) - .map(SemanticNode::getLeafTextBlock); + return streamAllNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getTextBlock); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/DuplicatedParagraph.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/DuplicatedParagraph.java new file mode 100644 index 0000000..93c2427 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/DuplicatedParagraph.java @@ -0,0 +1,34 @@ +package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes; + +import java.util.stream.Stream; + +import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector; + +import lombok.Data; +import lombok.EqualsAndHashCode; +import lombok.experimental.SuperBuilder; + +@Data +@EqualsAndHashCode(callSuper = true) +@SuperBuilder +public class DuplicatedParagraph extends Paragraph { + + TextBlock unsortedLeafTextBlock; + + + @Override + public TextBlock getTextBlock() { + + return Stream.of(leafTextBlock, unsortedLeafTextBlock).collect(new TextBlockCollector()); + + } + + + @Override + public String toString() { + + return super.toString(); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Paragraph.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Paragraph.java index 71010b6..dfcb4f9 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Paragraph.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Paragraph.java @@ -18,11 +18,12 @@ import lombok.Builder; import lombok.Data; import lombok.EqualsAndHashCode; import lombok.experimental.FieldDefaults; +import lombok.experimental.SuperBuilder; @Data -@Builder +@SuperBuilder @AllArgsConstructor -@FieldDefaults(level = AccessLevel.PRIVATE) +@FieldDefaults(level = AccessLevel.PROTECTED) public class Paragraph implements GenericSemanticNode { @Builder.Default diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Section.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Section.java index 6680c01..3a59884 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Section.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Section.java @@ -11,7 +11,6 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.No import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector; import lombok.AccessLevel; import lombok.AllArgsConstructor; @@ -62,9 +61,7 @@ public class Section implements GenericSemanticNode { public TextBlock getTextBlock() { if (textBlock == null) { - textBlock = streamAllSubNodes().filter(SemanticNode::isLeaf) - .map(SemanticNode::getLeafTextBlock) - .collect(new TextBlockCollector()); + textBlock = GenericSemanticNode.super.getTextBlock(); } return textBlock; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SemanticNode.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SemanticNode.java index e35a83e..32369e6 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SemanticNode.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SemanticNode.java @@ -20,6 +20,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.E import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector; import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; public interface SemanticNode { @@ -39,7 +40,10 @@ public interface SemanticNode { * * @return TextBlock containing all AtomicTextBlocks that are located under this Node. */ - TextBlock getTextBlock(); + default TextBlock getTextBlock() { + + return streamAllSubNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getTextBlock).collect(new TextBlockCollector()); + } /** diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java index 5bed37b..18f3ef5 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java @@ -48,7 +48,6 @@ public class Table implements SemanticNode { @EqualsAndHashCode.Exclude Map bBoxCache; - /** * Streams all entities in this table, that appear in a row, which contains any of the provided strings. * @@ -332,9 +331,7 @@ public class Table implements SemanticNode { public TextBlock getTextBlock() { if (textBlock == null) { - textBlock = streamAllSubNodes().filter(SemanticNode::isLeaf) - .map(SemanticNode::getLeafTextBlock) - .collect(new TextBlockCollector()); + textBlock = SemanticNode.super.getTextBlock(); } return textBlock; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java index 0442af6..6323205 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java @@ -53,6 +53,9 @@ public class TextPageBlock extends AbstractPageBlock { @JsonIgnore private PageBlockType classification; + @JsonIgnore + private boolean toDuplicate; + @JsonIgnore public TextDirection getDir() { @@ -73,7 +76,7 @@ public class TextPageBlock extends AbstractPageBlock { return sequences.get(0).getPageWidth(); } - + public static TextPageBlock merge(List textBlocksToMerge) { @@ -82,6 +85,7 @@ public class TextPageBlock extends AbstractPageBlock { return fromTextPositionSequences(sequences); } + public static TextPageBlock fromTextPositionSequences(List wordBlockList) { TextPageBlock textBlock = null; @@ -133,7 +137,6 @@ public class TextPageBlock extends AbstractPageBlock { } - /** * Returns the minX value in pdf coordinate system. * Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation. @@ -362,7 +365,22 @@ public class TextPageBlock extends AbstractPageBlock { } return TextNormalizationUtilities.removeHyphenLineBreaks(sb.toString()); + } + + public int getNumberOfLines() { + + int numberOfLines = 1; + TextPositionSequence previous = null; + for (TextPositionSequence word : sequences) { + if (previous != null) { + if (word.getMaxYDirAdj() - previous.getMaxYDirAdj() > word.getTextHeight()) { + numberOfLines++; + } + } + previous = word; + } + return numberOfLines; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java index 82829c6..dc77c45 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java @@ -55,6 +55,17 @@ public class TextPositionSequence implements CharSequence { } + public TextPositionSequence(List textPositions, int page) { + + this.textPositions = textPositions; + this.page = page; + this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir()); + this.rotation = textPositions.get(0).getRotation(); + this.pageHeight = textPositions.get(0).getPageHeight(); + this.pageWidth = textPositions.get(0).getPageWidth(); + } + + @Override public int length() { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java index 04cc930..ae8be2a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java @@ -240,7 +240,7 @@ public class SectionsBuilderService { } - private ClassificationSection buildTextBlock(List wordBlockList, String lastHeadline) { + public ClassificationSection buildTextBlock(List wordBlockList, String lastHeadline) { ClassificationSection section = new ClassificationSection(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java index 2827153..4af2a04 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java @@ -14,7 +14,6 @@ import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; -import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle; import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; @@ -41,19 +40,18 @@ public class TableExtractionService { *

* DirAdj (Text direction adjusted) values can not be used here. * - * @param cleanRulings The lines used to build the table. - * @param page Page object that contains textblocks and statistics. + * @param emptyCells The cells used to build the table. + * @param page Page object that contains textblocks and statistics. */ - public void extractTables(CleanRulings cleanRulings, ClassificationPage page) { + public void extractTables(List emptyCells, ClassificationPage page) { - List cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical()); // sort cells by size (height * width) ascending so that textBlocks are always assigned to the smallest cells that contain them - cells.sort(CELL_SIZE_COMPARATOR); + emptyCells.sort(CELL_SIZE_COMPARATOR); for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) { TextPageBlock textBlock = (TextPageBlock) abstractPageBlock; - for (Cell cell : cells) { + for (Cell cell : emptyCells) { if (cell.hasMinimumSize() && doesCellContainTextBlock(cell, textBlock)) { cell.addTextBlock(textBlock); break; @@ -61,7 +59,7 @@ public class TableExtractionService { } } - cells = new ArrayList<>(new HashSet<>(cells)); + var cells = new ArrayList<>(new HashSet<>(emptyCells)); DoubleComparisons.sort(cells, Rectangle.ILL_DEFINED_ORDER); List spreadsheetAreas = SpreadsheetFinder.findSpreadsheetsFromCells(cells); @@ -79,9 +77,7 @@ public class TableExtractionService { } } - var containedCellsWithText = containedCells.stream() - .filter(cell -> !cell.getTextBlocks().isEmpty()) - .toList(); + var containedCellsWithText = containedCells.stream().filter(cell -> !cell.getTextBlocks().isEmpty()).toList(); // verify if table would contain fewer cells with text than the threshold allows if (containedCellsWithText.size() >= MAX_TABLE_CONTAINED_CELLS_WITH_TEXT && checkIfTableCellsAreUniform(containedCells)) { @@ -101,11 +97,7 @@ public class TableExtractionService { if (position != -1) { page.getTextBlocks().add(position, table); - var toBeRemoved = table.getCells() - .stream() - .map(Cell::getTextBlocks) - .flatMap(List::stream) - .toList(); + var toBeRemoved = table.getCells().stream().map(Cell::getTextBlocks).flatMap(List::stream).toList(); // remove text blocks from the page that were also added with the table (from its contained cells) page.getTextBlocks().removeAll(toBeRemoved); } @@ -115,7 +107,7 @@ public class TableExtractionService { private boolean checkIfTableCellsAreUniform(List containedCells) { - if(containedCells.size() <= 2) { + if (containedCells.size() <= 2) { return true; } @@ -139,19 +131,13 @@ public class TableExtractionService { } double x0 = cell.getX(); double y0 = cell.getY(); - return (x >= x0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE - && y >= y0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE - && (x + w) <= x0 + cell.getWidth() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE - && (y + h) <= y0 + cell.getHeight() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE); + return (x >= x0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE && y >= y0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE && (x + w) <= x0 + cell.getWidth() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE && (y + h) <= y0 + cell.getHeight() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE); } public static List findCells(List horizontalRulingLines, List verticalRulingLines) { - return RectangularIntersectionFinder.find(horizontalRulingLines, verticalRulingLines) - .stream() - .map(Cell::new) - .collect(Collectors.toList()); + return RectangularIntersectionFinder.find(horizontalRulingLines, verticalRulingLines).stream().map(Cell::new).collect(Collectors.toList()); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java new file mode 100644 index 0000000..ccd3f94 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java @@ -0,0 +1,408 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.blockification; + +import static java.util.stream.Collectors.toSet; + +import java.awt.geom.Point2D; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.HashSet; +import java.util.List; +import java.util.ListIterator; +import java.util.Set; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.DocstrumSegmentationService; +import com.knecon.fforesight.service.layoutparser.processor.utils.QuickSort; +import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil; +import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionSequenceComparator; + +import lombok.RequiredArgsConstructor; + +@SuppressWarnings("all") +@Service +@RequiredArgsConstructor +public class DocstrumBlockificationService { + + private final DocstrumSegmentationService docstrumSegmentationService; + + static final float THRESHOLD = 1f; + + + public ClassificationPage blockify(List textPositions, List cells, boolean xyOder) { + + // Underlined or strikethrough are also in rulings but we dont want to split blocks with them so we use cells. + List usedHorizonalRulings = new ArrayList<>(); + List usedVerticalRulings = new ArrayList<>(); + + cells.forEach(cell -> { + usedHorizonalRulings.add(new Ruling(new Point2D.Float(cell.x, cell.y), new Point2D.Float(cell.x + cell.width, cell.y))); + usedHorizonalRulings.add(new Ruling(new Point2D.Float(cell.x, cell.y + cell.height), new Point2D.Float(cell.x + cell.width, cell.y + cell.height))); + usedVerticalRulings.add(new Ruling(new Point2D.Float(cell.x, cell.y), new Point2D.Float(cell.x, cell.y + cell.height))); + usedVerticalRulings.add(new Ruling(new Point2D.Float(cell.x + cell.width, cell.y), new Point2D.Float(cell.x + cell.width, cell.y + cell.height))); + }); + + List abstractPageBlocks = new ArrayList<>(); + var zones = docstrumSegmentationService.segmentPage(textPositions, xyOder); + zones.forEach(zone -> { + + List textPositionSequences = new ArrayList<>(); + zone.getLines().forEach(line -> { + line.getWords().forEach(word -> { + textPositionSequences.add(new TextPositionSequence(word.getTextPositions(), word.getPage())); + }); + }); + + abstractPageBlocks.addAll(splitZonesAtRulings(textPositionSequences, usedHorizonalRulings, usedVerticalRulings)); +// abstractPageBlocks.add(buildTextBlock(textPositionSequences, 0)); + }); + + return new ClassificationPage(abstractPageBlocks); + } + + + public void combineBlocks(ClassificationPage page) { + + mergeZones(page.getTextBlocks()); + + TextPageBlock previous = new TextPageBlock(); + ListIterator itty = page.getTextBlocks().listIterator(); + while (itty.hasNext()) { + AbstractPageBlock block = itty.next(); + if (block instanceof TablePageBlock) { + continue; + } + TextPageBlock current = (TextPageBlock) block; + + if (previous != null && !previous.getSequences().isEmpty()) { + + if (current.getDir() == previous.getDir() // + && previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 // + && previous.intersectsY(current) // + && !hasBetween(current, previous, page.getTextBlocks()) // + && numberOfYIntersections(current, previous, page.getTextBlocks()) == 0) { + + previous.getSequences().addAll(current.getSequences()); + previous = buildTextBlock(previous.getSequences(), 0); + previous.setToDuplicate(true); + itty.remove(); + itty.previous(); + itty.set(previous); + itty.next(); + continue; + } + + if (current.getDir() == previous.getDir() && (previous.almostIntersects(current, 0, 0))) { + + previous.getSequences().addAll(current.getSequences()); + boolean toDuplicate = previous.isToDuplicate(); + previous = buildTextBlock(previous.getSequences(), 0); + previous.setToDuplicate(toDuplicate); + itty.remove(); + itty.previous(); + itty.set(previous); + itty.next(); + continue; + } + + if (current.getDir() == previous.getDir() // + && (Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) // + && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) // + && !hasBetween(current, previous, page.getTextBlocks()) && numberOfYIntersections(current, previous, page.getTextBlocks()) <= 4) { + + previous.getSequences().addAll(current.getSequences()); + previous = buildTextBlock(previous.getSequences(), 0); + itty.remove(); + itty.previous(); + itty.set(previous); + itty.next(); + continue; + } + + if (current.getDir() == previous.getDir() // + && current.intersectsY(previous) // + && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) // + && !hasBetween(current, previous, page.getTextBlocks()) // + && numberOfYIntersections(current, previous, page.getTextBlocks()) <= 0) { + previous.getSequences().addAll(current.getSequences()); + previous = buildTextBlock(previous.getSequences(), 0); + itty.remove(); + itty.previous(); + itty.set(previous); + itty.next(); + continue; + } + + } + previous = current; + } + + mergeZones(page.getTextBlocks()); + } + + + private boolean hasBetween(TextPageBlock block, TextPageBlock other, List allBlocks) { + + for (AbstractPageBlock current : allBlocks) { + + if (current == other || current == block) { + continue; + } + + if (other.intersectsY(current) && other.getMaxX() <= current.getMinX() && current.getMaxX() <= block.getMinX()) { + return true; + } + } + + return false; + } + + + private int numberOfYIntersections(TextPageBlock block, TextPageBlock other, List allBlocks) { + + double minY = Math.min(block.getMinY(), other.getMinY()); + double maxY = Math.min(block.getMaxY(), other.getMaxY()); + + int numberOfYIntersections = 0; + for (AbstractPageBlock current : allBlocks) { + + if (current == other || current == block) { + continue; + } + + if (minY <= current.getMaxY() && maxY >= current.getMinY()) { + numberOfYIntersections++; + } + } + + return numberOfYIntersections; + } + + + private void mergeZones(List zones) { + + ListIterator itty = zones.listIterator(); + Set toRemove = new HashSet<>(); + while (itty.hasNext()) { + AbstractPageBlock block = itty.next(); + if (block instanceof TablePageBlock) { + continue; + } + + TextPageBlock current = (TextPageBlock) block; + + if (current.isToDuplicate()) { + continue; + } + + for (int i = 0; i < zones.size(); i++) { + + if (toRemove.contains(zones.get(i))) { + continue; + } + if (zones.get(i) == current) { + continue; + } + if (zones.get(i) instanceof TablePageBlock) { + continue; + } + + TextPageBlock inner = (TextPageBlock) zones.get(i); + + if (inner.isToDuplicate()) { + continue; + } + + if (current.getDir() == inner.getDir() && current.almostIntersects(inner, 0, 0)) { + + current.getSequences().addAll(inner.getSequences()); + QuickSort.sort(current.getSequences(), new TextPositionSequenceComparator()); + current = buildTextBlock(current.getSequences(), 0); + toRemove.add(inner); + itty.set(current); + } + } + } + zones.removeAll(toRemove); + } + + + public List splitZonesAtRulings(List textPositions, List horizontalRulingLines, List verticalRulingLines) { + + int indexOnPage = 0; + List chunkWords = new ArrayList<>(); + List chunkBlockList = new ArrayList<>(); + + float minX = 1000, maxX = 0, minY = 1000, maxY = 0; + TextPositionSequence prev = null; + + for (TextPositionSequence word : textPositions) { + + boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines); + boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir()); + + if (prev != null && (splitByDir || isSplitByRuling)) { + + TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage); + indexOnPage++; + + chunkBlockList.add(cb1); + chunkWords = new ArrayList<>(); + + minX = 1000; + maxX = 0; + minY = 1000; + maxY = 0; + prev = null; + } + + chunkWords.add(word); + + prev = word; + if (word.getMinXDirAdj() < minX) { + minX = word.getMinXDirAdj(); + } + if (word.getMaxXDirAdj() > maxX) { + maxX = word.getMaxXDirAdj(); + } + if (word.getMinYDirAdj() < minY) { + minY = word.getMinYDirAdj(); + } + if (word.getMaxYDirAdj() > maxY) { + maxY = word.getMaxYDirAdj(); + } + } + + TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage); + if (cb1 != null) { + chunkBlockList.add(cb1); + } + + return chunkBlockList; + } + + + private boolean equalsWithThreshold(float f1, float f2) { + + return Math.abs(f1 - f2) < THRESHOLD; + } + + + private TextPageBlock buildTextBlock(List wordBlockList, int indexOnPage) { + + TextPageBlock textBlock = null; + + FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter(); + FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter(); + FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter(); + StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter(); + StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter(); + + for (TextPositionSequence wordBlock : wordBlockList) { + + lineHeightFrequencyCounter.add(wordBlock.getTextHeight()); + fontSizeFrequencyCounter.add(wordBlock.getFontSize()); + spaceFrequencyCounter.add(wordBlock.getSpaceWidth()); + fontFrequencyCounter.add(wordBlock.getFont()); + styleFrequencyCounter.add(wordBlock.getFontStyle()); + + if (textBlock == null) { + textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(), + wordBlock.getMaxXDirAdj(), + wordBlock.getMinYDirAdj(), + wordBlock.getMaxYDirAdj(), + wordBlockList, + wordBlock.getRotation()); + } else { + TextPageBlock spatialEntity = textBlock.union(wordBlock); + textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight()); + } + } + + if (textBlock != null) { + textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular()); + textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest()); + } + + if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) { + textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj)); + } + return textBlock; + } + + + private boolean isSplitByRuling(float minX, + float minY, + float maxX, + float maxY, + TextPositionSequence word, + List horizontalRulingLines, + List verticalRulingLines) { + + return isSplitByRuling(maxX, + minY, + word.getMinXDirAdj(), + word.getMinYDirAdj(), + verticalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()) // + || isSplitByRuling(minX, + minY, + word.getMinXDirAdj(), + word.getMaxYDirAdj(), + horizontalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()) // + || isSplitByRuling(maxX, + minY, + word.getMinXDirAdj(), + word.getMinYDirAdj(), + horizontalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()) // + || isSplitByRuling(minX, + minY, + word.getMinXDirAdj(), + word.getMaxYDirAdj(), + verticalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()); + } + + + private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List rulingLines, float dir, float pageWidth, float pageHeight) { + + for (Ruling ruling : rulingLines) { + var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight); + if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) { + return true; + } + } + return false; + } + + + private double round(float value, int decimalPoints) { + + var d = Math.pow(10, decimalPoints); + return Math.round(value * d) / d; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmentationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmentationService.java new file mode 100644 index 0000000..5fa3e01 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmentationService.java @@ -0,0 +1,59 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum; + +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.LineBuilderService; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.NearestNeighbourService; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.ReadingOrderService; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.SpacingService; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.ZoneBuilderService; + +import lombok.RequiredArgsConstructor; + +@Service +@RequiredArgsConstructor +public class DocstrumSegmentationService { + + private final NearestNeighbourService nearestNeighbourService; + private final SpacingService spacingService; + private final LineBuilderService lineBuilderService; + private final ZoneBuilderService zoneBuilderService; + private final ReadingOrderService readingOrderService; + + + public List segmentPage(List textPositions, boolean xyOder) { + + List zones = new ArrayList<>(); + zones.addAll(computeZones(textPositions, TextDirection.ZERO)); + zones.addAll(computeZones(textPositions, TextDirection.QUARTER_CIRCLE)); + zones.addAll(computeZones(textPositions, TextDirection.HALF_CIRCLE)); + zones.addAll(computeZones(textPositions, TextDirection.THREE_QUARTER_CIRCLE)); + + return readingOrderService.resolve(zones, xyOder); + } + + + private List computeZones(List textPositions, TextDirection direction) { + + var positions = textPositions.stream().filter(t -> t.getDir() == direction).map(TextPositionSequence::getTextPositions).flatMap(List::stream).toList(); + + var characters = positions.stream().map(Character::new).collect(Collectors.toList()); + + nearestNeighbourService.findNearestNeighbors(characters); + + var characterSpacing = spacingService.computeCharacterSpacing(characters); + var lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20); + + var lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing); + return zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/AngleFilter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/AngleFilter.java new file mode 100644 index 0000000..c7fd0a6 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/AngleFilter.java @@ -0,0 +1,25 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model; + +public class AngleFilter { + + protected double lowerAngle; + protected double upperAngle; + + + public AngleFilter(double lowerAngle, double upperAngle) { + + this.lowerAngle = lowerAngle < -Math.PI / 2 ? lowerAngle + Math.PI : lowerAngle; + this.upperAngle = upperAngle >= Math.PI / 2 ? upperAngle - Math.PI : upperAngle; + } + + + public boolean matches(Neighbor neighbor) { + + if (lowerAngle <= upperAngle) { + return lowerAngle <= neighbor.getAngle() && neighbor.getAngle() < upperAngle; + } else { + return lowerAngle <= neighbor.getAngle() || neighbor.getAngle() < upperAngle; + } + } + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/BoundingBox.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/BoundingBox.java new file mode 100644 index 0000000..5215d6f --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/BoundingBox.java @@ -0,0 +1,48 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model; + +import java.awt.geom.Rectangle2D; + +import lombok.Data; + +@Data +public abstract class BoundingBox { + + private Rectangle2D bBox; + + + public double getX() { + + return bBox.getX(); + } + + + public double getY() { + + return bBox.getY(); + } + + + public double getWidth() { + + return bBox.getWidth(); + } + + + public double getHeight() { + + return bBox.getHeight(); + } + + + public double getArea() { + + return (bBox.getHeight() * bBox.getWidth()); + } + + + public boolean contains(Rectangle2D contained, double tolerance) { + + return bBox.getX() <= contained.getX() + tolerance && bBox.getY() <= contained.getY() + tolerance && bBox.getX() + bBox.getWidth() >= contained.getX() + contained.getWidth() - tolerance && bBox.getY() + bBox.getHeight() >= contained.getY() + contained.getHeight() - tolerance; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Character.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Character.java new file mode 100644 index 0000000..3e768ed --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Character.java @@ -0,0 +1,85 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; + +import lombok.Data; + +@Data +public class Character { + + private final double x; + private final double y; + private final RedTextPosition textPosition; + + private List neighbors = new ArrayList<>(); + + + public Character(RedTextPosition chunk) { + + this.x = chunk.getXDirAdj() + chunk.getWidthDirAdj() / 2; + this.y = chunk.getYDirAdj() + chunk.getHeightDir() / 2; + this.textPosition = chunk; + } + + + public double getHeight() { + + return textPosition.getHeightDir(); + } + + + public double distance(Character character) { + + double dx = getX() - character.getX(); + double dy = getY() - character.getY(); + return Math.sqrt(dx * dx + dy * dy); + } + + + public double horizontalDistance(Character character) { + + return Math.abs(getX() - character.getX()); + } + + + public double verticalDistance(Character character) { + + return Math.abs(getY() - character.getY()); + } + + + public double overlappingDistance(Character other) { + + double[] xs = new double[4]; + double s = Math.sin(-0); + double c = Math.cos(-0); + xs[0] = c * x - s * y; + xs[1] = c * (x + textPosition.getWidthDirAdj()) - s * (y + textPosition.getHeightDir()); + xs[2] = c * other.x - s * other.y; + xs[3] = c * (other.x + other.textPosition.getWidthDirAdj()) - s * (other.y + other.textPosition.getHeightDir()); + boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0]; + Arrays.sort(xs); + return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1); + } + + + public void setNeighbors(List neighbors) { + + this.neighbors = neighbors; + } + + + public double angle(Character character) { + + if (getX() > character.getX()) { + return Math.atan2(getY() - character.getY(), getX() - character.getX()); + } else { + return Math.atan2(character.getY() - getY(), character.getX() - getX()); + } + } + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/DisjointSets.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/DisjointSets.java new file mode 100644 index 0000000..421ffa5 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/DisjointSets.java @@ -0,0 +1,194 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model; + +import java.util.AbstractSet; +import java.util.Collection; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.NoSuchElementException; +import java.util.Set; + +public class DisjointSets implements Iterable> { + + private final Map> map = new HashMap<>(); + + + public DisjointSets(Collection collection) { + + for (E element : collection) { + map.put(element, new Entry(element)); + } + } + + + public boolean areTogether(E e1, E e2) { + + return map.get(e1).findRepresentative().equals(map.get(e2).findRepresentative()); + } + + + public void union(E e1, E e2) { + + Entry r1 = map.get(e1).findRepresentative(); + Entry r2 = map.get(e2).findRepresentative(); + if (!r1.equals(r2)) { + if (r1.size <= r2.size) { + r2.mergeWith(r1); + } else { + r1.mergeWith(r2); + } + } + } + + + @Override + public Iterator> iterator() { + + return new Iterator<>() { + + private final Iterator> iterator = map.values().iterator(); + private Entry nextRepresentative; + + { + findNextRepresentative(); + } + + @Override + public boolean hasNext() { + + return nextRepresentative != null; + } + + + @Override + public Set next() { + + if (nextRepresentative == null) { + throw new NoSuchElementException(); + } + Set result = nextRepresentative.asSet(); + findNextRepresentative(); + return result; + } + + + private void findNextRepresentative() { + + while (iterator.hasNext()) { + Entry candidate = iterator.next(); + if (candidate.isRepresentative()) { + nextRepresentative = candidate; + return; + } + } + nextRepresentative = null; + } + + + @Override + public void remove() { + + throw new UnsupportedOperationException(); + } + + }; + } + + + private static class Entry { + + private int size = 1; + private final E value; + private Entry parent = this; + private Entry next; + private Entry last = this; + + + Entry(E value) { + + this.value = value; + } + + + void mergeWith(Entry otherRepresentative) { + + size += otherRepresentative.size; + last.next = otherRepresentative; + last = otherRepresentative.last; + otherRepresentative.parent = this; + } + + + Entry findRepresentative() { + + Entry representative = parent; + while (!representative.parent.equals(representative)) { + representative = representative.parent; + } + for (Entry entry = this; !entry.equals(representative); ) { + Entry nextEntry = entry.parent; + entry.parent = representative; + entry = nextEntry; + } + return representative; + } + + + boolean isRepresentative() { + + return parent.equals(this); + } + + + Set asSet() { + + return new AbstractSet() { + + @Override + public Iterator iterator() { + + return new Iterator() { + + private Entry nextEntry = findRepresentative(); + + + @Override + public boolean hasNext() { + + return nextEntry != null; + } + + + @Override + public E next() { + + if (nextEntry == null) { + throw new NoSuchElementException(); + } + E result = nextEntry.value; + nextEntry = nextEntry.next; + return result; + } + + + @Override + public void remove() { + + throw new UnsupportedOperationException(); + } + + }; + } + + + @Override + public int size() { + + return findRepresentative().size; + } + }; + } + + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Histogram.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Histogram.java new file mode 100644 index 0000000..fb34ac6 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Histogram.java @@ -0,0 +1,90 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model; + +public class Histogram { + + private static final double EPSILON = 1.0e-6; + private final double min; + private final double resolution; + private double[] frequencies; + + + public Histogram(double minValue, double maxValue, double resolution) { + + this.min = minValue - EPSILON; + double delta = maxValue - minValue + 2 * EPSILON; + int size = Math.max(1, (int) Math.round((maxValue - minValue) / resolution)); + this.resolution = delta / size; + this.frequencies = new double[size]; + } + + + public void kernelSmooth(double[] kernel) { + + double[] newFrequencies = new double[frequencies.length]; + int shift = (kernel.length - 1) / 2; + for (int i = 0; i < kernel.length; i++) { + int jStart = Math.max(0, i - shift); + int jEnd = Math.min(frequencies.length, frequencies.length + i - shift); + for (int j = jStart; j < jEnd; j++) { + newFrequencies[j - i + shift] += kernel[i] * frequencies[j]; + } + } + frequencies = newFrequencies; + } + + + public double[] createGaussianKernel(double length, double stdDeviation) { + + int r = (int) Math.round(length / resolution) / 2; + + int size = 2 * r + 1; + double[] kernel = new double[size]; + double sum = 0; + double b = 2 * (stdDeviation / resolution) * (stdDeviation / resolution); + double a = 1 / Math.sqrt(Math.PI * b); + for (int i = 0; i < size; i++) { + kernel[i] = a * Math.exp(-(i - r) * (i - r) / b); + sum += kernel[i]; + } + for (int i = 0; i < size; i++) { + kernel[i] /= sum; + } + return kernel; + } + + + public void gaussianSmooth(double windowLength, double stdDeviation) { + + kernelSmooth(createGaussianKernel(windowLength, stdDeviation)); + } + + + public void add(double value) { + + frequencies[(int) ((value - min) / resolution)] += 1.0; + } + + + public int getSize() { + + return frequencies.length; + } + + + public double getPeakValue() { + + int peakIndex = 0; + for (int i = 1; i < frequencies.length; i++) { + if (frequencies[i] > frequencies[peakIndex]) { + peakIndex = i; + } + } + int peakEndIndex = peakIndex + 1; + final double EPS = 0.0001; + while (peakEndIndex < frequencies.length && Math.abs(frequencies[peakEndIndex] - frequencies[peakIndex]) < EPS) { + peakEndIndex++; + } + return ((double) peakIndex + peakEndIndex) / 2 * resolution + min; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Line.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Line.java new file mode 100644 index 0000000..e9e0201 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Line.java @@ -0,0 +1,168 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model; + +import java.awt.geom.Rectangle2D; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; + +import lombok.Data; + +@Data +public class Line extends BoundingBox { + + private static final double WORD_DISTANCE_MULTIPLIER = 0.2; + + private final double x0; + private final double y0; + + private final double x1; + private final double y1; + + private final double height; + + private final List characters; + private final List words = new ArrayList<>(); + + + public Line(List characters, double wordSpacing) { + + this.characters = characters; + + if (characters.size() >= 2) { + // linear regression + double sx = 0.0; + double sxx = 0.0; + double sxy = 0.0; + double sy = 0.0; + for (Character character : characters) { + sx += character.getX(); + sxx += character.getX() * character.getX(); + sxy += character.getX() * character.getY(); + sy += character.getY(); + } + double b = (characters.size() * sxy - sx * sy) / (characters.size() * sxx - sx * sx); + double a = (sy - b * sx) / characters.size(); + + this.x0 = characters.get(0).getX(); + this.y0 = a + b * this.x0; + this.x1 = characters.get(characters.size() - 1).getX(); + this.y1 = a + b * this.x1; + } else { + Character character = characters.get(0); + double dx = character.getTextPosition().getWidthDirAdj() / 3; + double dy = dx * Math.tan(0); + this.x0 = character.getX() - dx; + this.x1 = character.getX() + dx; + this.y0 = character.getY() - dy; + this.y1 = character.getY() + dy; + } + height = computeHeight(); + computeWords(wordSpacing * WORD_DISTANCE_MULTIPLIER); + buildBBox(); + } + + + public double getAngle() { + + return Math.atan2(y1 - y0, x1 - x0); + } + + + public double getLength() { + + return Math.sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1)); + } + + + private double computeHeight() { + + double sum = 0.0; + for (Character component : characters) { + sum += component.getHeight(); + } + return sum / characters.size(); + } + + + public double angularDifference(Line j) { + + double diff = Math.abs(getAngle() - j.getAngle()); + if (diff <= Math.PI / 2) { + return diff; + } else { + return Math.PI - diff; + } + } + + + public double horizontalDistance(Line other) { + + double[] xs = new double[4]; + xs[0] = x0; + xs[1] = x1; + xs[2] = other.x0; + xs[3] = other.x1; + boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0]; + Arrays.sort(xs); + return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1); + } + + + public double verticalDistance(Line other) { + + double ym = (y0 + y1) / 2; + double yn = (other.y0 + other.y1) / 2; + return Math.abs(ym - yn) / Math.sqrt(1); + } + + + private void computeWords(double wordSpacing) { + + TextPositionSequence word = new TextPositionSequence(); + Character previous = null; + for (Character current : characters) { + if (previous != null) { + double dist = current.getTextPosition().getXDirAdj() - previous.getTextPosition().getXDirAdj() - previous.getTextPosition().getWidthDirAdj(); + if (dist > wordSpacing) { + words.add(word); + word = new TextPositionSequence(); + } + } + word.getTextPositions().add(current.getTextPosition()); + previous = current; + } + words.add(word); + } + + + private void buildBBox() { + + double minX = Double.POSITIVE_INFINITY; + double minY = Double.POSITIVE_INFINITY; + double maxX = Double.NEGATIVE_INFINITY; + double maxY = Double.NEGATIVE_INFINITY; + + for (Character character : characters) { + + minX = Math.min(minX, character.getTextPosition().getXDirAdj()); + minY = Math.min(minY, character.getTextPosition().getYDirAdj()); + maxX = Math.max(maxX, character.getTextPosition().getXDirAdj() + character.getTextPosition().getWidthDirAdj()); + maxY = Math.max(maxY, character.getTextPosition().getYDirAdj() + character.getTextPosition().getHeightDir()); + + } + + this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY)); + } + + + public String toString() { + + StringBuilder sb = new StringBuilder(); + words.forEach(word -> sb.append(word.toString()).append(" ")); + return sb.toString().trim(); + } + +} + diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Neighbor.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Neighbor.java new file mode 100644 index 0000000..b2b4174 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Neighbor.java @@ -0,0 +1,36 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model; + +import lombok.Getter; + +public class Neighbor { + + @Getter + private final double distance; + @Getter + private final double angle; + private final Character originCharacter; + @Getter + private final Character character; + + + public Neighbor(Character neighbor, Character origin) { + + this.distance = neighbor.distance(origin); + this.angle = neighbor.angle(origin); + this.character = neighbor; + this.originCharacter = origin; + } + + + public double getHorizontalDistance() { + + return character.horizontalDistance(originCharacter); + } + + + public double getVerticalDistance() { + + return character.verticalDistance(originCharacter); + } + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Zone.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Zone.java new file mode 100644 index 0000000..d5651d8 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Zone.java @@ -0,0 +1,51 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model; + +import java.awt.geom.Rectangle2D; +import java.util.Comparator; +import java.util.List; + +import lombok.Data; + +@Data +public class Zone extends BoundingBox { + + private List lines; + + + @SuppressWarnings("PMD.ConstructorCallsOverridableMethod") + public Zone(List lines) { + + lines.sort(Comparator.comparingDouble(Line::getY)); + this.lines = lines; + buildBBox(); + } + + + public void buildBBox() { + + double minX = Double.POSITIVE_INFINITY; + double minY = Double.POSITIVE_INFINITY; + double maxX = Double.NEGATIVE_INFINITY; + double maxY = Double.NEGATIVE_INFINITY; + + for (Line line : lines) { + + minX = Math.min(minX, line.getX()); + minY = Math.min(minY, line.getY()); + maxX = Math.max(maxX, line.getX() + line.getWidth()); + maxY = Math.max(maxY, line.getY() + line.getHeight()); + + } + + this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY)); + } + + + public String toString() { + + StringBuilder sb = new StringBuilder(); + lines.forEach(line -> sb.append(line.toString()).append("\n")); + return sb.toString().trim(); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/LineBuilderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/LineBuilderService.java new file mode 100644 index 0000000..c3d9ad2 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/LineBuilderService.java @@ -0,0 +1,51 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.AngleFilter; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.DisjointSets; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Line; + +@Service +public class LineBuilderService { + + private static final double CHARACTER_SPACING_DISTANCE_MULTIPLIER = 3.5; + private static final double MAX_VERTICAL_CHARACTER_DISTANCE = 0.67; + private static final double ANGLE_TOLERANCE = Math.PI / 6; + + + public List buildLines(List characters, double characterSpacing, double lineSpacing) { + + double maxHorizontalDistance = characterSpacing * CHARACTER_SPACING_DISTANCE_MULTIPLIER; + double maxVerticalDistance = lineSpacing * MAX_VERTICAL_CHARACTER_DISTANCE; + + DisjointSets sets = new DisjointSets<>(characters); + AngleFilter filter = new AngleFilter(-ANGLE_TOLERANCE, ANGLE_TOLERANCE); + + characters.forEach(character -> { + character.getNeighbors().forEach(neighbor -> { + double x = neighbor.getHorizontalDistance() / maxHorizontalDistance; + double y = neighbor.getVerticalDistance() / maxVerticalDistance; + if (character.getTextPosition().getDir() == neighbor.getCharacter().getTextPosition().getDir() && filter.matches(neighbor) && Math.pow(x, 2) + Math.pow(y, + 2) <= 1) { + sets.union(character, neighbor.getCharacter()); + } + }); + }); + + List lines = new ArrayList<>(); + sets.forEach(group -> { + List lineCharacters = new ArrayList<>(group); + lineCharacters.sort(Comparator.comparingDouble(Character::getX)); + lines.add(new Line(lineCharacters, characterSpacing)); + }); + + return lines; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/NearestNeighbourService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/NearestNeighbourService.java new file mode 100644 index 0000000..1a3f6e2 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/NearestNeighbourService.java @@ -0,0 +1,78 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Neighbor; + +@Service +public class NearestNeighbourService { + + private static final int NUMBER_OF_NEIGHBOURS = 8; + private static final double STEP = 16.0; + + + public void findNearestNeighbors(List characters) { + + if (characters.isEmpty()) { + return; + } + + characters.sort(Comparator.comparingDouble(Character::getX)); + + int maxNeighborCount = NUMBER_OF_NEIGHBOURS; + if (characters.size() <= NUMBER_OF_NEIGHBOURS) { + maxNeighborCount = characters.size() - 1; + } + + for (int i = 0; i < characters.size(); i++) { + + List candidates = new ArrayList<>(); + + int start = i; + int end = i + 1; + + double distance = Double.POSITIVE_INFINITY; + + for (double searchDistance = 0; searchDistance < distance; ) { + + searchDistance += STEP; + boolean newCandidatesFound = false; + + while (start > 0 && characters.get(i).getX() - characters.get(start - 1).getX() < searchDistance) { + start--; + candidates.add(new Neighbor(characters.get(start), characters.get(i))); + clearLeastDistant(candidates, maxNeighborCount); + newCandidatesFound = true; + } + + while (end < characters.size() && characters.get(end).getX() - characters.get(i).getX() < searchDistance) { + candidates.add(new Neighbor(characters.get(end), characters.get(i))); + clearLeastDistant(candidates, maxNeighborCount); + end++; + newCandidatesFound = true; + } + + if (newCandidatesFound && candidates.size() >= maxNeighborCount) { + distance = candidates.get(maxNeighborCount - 1).getDistance(); + } + } + clearLeastDistant(candidates, maxNeighborCount); + characters.get(i).setNeighbors(new ArrayList<>(candidates)); + } + } + + + private void clearLeastDistant(List candidates, int maxNeighborCount) { + + if (candidates.size() > maxNeighborCount) { + candidates.sort(Comparator.comparingDouble(Neighbor::getDistance)); + candidates.remove(candidates.remove(candidates.size() - 1)); + } + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ReadingOrderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ReadingOrderService.java new file mode 100644 index 0000000..3e1ab25 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ReadingOrderService.java @@ -0,0 +1,100 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.ListIterator; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.BoundingBox; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils.DoubleUtils; + +@Service +public class ReadingOrderService { + + private static final double THRESHOLD = 5; + + + public List resolve(List zones, boolean xyOrder) { + + if (zones.isEmpty() || zones.size() == 1) { + return zones; + } + + if (xyOrder) { +// QuickSort.sort(zones, new ZoneComparator()); + zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) + .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, 0))); + return zones; + } + + return resolveMultiColumnReadingOder(zones); + } + + + private List resolveMultiColumnReadingOder(List zones) { + + // Simple reading order resolver for multi column page layout as described here : https://pub.towardsai.net/advanced-rag-02-unveiling-pdf-parsing-b84ae866344e + // TODO implement a more fancy reading order resolver see https://github.com/BobLd/DocumentLayoutAnalysis/blob/master/README.md#reading-order + + double minX = Double.POSITIVE_INFINITY; + double maxX = Double.NEGATIVE_INFINITY; + + for (Zone zone : zones) { + if (zone.getX() < minX) { + minX = zone.getX(); + } + if (zone.getX() + zone.getWidth() > maxX) { + maxX = zone.getX() + zone.getWidth(); + } + } + + double midLineXCoordinate = (minX + maxX) / 2; + + List leftOf = new ArrayList<>(); + List rightOf = new ArrayList<>(); + List middle = new ArrayList<>(); + for (Zone zone : zones) { + if (zone.getX() < midLineXCoordinate && zone.getX() + zone.getWidth() < midLineXCoordinate) { + leftOf.add(zone); + } else if (zone.getX() > midLineXCoordinate && zone.getX() + zone.getWidth() > midLineXCoordinate) { + rightOf.add(zone); + } else { + middle.add(zone); + } + } + + leftOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) + .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); + + rightOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) + .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); + + middle.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) + .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); + + List sortedZones = new ArrayList<>(); + sortedZones.addAll(leftOf); + sortedZones.addAll(rightOf); + + ListIterator itty = middle.listIterator(); + + while (itty.hasNext()) { + Zone current = itty.next(); + for (int i = 0; i < sortedZones.size(); i++) { + if (current.getY() < sortedZones.get(i).getY()) { + sortedZones.add(i, current); + itty.remove(); + break; + } + } + } + + sortedZones.addAll(middle); + + return sortedZones; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/SpacingService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/SpacingService.java new file mode 100644 index 0000000..2aab22d --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/SpacingService.java @@ -0,0 +1,56 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service; + +import java.util.List; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.AngleFilter; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Histogram; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Neighbor; + +@Service +public class SpacingService { + + private static final double SPACING_HISTOGRAM_RESOLUTION = 0.5; + private static final double SPACING_HISTOGRAM_SMOOTHING_LENGTH = 2.5; + private static final double SPACING_HIST_SMOOTHING_STANDARD_DEVIATION = 0.5; + private static final double ANGLE_TOLERANCE = Math.PI / 6; + + + public double computeCharacterSpacing(List characters) { + + return computeSpacing(characters, 0); + } + + + public double computeLineSpacing(List characters) { + + return computeSpacing(characters, Math.PI / 2); + } + + + private double computeSpacing(List characters, double angle) { + + double maxDistance = Double.NEGATIVE_INFINITY; + + for (Character character : characters) { + for (Neighbor neighbor : character.getNeighbors()) { + maxDistance = Math.max(maxDistance, neighbor.getDistance()); + } + } + Histogram histogram = new Histogram(0, maxDistance, SPACING_HISTOGRAM_RESOLUTION); + AngleFilter angleFilter = new AngleFilter(angle - ANGLE_TOLERANCE, angle + ANGLE_TOLERANCE); + for (Character character : characters) { + for (Neighbor neighbor : character.getNeighbors()) { + if (angleFilter.matches(neighbor)) { + histogram.add(neighbor.getDistance()); + } + } + } + + histogram.gaussianSmooth(SPACING_HISTOGRAM_SMOOTHING_LENGTH, SPACING_HIST_SMOOTHING_STANDARD_DEVIATION); + return histogram.getPeakValue(); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ZoneBuilderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ZoneBuilderService.java new file mode 100644 index 0000000..e38eb0e --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ZoneBuilderService.java @@ -0,0 +1,150 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.Set; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.DisjointSets; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Line; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone; + +@Service +public class ZoneBuilderService { + + private static final double MIN_HORIZONTAL_DISTANCE_MULTIPLIER = -0.5; + private static final double MAX_VERTICAL_DISTANCE_MULTIPLIER = 1.2; + + private static final double MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER = -3.0; + + private static final double MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER = 0.5; + + private static final double MIN_LINE_SIZE_SCALE = 0.9; + + private static final double MAX_LINE_SIZE_SCALE = 2.5; + + private static final double ANGLE_TOLERANCE = Math.PI / 6; + + private static final int MAX_ZONES = 300; + + private static final double MAX_VERTICAL_MERGE_DISTANCE = 0.5; + + + public List buildZones(List lines, double characterSpacing, double lineSpacing) { + + double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER; + double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER; + double minHorizontalMergeDistance = characterSpacing * MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER; + double maxVerticalMergeDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER; + + DisjointSets sets = new DisjointSets<>(lines); + + double meanHeight = calculateMeanHeight(lines); + + lines.forEach(outerLine -> // + lines.forEach(innerLine -> { + + double scale = Math.min(outerLine.getHeight(), innerLine.getHeight()) / meanHeight; + scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE)); + + if (!sets.areTogether(outerLine, innerLine) && outerLine.angularDifference(innerLine) <= ANGLE_TOLERANCE) { + + double horizontalDistance = outerLine.horizontalDistance(innerLine) / scale; + double verticalDistance = outerLine.verticalDistance(innerLine) / scale; + + if (minHorizontalDistance <= horizontalDistance && verticalDistance <= maxVerticalDistance // + || minHorizontalMergeDistance <= horizontalDistance && verticalDistance <= maxVerticalMergeDistance) { + sets.union(outerLine, innerLine); + } + } + })); + + List zones = new ArrayList<>(); + sets.forEach(group -> { + zones.add(new Zone(new ArrayList<>(group))); + }); + + if (zones.size() > MAX_ZONES) { + List oneZoneLines = new ArrayList<>(); + for (Zone zone : zones) { + oneZoneLines.addAll(zone.getLines()); + } + return List.of(mergeLinesInZone(oneZoneLines, characterSpacing, lineSpacing)); + } + + return zones; + } + + + private double calculateMeanHeight(List lines) { + + double meanHeight = 0.0; + double weights = 0.0; + for (Line line : lines) { + double weight = line.getLength(); + meanHeight += line.getHeight() * weight; + weights += weight; + } + meanHeight /= weights; + return meanHeight; + } + + + private Zone mergeLinesInZone(List lines, double characterSpacing, double lineSpacing) { + + double maxHorizontalDistance = 0; + double minVerticalDistance = 0; + double maxVerticalDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE; + + DisjointSets sets = new DisjointSets<>(lines); + + lines.forEach(outer -> { + + lines.forEach(inner -> { + if (inner != outer) { + + double horizontalDistance = outer.horizontalDistance(inner); + double verticalDistance = outer.verticalDistance(inner); + + if (horizontalDistance <= maxHorizontalDistance && minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance) { + sets.union(outer, inner); + } else if (minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance && Math.abs(horizontalDistance - Math.min(outer.getLength(), + inner.getLength())) < 0.1) { + boolean characterOverlap = false; + int overlappingCount = 0; + for (Character outerCharacter : outer.getCharacters()) { + for (Character innerCharacter : inner.getCharacters()) { + double characterOverlapDistance = outerCharacter.overlappingDistance(innerCharacter); + if (characterOverlapDistance > 2) { + characterOverlap = true; + } + if (characterOverlapDistance > 0) { + overlappingCount++; + } + } + } + if (!characterOverlap && overlappingCount <= 2) { + sets.union(outer, inner); + } + } + } + }); + }); + + List outputZone = new ArrayList<>(); + for (Set group : sets) { + List components = new ArrayList<>(); + for (Line line : group) { + components.addAll(line.getCharacters()); + } + components.sort(Comparator.comparingDouble(Character::getX)); + + outputZone.add(new Line(components, characterSpacing)); + } + return new Zone(outputZone); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/utils/DoubleUtils.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/utils/DoubleUtils.java new file mode 100644 index 0000000..d762cf0 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/utils/DoubleUtils.java @@ -0,0 +1,15 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils; + +public class DoubleUtils { + + public static int compareDouble(double d1, double d2, double precision) { + + if (Double.isNaN(d1) || Double.isNaN(d2)) { + return Double.compare(d1, d2); + } + long i1 = Math.round(d1 / (precision == 0 ? 1 : precision)); + long i2 = Math.round(d2 / (precision == 0 ? 1 : precision)); + return Long.compare(i1, i2); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java index eaf5bf2..8859d93 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java @@ -13,6 +13,7 @@ import java.util.List; import java.util.Map; import java.util.NoSuchElementException; import java.util.Set; +import java.util.stream.Collectors; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; @@ -22,6 +23,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Header; @@ -77,6 +79,8 @@ public class DocumentGraphFactory { GenericSemanticNode node; if (originalTextBlock.isHeadline()) { node = Headline.builder().documentTree(context.getDocumentTree()).build(); + } else if (originalTextBlock.isToDuplicate()) { + node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree()).build(); } else { node = Paragraph.builder().documentTree(context.getDocumentTree()).build(); } @@ -87,6 +91,14 @@ public class DocumentGraphFactory { textBlocks.add(originalTextBlock); textBlocks.addAll(textBlocksToMerge); AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), node, context, page); + + if (node instanceof DuplicatedParagraph duplicatedParagraph) { + AtomicTextBlock unsortedTextBlock = context.textBlockFactory.buildAtomicTextBlock(textBlocks.stream() + .flatMap(tb -> tb.getSequences().stream()) + .collect(Collectors.toList()), node, context, page); + duplicatedParagraph.setUnsortedLeafTextBlock(unsortedTextBlock); + } + List treeId = context.documentTree.createNewChildEntryAndReturnId(parentNode, node); node.setLeafTextBlock(textBlock); node.setTreeId(treeId); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java index 7bd82e2..93e1e57 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java @@ -11,12 +11,12 @@ import java.util.Map; import java.util.Set; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; -import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section; +import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.utils.TableMergingUtility; import lombok.experimental.UtilityClass; @@ -171,6 +171,7 @@ public class SectionNodeFactory { .filter(abstractTextContainer -> abstractTextContainer.intersectsY(atc)) .map(abstractTextContainer -> (TextPageBlock) abstractTextContainer) .filter(abstractTextContainer -> abstractTextContainer.getDir() == atc.getDir()) + .filter(abstractTextContainer -> !abstractTextContainer.isToDuplicate()) .toList(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentDataMapper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentDataMapper.java index ac42f15..b4b20d8 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentDataMapper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentDataMapper.java @@ -8,8 +8,6 @@ import java.util.Map; import java.util.Set; import java.util.stream.Collectors; -import javax.xml.parsers.DocumentBuilder; - import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData; @@ -18,6 +16,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.Do import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; @@ -33,27 +32,20 @@ public class DocumentDataMapper { public DocumentData toDocumentData(Document document) { List documentTextData = document.streamTerminalTextBlocksInOrder() - .flatMap(textBlock -> textBlock.getAtomicTextBlocks() - .stream()) + .flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream()) .distinct() .map(DocumentDataMapper::toAtomicTextBlockData) .toList(); List atomicPositionBlockData = document.streamTerminalTextBlocksInOrder() - .flatMap(textBlock -> textBlock.getAtomicTextBlocks() - .stream()) + .flatMap(textBlock -> textBlock.getAtomicTextBlocks().stream()) .distinct() .map(DocumentDataMapper::toAtomicPositionBlockData) .toList(); - Set nonEmptyTextBlocks = documentTextData.stream() - .mapToLong(DocumentTextData::getId).boxed() - .collect(Collectors.toSet()); + Set nonEmptyTextBlocks = documentTextData.stream().mapToLong(DocumentTextData::getId).boxed().collect(Collectors.toSet()); - List documentPageData = document.getPages() - .stream() - .map(DocumentDataMapper::toPageData) - .toList(); + List documentPageData = document.getPages().stream().map(DocumentDataMapper::toPageData).toList(); DocumentStructure tableOfContentsData = toDocumentTreeData(document.getDocumentTree()); return DocumentData.builder() .documentTextData(documentTextData.toArray(new DocumentTextData[0])) @@ -84,22 +76,17 @@ public class DocumentDataMapper { case TABLE -> PropertiesMapper.buildTableProperties((Table) entry.getNode()); case TABLE_CELL -> PropertiesMapper.buildTableCellProperties((TableCell) entry.getNode()); case IMAGE -> PropertiesMapper.buildImageProperties((Image) entry.getNode()); + case PARAGRAPH -> + entry.getNode() instanceof DuplicatedParagraph duplicatedParagraph ? PropertiesMapper.buildDuplicateParagraphProperties(duplicatedParagraph) : new HashMap<>(); default -> new HashMap<>(); }; DocumentStructure.EntryData.EntryDataBuilder documentBuilder = DocumentStructure.EntryData.builder() .treeId(toPrimitiveIntArray(entry.getTreeId())) - .children(entry.getChildren() - .stream() - .map(DocumentDataMapper::toEntryData) - .toList()) + .children(entry.getChildren().stream().map(DocumentDataMapper::toEntryData).toList()) .type(entry.getType()) .atomicBlockIds(atomicTextBlocks) - .pageNumbers(entry.getNode().getPages() - .stream() - .map(Page::getNumber) - .map(Integer::longValue) - .toArray(Long[]::new)) + .pageNumbers(entry.getNode().getPages().stream().map(Page::getNumber).map(Integer::longValue).toArray(Long[]::new)) .properties(properties); if (entry.getNode() != null) { documentBuilder.engines(entry.getNode().getEngines()); @@ -112,10 +99,7 @@ public class DocumentDataMapper { private Long[] toAtomicTextBlockIds(TextBlock textBlock) { - return textBlock.getAtomicTextBlocks() - .stream() - .map(AtomicTextBlock::getId) - .toArray(Long[]::new); + return textBlock.getAtomicTextBlocks().stream().map(AtomicTextBlock::getId).toArray(Long[]::new); } @@ -167,9 +151,7 @@ public class DocumentDataMapper { private int[] toPrimitiveIntArray(List list) { - return list.stream() - .mapToInt(Integer::intValue) - .toArray(); + return list.stream().mapToInt(Integer::intValue).toArray(); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentGraphMapper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentGraphMapper.java index c51f9ec..a53c6d8 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentGraphMapper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/DocumentGraphMapper.java @@ -7,13 +7,14 @@ import java.util.List; import java.util.Map; import java.util.NoSuchElementException; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData; import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Header; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline; @@ -61,7 +62,7 @@ public class DocumentGraphMapper { SemanticNode node = switch (entryData.getType()) { case SECTION -> buildSection(context); - case PARAGRAPH -> buildParagraph(context); + case PARAGRAPH -> buildParagraph(context, entryData.getProperties()); case HEADLINE -> buildHeadline(context); case HEADER -> buildHeader(context); case FOOTER -> buildFooter(context); @@ -140,7 +141,17 @@ public class DocumentGraphMapper { } - private Paragraph buildParagraph(Context context) { + private Paragraph buildParagraph(Context context, Map properties) { + + if (PropertiesMapper.isDuplicateParagraph(properties)) { + + DuplicatedParagraph duplicatedParagraph = DuplicatedParagraph.builder().documentTree(context.documentTree).build(); + + Long[] unsortedTextblockIds = PropertiesMapper.getUnsortedTextblockIds(properties); + duplicatedParagraph.setUnsortedLeafTextBlock(toTextBlock(unsortedTextblockIds, context, duplicatedParagraph)); + return duplicatedParagraph; + + } return Paragraph.builder().documentTree(context.documentTree).build(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/PropertiesMapper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/PropertiesMapper.java index 329bd40..f4ebbd5 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/PropertiesMapper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/mapper/PropertiesMapper.java @@ -1,17 +1,19 @@ package com.knecon.fforesight.service.layoutparser.processor.services.mapper; import java.awt.geom.Rectangle2D; -import java.util.Collections; +import java.util.Arrays; import java.util.HashMap; import java.util.Locale; import java.util.Map; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; public class PropertiesMapper { @@ -76,6 +78,32 @@ public class PropertiesMapper { } + public static Map buildDuplicateParagraphProperties(DuplicatedParagraph duplicatedParagraph) { + + Map properties = new HashMap<>(); + properties.put(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID, Arrays.toString(toAtomicTextBlockIds(duplicatedParagraph.getUnsortedLeafTextBlock()))); + return properties; + } + + + public static boolean isDuplicateParagraph(Map properties) { + + return properties.containsKey(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID); + } + + + public static Long[] getUnsortedTextblockIds(Map properties) { + + return toLongArray(properties.get(DocumentStructure.DuplicateParagraphProperties.UNSORTED_TEXTBLOCK_ID)); + } + + + public static Long[] toLongArray(String ids) { + + return Arrays.stream(ids.substring(1, ids.length() - 1).trim().split(",")).map(Long::valueOf).toArray(Long[]::new); + } + + private static ImageType parseImageType(String imageType) { return switch (imageType) { @@ -101,4 +129,10 @@ public class PropertiesMapper { rectangle2D.getHeight()); } + + private static Long[] toAtomicTextBlockIds(TextBlock textBlock) { + + return textBlock.getAtomicTextBlocks().stream().map(AtomicTextBlock::getId).toArray(Long[]::new); + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java index 09a8eb2..18e5a5a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java @@ -237,8 +237,13 @@ public class PDFLinesTextStripper extends PDFTextStripper { int startIndex = 0; RedTextPosition previous = null; + float direction = -1; for (int i = 0; i <= textPositions.size() - 1; i++) { + if (direction == -1) { + direction = textPositions.get(i).getDir(); + } + if (!textPositionSequences.isEmpty()) { previous = textPositionSequences.get(textPositionSequences.size() - 1) .getTextPositions() @@ -250,6 +255,13 @@ public class PDFLinesTextStripper extends PDFTextStripper { continue; } + if (textPositions.get(i).getDir() != direction && startIndex != i) { + List sublist = textPositions.subList(startIndex, i); + textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart)); + startIndex = i; + direction = textPositions.get(i).getDir(); + } + // Strange but sometimes this is happening, for example: Metolachlor2.pdf if (checkIfCurrentPositionIsToTheRightOfPreviousPosition(i, textPositions)) { List sublist = textPositions.subList(startIndex, i); @@ -329,6 +341,7 @@ public class PDFLinesTextStripper extends PDFTextStripper { .getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < maximumGapSize; } + @Override public String getText(PDDocument doc) throws IOException { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java index e17a8b1..d357614 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/LayoutGridService.java @@ -20,6 +20,7 @@ import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; @@ -53,6 +54,8 @@ public class LayoutGridService { static Color INNER_LINES_COLOR = new Color(255, 175, 175); static Color PARAGRAPH_COLOR = new Color(70, 130, 180); + + static Color DUPLICATE_PARAGRAPH_COLOR = new Color(70, 180, 101); static Color TABLE_COLOR = new Color(102, 205, 170); static Color SECTION_COLOR = new Color(50, 50, 50); static Color HEADLINE_COLOR = new Color(162, 56, 56); @@ -100,6 +103,11 @@ public class LayoutGridService { case IMAGE -> IMAGE_COLOR; default -> null; }; + + if (semanticNode instanceof DuplicatedParagraph) { + color = DUPLICATE_PARAGRAPH_COLOR; + } + if (isNotSectionOrTableCellOrDocument(semanticNode)) { addAsRectangle(semanticNode, layoutGrid, color); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java index 53e8c29..9927685 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java @@ -3,7 +3,6 @@ package com.knecon.fforesight.service.layoutparser.processor.utils; import java.util.List; import java.util.stream.Collectors; - import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java index 4ea6204..64d1c98 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java @@ -95,7 +95,7 @@ public class HeadlinesGoldStandardIntegrationTest { goldStandardLog.getRedactionLogEntry().removeIf(r -> !r.isRedacted() || r.getChanges().get(r.getChanges().size() - 1).getType().equals(ChangeType.REMOVED)); goldStandardLog.getRedactionLogEntry().forEach(e -> goldStandardHeadlines.add(new Headline(e.getPositions().get(0).getPage(), e.getValue()))); - Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, + Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD, pdfFileResource.getFile(), new ImageServiceResponse(), new TableServiceResponse(), diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java index 0751be3..3af1376 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java @@ -26,7 +26,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest { public void testLayoutParserEndToEnd() { prepareStorage("files/bdr/Wie weiter bei Kristeneinrichtungen.pdf"); - LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER); + LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD); LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest); Arrays.stream(finishedEvent.message().split("\n")).forEach(log::info); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java index f5bf3a2..6f8de95 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java @@ -55,7 +55,7 @@ public class DocumentGraphJsonWritingTest extends BuildDocumentTest { @SneakyThrows private void writeJsons(Path filename) { - Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, + Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD, filename.toFile(), new ImageServiceResponse(), new TableServiceResponse(), diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index 9511cdd..7f85f01 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -26,7 +26,7 @@ public class ViewerDocumentTest extends BuildDocumentTest { @SneakyThrows public void testViewerDocument() { - String fileName = "files/SinglePages/T5 VV-640252-Page16.pdf"; + String fileName = "files/new/270 rotated text on non rotated pages.pdf"; String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; var documentFile = new ClassPathResource(fileName).getFile(); @@ -54,10 +54,11 @@ public class ViewerDocumentTest extends BuildDocumentTest { var documentFile = new ClassPathResource(fileName).getFile(); var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE, - documentFile, - new ImageServiceResponse(), - tableResponse, - new VisualLayoutParsingResponse(),Path.of(fileName).getFileName().toFile().toString()); + documentFile, + new ImageServiceResponse(), + tableResponse, + new VisualLayoutParsingResponse(), + Path.of(fileName).getFileName().toFile().toString()); ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null); LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService); Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java index f2a4b87..8db6fdc 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java @@ -56,12 +56,12 @@ public class PdfSegmentationServiceTest extends AbstractTest { @SneakyThrows public ClassificationDocument buildClassificationDocument(File originDocument, TableServiceResponse tableServiceResponse) { - ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, - originDocument, - new ImageServiceResponse(), - tableServiceResponse, - new VisualLayoutParsingResponse(), - "document"); + ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD, + originDocument, + new ImageServiceResponse(), + tableServiceResponse, + new VisualLayoutParsingResponse(), + "document"); redactManagerClassificationService.classifyDocument(classificationDocument); @@ -112,16 +112,8 @@ public class PdfSegmentationServiceTest extends AbstractTest { var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class); ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile(), tableServiceResponse); - assertThat(document.getSections() - .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .collect(Collectors.toList())).isNotEmpty(); - var tables = document.getSections() - .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .toList(); + assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty(); + var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList(); // Quality of the table parsing is not good, because the file is rotated at scanning. // We only asset that the table border is not the page border. @@ -143,12 +135,12 @@ public class PdfSegmentationServiceTest extends AbstractTest { imageServiceResponse.getData() .forEach(imageMetadata -> images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>()) .add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(), - imageMetadata.getPosition().getY1(), - imageMetadata.getGeometry().getWidth(), - imageMetadata.getGeometry().getHeight()), - ImageType.valueOf(imageMetadata.getClassification().getLabel().toUpperCase(Locale.ROOT)), - imageMetadata.isAlpha(), - imageMetadata.getPosition().getPageNumber()))); + imageMetadata.getPosition().getY1(), + imageMetadata.getGeometry().getWidth(), + imageMetadata.getGeometry().getHeight()), + ImageType.valueOf(imageMetadata.getClassification().getLabel().toUpperCase(Locale.ROOT)), + imageMetadata.isAlpha(), + imageMetadata.getPosition().getPageNumber()))); System.out.println("object"); } @@ -160,22 +152,11 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Spanning Cells.pdf"); ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); - assertThat(document.getSections() - .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .collect(Collectors.toList())).isNotEmpty(); - TablePageBlock table = document.getSections() - .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .toList() - .get(0); + assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty(); + TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0); assertThat(table.getColCount()).isEqualTo(6); assertThat(table.getRowCount()).isEqualTo(13); - assertThat(table.getRows() - .stream() - .mapToInt(List::size).sum()).isEqualTo(6 * 13); + assertThat(table.getRows().stream().mapToInt(List::size).sum()).isEqualTo(6 * 13); } @@ -185,37 +166,15 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Table.pdf"); ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); - assertThat(document.getSections() - .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .collect(Collectors.toList())).isNotEmpty(); - TablePageBlock firstTable = document.getSections() - .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .toList() - .get(0); + assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty(); + TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0); assertThat(firstTable.getColCount()).isEqualTo(8); assertThat(firstTable.getRowCount()).isEqualTo(1); - TablePageBlock secondTable = document.getSections() - .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .toList() - .get(1); + TablePageBlock secondTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(1); assertThat(secondTable.getColCount()).isEqualTo(8); assertThat(secondTable.getRowCount()).isEqualTo(2); - List> firstTableHeaderCells = firstTable.getRows() - .get(0) - .stream() - .map(Collections::singletonList) - .collect(Collectors.toList()); - assertThat(secondTable.getRows() - .stream() - .allMatch(row -> row.stream() - .map(Cell::getHeaderCells) - .toList().equals(firstTableHeaderCells))).isTrue(); + List> firstTableHeaderCells = firstTable.getRows().get(0).stream().map(Collections::singletonList).collect(Collectors.toList()); + assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue(); } @@ -225,37 +184,15 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Multi Page Table.pdf"); ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); - assertThat(document.getSections() - .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .collect(Collectors.toList())).isNotEmpty(); - TablePageBlock firstTable = document.getSections() - .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .toList() - .get(0); + assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty(); + TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0); assertThat(firstTable.getColCount()).isEqualTo(9); assertThat(firstTable.getRowCount()).isEqualTo(5); - TablePageBlock secondTable = document.getSections() - .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .toList() - .get(1); + TablePageBlock secondTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(1); assertThat(secondTable.getColCount()).isEqualTo(9); assertThat(secondTable.getRowCount()).isEqualTo(6); - List> firstTableHeaderCells = firstTable.getRows() - .get(firstTable.getRowCount() - 1) - .stream() - .map(Cell::getHeaderCells) - .collect(Collectors.toList()); - assertThat(secondTable.getRows() - .stream() - .allMatch(row -> row.stream() - .map(Cell::getHeaderCells) - .toList().equals(firstTableHeaderCells))).isTrue(); + List> firstTableHeaderCells = firstTable.getRows().get(firstTable.getRowCount() - 1).stream().map(Cell::getHeaderCells).collect(Collectors.toList()); + assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue(); } @@ -265,37 +202,15 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Rotated Table Headers.pdf"); ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); - assertThat(document.getSections() - .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .collect(Collectors.toList())).isNotEmpty(); - TablePageBlock firstTable = document.getSections() - .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .toList() - .get(0); + assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty(); + TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0); assertThat(firstTable.getColCount()).isEqualTo(8); assertThat(firstTable.getRowCount()).isEqualTo(1); - TablePageBlock secondTable = document.getSections() - .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .toList() - .get(1); + TablePageBlock secondTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(1); assertThat(secondTable.getColCount()).isEqualTo(8); assertThat(secondTable.getRowCount()).isEqualTo(6); - List> firstTableHeaderCells = firstTable.getRows() - .get(0) - .stream() - .map(Collections::singletonList) - .collect(Collectors.toList()); - assertThat(secondTable.getRows() - .stream() - .allMatch(row -> row.stream() - .map(Cell::getHeaderCells) - .toList().equals(firstTableHeaderCells))).isTrue(); + List> firstTableHeaderCells = firstTable.getRows().get(0).stream().map(Collections::singletonList).collect(Collectors.toList()); + assertThat(secondTable.getRows().stream().allMatch(row -> row.stream().map(Cell::getHeaderCells).toList().equals(firstTableHeaderCells))).isTrue(); } @@ -345,30 +260,29 @@ public class PdfSegmentationServiceTest extends AbstractTest { validateTable(document, 0, 8, 8, 0, 0); List> values = Arrays.asList(Arrays.asList("Annex point Reference within DAR/RAR", - "Author, date", - "Study title", - "Analytical method Author, date, No.", - "Technique, LOQ of the method, validated working range", - "Method meets analytical validation criteria", - "Remarks (in case validation criteria are not met)", - "Acceptability of the method"), - Arrays.asList( - "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", - "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", - "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", - "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", - "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", - "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", - "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", - "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"), - Arrays.asList("CA 7.1.2.1.1 DAR (2009)", - "Evans P.G. 2001 TMJ4569B, VV-323245", - "Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom", - "Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845", - "LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02 mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD", - "Y", - "N/A", - "Y")); + "Author, date", + "Study title", + "Analytical method Author, date, No.", + "Technique, LOQ of the method, validated working range", + "Method meets analytical validation criteria", + "Remarks (in case validation criteria are not met)", + "Acceptability of the method"), + Arrays.asList("Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", + "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"), + Arrays.asList("CA 7.1.2.1.1 DAR (2009)", + "Evans P.G. 2001 TMJ4569B, VV-323245", + "Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom", + "Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845", + "LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02 mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD", + "Y", + "N/A", + "Y")); validateTable(document, 0, values); @@ -757,11 +671,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { @SneakyThrows private void toHtml(ClassificationDocument document, String filename) { - var tables = document.getSections() - .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .toList(); + var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList(); StringBuilder sb = new StringBuilder(); int currentPage = 1; @@ -782,19 +692,9 @@ public class PdfSegmentationServiceTest extends AbstractTest { private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) { - TablePageBlock table = document.getSections() - .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .toList() - .get(tableIndex); + TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex); List> rows = table.getRows(); - int emptyCellsFoundFound = rows.stream() - .flatMap(List::stream) - .toList() - .stream() - .filter(f -> f.toString().isEmpty()) - .toList().size(); + int emptyCellsFoundFound = rows.stream().flatMap(List::stream).toList().stream().filter(f -> f.toString().isEmpty()).toList().size(); for (List row : table.getRows()) { row.forEach(r -> System.out.println(r.toString())); @@ -809,20 +709,11 @@ public class PdfSegmentationServiceTest extends AbstractTest { private void validateTable(ClassificationDocument document, int tableIndex, List> values) { - TablePageBlock table = document.getSections() - .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .toList() - .get(tableIndex); + TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex); List> rows = table.getRows(); - List rowsFlattened = rows.stream() - .flatMap(List::stream) - .toList(); - List valuesFlattened = values.stream() - .flatMap(List::stream) - .toList(); + List rowsFlattened = rows.stream().flatMap(List::stream).toList(); + List valuesFlattened = values.stream().flatMap(List::stream).toList(); for (int i = 0; i < valuesFlattened.size(); i++) { Cell cell = rowsFlattened.get(i); @@ -835,11 +726,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { private void validateTableSize(ClassificationDocument document, int tableSize) { - assertThat(document.getSections() - .stream() - .flatMap(paragraph -> paragraph.getTables() - .stream()) - .toList().size()).isEqualTo(tableSize); + assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().size()).isEqualTo(tableSize); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java index 3aae43a..e8f0f3f 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java @@ -74,7 +74,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest { cleanRulingsPerPage.add(rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings())); } var cleanRulings = cleanRulingsPerPage.stream().map(CleanRulings::getVertical).collect(Collectors.toList()); - PdfDraw.drawLinesPerPage(fileName, cleanRulings, lineFileName); + PdfDraw.drawLinesPerPage(fileName, cleanRulings, lineFileName); } @@ -99,13 +99,13 @@ public class RulingCleaningServiceTest extends BuildDocumentTest { @SneakyThrows private void writeJsons(Path filename) { - Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, + Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD, filename.toFile(), new ImageServiceResponse(), new TableServiceResponse(), new VisualLayoutParsingResponse(), filename.toFile().toString())); - Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, + Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD, filename.toFile(), new ImageServiceResponse(), new TableServiceResponse(), diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/AbstractTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/AbstractTest.java index 759f0e8..c0e2809 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/AbstractTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/AbstractTest.java @@ -20,7 +20,6 @@ import org.springframework.context.annotation.Import; import org.springframework.context.annotation.Primary; import org.springframework.core.io.ClassPathResource; import org.springframework.test.context.junit.jupiter.SpringExtension; -import org.xmlunit.builder.Input; import com.iqser.red.commons.jackson.ObjectMapperFactory; import com.iqser.red.storage.commons.service.StorageService; @@ -68,7 +67,7 @@ public abstract class AbstractTest { protected LayoutParsingRequest buildStandardLayoutParsingRequest() { return LayoutParsingRequest.builder() - .layoutParsingType(LayoutParsingType.REDACT_MANAGER) + .layoutParsingType(LayoutParsingType.REDACT_MANAGER_OLD) .originFileStorageId(ORIGIN_FILE_ID) .tablesFileStorageId(Optional.of(TABLE_FILE_ID)) .imagesFileStorageId(Optional.of(IMAGE_FILE_ID)) @@ -99,7 +98,7 @@ public abstract class AbstractTest { @SneakyThrows protected LayoutParsingRequest prepareStorage(String file) { - return prepareStorage(file, "cv_table_parsing_response/empty.json", "image_service_response/empty.json","visual_layout_parsing_response/empty.json"); + return prepareStorage(file, "cv_table_parsing_response/empty.json", "image_service_response/empty.json", "visual_layout_parsing_response/empty.json"); } @@ -107,7 +106,7 @@ public abstract class AbstractTest { protected LayoutParsingRequest prepareStorage(InputStream fileInputStream) { storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileInputStream); - return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER); + return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD); } @@ -140,6 +139,7 @@ public abstract class AbstractTest { return prepareStorage(pdfFileResource.getInputStream(), cvServiceResponseFileResource.getInputStream(), imageInfoFileResource.getInputStream()); } + @SneakyThrows protected LayoutParsingRequest prepareStorage(String file, String cvServiceResponseFile, String imageInfoFile, String visualLayoutParsingResponseFile) { @@ -148,9 +148,13 @@ public abstract class AbstractTest { ClassPathResource imageInfoFileResource = new ClassPathResource(imageInfoFile); ClassPathResource visualLayoutParsingResponseResource = new ClassPathResource(visualLayoutParsingResponseFile); - return prepareStorage(pdfFileResource.getInputStream(), cvServiceResponseFileResource.getInputStream(), imageInfoFileResource.getInputStream(), visualLayoutParsingResponseResource.getInputStream()); + return prepareStorage(pdfFileResource.getInputStream(), + cvServiceResponseFileResource.getInputStream(), + imageInfoFileResource.getInputStream(), + visualLayoutParsingResponseResource.getInputStream()); } + @SneakyThrows protected LayoutParsingRequest prepareStorage(InputStream fileStream, InputStream cvServiceResponseFileStream, InputStream imageInfoStream) { @@ -158,18 +162,22 @@ public abstract class AbstractTest { storageService.storeObject(TenantContext.getTenantId(), TABLE_FILE_ID, cvServiceResponseFileStream); storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream); - return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER); + return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD); } + @SneakyThrows - protected LayoutParsingRequest prepareStorage(InputStream fileStream, InputStream cvServiceResponseFileStream, InputStream imageInfoStream, InputStream visualLayoutParsingResponseFileStream) { + protected LayoutParsingRequest prepareStorage(InputStream fileStream, + InputStream cvServiceResponseFileStream, + InputStream imageInfoStream, + InputStream visualLayoutParsingResponseFileStream) { storageService.storeObject(TenantContext.getTenantId(), IMAGE_FILE_ID, imageInfoStream); storageService.storeObject(TenantContext.getTenantId(), TABLE_FILE_ID, cvServiceResponseFileStream); storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream); - storageService.storeObject(TenantContext.getTenantId(),VISUAL_LAYOUT_FILE,visualLayoutParsingResponseFileStream ); + storageService.storeObject(TenantContext.getTenantId(), VISUAL_LAYOUT_FILE, visualLayoutParsingResponseFileStream); - return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER); + return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java index 79db6bf..0ff0cef 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java @@ -26,14 +26,19 @@ public abstract class BuildDocumentTest extends AbstractTest { File fileResource = new ClassPathResource(filename).getFile(); prepareStorage(filename); - return layoutParsingPipeline.parseLayout(layoutParsingType, fileResource, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), new TableServiceResponse(), new VisualLayoutParsingResponse(),filename); + return layoutParsingPipeline.parseLayout(layoutParsingType, + fileResource, + layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), + new TableServiceResponse(), + new VisualLayoutParsingResponse(), + filename); } @SneakyThrows protected Document buildGraph(String filename) { - return buildGraph(filename, LayoutParsingType.REDACT_MANAGER); + return buildGraph(filename, LayoutParsingType.REDACT_MANAGER_OLD); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/brokenTableOnOcr_ocred 1.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/brokenTableOnOcr_ocred 1.pdf new file mode 100644 index 0000000..1a00988 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/brokenTableOnOcr_ocred 1.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/new/wrongOrder 2.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/new/wrongOrder 2.pdf new file mode 100644 index 0000000..4834562 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/new/wrongOrder 2.pdf differ