From f08c4ced43356ba4cbc28e1b1d9ba9f66a8fb955 Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Thu, 15 Jun 2023 14:37:37 +0200 Subject: [PATCH] TAAS-41: TAAS Document Structure * changed TextPageBlock splitting * changed Header and Footer Classification * added TAAS Document Structure Prototype --- .../AtomicPositionBlockData.java | 2 +- .../{ => redaction}/AtomicTextBlockData.java | 2 +- .../data/{ => redaction}/DocumentData.java | 2 +- .../{ => redaction}/DocumentTreeData.java | 2 +- .../api/data/{ => redaction}/NodeType.java | 2 +- .../api/data/{ => redaction}/PageData.java | 2 +- .../internal/api/data/taas/ParagraphData.java | 20 ++ .../internal/api/data/taas/Range.java | 5 + .../api/data/taas/ResearchDocumentData.java | 16 ++ .../internal/api/data/taas/RowData.java | 15 + .../api/data/taas/StructureObject.java | 19 ++ .../internal/api/data/taas/TableData.java | 15 + .../processor/LayoutParsingService.java | 25 +- .../LayoutParsingStorageService.java | 10 +- .../model/text/TextPageBlock.java | 67 ++++- .../model/text/TextPositionSequence.java | 20 +- .../parsing/LegacyPDFStreamEngine.java | 9 +- .../parsing/PDFLinesTextStripper.java | 18 +- .../parsing/PDFTextStripper.java | 62 +++- .../service/BlockificationService.java | 266 +++++++++--------- .../service/BodyTextFrameService.java | 2 +- .../service/ClassificationService.java | 7 +- .../service/TableExtractionService.java | 8 + .../classification/utils/FileUtils.java | 56 ---- .../SearchTextWithTextPositionDto.java | 8 +- .../SearchTextWithTextPositionFactory.java | 39 ++- .../processor/factory/TextBlockFactory.java | 25 +- .../processor/graph/Boundary.java | 24 -- .../processor/graph/DocumentTree.java | 2 +- .../processor/graph/nodes/Document.java | 2 +- .../processor/graph/nodes/Footer.java | 2 +- .../processor/graph/nodes/Header.java | 2 +- .../processor/graph/nodes/Headline.java | 2 +- .../processor/graph/nodes/Image.java | 2 +- .../processor/graph/nodes/Paragraph.java | 2 +- .../processor/graph/nodes/Section.java | 2 +- .../processor/graph/nodes/SemanticNode.java | 10 +- .../processor/graph/nodes/Table.java | 2 +- .../processor/graph/nodes/TableCell.java | 2 +- .../graph/textblock/AtomicTextBlock.java | 59 ++-- .../textblock/ConcatenatedTextBlock.java | 35 +++ .../processor/graph/textblock/TextBlock.java | 12 + .../{ => redaction}/DocumentDataMapper.java | 12 +- .../{ => redaction}/DocumentGraphMapper.java | 12 +- .../{ => redaction}/PropertiesMapper.java | 2 +- .../mapper/taas/TaasDocumentDataMapper.java | 108 +++++++ .../utils/PdfVisualisationUtility.java | 2 +- .../layoutparser/server/BdrJsonBuildTest.java | 103 +++++++ .../graph/DocumentGraphJsonWritingTest.java | 4 +- .../graph/DocumentGraphMappingTest.java | 6 +- .../server/utils/visualizations/PdfDraw.java | 2 +- 51 files changed, 818 insertions(+), 317 deletions(-) rename layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/{ => redaction}/AtomicPositionBlockData.java (96%) rename layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/{ => redaction}/AtomicTextBlockData.java (96%) rename layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/{ => redaction}/DocumentData.java (96%) rename layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/{ => redaction}/DocumentTreeData.java (99%) rename layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/{ => redaction}/NodeType.java (95%) rename layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/{ => redaction}/PageData.java (96%) create mode 100644 layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/ParagraphData.java create mode 100644 layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/Range.java create mode 100644 layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/ResearchDocumentData.java create mode 100644 layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/RowData.java create mode 100644 layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/StructureObject.java create mode 100644 layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/TableData.java delete mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/FileUtils.java rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/{ => redaction}/DocumentDataMapper.java (97%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/{ => redaction}/DocumentGraphMapper.java (97%) rename layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/{ => redaction}/PropertiesMapper.java (99%) create mode 100644 layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/taas/TaasDocumentDataMapper.java create mode 100644 layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/AtomicPositionBlockData.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/AtomicPositionBlockData.java similarity index 96% rename from layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/AtomicPositionBlockData.java rename to layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/AtomicPositionBlockData.java index f61d380..42daa7b 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/AtomicPositionBlockData.java +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/AtomicPositionBlockData.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.internal.api.data; +package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction; import lombok.AccessLevel; import lombok.AllArgsConstructor; diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/AtomicTextBlockData.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/AtomicTextBlockData.java similarity index 96% rename from layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/AtomicTextBlockData.java rename to layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/AtomicTextBlockData.java index 04349e4..52c1f72 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/AtomicTextBlockData.java +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/AtomicTextBlockData.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.internal.api.data; +package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction; diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/DocumentData.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/DocumentData.java similarity index 96% rename from layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/DocumentData.java rename to layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/DocumentData.java index 9e85750..1f38471 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/DocumentData.java +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/DocumentData.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.internal.api.data; +package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction; import lombok.AccessLevel; import lombok.AllArgsConstructor; diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/DocumentTreeData.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/DocumentTreeData.java similarity index 99% rename from layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/DocumentTreeData.java rename to layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/DocumentTreeData.java index 3a14a37..b0d5433 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/DocumentTreeData.java +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/DocumentTreeData.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.internal.api.data; +package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction; import java.util.List; import java.util.Map; diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/NodeType.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/NodeType.java similarity index 95% rename from layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/NodeType.java rename to layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/NodeType.java index 91104f2..7b92adb 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/NodeType.java +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/NodeType.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.internal.api.data; +package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction; import java.util.Locale; diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/PageData.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/PageData.java similarity index 96% rename from layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/PageData.java rename to layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/PageData.java index 20c92a3..4ea8069 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/PageData.java +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/PageData.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.internal.api.data; +package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction; import lombok.AccessLevel; import lombok.AllArgsConstructor; diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/ParagraphData.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/ParagraphData.java new file mode 100644 index 0000000..8995ca8 --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/ParagraphData.java @@ -0,0 +1,20 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.data.taas; + +import java.util.List; + +import lombok.Builder; +import lombok.Data; + +@Data +@Builder +public class ParagraphData { + + private String text; + List boldTextBoundaries; + List italicTextBoundaries; + private String classification; + + private String orientation; + private int textDirection; + +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/Range.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/Range.java new file mode 100644 index 0000000..a978cc0 --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/Range.java @@ -0,0 +1,5 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.data.taas; + +public record Range(int start, int end) { + +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/ResearchDocumentData.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/ResearchDocumentData.java new file mode 100644 index 0000000..667fdee --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/ResearchDocumentData.java @@ -0,0 +1,16 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.data.taas; + +import java.util.List; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; + +@Builder +@Data +@AllArgsConstructor +public class ResearchDocumentData { + + String originalFile; + List structureObjects; +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/RowData.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/RowData.java new file mode 100644 index 0000000..6d021d4 --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/RowData.java @@ -0,0 +1,15 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.data.taas; + +import java.util.List; + +import lombok.AllArgsConstructor; +import lombok.Data; + +@Data +@AllArgsConstructor +public class RowData { + + boolean header; + List cellText; + float[] bBox; +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/StructureObject.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/StructureObject.java new file mode 100644 index 0000000..fca1eff --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/StructureObject.java @@ -0,0 +1,19 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.data.taas; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; + +@Data +@Builder +@AllArgsConstructor +public class StructureObject { + + Integer structureObjectNumber; + int page; + int stringOffset; + float[] boundingBox; + ParagraphData paragraph; + TableData table; + +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/TableData.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/TableData.java new file mode 100644 index 0000000..e5153dd --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/TableData.java @@ -0,0 +1,15 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.data.taas; + +import java.util.List; + +import lombok.AllArgsConstructor; +import lombok.Data; + +@Data +@AllArgsConstructor +public class TableData { + + List rowData; + Integer numberOfCols; + Integer numberOfRows; +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingService.java index 33f309b..da80daf 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingService.java @@ -19,7 +19,7 @@ import com.knecon.fforesight.service.layoutparser.processor.classification.servi import com.knecon.fforesight.service.layoutparser.processor.classification.service.SectionsBuilderService; import com.knecon.fforesight.service.layoutparser.processor.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; -import com.knecon.fforesight.service.layoutparser.processor.mapper.DocumentDataMapper; +import com.knecon.fforesight.service.layoutparser.processor.mapper.redaction.DocumentDataMapper; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; @@ -84,4 +84,27 @@ public class LayoutParsingService { return DocumentGraphFactory.buildDocumentGraph(classificationDocument); } + + public Document parseLayoutWithTimer(PDDocument originDocument, ImageServiceResponse imageServiceResponse, TableServiceResponse tableServiceResponse) { + + long start = System.currentTimeMillis(); + ClassificationDocument classificationDocument = pdfParsingService.parseDocument(originDocument, + cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse), + imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse)); + System.out.printf("parsed %d ms", System.currentTimeMillis() - start); + + start = System.currentTimeMillis(); + classificationService.classifyDocument(classificationDocument); + System.out.printf(", classified %d ms", System.currentTimeMillis() - start); + + start = System.currentTimeMillis(); + sectionsBuilderService.buildSections(classificationDocument); + System.out.printf(", sections built %d ms", System.currentTimeMillis() - start); + + start = System.currentTimeMillis(); + Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument); + System.out.printf(", graph constructed %d ms", System.currentTimeMillis() - start); + return document; + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java index 8a1fac9..a2441a4 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java @@ -14,11 +14,11 @@ import org.springframework.stereotype.Service; import com.fasterxml.jackson.databind.ObjectMapper; import com.iqser.red.storage.commons.service.StorageService; -import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositionBlockData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentTreeData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.PageData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicPositionBlockData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicTextBlockData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTreeData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.PageData; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/TextPageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/TextPageBlock.java index cbf6214..892d13a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/TextPageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/TextPageBlock.java @@ -1,21 +1,28 @@ package com.knecon.fforesight.service.layoutparser.processor.classification.model.text; +import static java.util.stream.Collectors.toSet; + import java.util.ArrayList; +import java.util.Comparator; import java.util.List; import com.fasterxml.jackson.annotation.JsonIgnore; import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.classification.model.FloatFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType; +import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons; import com.knecon.fforesight.service.layoutparser.processor.classification.utils.TextNormalizationUtilities; import lombok.AllArgsConstructor; import lombok.Builder; import lombok.Data; +import lombok.EqualsAndHashCode; import lombok.NoArgsConstructor; +@EqualsAndHashCode(callSuper = true) +@Data @AllArgsConstructor @Builder -@Data @NoArgsConstructor public class TextPageBlock extends AbstractPageBlock { @@ -67,6 +74,64 @@ public class TextPageBlock extends AbstractPageBlock { return sequences.get(0).getPageWidth(); } + public static TextPageBlock merge(List textBlocksToMerge) { + + List sequences = textBlocksToMerge.stream().map(TextPageBlock::getSequences).flatMap(java.util.Collection::stream).toList(); + sequences = new ArrayList<>(sequences); + return fromTextPositionSequences(sequences); + } + + public static TextPageBlock fromTextPositionSequences(List wordBlockList) { + + TextPageBlock textBlock = null; + + FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter(); + FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter(); + FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter(); + StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter(); + StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter(); + + for (TextPositionSequence wordBlock : wordBlockList) { + + lineHeightFrequencyCounter.add(wordBlock.getTextHeight()); + fontSizeFrequencyCounter.add(wordBlock.getFontSize()); + spaceFrequencyCounter.add(wordBlock.getSpaceWidth()); + fontFrequencyCounter.add(wordBlock.getFont()); + styleFrequencyCounter.add(wordBlock.getFontStyle()); + + if (textBlock == null) { + textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(), + wordBlock.getMaxXDirAdj(), + wordBlock.getMinYDirAdj(), + wordBlock.getMaxYDirAdj(), + wordBlockList, + wordBlock.getRotation()); + } else { + TextPageBlock spatialEntity = textBlock.union(wordBlock); + textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight()); + } + } + + if (textBlock != null) { + textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular()); + textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest()); + } + + if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences() + .stream() + .map(t -> DoubleComparisons.round(t.getMinYDirAdj(), 3)) + .collect(toSet()) + .size() == 1) { + textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj)); + } + return textBlock; + } + + /** * Returns the minX value in pdf coordinate system. diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/TextPositionSequence.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/TextPositionSequence.java index fa1b243..8b73a42 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/TextPositionSequence.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/TextPositionSequence.java @@ -8,8 +8,8 @@ import java.util.stream.Collectors; import org.apache.pdfbox.text.TextPosition; +import com.dslplatform.json.JsonAttribute; import com.fasterxml.jackson.annotation.JsonIgnore; -import com.fasterxml.jackson.annotation.JsonIgnoreProperties; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; @@ -25,7 +25,6 @@ import lombok.extern.slf4j.Slf4j; @Builder @NoArgsConstructor @AllArgsConstructor -@JsonIgnoreProperties({"empty"}) public class TextPositionSequence implements CharSequence { public static final int HEIGHT_PADDING = 2; @@ -36,6 +35,7 @@ public class TextPositionSequence implements CharSequence { private int rotation; private float pageHeight; private float pageWidth; + private boolean isParagraphStart; public TextPositionSequence(int page) { @@ -44,7 +44,7 @@ public class TextPositionSequence implements CharSequence { } - public TextPositionSequence(List textPositions, int page) { + public TextPositionSequence(List textPositions, int page, boolean isParagraphStart) { this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList()); this.page = page; @@ -52,6 +52,7 @@ public class TextPositionSequence implements CharSequence { this.rotation = textPositions.get(0).getRotation(); this.pageHeight = textPositions.get(0).getPageHeight(); this.pageWidth = textPositions.get(0).getPageWidth(); + this.isParagraphStart = isParagraphStart; } @@ -141,6 +142,7 @@ public class TextPositionSequence implements CharSequence { * @return the text direction adjusted minX value */ @JsonIgnore + @JsonAttribute(ignore = true) public float getMinXDirAdj() { return textPositions.get(0).getXDirAdj(); @@ -155,6 +157,7 @@ public class TextPositionSequence implements CharSequence { * @return the text direction adjusted maxX value */ @JsonIgnore + @JsonAttribute(ignore = true) public float getMaxXDirAdj() { return textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidthDirAdj() + HEIGHT_PADDING; @@ -169,6 +172,7 @@ public class TextPositionSequence implements CharSequence { * @return the text direction adjusted minY value. The upper border of the bounding box of the word. */ @JsonIgnore + @JsonAttribute(ignore = true) public float getMinYDirAdj() { return textPositions.get(0).getYDirAdj() - getTextHeight(); @@ -183,6 +187,7 @@ public class TextPositionSequence implements CharSequence { * @return the text direction adjusted maxY value. The lower border of the bounding box of the word. */ @JsonIgnore + @JsonAttribute(ignore = true) public float getMaxYDirAdj() { return textPositions.get(0).getYDirAdj(); @@ -191,6 +196,7 @@ public class TextPositionSequence implements CharSequence { @JsonIgnore + @JsonAttribute(ignore = true) public float getTextHeight() { return textPositions.get(0).getHeightDir() + HEIGHT_PADDING; @@ -198,6 +204,7 @@ public class TextPositionSequence implements CharSequence { @JsonIgnore + @JsonAttribute(ignore = true) public float getHeight() { return getMaxYDirAdj() - getMinYDirAdj(); @@ -205,6 +212,7 @@ public class TextPositionSequence implements CharSequence { @JsonIgnore + @JsonAttribute(ignore = true) public float getWidth() { return getMaxXDirAdj() - getMinXDirAdj(); @@ -212,6 +220,7 @@ public class TextPositionSequence implements CharSequence { @JsonIgnore + @JsonAttribute(ignore = true) public String getFont() { return textPositions.get(0).getFontName().toLowerCase().replaceAll(",bold", "").replaceAll(",italic", ""); @@ -219,6 +228,7 @@ public class TextPositionSequence implements CharSequence { @JsonIgnore + @JsonAttribute(ignore = true) public String getFontStyle() { String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase(); @@ -237,6 +247,7 @@ public class TextPositionSequence implements CharSequence { @JsonIgnore + @JsonAttribute(ignore = true) public float getFontSize() { return textPositions.get(0).getFontSizeInPt(); @@ -244,6 +255,7 @@ public class TextPositionSequence implements CharSequence { @JsonIgnore + @JsonAttribute(ignore = true) public float getSpaceWidth() { return textPositions.get(0).getWidthOfSpace(); @@ -260,6 +272,7 @@ public class TextPositionSequence implements CharSequence { * @return bounding box of the word in Pdf Coordinate System */ @JsonIgnore + @JsonAttribute(ignore = true) @SneakyThrows public Rectangle getRectangle() { @@ -299,3 +312,4 @@ public class TextPositionSequence implements CharSequence { } } + diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/LegacyPDFStreamEngine.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/LegacyPDFStreamEngine.java index 5aa1439..307d442 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/LegacyPDFStreamEngine.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/LegacyPDFStreamEngine.java @@ -76,7 +76,7 @@ import org.apache.pdfbox.util.Vector; * THIS CODE IS DELIBERATELY INCORRECT, USE PDFStreamEngine INSTEAD. */ @SuppressWarnings({"PMD", "checkstyle:all"}) -class LegacyPDFStreamEngine extends PDFStreamEngine { +public class LegacyPDFStreamEngine extends PDFStreamEngine { private static final Log LOG = LogFactory.getLog(LegacyPDFStreamEngine.class); @@ -126,7 +126,7 @@ class LegacyPDFStreamEngine extends PDFStreamEngine { * This will initialize and process the contents of the stream. * * @param page the page to process - * @throws IOException if there is an error accessing the stream. + * @throws java.io.IOException if there is an error accessing the stream. */ @Override public void processPage(PDPage page) throws IOException { @@ -149,7 +149,7 @@ class LegacyPDFStreamEngine extends PDFStreamEngine { * written by Ben Litchfield for PDFStreamEngine. */ @Override - protected void showGlyph(Matrix textRenderingMatrix, PDFont font, int code,Vector displacement) throws IOException { + protected void showGlyph(Matrix textRenderingMatrix, PDFont font, int code, Vector displacement) throws IOException { // // legacy calculations which were previously in PDFStreamEngine // @@ -165,7 +165,7 @@ class LegacyPDFStreamEngine extends PDFStreamEngine { float displacementX = displacement.getX(); // the sorting algorithm is based on the width of the character. As the displacement - // for vertical characters doesn't provide any suitable value for it, we have to + // for vertical characters doesn't provide any suitable value for it, we have to // calculate our own if (font.isVertical()) { displacementX = font.getWidth(code) / 1000; @@ -382,3 +382,4 @@ class LegacyPDFStreamEngine extends PDFStreamEngine { } } + diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFLinesTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFLinesTextStripper.java index 223b0ba..07d648c 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFLinesTextStripper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFLinesTextStripper.java @@ -3,7 +3,6 @@ package com.knecon.fforesight.service.layoutparser.processor.classification.pars import java.awt.geom.Point2D; import java.io.IOException; import java.util.ArrayList; -import java.util.Comparator; import java.util.List; import org.apache.pdfbox.contentstream.operator.Operator; @@ -208,13 +207,11 @@ public class PDFLinesTextStripper extends PDFTextStripper { @Override - public void writeString(String text, List textPositions) throws IOException { + public void writeString(String text, List textPositions, boolean isParagraphStart) throws IOException { int startIndex = 0; RedTextPosition previous = null; - textPositions.sort(Comparator.comparing(TextPosition::getXDirAdj)); - for (int i = 0; i <= textPositions.size() - 1; i++) { if (!textPositionSequences.isEmpty()) { @@ -250,7 +247,7 @@ public class PDFLinesTextStripper extends PDFTextStripper { if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0) .getUnicode() .equals("\t")))) { - textPositionSequences.add(new TextPositionSequence(sublist, pageNumber)); + textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() -1 && isParagraphStart)); } startIndex = i; } @@ -260,7 +257,7 @@ public class PDFLinesTextStripper extends PDFTextStripper { if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0) .getUnicode() .equals("\t")))) { - textPositionSequences.add(new TextPositionSequence(sublist, pageNumber)); + textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() -1 && isParagraphStart)); } startIndex = i; } @@ -276,11 +273,11 @@ public class PDFLinesTextStripper extends PDFTextStripper { // Remove false sequence ends (whitespaces) if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0) .getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) { - for (TextPosition textPosition : sublist) { - textPositionSequences.get(textPositionSequences.size() - 1).add(textPosition); + for (TextPosition t : sublist) { + textPositionSequences.get(textPositionSequences.size() - 1).add(t); } } else { - textPositionSequences.add(new TextPositionSequence(sublist, pageNumber)); + textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() -1 && isParagraphStart)); } } startIndex = i + 1; @@ -303,7 +300,7 @@ public class PDFLinesTextStripper extends PDFTextStripper { textPositionSequences.get(textPositionSequences.size() - 1).add(t); } } else { - textPositionSequences.add(new TextPositionSequence(sublist, pageNumber)); + textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, isParagraphStart)); } } super.writeString(text); @@ -328,3 +325,4 @@ public class PDFLinesTextStripper extends PDFTextStripper { } } + diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFTextStripper.java index de0490b..49e6a78 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFTextStripper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFTextStripper.java @@ -27,6 +27,7 @@ import java.text.Bidi; import java.text.Normalizer; import java.util.ArrayList; import java.util.Collections; +import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; @@ -240,10 +241,10 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { document = doc; output = outputStream; if (getAddMoreFormatting()) { - paragraphEnd = lineSeparator; + paragraphEnd = "\n----ParagraphEnd----\n\n"; pageStart = lineSeparator; - articleStart = lineSeparator; - articleEnd = lineSeparator; + articleStart = "\n----ArticelStart----\n\n"; + articleEnd = "\n----ArticelEnd----\n\n"; } startDocument(document); processPages(document.getPages()); @@ -594,9 +595,14 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { // but this caused a lot of regression test failures. So, I'm leaving it be for // now if (!overlap(positionY, positionHeight, maxYForLine, maxHeightForLine)) { - writeLine(normalize(line)); - line.clear(); + var normalized = normalize(line); +// normalized.stream().filter(l -> System.out.println(l.getText().contains("Plenarprotokoll 20/24")).findFirst().isPresent() + + lastLineStartPosition = handleLineSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine); + writeLine(normalized, current.isParagraphStart); + line.clear(); + expectedStartOfNextWordX = EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE; maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE; maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE; @@ -630,7 +636,24 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { if (startOfPage && lastPosition == null) { writeParagraphStart();// not sure this is correct for RTL? } + line.add(new LineItem(position)); + +// Collections.sort(line, new Comparator() { +// +// @Override +// public int compare(LineItem str1, LineItem str2) { +// if(null == str1.getTextPosition()) { +// return 0; +// } +// else if(null == str2.getTextPosition()) { +// return 0; +// } +// return Float.compare(str1.getTextPosition().getX(), str2.getTextPosition().getX()); +// } +// }); + +// line.sort(Comparator.comparing(a -> a.getTextPosition() != null && a.getTextPosition().getX())); } maxHeightForLine = Math.max(maxHeightForLine, positionHeight); minYTopForLine = Math.min(minYTopForLine, positionY - positionHeight); @@ -646,7 +669,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { } // print the final line if (line.size() > 0) { - writeLine(normalize(line)); + writeLine(normalize(line), false); writeParagraphEnd(); } endArticle(); @@ -703,7 +726,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { * @param textPositions The TextPositions belonging to the text. * @throws IOException If there is an error when writing the text. */ - protected void writeString(String text, List textPositions) throws IOException { + protected void writeString(String text, List textPositions, boolean isParagraphEnd) throws IOException { writeString(text); } @@ -998,7 +1021,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { /** - * By default the text stripper will attempt to remove text that overlapps each other. Word paints the same + * By default, the text stripper will attempt to remove text that overlapps each other. Word paints the same * character several times in order to make it look bold. By setting this to false all text will be extracted, which * means that certain sections will be duplicated, but better performance will be noticed. * @@ -1385,6 +1408,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { } else { writeLineSeparator(); writeParagraphSeparator(); + lastLineStartPosition.setEndParagraphWritten(); } } else { writeLineSeparator(); @@ -1428,6 +1452,10 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { float newXVal = multiplyFloat(getIndentThreshold(), position.getTextPosition().getWidthOfSpace()); float positionWidth = multiplyFloat(0.25f, position.getTextPosition().getWidth()); +// if(xGap < 0){ +// result = true; +// } +// else if (yGap > newYVal) { result = true; } else if (xGap > newXVal) { @@ -1636,12 +1664,13 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { * @param line a list with the words of the given line * @throws IOException if something went wrong */ - private void writeLine(List line) throws IOException { + private void writeLine(List line, boolean isParagraphEnd) throws IOException { int numberOfStrings = line.size(); for (int i = 0; i < numberOfStrings; i++) { WordWithTextPositions word = line.get(i); - writeString(word.getText(), word.getTextPositions()); + word.getTextPositions().sort(Comparator.comparing(TextPosition::getX)); + writeString(word.getText(), word.getTextPositions(), isParagraphEnd && i == numberOfStrings - 1); if (i < numberOfStrings - 1) { writeWordSeparator(); } @@ -1963,6 +1992,8 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { private boolean isHangingIndent = false; private boolean isArticleStart = false; + private boolean endParagraphWritten = false; + private TextPosition position = null; @@ -2024,6 +2055,16 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { } + public boolean isEndParagraphWritten() { + + return endParagraphWritten; + } + + public void setEndParagraphWritten(){ + endParagraphWritten = true; + } + + /** * Sets the isArticleStart() flag to true. */ @@ -2065,3 +2106,4 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { } } + diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/BlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/BlockificationService.java index c657ffc..9281e2b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/BlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/BlockificationService.java @@ -1,20 +1,18 @@ package com.knecon.fforesight.service.layoutparser.processor.classification.service; -import static java.util.stream.Collectors.toSet; - import java.util.ArrayList; -import java.util.Comparator; import java.util.Iterator; +import java.util.LinkedList; import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.FloatFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.classification.model.Orientation; import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.StringFrequencyCounter; import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.classification.utils.RulingTextDirAdjustUtil; @@ -23,95 +21,64 @@ import com.knecon.fforesight.service.layoutparser.processor.classification.utils @SuppressWarnings("all") public class BlockificationService { - static final float THRESHOLD = 1f; + private static final float THRESHOLD = 1f; + private static final float Y_GAP_SPLIT_HEIGHT_MODIFIER = 1.25f; + private static final int X_GAP_SPLIT_CONSTANT = 50; /** * This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions. * This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this! - * Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling. + * Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling. * * @param textPositions The words of a page. * @param horizontalRulingLines Horizontal table lines. * @param verticalRulingLines Vertical table lines. - * @return Page object that contains the Textblock and text statistics. + * @return ClassificationPage object that contains the Textblock and text statistics. */ public ClassificationPage blockify(List textPositions, List horizontalRulingLines, List verticalRulingLines) { - int indexOnPage = 0; - List chunkWords = new ArrayList<>(); - List chunkBlockList = new ArrayList<>(); + List classificationTextBlocks = constructFineGranularTextPageBlocks(textPositions, horizontalRulingLines, verticalRulingLines); - float minX = 1000, maxX = 0, minY = 1000, maxY = 0; - TextPositionSequence prev = null; + classificationTextBlocks = mergeFineGranularTextPageBlocks(classificationTextBlocks); - boolean wasSplitted = false; - Float splitX1 = null; - for (TextPositionSequence word : textPositions) { + return new ClassificationPage(new ArrayList<>(classificationTextBlocks.stream().map(classificationTextBlock -> (AbstractPageBlock) classificationTextBlock).toList())); + } - boolean lineSeparation = word.getMinYDirAdj() - maxY > word.getHeight() * 1.25; - boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight(); - boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj(); - boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX; - boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj(); - boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines); - boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir()); - if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) { + private List mergeFineGranularTextPageBlocks(List classificationTextBlocks) { - Orientation prevOrientation = null; - if (!chunkBlockList.isEmpty()) { - prevOrientation = chunkBlockList.get(chunkBlockList.size() - 1).getOrientation(); - } - - TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage); - indexOnPage++; - - chunkBlockList.add(cb1); - chunkWords = new ArrayList<>(); - - if (splitByX && !isSplitByRuling) { - wasSplitted = true; - cb1.setOrientation(Orientation.LEFT); - splitX1 = word.getMinXDirAdj(); - } else if (newLineAfterSplit && !isSplitByRuling) { - wasSplitted = false; - cb1.setOrientation(Orientation.RIGHT); - splitX1 = null; - } else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) { - cb1.setOrientation(Orientation.LEFT); - } - - minX = 1000; - maxX = 0; - minY = 1000; - maxY = 0; - prev = null; - } - - chunkWords.add(word); - - prev = word; - if (word.getMinXDirAdj() < minX) { - minX = word.getMinXDirAdj(); - } - if (word.getMaxXDirAdj() > maxX) { - maxX = word.getMaxXDirAdj(); - } - if (word.getMinYDirAdj() < minY) { - minY = word.getMinYDirAdj(); - } - if (word.getMaxYDirAdj() > maxY) { - maxY = word.getMaxYDirAdj(); - } + if (classificationTextBlocks.isEmpty()) { + return new ArrayList<>(); } - - TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage); - if (cb1 != null) { - chunkBlockList.add(cb1); + List> textBlocksToMerge = new LinkedList<>(); + List currentTextBlocksToMerge = new LinkedList<>(); + textBlocksToMerge.add(currentTextBlocksToMerge); + TextPageBlock previousTextBlock = null; + for (TextPageBlock currentTextBlock : classificationTextBlocks) { + if (previousTextBlock == null) { + currentTextBlocksToMerge.add(currentTextBlock); + previousTextBlock = currentTextBlock; + continue; + } + boolean alignsXRight = Math.abs(currentTextBlock.getPdfMaxX() - previousTextBlock.getPdfMaxX()) < 1; + boolean smallYGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < 5; + if (alignsXRight && smallYGap) { + currentTextBlocksToMerge.add(currentTextBlock); + } else { + currentTextBlocksToMerge = new LinkedList<>(); + currentTextBlocksToMerge.add(currentTextBlock); + textBlocksToMerge.add(currentTextBlocksToMerge); + } + previousTextBlock = currentTextBlock; } + return textBlocksToMerge.stream().map(TextPageBlock::merge).toList(); + } - Iterator itty = chunkBlockList.iterator(); + + private void assignOrientations(List classificationTextBlocks) { + + Iterator itty = classificationTextBlocks.iterator(); TextPageBlock previousLeft = null; TextPageBlock previousRight = null; @@ -141,12 +108,13 @@ public class BlockificationService { } } - itty = chunkBlockList.iterator(); + itty = classificationTextBlocks.iterator(); TextPageBlock previous = null; while (itty.hasNext()) { TextPageBlock block = (TextPageBlock) itty.next(); - if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(), + if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold( + block.getMaxY(), previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation() .equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) { previous.add(block); @@ -156,8 +124,95 @@ public class BlockificationService { previous = block; } + } - return new ClassificationPage(chunkBlockList); + + private List constructFineGranularTextPageBlocks(List textPositions, + List horizontalRulingLines, + List verticalRulingLines) { + + int indexOnPage = 0; + List wordClusterToCombine = new ArrayList<>(); + List classificationTextBlocks = new ArrayList<>(); + + float minX = 1000, maxX = 0, minY = 1000, maxY = 0; + TextPositionSequence prev = null; + + var listIdentitifier = Pattern.compile("\\b(?:[1-9]|1\\d|20|[ivxlc]|[a-z])\\s*(?:[.)])", Pattern.CASE_INSENSITIVE); + + boolean wasSplitted = false; + Float splitX1 = null; + for (TextPositionSequence word : textPositions) { + + Matcher listIdentifierPattern = listIdentitifier.matcher(word.toString()); + + boolean yGap = word.getMinYDirAdj() - maxY > word.getHeight() * Y_GAP_SPLIT_HEIGHT_MODIFIER; + boolean sameLine = prev != null && equalsWithThreshold(prev.getMinYDirAdj(), word.getMinYDirAdj()); + boolean positiveXGapInline = prev != null && maxX + X_GAP_SPLIT_CONSTANT < word.getMinXDirAdj() && sameLine; + boolean negativeXGap = prev != null && word.getMinXDirAdj() - minX < -5; + boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight(); + boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj(); + boolean splitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines); + boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir()); + boolean fontChange = prev != null && (!word.getFont().equals(prev.getFont()) || !word.getFontStyle() + .equals(prev.getFontStyle()) || word.getFontSize() != prev.getFontSize()); + boolean newline = prev != null && Math.abs(word.getMinYDirAdj() - prev.getMinYDirAdj()) > word.getHeight(); + boolean isListIdentifier = listIdentifierPattern.matches(); + + if (prev != null && (prev.isParagraphStart() || negativeXGap || positiveXGapInline || yGap || startFromTop || splitByRuling || (newline && (fontChange || isListIdentifier)))) { +// if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) { + + Orientation prevOrientation = null; + if (!classificationTextBlocks.isEmpty()) { + prevOrientation = classificationTextBlocks.get(classificationTextBlocks.size() - 1).getOrientation(); + } + + TextPageBlock classificationTextBlock = TextPageBlock.fromTextPositionSequences(wordClusterToCombine); + + classificationTextBlocks.add(classificationTextBlock); + wordClusterToCombine = new ArrayList<>(); + + if (positiveXGapInline && !splitByRuling) { + wasSplitted = true; + classificationTextBlock.setOrientation(Orientation.LEFT); + splitX1 = word.getMinXDirAdj(); + } else if (newLineAfterSplit && !splitByRuling) { + wasSplitted = false; + classificationTextBlock.setOrientation(Orientation.RIGHT); + splitX1 = null; + } else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (yGap || !startFromTop || !positiveXGapInline || !newLineAfterSplit || !splitByRuling)) { + classificationTextBlock.setOrientation(Orientation.LEFT); + } + + minX = 1000; + maxX = 0; + minY = 1000; + maxY = 0; + prev = null; + } + + wordClusterToCombine.add(word); + + prev = word; + if (word.getMinXDirAdj() < minX) { + minX = word.getMinXDirAdj(); + } + if (word.getMaxXDirAdj() > maxX) { + maxX = word.getMaxXDirAdj(); + } + if (word.getMinYDirAdj() < minY) { + minY = word.getMinYDirAdj(); + } + if (word.getMaxYDirAdj() > maxY) { + maxY = word.getMaxYDirAdj(); + } + } + + TextPageBlock classificationTextBlock = TextPageBlock.fromTextPositionSequences(wordClusterToCombine); + if (classificationTextBlock != null) { + classificationTextBlocks.add(classificationTextBlock); + } + return classificationTextBlocks; } @@ -167,53 +222,6 @@ public class BlockificationService { } - private TextPageBlock buildTextBlock(List wordBlockList, int indexOnPage) { - - TextPageBlock textBlock = null; - - FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter(); - FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter(); - FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter(); - StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter(); - StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter(); - - for (TextPositionSequence wordBlock : wordBlockList) { - - lineHeightFrequencyCounter.add(wordBlock.getTextHeight()); - fontSizeFrequencyCounter.add(wordBlock.getFontSize()); - spaceFrequencyCounter.add(wordBlock.getSpaceWidth()); - fontFrequencyCounter.add(wordBlock.getFont()); - styleFrequencyCounter.add(wordBlock.getFontStyle()); - - if (textBlock == null) { - textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(), - wordBlock.getMaxXDirAdj(), - wordBlock.getMinYDirAdj(), - wordBlock.getMaxYDirAdj(), - wordBlockList, - wordBlock.getRotation()); - } else { - TextPageBlock spatialEntity = textBlock.union(wordBlock); - textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight()); - } - } - - if (textBlock != null) { - textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular()); - textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular()); - textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular()); - textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular()); - textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular()); - textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest()); - } - - if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) { - textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj)); - } - return textBlock; - } - - private boolean isSplitByRuling(float minX, float minY, float maxX, @@ -253,7 +261,7 @@ public class BlockificationService { verticalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), - word.getPageHeight()); + word.getPageHeight()); // } @@ -268,11 +276,5 @@ public class BlockificationService { return false; } - - private double round(float value, int decimalPoints) { - - var d = Math.pow(10, decimalPoints); - return Math.round(value * d) / d; - } - } + diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/BodyTextFrameService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/BodyTextFrameService.java index ded5d93..bf8d597 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/BodyTextFrameService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/BodyTextFrameService.java @@ -17,7 +17,7 @@ import com.knecon.fforesight.service.layoutparser.processor.classification.utils @Service public class BodyTextFrameService { - private static final float APPROXIMATE_HEADER_LINE_COUNT = 2.9f; + private static final float APPROXIMATE_HEADER_LINE_COUNT = 2.0f; /** diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/ClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/ClassificationService.java index 263b7eb..2060ace 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/ClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/ClassificationService.java @@ -57,12 +57,9 @@ public class ClassificationService { textBlock.setClassification(PageBlockType.OTHER); return; } - if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() - .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) { + if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) { textBlock.setClassification(PageBlockType.HEADER); - - } else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() - .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) { + } else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) { textBlock.setClassification(PageBlockType.FOOTER); } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks() diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/TableExtractionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/TableExtractionService.java index c11cca0..4983220 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/TableExtractionService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/TableExtractionService.java @@ -136,6 +136,14 @@ public class TableExtractionService { public List findCells(List horizontalRulingLines, List verticalRulingLines) { + for (Ruling r : horizontalRulingLines) { + if (r.getX2() < r.getX1()) { + double a = r.getX2(); + r.x2 = (float) r.getX1(); + r.x1 = (float) a; + } + } + List cellsFound = new ArrayList<>(); Map intersectionPoints = Ruling.findIntersections(horizontalRulingLines, verticalRulingLines); List intersectionPointsList = new ArrayList<>(intersectionPoints.keySet()); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/FileUtils.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/FileUtils.java deleted file mode 100644 index 8196f3b..0000000 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/FileUtils.java +++ /dev/null @@ -1,56 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.utils; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; - -import lombok.experimental.UtilityClass; -import lombok.extern.slf4j.Slf4j; - -@Slf4j -@UtilityClass -public class FileUtils { - - public File createTempFile(String filenamePrefix, String filenameSuffix) throws IOException { - - File tempFile = Files.createTempFile(filenamePrefix, filenameSuffix).toFile(); - setRWPermissionsOnlyForOwner(tempFile); - - return tempFile; - } - - - /** - * Deletes a file; logs a message with the reason if the deletion fails. - * This method is null-safe. - * - * @param file The file to delete. Can be null. - */ - public void deleteFile(File file) { - - if (file != null) { - try { - Files.deleteIfExists(file.toPath()); - } catch (IOException ex) { - log.warn("Could not delete file!", ex); - } - } - } - - - // We don't need to check the results of the permission setters below, - // since we're manipulating a file we created ourselves. - @SuppressWarnings({"ResultOfMethodCallIgnored", "squid:S899"}) - private void setRWPermissionsOnlyForOwner(File tempFile) { - - try { - tempFile.setReadable(true, true); - tempFile.setWritable(true, true); - tempFile.setExecutable(false); - } catch (SecurityException ex) { - // This should never happen since we're creating a temp file ourselves. - log.warn("Caught an exception during temp file creation. This should not happend. Check the code.", ex); - } - } - -} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SearchTextWithTextPositionDto.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SearchTextWithTextPositionDto.java index 223492c..231148e 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SearchTextWithTextPositionDto.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SearchTextWithTextPositionDto.java @@ -4,6 +4,8 @@ import java.awt.geom.Rectangle2D; import java.util.Collections; import java.util.List; +import com.knecon.fforesight.service.layoutparser.processor.graph.Boundary; + import lombok.AccessLevel; import lombok.Builder; import lombok.Getter; @@ -16,7 +18,9 @@ public class SearchTextWithTextPositionDto { String searchText; List lineBreaks; - List stringCoordsToPositionCoords; + List stringIdxToPositionIdx; + List boldTextBoundaries; + List italicTextBoundaries; List positions; @@ -26,7 +30,7 @@ public class SearchTextWithTextPositionDto { .searchText("") .lineBreaks(Collections.emptyList()) .positions(Collections.emptyList()) - .stringCoordsToPositionCoords(Collections.emptyList()) + .stringIdxToPositionIdx(Collections.emptyList()) .build(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SearchTextWithTextPositionFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SearchTextWithTextPositionFactory.java index 98033f1..1d005f1 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SearchTextWithTextPositionFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SearchTextWithTextPositionFactory.java @@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.factory; import java.awt.geom.AffineTransform; import java.awt.geom.Rectangle2D; +import java.util.Collections; import java.util.LinkedList; import java.util.List; import java.util.Objects; @@ -9,6 +10,7 @@ import java.util.Objects; import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.RedTextPosition; import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextDirection; import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.graph.Boundary; import lombok.experimental.UtilityClass; @@ -24,7 +26,7 @@ public class SearchTextWithTextPositionFactory { public final int MAX_HYPHEN_LINEBREAK_DISTANCE = 3; - public SearchTextWithTextPositionDto buildSearchTextToTextPositionModel(List sequences) { + public SearchTextWithTextPositionDto buildSearchTextToTextPositionDto(List sequences) { if (sequences.isEmpty() || sequences.stream().allMatch(sequence -> sequence.getTextPositions().isEmpty())) { return SearchTextWithTextPositionDto.empty(); @@ -69,8 +71,10 @@ public class SearchTextWithTextPositionFactory { return SearchTextWithTextPositionDto.builder() .searchText(context.stringBuilder.toString()) .lineBreaks(context.lineBreaksStringIdx) - .stringCoordsToPositionCoords(context.stringIdxToPositionIdx) + .stringIdxToPositionIdx(context.stringIdxToPositionIdx) .positions(positions) + .boldTextBoundaries(mergeToBoundaries(context.boldTextsStringIdx)) + .italicTextBoundaries(mergeToBoundaries(context.italicTextStringIdx)) .build(); } @@ -82,6 +86,8 @@ public class SearchTextWithTextPositionFactory { // unicode characters with more than 16-bit encoding have a length > 1 in java strings for (int j = 0; j < currentTextPosition.getUnicode().length(); j++) { context.stringIdxToPositionIdx.add(context.positionIdx); + addTextPositionWithFontType(currentTextPosition, "bold", context.boldTextsStringIdx, context.stringIdx); + addTextPositionWithFontType(currentTextPosition, "italic", context.italicTextStringIdx, context.stringIdx); } context.stringIdx += currentTextPosition.getUnicode().length(); } @@ -103,6 +109,33 @@ public class SearchTextWithTextPositionFactory { return context.stringIdx - context.lastHyphenIdx < MAX_HYPHEN_LINEBREAK_DISTANCE; } + private static List mergeToBoundaries(List integers) { + + if (integers.isEmpty()) { + return Collections.emptyList(); + } + List boundaries = new LinkedList<>(); + int start = integers.get(0); + int end = integers.get(0) + 1; + for (int current : integers) { + if (current > end + 1) { + boundaries.add(new Boundary(start, end)); + start = current; + } + end = current + 1; + } + if (boundaries.isEmpty()) + boundaries.add(new Boundary(start, end)); + return boundaries; + } + + + private static void addTextPositionWithFontType(RedTextPosition currentTextPosition, String fontType, List fontTypePositions, int stringIdx) { + + if (currentTextPosition.getFontName().toLowerCase().contains(fontType)) { + fontTypePositions.add(stringIdx); + } + } private boolean isLineBreak(RedTextPosition currentTextPosition, RedTextPosition previousTextPosition) { @@ -173,6 +206,8 @@ public class SearchTextWithTextPositionFactory { List stringIdxToPositionIdx = new LinkedList<>(); List lineBreaksStringIdx = new LinkedList<>(); + List boldTextsStringIdx = new LinkedList<>(); + List italicTextStringIdx = new LinkedList<>(); StringBuilder stringBuilder = new StringBuilder(); int stringIdx; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/TextBlockFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/TextBlockFactory.java index caf01f9..69c71e4 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/TextBlockFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/TextBlockFactory.java @@ -26,12 +26,33 @@ public class TextBlockFactory { public AtomicTextBlock buildAtomicTextBlock(List sequences, SemanticNode parent, Integer numberOnPage, Page page) { - SearchTextWithTextPositionDto searchTextWithTextPositionDto = SearchTextWithTextPositionFactory.buildSearchTextToTextPositionModel(sequences); + SearchTextWithTextPositionDto searchTextWithTextPositionDto = SearchTextWithTextPositionFactory.buildSearchTextToTextPositionDto(sequences); int offset = stringOffset; stringOffset += searchTextWithTextPositionDto.getSearchText().length(); long idx = textBlockIdx; textBlockIdx++; - return AtomicTextBlock.fromSearchTextWithTextPositionDto(searchTextWithTextPositionDto, parent, offset, idx, numberOnPage, page); + String orientation; + int textDirection; + if (sequences.isEmpty()) { + orientation = null; + textDirection = 0; + } else { + orientation = sequences.get(0).getDir().toString(); + textDirection = sequences.get(0).getRotation(); + } + return AtomicTextBlock.fromSearchTextWithTextPosition(searchTextWithTextPositionDto.getSearchText(), + searchTextWithTextPositionDto.getLineBreaks(), + searchTextWithTextPositionDto.getBoldTextBoundaries(), + searchTextWithTextPositionDto.getItalicTextBoundaries(), + searchTextWithTextPositionDto.getPositions(), + searchTextWithTextPositionDto.getStringIdxToPositionIdx(), + idx, + parent, + numberOnPage, + page, + offset, + orientation, + textDirection); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/Boundary.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/Boundary.java index 463b7a6..ff7366d 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/Boundary.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/Boundary.java @@ -6,8 +6,6 @@ import java.util.Collection; import java.util.LinkedList; import java.util.List; -import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; - import lombok.EqualsAndHashCode; import lombok.Setter; @@ -138,26 +136,4 @@ public class Boundary implements Comparable { return 0; } - - /** - * shrinks the boundary, such that textBlock.subSequence(boundary) returns a string without whitespaces. - * - * @param textBlock TextBlock to check whitespaces against - * @return boundary - */ - public Boundary trim(TextBlock textBlock) { - - int trimmedStart = this.start; - while (Character.isWhitespace(textBlock.charAt(trimmedStart))) { - trimmedStart++; - } - - int trimmedEnd = this.end; - while (Character.isWhitespace(textBlock.charAt(trimmedEnd - 1))) { - trimmedEnd--; - } - - return new Boundary(trimmedStart, Math.max(trimmedEnd, trimmedStart)); - } - } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/DocumentTree.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/DocumentTree.java index cea2557..2f5f0c7 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/DocumentTree.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/DocumentTree.java @@ -7,7 +7,7 @@ import java.util.LinkedList; import java.util.List; import java.util.stream.Stream; -import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.GenericSemanticNode; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Document.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Document.java index 9ebcce6..d286c65 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Document.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Document.java @@ -11,7 +11,7 @@ import java.util.stream.Collectors; import java.util.stream.Stream; import com.amazonaws.services.kms.model.NotFoundException; -import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Footer.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Footer.java index 59813b4..a8bef65 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Footer.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Footer.java @@ -4,7 +4,7 @@ import java.util.HashSet; import java.util.List; import java.util.Set; -import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Header.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Header.java index a9dfce2..b405395 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Header.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Header.java @@ -4,7 +4,7 @@ import java.util.HashSet; import java.util.List; import java.util.Set; -import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Headline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Headline.java index 99e1adc..4856683 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Headline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Headline.java @@ -4,7 +4,7 @@ import java.util.HashSet; import java.util.List; import java.util.Set; -import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Image.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Image.java index 7ec9926..058f322 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Image.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Image.java @@ -8,7 +8,7 @@ import java.util.List; import java.util.Map; import java.util.Set; -import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Paragraph.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Paragraph.java index 8943d56..2f471fa 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Paragraph.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Paragraph.java @@ -4,7 +4,7 @@ import java.util.HashSet; import java.util.List; import java.util.Set; -import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Section.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Section.java index 76e6f08..2a3f360 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Section.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Section.java @@ -4,7 +4,7 @@ import java.util.HashSet; import java.util.List; import java.util.Set; -import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/SemanticNode.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/SemanticNode.java index a9e753f..139464f 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/SemanticNode.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/SemanticNode.java @@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.graph.nodes; import static java.lang.String.format; import java.awt.geom.Rectangle2D; +import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -10,7 +11,7 @@ import java.util.Set; import java.util.stream.Collectors; import java.util.stream.Stream; -import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.processor.graph.Boundary; import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.graph.entity.EntityType; @@ -59,6 +60,12 @@ public interface SemanticNode { } + default Page getFirstPage() { + + return getTextBlock().getPages().stream().min(Comparator.comparingInt(Page::getNumber)).orElseThrow(() -> new IllegalStateException("SemanticNode has no Page!")); + } + + /** * Each AtomicTextBlock is assigned a page, so to get the pages for this boundary, it collects the PageNodes from each AtomicTextBlock belonging to this node's TextBlock. * @@ -306,7 +313,6 @@ public interface SemanticNode { } - /** * This function is used during insertion of EntityNodes into the graph, it checks if the boundary of the RedactionEntity intersects or even contains the RedactionEntity. * It sets the fields accordingly and recursively calls this function on all its children. diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Table.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Table.java index 37c55bd..18118b5 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Table.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Table.java @@ -9,7 +9,7 @@ import java.util.Set; import java.util.stream.IntStream; import java.util.stream.Stream; -import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/TableCell.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/TableCell.java index 1a4f8a3..74a34e7 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/TableCell.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/TableCell.java @@ -7,7 +7,7 @@ import java.util.List; import java.util.Map; import java.util.Set; -import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/AtomicTextBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/AtomicTextBlock.java index 37eaf19..9db1955 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/AtomicTextBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/AtomicTextBlock.java @@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.graph.textblock; import static java.lang.String.format; import java.awt.geom.Rectangle2D; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; @@ -10,9 +11,8 @@ import java.util.HashMap; import java.util.List; import java.util.Map; -import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositionBlockData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData; -import com.knecon.fforesight.service.layoutparser.processor.factory.SearchTextWithTextPositionDto; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicPositionBlockData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicTextBlockData; import com.knecon.fforesight.service.layoutparser.processor.graph.Boundary; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode; @@ -38,11 +38,20 @@ public class AtomicTextBlock implements TextBlock { //string coordinates Boundary boundary; String searchText; - List lineBreaks; + @Builder.Default + List lineBreaks = new ArrayList<>(); + @Builder.Default + List boldTextBoundaries = new ArrayList<>(); + @Builder.Default + List italicTextBoundaries = new ArrayList<>(); + String orientation; + int textDirection; //position coordinates - List stringIdxToPositionIdx; - List positions; + @Builder.Default + List stringIdxToPositionIdx = new ArrayList<>(); + @Builder.Default + List positions = new ArrayList<>(); @EqualsAndHashCode.Exclude SemanticNode parent; @@ -55,23 +64,34 @@ public class AtomicTextBlock implements TextBlock { } - public static AtomicTextBlock fromSearchTextWithTextPositionDto(SearchTextWithTextPositionDto searchTextWithTextPositionDto, - SemanticNode parent, - int stringOffset, - Long textBlockIdx, - Integer numberOnPage, - Page page) { + public static AtomicTextBlock fromSearchTextWithTextPosition(String searchText, + List lineBreaks, + List boldTextBoundaries, + List italicTextBoundaries, + List positions, + List stringIdxToPositionIdx, + long idx, + SemanticNode parent, + int numberOnPage, + Page page, + int offset, + String orientation, + int textDirection) { return AtomicTextBlock.builder() - .id(textBlockIdx) + .id(idx) .parent(parent) - .searchText(searchTextWithTextPositionDto.getSearchText()) + .searchText(searchText) .numberOnPage(numberOnPage) .page(page) - .lineBreaks(searchTextWithTextPositionDto.getLineBreaks()) - .positions(searchTextWithTextPositionDto.getPositions()) - .stringIdxToPositionIdx(searchTextWithTextPositionDto.getStringCoordsToPositionCoords()) - .boundary(new Boundary(stringOffset, stringOffset + searchTextWithTextPositionDto.getSearchText().length())) + .lineBreaks(lineBreaks) + .boldTextBoundaries(boldTextBoundaries) + .italicTextBoundaries(italicTextBoundaries) + .positions(positions) + .stringIdxToPositionIdx(stringIdxToPositionIdx) + .boundary(new Boundary(offset, offset + searchText.length())) + .textDirection(textDirection) + .orientation(orientation) .build(); } @@ -82,11 +102,8 @@ public class AtomicTextBlock implements TextBlock { .id(textBlockIdx) .boundary(new Boundary(stringOffset, stringOffset)) .searchText("") - .lineBreaks(Collections.emptyList()) .page(page) .numberOnPage(numberOnPage) - .stringIdxToPositionIdx(Collections.emptyList()) - .positions(Collections.emptyList()) .parent(parent) .build(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/ConcatenatedTextBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/ConcatenatedTextBlock.java index 69e0473..4e1a5fb 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/ConcatenatedTextBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/ConcatenatedTextBlock.java @@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.graph.textblock; import static java.lang.String.format; import java.awt.geom.Rectangle2D; +import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.LinkedList; @@ -182,4 +183,38 @@ public class ConcatenatedTextBlock implements TextBlock { return getSearchText(); } + + @Override + public List getBoldTextBoundaries() { + + return getAtomicTextBlocks().stream().map(AtomicTextBlock::getBoldTextBoundaries).flatMap(Collection::stream).toList(); + } + + + @Override + public List getItalicTextBoundaries() { + + return getAtomicTextBlocks().stream().map(AtomicTextBlock::getItalicTextBoundaries).flatMap(Collection::stream).toList(); + } + + + @Override + public String getOrientation() { + + if (atomicTextBlocks.isEmpty()) { + return ""; + } + return atomicTextBlocks.get(0).getOrientation(); + } + + + @Override + public int getTextDirection() { + + if (atomicTextBlocks.isEmpty()) { + return 0; + } + return atomicTextBlocks.get(0).getTextDirection(); + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/TextBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/TextBlock.java index 34a0f7a..69d1640 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/TextBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/TextBlock.java @@ -21,6 +21,18 @@ public interface TextBlock extends CharSequence { List getAtomicTextBlocks(); + List getBoldTextBoundaries(); + + + List getItalicTextBoundaries(); + + + String getOrientation(); + + + int getTextDirection(); + + Boundary getBoundary(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/DocumentDataMapper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/redaction/DocumentDataMapper.java similarity index 97% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/DocumentDataMapper.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/redaction/DocumentDataMapper.java index 08f182d..c901d36 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/DocumentDataMapper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/redaction/DocumentDataMapper.java @@ -1,15 +1,15 @@ -package com.knecon.fforesight.service.layoutparser.processor.mapper; +package com.knecon.fforesight.service.layoutparser.processor.mapper.redaction; import java.awt.geom.Rectangle2D; import java.util.HashMap; import java.util.List; import java.util.Map; -import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositionBlockData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.PageData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentTreeData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicPositionBlockData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicTextBlockData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.PageData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTreeData; import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Image; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/DocumentGraphMapper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/redaction/DocumentGraphMapper.java similarity index 97% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/DocumentGraphMapper.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/redaction/DocumentGraphMapper.java index 80973e1..633878c 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/DocumentGraphMapper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/redaction/DocumentGraphMapper.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.mapper; +package com.knecon.fforesight.service.layoutparser.processor.mapper.redaction; import java.util.Arrays; import java.util.HashSet; @@ -7,11 +7,11 @@ import java.util.List; import java.util.Map; import java.util.NoSuchElementException; -import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositionBlockData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentTreeData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.PageData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicPositionBlockData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicTextBlockData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTreeData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.PageData; import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Footer; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/PropertiesMapper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/redaction/PropertiesMapper.java similarity index 99% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/PropertiesMapper.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/redaction/PropertiesMapper.java index cbb6d49..a7a5aeb 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/PropertiesMapper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/redaction/PropertiesMapper.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.mapper; +package com.knecon.fforesight.service.layoutparser.processor.mapper.redaction; import java.awt.geom.Rectangle2D; import java.util.Arrays; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/taas/TaasDocumentDataMapper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/taas/TaasDocumentDataMapper.java new file mode 100644 index 0000000..c0f40ca --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/taas/TaasDocumentDataMapper.java @@ -0,0 +1,108 @@ +package com.knecon.fforesight.service.layoutparser.processor.mapper.taas; + +import java.awt.geom.Rectangle2D; +import java.util.List; +import java.util.Locale; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.IntStream; + +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; +import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ParagraphData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.Range; +import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.RowData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.StructureObject; +import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.TableData; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Table; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.TableCell; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; + +public class TaasDocumentDataMapper { + + public static ResearchDocumentData fromDocument(Document document) { + AtomicInteger structureObjectNumber = new AtomicInteger(); + List structureObjects = document.streamAllSubNodes() + .filter(node -> !node.getType().equals(NodeType.TABLE_CELL)) + .filter(node -> !node.getType().equals(NodeType.SECTION)) + .map(node -> { + if (node.getType().equals(NodeType.TABLE)) { + return TaasDocumentDataMapper.fromTableWithTableData((Table) node, structureObjectNumber.getAndIncrement()); + } else { + return TaasDocumentDataMapper.fromSemanticNodeWithParagraphData(node, structureObjectNumber.getAndIncrement()); + } + }) + .toList(); + return ResearchDocumentData.builder().structureObjects(structureObjects).build(); + } + + public static ParagraphData fromTextBlock(String classification, TextBlock textBlock) { + + return ParagraphData.builder() + .boldTextBoundaries(textBlock.getBoldTextBoundaries().stream().map(b -> new Range(b.start(), b.end())).toList()) + .italicTextBoundaries(textBlock.getItalicTextBoundaries().stream().map(b -> new Range(b.start(), b.end())).toList()) + .text(textBlock.getSearchText()) + .classification(classification) + .orientation(textBlock.getOrientation()) + .textDirection(textBlock.getTextDirection()) + .build(); + } + + public static TableData fromTable(Table table) { + + List rowData = IntStream.range(0, table.getNumberOfRows()).boxed().map(rowIdx -> table.streamRow(rowIdx).toList()).map(TaasDocumentDataMapper::fromTableCells).toList(); + return new TableData(rowData, table.getNumberOfCols(), table.getNumberOfRows()); + } + + public static RowData fromTableCells(List tableCells) { + + if (tableCells.isEmpty()) { + throw new IllegalArgumentException("no table cells provided"); + } + boolean header = tableCells.stream().allMatch(TableCell::isHeader); + Page firstPage = tableCells.get(0).getFirstPage(); + Rectangle2D bBox = tableCells.stream().map(TableCell::getBBox).reduce((map1, map2) -> { + map1.forEach((page, rectangle) -> map2.merge(page, rectangle, (rect1, rect2) -> rect1.createUnion(rect2).getBounds2D())); + return map2; + }).orElseThrow().get(firstPage); + List cellText = tableCells.stream().map(TableCell::getTextBlock).map(TextBlock::getSearchText).toList(); + return new RowData(header, cellText, toFloatArray(bBox)); + } + + + public static StructureObject fromSemanticNodeWithParagraphData(SemanticNode semanticNode, Integer structureObjectNumber) { + + Page page = semanticNode.getFirstPage(); + Rectangle2D bBox = semanticNode.getBBox().get(page); + return StructureObject.builder() + .structureObjectNumber(structureObjectNumber) + .boundingBox(toFloatArray(bBox)) + .stringOffset(semanticNode.getBoundary().start()) + .page(page.getNumber()) + .paragraph(TaasDocumentDataMapper.fromTextBlock(semanticNode.getType().toString().toLowerCase(Locale.ROOT), semanticNode.getTextBlock())) + .table(null) + .build(); + } + + + public static StructureObject fromTableWithTableData(Table table, int structureObjectNumber) { + + Page page = table.getFirstPage(); + Rectangle2D bBox = table.getBBox().get(page); + return StructureObject.builder() + .structureObjectNumber(structureObjectNumber) + .boundingBox(toFloatArray(bBox)) + .stringOffset(table.getBoundary().start()) + .page(page.getNumber()) + .paragraph(null) + .table(TaasDocumentDataMapper.fromTable(table)) + .build(); + } + + private static float[] toFloatArray(Rectangle2D bBox) { + + return new float[]{(float) bBox.getX(), (float) bBox.getY(), (float) bBox.getWidth(), (float) bBox.getHeight()}; + } +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PdfVisualisationUtility.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PdfVisualisationUtility.java index 2616560..9fd0b75 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PdfVisualisationUtility.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PdfVisualisationUtility.java @@ -13,7 +13,7 @@ import org.apache.pdfbox.pdmodel.PDPageContentStream; import org.apache.pdfbox.pdmodel.font.PDType1Font; import org.apache.pdfbox.pdmodel.font.Standard14Fonts; -import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page; diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java new file mode 100644 index 0000000..fa2152e --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java @@ -0,0 +1,103 @@ +package com.knecon.fforesight.service.layoutparser.server; + +import java.awt.Color; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.time.Duration; +import java.util.List; + +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData; +import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingService; +import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.processor.mapper.taas.TaasDocumentDataMapper; +import com.knecon.fforesight.service.layoutparser.server.utils.BaseTest; +import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; + +import lombok.SneakyThrows; + +public class BdrJsonBuildTest extends BaseTest { + + @Autowired + private ObjectMapper objectMapper; + + @Autowired + private LayoutParsingService layoutParsingService; + + + @SneakyThrows + protected Document buildGraph(File filename) { + + try (InputStream inputStream = new FileInputStream(filename)) { + PDDocument pdDocument = Loader.loadPDF(inputStream); + return layoutParsingService.parseLayoutWithTimer(pdDocument, new ImageServiceResponse(), new TableServiceResponse()); + } + } + + + @Test + public void writeBDRDocumentData() throws IOException { + + String sourcePath = "/tmp/bdr_files"; + String targetPath = "/tmp/result"; + Paths.get(targetPath).toFile().mkdirs(); + + List files = Files.walk(Paths.get(sourcePath)).filter(currentPath -> currentPath.toString().endsWith(".pdf")).map(Path::toFile).toList(); + + System.out.printf("Found %d files \n", files.size()); + for (int i = 0; i < files.size(); i++) { + System.out.println(i + ": " + files.get(i)); + } + + System.out.println(); + + for (var file : files) { + long start = System.currentTimeMillis(); + System.out.println("Starting Structure Analysis for: " + file); + Document document = buildGraph(file); + + long start2 = System.currentTimeMillis(); + ResearchDocumentData researchDocumentData = TaasDocumentDataMapper.fromDocument(document); + researchDocumentData.setOriginalFile(file.toString()); + System.out.printf(", mapped to research data %d ms \n", System.currentTimeMillis() - start2); + + File jsonFile = Paths.get(targetPath, file.getName().replace(".pdf", ".json")).toFile(); + try (FileOutputStream fileOutputStream = new FileOutputStream(jsonFile)) { + System.out.println("json written to: " + jsonFile); + fileOutputStream.write(objectMapper.writeValueAsBytes(researchDocumentData)); + } + File visualizationFile = Paths.get(targetPath, file.getName().replace(".pdf", "_BBOX.pdf")).toFile(); + visualizeSemanticNodes(file, visualizationFile, document, document.getTextBlock()); + System.out.println("visualization pdf written to: " + visualizationFile); + System.out.printf("Full analysis and file creation took %s\n\n", Duration.ofMillis(System.currentTimeMillis() - start)); + } + + } + + + private static void visualizeSemanticNodes(File file, File resultingFileName, Document document, TextBlock textBlock) throws IOException { + + try (var fileStream = new FileInputStream(file); var outputStream = new FileOutputStream(resultingFileName)) { + PDDocument pdDocument = Loader.loadPDF(fileStream); + PdfDraw.drawDocumentGraph(pdDocument, document); + PdfDraw.drawTextBlock(pdDocument, textBlock, PdfDraw.Options.builder().stroke(true).strokeWidth(0.1f).strokeColor(Color.YELLOW).build()); + pdDocument.save(outputStream); + pdDocument.close(); + } + } + +} diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java index 0874336..194aa53 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java @@ -8,9 +8,9 @@ import org.junit.jupiter.api.Test; import com.fasterxml.jackson.databind.ObjectMapper; import com.iqser.red.commons.jackson.ObjectMapperFactory; -import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; -import com.knecon.fforesight.service.layoutparser.processor.mapper.DocumentDataMapper; +import com.knecon.fforesight.service.layoutparser.processor.mapper.redaction.DocumentDataMapper; import lombok.SneakyThrows; diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphMappingTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphMappingTest.java index ccf6f82..d0207d9 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphMappingTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphMappingTest.java @@ -2,11 +2,11 @@ package com.knecon.fforesight.service.layoutparser.server.graph; import org.junit.jupiter.api.Test; -import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; -import com.knecon.fforesight.service.layoutparser.processor.mapper.DocumentDataMapper; -import com.knecon.fforesight.service.layoutparser.processor.mapper.DocumentGraphMapper; +import com.knecon.fforesight.service.layoutparser.processor.mapper.redaction.DocumentDataMapper; +import com.knecon.fforesight.service.layoutparser.processor.mapper.redaction.DocumentGraphMapper; import lombok.SneakyThrows; diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/visualizations/PdfDraw.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/visualizations/PdfDraw.java index e8fd284..2b8eeb4 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/visualizations/PdfDraw.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/visualizations/PdfDraw.java @@ -14,7 +14,7 @@ import org.apache.pdfbox.pdmodel.font.PDType1Font; import org.apache.pdfbox.pdmodel.font.Standard14Fonts; import org.apache.pdfbox.util.Matrix; -import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;