diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/AtomicPositionBlockData.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/AtomicPositionBlockData.java similarity index 96% rename from layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/AtomicPositionBlockData.java rename to layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/AtomicPositionBlockData.java index f61d380..42daa7b 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/AtomicPositionBlockData.java +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/AtomicPositionBlockData.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.internal.api.data; +package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction; import lombok.AccessLevel; import lombok.AllArgsConstructor; diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/AtomicTextBlockData.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/AtomicTextBlockData.java similarity index 96% rename from layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/AtomicTextBlockData.java rename to layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/AtomicTextBlockData.java index 04349e4..52c1f72 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/AtomicTextBlockData.java +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/AtomicTextBlockData.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.internal.api.data; +package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction; diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/DocumentData.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/DocumentData.java similarity index 96% rename from layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/DocumentData.java rename to layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/DocumentData.java index 9e85750..1f38471 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/DocumentData.java +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/DocumentData.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.internal.api.data; +package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction; import lombok.AccessLevel; import lombok.AllArgsConstructor; diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/DocumentTreeData.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/DocumentTreeData.java similarity index 99% rename from layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/DocumentTreeData.java rename to layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/DocumentTreeData.java index 3a14a37..b0d5433 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/DocumentTreeData.java +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/DocumentTreeData.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.internal.api.data; +package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction; import java.util.List; import java.util.Map; diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/NodeType.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/NodeType.java similarity index 95% rename from layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/NodeType.java rename to layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/NodeType.java index 91104f2..7b92adb 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/NodeType.java +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/NodeType.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.internal.api.data; +package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction; import java.util.Locale; diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/PageData.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/PageData.java similarity index 96% rename from layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/PageData.java rename to layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/PageData.java index 20c92a3..4ea8069 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/PageData.java +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/redaction/PageData.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.internal.api.data; +package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction; import lombok.AccessLevel; import lombok.AllArgsConstructor; diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/ParagraphData.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/ParagraphData.java new file mode 100644 index 0000000..d4c6251 --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/ParagraphData.java @@ -0,0 +1,21 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.data.taas; + +import java.util.List; + +import lombok.Builder; +import lombok.Data; + +@Data +@Builder +public class ParagraphData { + + private String text; + List boldTextBoundaries; + List italicTextBoundaries; + List linebreaks; + private String classification; + + private String orientation; + private int textDirection; + +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/Range.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/Range.java new file mode 100644 index 0000000..a978cc0 --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/Range.java @@ -0,0 +1,5 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.data.taas; + +public record Range(int start, int end) { + +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/ResearchDocumentData.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/ResearchDocumentData.java new file mode 100644 index 0000000..667fdee --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/ResearchDocumentData.java @@ -0,0 +1,16 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.data.taas; + +import java.util.List; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; + +@Builder +@Data +@AllArgsConstructor +public class ResearchDocumentData { + + String originalFile; + List structureObjects; +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/RowData.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/RowData.java new file mode 100644 index 0000000..388275b --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/RowData.java @@ -0,0 +1,15 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.data.taas; + +import java.util.List; + +import lombok.AllArgsConstructor; +import lombok.Data; + +@Data +@AllArgsConstructor +public class RowData { + + boolean header; + List cellText; + float[] bBox; +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/StructureObject.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/StructureObject.java new file mode 100644 index 0000000..fca1eff --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/StructureObject.java @@ -0,0 +1,19 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.data.taas; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; + +@Data +@Builder +@AllArgsConstructor +public class StructureObject { + + Integer structureObjectNumber; + int page; + int stringOffset; + float[] boundingBox; + ParagraphData paragraph; + TableData table; + +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/TableData.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/TableData.java new file mode 100644 index 0000000..e5153dd --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/data/taas/TableData.java @@ -0,0 +1,15 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.data.taas; + +import java.util.List; + +import lombok.AllArgsConstructor; +import lombok.Data; + +@Data +@AllArgsConstructor +public class TableData { + + List rowData; + Integer numberOfCols; + Integer numberOfRows; +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingQueueNames.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingQueueNames.java index 812290b..4aabd62 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingQueueNames.java +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingQueueNames.java @@ -2,7 +2,7 @@ package com.knecon.fforesight.service.layoutparser.internal.api.queue; public class LayoutParsingQueueNames { - public static final String LAYOUT_PARSING_REQUEST_QUEUE = "LAYOUTPARSING_REQUEST_QUEUE"; - public static final String LAYOUT_PARSING_DLQ = "LAYOUTPARSING_DLQ"; - public static final String LAYOUT_PARSING_FINISHED_EVENT_QUEUE = "LAYOUTPARSING_FINISHED_EVENT_QUEUE"; + public static final String LAYOUT_PARSING_REQUEST_QUEUE = "layout_parsing_request_queue"; + public static final String LAYOUT_PARSING_DLQ = "layout_parsing_dead_letter_queue"; + public static final String LAYOUT_PARSING_FINISHED_EVENT_QUEUE = "layout_parsing_response_queue"; } diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingRequest.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingRequest.java index 721b9be..f4177c5 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingRequest.java +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingRequest.java @@ -8,13 +8,16 @@ import lombok.Builder; @Builder public record LayoutParsingRequest( + LayoutParsingType layoutParsingType, Map identifier, String originFileStorageId, Optional tablesFileStorageId, Optional imagesFileStorageId, String structureFileStorageId, + String researchDocumentStorageId, String textBlockFileStorageId, String positionBlockFileStorageId, - String pageFileStorageId) { + String pageFileStorageId, + String sectionGridStorageId) { } diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingType.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingType.java new file mode 100644 index 0000000..7598d29 --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingType.java @@ -0,0 +1,7 @@ +package com.knecon.fforesight.service.layoutparser.internal.api.queue; + +public enum LayoutParsingType { + REDACT_MANAGER, + TAAS, + DOCUMINE +} diff --git a/layoutparser-service/layoutparser-service-internal-api/src/test/resources/log4j2-test.xml b/layoutparser-service/layoutparser-service-internal-api/src/test/resources/log4j2-test.xml new file mode 100644 index 0000000..b4895cf --- /dev/null +++ b/layoutparser-service/layoutparser-service-internal-api/src/test/resources/log4j2-test.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/layoutparser-service/layoutparser-service-processor/pom.xml b/layoutparser-service/layoutparser-service-processor/pom.xml index f976cd6..8bcaeb5 100644 --- a/layoutparser-service/layoutparser-service-processor/pom.xml +++ b/layoutparser-service/layoutparser-service-processor/pom.xml @@ -60,6 +60,12 @@ org.springframework.boot spring-boot-starter-amqp + + org.junit.jupiter + junit-jupiter + RELEASE + test + diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java new file mode 100644 index 0000000..f945b6e --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -0,0 +1,141 @@ +package com.knecon.fforesight.service.layoutparser.processor; + +import static java.lang.String.format; + +import java.io.IOException; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent; +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; +import com.knecon.fforesight.service.layoutparser.processor.adapter.CvTableParsingAdapter; +import com.knecon.fforesight.service.layoutparser.processor.adapter.ImageServiceResponseAdapter; +import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.factory.DocumentGraphFactory; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.mapper.redaction.DocumentDataMapper; +import com.knecon.fforesight.service.layoutparser.processor.mapper.taas.TaasDocumentDataMapper; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.services.PdfParsingService; +import com.knecon.fforesight.service.layoutparser.processor.services.SectionGridCreatorService; +import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService; +import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService; +import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService; +import com.knecon.fforesight.service.layoutparser.processor.services.classification.TaasClassificationService; + +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@Service +@RequiredArgsConstructor +public class LayoutParsingPipeline { + + private final ImageServiceResponseAdapter imageServiceResponseAdapter; + private final CvTableParsingAdapter cvTableParsingAdapter; + private final LayoutParsingStorageService layoutParsingStorageService; + private final PdfParsingService pdfParsingService; + private final SectionsBuilderService sectionsBuilderService; + private final SectionGridCreatorService sectionGridCreatorService; + private final TaasClassificationService taasClassificationService; + private final RedactManagerClassificationService redactManagerClassificationService; + private final DocuMineClassificationService docuMineClassificationService; + + + public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException { + + long start = System.currentTimeMillis(); + PDDocument originDocument = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId()); + + ImageServiceResponse imageServiceResponse = new ImageServiceResponse(); + if (layoutParsingRequest.imagesFileStorageId().isPresent()) { + imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.pageFileStorageId()); + } + + TableServiceResponse tableServiceResponse = new TableServiceResponse(); + if (layoutParsingRequest.tablesFileStorageId().isPresent()) { + tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.pageFileStorageId()); + } + + Document documentGraph = parseLayout(layoutParsingRequest.layoutParsingType(), originDocument, imageServiceResponse, tableServiceResponse); + int numberOfPages = originDocument.getNumberOfPages(); + originDocument.close(); + + layoutParsingStorageService.storeSectionGrid(layoutParsingRequest, sectionGridCreatorService.createSectionGrid(documentGraph)); + layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph)); + + if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.TAAS)) { + var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentGraph); + layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData); + } + + return LayoutParsingFinishedEvent.builder() + .identifier(layoutParsingRequest.identifier()) + .numberOfPages(numberOfPages) + .duration(System.currentTimeMillis() - start) + .message(format("Layout parsing is finished and files have been saved with Ids:\n Structure: %s\nText: %s\nPositions: %s\nPageData: %s", + layoutParsingRequest.structureFileStorageId(), + layoutParsingRequest.textBlockFileStorageId(), + layoutParsingRequest.positionBlockFileStorageId(), + layoutParsingRequest.pageFileStorageId())) + .build(); + } + + + public Document parseLayout(LayoutParsingType layoutParsingType, + PDDocument originDocument, + ImageServiceResponse imageServiceResponse, + TableServiceResponse tableServiceResponse) { + + ClassificationDocument classificationDocument = pdfParsingService.parseDocument(layoutParsingType, + originDocument, + cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse), + imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse)); + + switch (layoutParsingType) { + case TAAS -> taasClassificationService.classifyDocument(classificationDocument); + case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument); + case REDACT_MANAGER -> redactManagerClassificationService.classifyDocument(classificationDocument); + } + + sectionsBuilderService.buildSections(classificationDocument); + + return DocumentGraphFactory.buildDocumentGraph(classificationDocument); + } + + + public Document parseLayoutWithTimer(LayoutParsingType layoutParsingType, + PDDocument originDocument, + ImageServiceResponse imageServiceResponse, + TableServiceResponse tableServiceResponse) { + + long start = System.currentTimeMillis(); + + ClassificationDocument classificationDocument = pdfParsingService.parseDocument(layoutParsingType, originDocument, + cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse), + imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse)); + + System.out.printf("parsed %d ms", System.currentTimeMillis() - start); + + start = System.currentTimeMillis(); + switch (layoutParsingType) { + case TAAS -> taasClassificationService.classifyDocument(classificationDocument); + case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument); + case REDACT_MANAGER -> redactManagerClassificationService.classifyDocument(classificationDocument); + } + System.out.printf(", classified %d ms", System.currentTimeMillis() - start); + + start = System.currentTimeMillis(); + sectionsBuilderService.buildSections(classificationDocument); + System.out.printf(", sections built %d ms", System.currentTimeMillis() - start); + + start = System.currentTimeMillis(); + Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument); + System.out.printf(", graph constructed %d ms", System.currentTimeMillis() - start); + return document; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingService.java deleted file mode 100644 index 33f309b..0000000 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingService.java +++ /dev/null @@ -1,87 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.processor; - -import static java.lang.String.format; - -import java.io.IOException; - -import org.apache.pdfbox.pdmodel.PDDocument; -import org.springframework.stereotype.Service; - -import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent; -import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; -import com.knecon.fforesight.service.layoutparser.processor.adapter.CvTableParsingAdapter; -import com.knecon.fforesight.service.layoutparser.processor.adapter.ImageServiceResponseAdapter; -import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse; -import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument; -import com.knecon.fforesight.service.layoutparser.processor.classification.service.ClassificationService; -import com.knecon.fforesight.service.layoutparser.processor.classification.service.PdfParsingService; -import com.knecon.fforesight.service.layoutparser.processor.classification.service.SectionsBuilderService; -import com.knecon.fforesight.service.layoutparser.processor.factory.DocumentGraphFactory; -import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; -import com.knecon.fforesight.service.layoutparser.processor.mapper.DocumentDataMapper; - -import lombok.RequiredArgsConstructor; -import lombok.extern.slf4j.Slf4j; - -@Slf4j -@Service -@RequiredArgsConstructor -public class LayoutParsingService { - - private final ImageServiceResponseAdapter imageServiceResponseAdapter; - private final CvTableParsingAdapter cvTableParsingAdapter; - private final LayoutParsingStorageService layoutParsingStorageService; - private final PdfParsingService pdfParsingService; - private final ClassificationService classificationService; - private final SectionsBuilderService sectionsBuilderService; - - - public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException { - - long start = System.currentTimeMillis(); - PDDocument originDocument = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId()); - - ImageServiceResponse imageServiceResponse = new ImageServiceResponse(); - if (layoutParsingRequest.imagesFileStorageId().isPresent()) { - imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.pageFileStorageId()); - } - - TableServiceResponse tableServiceResponse = new TableServiceResponse(); - if (layoutParsingRequest.tablesFileStorageId().isPresent()) { - tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.pageFileStorageId()); - } - - Document documentGraph = parseLayout(originDocument, imageServiceResponse, tableServiceResponse); - int numberOfPages = originDocument.getNumberOfPages(); - originDocument.close(); - - layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph)); - - return LayoutParsingFinishedEvent.builder() - .identifier(layoutParsingRequest.identifier()) - .numberOfPages(numberOfPages) - .duration(System.currentTimeMillis() - start) - .message(format("Layout parsing is finished and files have been saved with Ids:\n Structure: %s\nText: %s\nPositions: %s\nPageData: %s", - layoutParsingRequest.structureFileStorageId(), - layoutParsingRequest.textBlockFileStorageId(), - layoutParsingRequest.positionBlockFileStorageId(), - layoutParsingRequest.pageFileStorageId())) - .build(); - } - - - public Document parseLayout(PDDocument originDocument, ImageServiceResponse imageServiceResponse, TableServiceResponse tableServiceResponse) { - - ClassificationDocument classificationDocument = pdfParsingService.parseDocument(originDocument, - cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse), - imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse)); - - classificationService.classifyDocument(classificationDocument); - - sectionsBuilderService.buildSections(classificationDocument); - - return DocumentGraphFactory.buildDocumentGraph(classificationDocument); - } - -} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java index 8a1fac9..4a82ff8 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java @@ -13,12 +13,14 @@ import org.apache.pdfbox.pdmodel.PDDocument; import org.springframework.stereotype.Service; import com.fasterxml.jackson.databind.ObjectMapper; +import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionGrid; import com.iqser.red.storage.commons.service.StorageService; -import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositionBlockData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentTreeData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.PageData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicPositionBlockData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicTextBlockData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTreeData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.PageData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse; @@ -67,13 +69,24 @@ public class LayoutParsingStorageService { } - public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, DocumentData documentData) throws IOException { + public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, DocumentData documentData) { storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), documentData.getDocumentTreeData()); storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.textBlockFileStorageId(), documentData.getAtomicTextBlocks()); storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.positionBlockFileStorageId(), documentData.getAtomicPositionBlocks()); storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.pageFileStorageId(), documentData.getPages()); + } + + public void storeSectionGrid(LayoutParsingRequest layoutParsingRequest, SectionGrid sectionGrid) { + + storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.sectionGridStorageId(), sectionGrid); + } + + + public void storeResearchDocumentData(LayoutParsingRequest layoutParsingRequest, ResearchDocumentData researchDocumentData) { + + storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.researchDocumentStorageId(), researchDocumentData); } @@ -86,9 +99,7 @@ public class LayoutParsingStorageService { AtomicPositionBlockData[] atomicPositionBlockData = storageService.readJSONObject(TenantContext.getTenantId(), layoutParsingRequest.positionBlockFileStorageId(), AtomicPositionBlockData[].class); - DocumentTreeData tableOfContentsData = storageService.readJSONObject(TenantContext.getTenantId(), - layoutParsingRequest.structureFileStorageId(), - DocumentTreeData.class); + DocumentTreeData tableOfContentsData = storageService.readJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), DocumentTreeData.class); return DocumentData.builder() .documentTreeData(tableOfContentsData) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/ImageServiceResponseAdapter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/ImageServiceResponseAdapter.java index 29898c9..4b64f1c 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/ImageServiceResponseAdapter.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/adapter/ImageServiceResponseAdapter.java @@ -10,8 +10,8 @@ import java.util.Map; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.ImageType; import lombok.RequiredArgsConstructor; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/Orientation.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/Orientation.java deleted file mode 100644 index 75ae7bd..0000000 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/Orientation.java +++ /dev/null @@ -1,8 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model; - -public enum Orientation { - - NONE, - LEFT, - RIGHT -} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/FileUtils.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/FileUtils.java deleted file mode 100644 index 8196f3b..0000000 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/FileUtils.java +++ /dev/null @@ -1,56 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.utils; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; - -import lombok.experimental.UtilityClass; -import lombok.extern.slf4j.Slf4j; - -@Slf4j -@UtilityClass -public class FileUtils { - - public File createTempFile(String filenamePrefix, String filenameSuffix) throws IOException { - - File tempFile = Files.createTempFile(filenamePrefix, filenameSuffix).toFile(); - setRWPermissionsOnlyForOwner(tempFile); - - return tempFile; - } - - - /** - * Deletes a file; logs a message with the reason if the deletion fails. - * This method is null-safe. - * - * @param file The file to delete. Can be null. - */ - public void deleteFile(File file) { - - if (file != null) { - try { - Files.deleteIfExists(file.toPath()); - } catch (IOException ex) { - log.warn("Could not delete file!", ex); - } - } - } - - - // We don't need to check the results of the permission setters below, - // since we're manipulating a file we created ourselves. - @SuppressWarnings({"ResultOfMethodCallIgnored", "squid:S899"}) - private void setRWPermissionsOnlyForOwner(File tempFile) { - - try { - tempFile.setReadable(true, true); - tempFile.setWritable(true, true); - tempFile.setExecutable(false); - } catch (SecurityException ex) { - // This should never happen since we're creating a temp file ourselves. - log.warn("Caught an exception during temp file creation. This should not happend. Check the code.", ex); - } - } - -} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/DocumentGraphFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/DocumentGraphFactory.java index 4897aa2..8e56f49 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/DocumentGraphFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/DocumentGraphFactory.java @@ -13,13 +13,13 @@ import java.util.Map; import java.util.NoSuchElementException; import java.util.Set; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationFooter; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationHeader; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Footer; @@ -81,8 +81,9 @@ public class DocumentGraphFactory { page.getMainBody().add(node); - List textBlocks = new ArrayList<>(textBlocksToMerge); + List textBlocks = new ArrayList<>(); textBlocks.add(originalTextBlock); + textBlocks.addAll(textBlocksToMerge); AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), node, context, page); List treeId = context.documentTree.createNewChildEntryAndReturnId(parentNode, node); node.setLeafTextBlock(textBlock); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SearchTextWithTextPositionDto.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SearchTextWithTextPositionDto.java index 223492c..231148e 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SearchTextWithTextPositionDto.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SearchTextWithTextPositionDto.java @@ -4,6 +4,8 @@ import java.awt.geom.Rectangle2D; import java.util.Collections; import java.util.List; +import com.knecon.fforesight.service.layoutparser.processor.graph.Boundary; + import lombok.AccessLevel; import lombok.Builder; import lombok.Getter; @@ -16,7 +18,9 @@ public class SearchTextWithTextPositionDto { String searchText; List lineBreaks; - List stringCoordsToPositionCoords; + List stringIdxToPositionIdx; + List boldTextBoundaries; + List italicTextBoundaries; List positions; @@ -26,7 +30,7 @@ public class SearchTextWithTextPositionDto { .searchText("") .lineBreaks(Collections.emptyList()) .positions(Collections.emptyList()) - .stringCoordsToPositionCoords(Collections.emptyList()) + .stringIdxToPositionIdx(Collections.emptyList()) .build(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SearchTextWithTextPositionFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SearchTextWithTextPositionFactory.java index 98033f1..afc179a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SearchTextWithTextPositionFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SearchTextWithTextPositionFactory.java @@ -2,13 +2,15 @@ package com.knecon.fforesight.service.layoutparser.processor.factory; import java.awt.geom.AffineTransform; import java.awt.geom.Rectangle2D; +import java.util.Collections; import java.util.LinkedList; import java.util.List; import java.util.Objects; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.RedTextPosition; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextDirection; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.graph.Boundary; import lombok.experimental.UtilityClass; @@ -24,7 +26,7 @@ public class SearchTextWithTextPositionFactory { public final int MAX_HYPHEN_LINEBREAK_DISTANCE = 3; - public SearchTextWithTextPositionDto buildSearchTextToTextPositionModel(List sequences) { + public SearchTextWithTextPositionDto buildSearchTextToTextPositionDto(List sequences) { if (sequences.isEmpty() || sequences.stream().allMatch(sequence -> sequence.getTextPositions().isEmpty())) { return SearchTextWithTextPositionDto.empty(); @@ -69,8 +71,10 @@ public class SearchTextWithTextPositionFactory { return SearchTextWithTextPositionDto.builder() .searchText(context.stringBuilder.toString()) .lineBreaks(context.lineBreaksStringIdx) - .stringCoordsToPositionCoords(context.stringIdxToPositionIdx) + .stringIdxToPositionIdx(context.stringIdxToPositionIdx) .positions(positions) + .boldTextBoundaries(mergeToBoundaries(context.boldTextsStringIdx)) + .italicTextBoundaries(mergeToBoundaries(context.italicTextStringIdx)) .build(); } @@ -82,6 +86,8 @@ public class SearchTextWithTextPositionFactory { // unicode characters with more than 16-bit encoding have a length > 1 in java strings for (int j = 0; j < currentTextPosition.getUnicode().length(); j++) { context.stringIdxToPositionIdx.add(context.positionIdx); + addTextPositionWithFontType(currentTextPosition, "bold", context.boldTextsStringIdx, context.stringIdx); + addTextPositionWithFontType(currentTextPosition, "italic", context.italicTextStringIdx, context.stringIdx); } context.stringIdx += currentTextPosition.getUnicode().length(); } @@ -103,6 +109,33 @@ public class SearchTextWithTextPositionFactory { return context.stringIdx - context.lastHyphenIdx < MAX_HYPHEN_LINEBREAK_DISTANCE; } + private static List mergeToBoundaries(List integers) { + + if (integers.isEmpty()) { + return Collections.emptyList(); + } + List boundaries = new LinkedList<>(); + int start = integers.get(0); + int end = integers.get(0) + 1; + for (int current : integers) { + if (current > end + 1) { + boundaries.add(new Boundary(start, end)); + start = current; + } + end = current + 1; + } + if (boundaries.isEmpty()) + boundaries.add(new Boundary(start, end)); + return boundaries; + } + + + private static void addTextPositionWithFontType(RedTextPosition currentTextPosition, String fontType, List fontTypePositions, int stringIdx) { + + if (currentTextPosition.getFontName().toLowerCase().contains(fontType)) { + fontTypePositions.add(stringIdx); + } + } private boolean isLineBreak(RedTextPosition currentTextPosition, RedTextPosition previousTextPosition) { @@ -173,6 +206,8 @@ public class SearchTextWithTextPositionFactory { List stringIdxToPositionIdx = new LinkedList<>(); List lineBreaksStringIdx = new LinkedList<>(); + List boldTextsStringIdx = new LinkedList<>(); + List italicTextStringIdx = new LinkedList<>(); StringBuilder stringBuilder = new StringBuilder(); int stringIdx; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SectionNodeFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SectionNodeFactory.java index d3942fa..01eea15 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SectionNodeFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/SectionNodeFactory.java @@ -10,10 +10,10 @@ import java.util.List; import java.util.Map; import java.util.Set; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.GenericSemanticNode; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Section; @@ -80,7 +80,7 @@ public class SectionNodeFactory { remainingBlocks.removeAll(alreadyMerged); if (abstractPageBlock instanceof TextPageBlock) { - List textBlocks = findTextBlocksWithSameClassificationAndAlignsY(abstractPageBlock, remainingBlocks); + List textBlocks = findTextBlocksWithSameClassificationAndAlignsY((TextPageBlock) abstractPageBlock, remainingBlocks); alreadyMerged.addAll(textBlocks); DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks); } else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) { @@ -123,7 +123,7 @@ public class SectionNodeFactory { List previousList = splitList.get(i - 1); AbstractPageBlock lastPageBlockInPreviousList = previousList.get(previousList.size() - 1); if (lastPageBlockInPreviousList.isHeadline()) { - previousList.remove(i - 1); + previousList.remove(previousList.size() - 1); splitList.get(i).add(0, lastPageBlockInPreviousList); } } @@ -162,7 +162,7 @@ public class SectionNodeFactory { } - private List findTextBlocksWithSameClassificationAndAlignsY(AbstractPageBlock atc, List pageBlocks) { + private List findTextBlocksWithSameClassificationAndAlignsY(TextPageBlock atc, List pageBlocks) { return pageBlocks.stream() .filter(abstractTextContainer -> !abstractTextContainer.equals(atc)) @@ -170,6 +170,7 @@ public class SectionNodeFactory { .filter(abstractTextContainer -> abstractTextContainer instanceof TextPageBlock) .filter(abstractTextContainer -> abstractTextContainer.intersectsY(atc)) .map(abstractTextContainer -> (TextPageBlock) abstractTextContainer) + .filter(abstractTextContainer -> abstractTextContainer.getDir() == atc.getDir()) .toList(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/TableNodeFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/TableNodeFactory.java index 0124ec1..13977a1 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/TableNodeFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/TableNodeFactory.java @@ -7,10 +7,10 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.GenericSemanticNode; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/TextBlockFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/TextBlockFactory.java index caf01f9..399c9d5 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/TextBlockFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/factory/TextBlockFactory.java @@ -2,7 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.factory; import java.util.List; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode; import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.AtomicTextBlock; @@ -26,12 +26,33 @@ public class TextBlockFactory { public AtomicTextBlock buildAtomicTextBlock(List sequences, SemanticNode parent, Integer numberOnPage, Page page) { - SearchTextWithTextPositionDto searchTextWithTextPositionDto = SearchTextWithTextPositionFactory.buildSearchTextToTextPositionModel(sequences); + SearchTextWithTextPositionDto searchTextWithTextPositionDto = SearchTextWithTextPositionFactory.buildSearchTextToTextPositionDto(sequences); int offset = stringOffset; stringOffset += searchTextWithTextPositionDto.getSearchText().length(); long idx = textBlockIdx; textBlockIdx++; - return AtomicTextBlock.fromSearchTextWithTextPositionDto(searchTextWithTextPositionDto, parent, offset, idx, numberOnPage, page); + String orientation; + int textDirection; + if (sequences.isEmpty()) { + orientation = null; + textDirection = 0; + } else { + orientation = sequences.get(0).getDir().toString(); + textDirection = sequences.get(0).getRotation(); + } + return AtomicTextBlock.fromSearchTextWithTextPosition(searchTextWithTextPositionDto.getSearchText(), + searchTextWithTextPositionDto.getLineBreaks(), + searchTextWithTextPositionDto.getBoldTextBoundaries(), + searchTextWithTextPositionDto.getItalicTextBoundaries(), + searchTextWithTextPositionDto.getPositions(), + searchTextWithTextPositionDto.getStringIdxToPositionIdx(), + idx, + parent, + numberOnPage, + page, + offset, + orientation, + textDirection); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/Boundary.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/Boundary.java index 463b7a6..5808e20 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/Boundary.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/Boundary.java @@ -5,8 +5,7 @@ import static java.lang.String.format; import java.util.Collection; import java.util.LinkedList; import java.util.List; - -import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; +import java.util.stream.IntStream; import lombok.EqualsAndHashCode; import lombok.Setter; @@ -109,6 +108,10 @@ public class Boundary implements Comparable { return splitBoundaries; } + public IntStream intStream() { + + return IntStream.range(start, end); + } public static Boundary merge(Collection boundaries) { @@ -138,26 +141,4 @@ public class Boundary implements Comparable { return 0; } - - /** - * shrinks the boundary, such that textBlock.subSequence(boundary) returns a string without whitespaces. - * - * @param textBlock TextBlock to check whitespaces against - * @return boundary - */ - public Boundary trim(TextBlock textBlock) { - - int trimmedStart = this.start; - while (Character.isWhitespace(textBlock.charAt(trimmedStart))) { - trimmedStart++; - } - - int trimmedEnd = this.end; - while (Character.isWhitespace(textBlock.charAt(trimmedEnd - 1))) { - trimmedEnd--; - } - - return new Boundary(trimmedStart, Math.max(trimmedEnd, trimmedStart)); - } - } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/DocumentTree.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/DocumentTree.java index cea2557..2f5f0c7 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/DocumentTree.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/DocumentTree.java @@ -7,7 +7,7 @@ import java.util.LinkedList; import java.util.List; import java.util.stream.Stream; -import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.GenericSemanticNode; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Document.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Document.java index 9ebcce6..d286c65 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Document.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Document.java @@ -11,7 +11,7 @@ import java.util.stream.Collectors; import java.util.stream.Stream; import com.amazonaws.services.kms.model.NotFoundException; -import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Footer.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Footer.java index 59813b4..a8bef65 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Footer.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Footer.java @@ -4,7 +4,7 @@ import java.util.HashSet; import java.util.List; import java.util.Set; -import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Header.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Header.java index a9dfce2..b405395 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Header.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Header.java @@ -4,7 +4,7 @@ import java.util.HashSet; import java.util.List; import java.util.Set; -import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Headline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Headline.java index 99e1adc..4856683 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Headline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Headline.java @@ -4,7 +4,7 @@ import java.util.HashSet; import java.util.List; import java.util.Set; -import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Image.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Image.java index 7ec9926..058f322 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Image.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Image.java @@ -8,7 +8,7 @@ import java.util.List; import java.util.Map; import java.util.Set; -import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Page.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Page.java index f01cc38..5fcfa51 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Page.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Page.java @@ -5,7 +5,7 @@ import java.util.LinkedList; import java.util.List; import java.util.Set; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlockCollector; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Paragraph.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Paragraph.java index 8943d56..2f471fa 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Paragraph.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Paragraph.java @@ -4,7 +4,7 @@ import java.util.HashSet; import java.util.List; import java.util.Set; -import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Section.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Section.java index 76e6f08..2a3f360 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Section.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Section.java @@ -4,7 +4,7 @@ import java.util.HashSet; import java.util.List; import java.util.Set; -import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/SemanticNode.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/SemanticNode.java index a9e753f..d76b7a2 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/SemanticNode.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/SemanticNode.java @@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.graph.nodes; import static java.lang.String.format; import java.awt.geom.Rectangle2D; +import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -10,14 +11,14 @@ import java.util.Set; import java.util.stream.Collectors; import java.util.stream.Stream; -import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.processor.graph.Boundary; import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.graph.entity.EntityType; import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.AtomicTextBlock; import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; -import com.knecon.fforesight.service.layoutparser.processor.services.RectangleTransformations; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; public interface SemanticNode { @@ -59,6 +60,12 @@ public interface SemanticNode { } + default Page getFirstPage() { + + return getTextBlock().getPages().stream().min(Comparator.comparingInt(Page::getNumber)).orElseThrow(() -> new IllegalStateException("SemanticNode has no Page!")); + } + + /** * Each AtomicTextBlock is assigned a page, so to get the pages for this boundary, it collects the PageNodes from each AtomicTextBlock belonging to this node's TextBlock. * @@ -306,7 +313,6 @@ public interface SemanticNode { } - /** * This function is used during insertion of EntityNodes into the graph, it checks if the boundary of the RedactionEntity intersects or even contains the RedactionEntity. * It sets the fields accordingly and recursively calls this function on all its children. diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Table.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Table.java index 37c55bd..18118b5 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Table.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/Table.java @@ -9,7 +9,7 @@ import java.util.Set; import java.util.stream.IntStream; import java.util.stream.Stream; -import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/TableCell.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/TableCell.java index 1a4f8a3..74a34e7 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/TableCell.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/nodes/TableCell.java @@ -7,7 +7,7 @@ import java.util.List; import java.util.Map; import java.util.Set; -import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity; import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/AtomicTextBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/AtomicTextBlock.java index 37eaf19..c1bde81 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/AtomicTextBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/AtomicTextBlock.java @@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.graph.textblock; import static java.lang.String.format; import java.awt.geom.Rectangle2D; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; @@ -10,9 +11,8 @@ import java.util.HashMap; import java.util.List; import java.util.Map; -import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositionBlockData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData; -import com.knecon.fforesight.service.layoutparser.processor.factory.SearchTextWithTextPositionDto; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicPositionBlockData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicTextBlockData; import com.knecon.fforesight.service.layoutparser.processor.graph.Boundary; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode; @@ -38,11 +38,20 @@ public class AtomicTextBlock implements TextBlock { //string coordinates Boundary boundary; String searchText; - List lineBreaks; + @Builder.Default + List lineBreaks = new ArrayList<>(); + @Builder.Default + List boldTextBoundaries = new ArrayList<>(); + @Builder.Default + List italicTextBoundaries = new ArrayList<>(); + String orientation; + int textDirection; //position coordinates - List stringIdxToPositionIdx; - List positions; + @Builder.Default + List stringIdxToPositionIdx = new ArrayList<>(); + @Builder.Default + List positions = new ArrayList<>(); @EqualsAndHashCode.Exclude SemanticNode parent; @@ -55,23 +64,34 @@ public class AtomicTextBlock implements TextBlock { } - public static AtomicTextBlock fromSearchTextWithTextPositionDto(SearchTextWithTextPositionDto searchTextWithTextPositionDto, - SemanticNode parent, - int stringOffset, - Long textBlockIdx, - Integer numberOnPage, - Page page) { + public static AtomicTextBlock fromSearchTextWithTextPosition(String searchText, + List lineBreaks, + List boldTextBoundaries, + List italicTextBoundaries, + List positions, + List stringIdxToPositionIdx, + long idx, + SemanticNode parent, + int numberOnPage, + Page page, + int offset, + String orientation, + int textDirection) { return AtomicTextBlock.builder() - .id(textBlockIdx) + .id(idx) .parent(parent) - .searchText(searchTextWithTextPositionDto.getSearchText()) + .searchText(searchText) .numberOnPage(numberOnPage) .page(page) - .lineBreaks(searchTextWithTextPositionDto.getLineBreaks()) - .positions(searchTextWithTextPositionDto.getPositions()) - .stringIdxToPositionIdx(searchTextWithTextPositionDto.getStringCoordsToPositionCoords()) - .boundary(new Boundary(stringOffset, stringOffset + searchTextWithTextPositionDto.getSearchText().length())) + .lineBreaks(lineBreaks) + .boldTextBoundaries(boldTextBoundaries) + .italicTextBoundaries(italicTextBoundaries) + .positions(positions) + .stringIdxToPositionIdx(stringIdxToPositionIdx) + .boundary(new Boundary(offset, offset + searchText.length())) + .textDirection(textDirection) + .orientation(orientation) .build(); } @@ -82,11 +102,8 @@ public class AtomicTextBlock implements TextBlock { .id(textBlockIdx) .boundary(new Boundary(stringOffset, stringOffset)) .searchText("") - .lineBreaks(Collections.emptyList()) .page(page) .numberOnPage(numberOnPage) - .stringIdxToPositionIdx(Collections.emptyList()) - .positions(Collections.emptyList()) .parent(parent) .build(); } @@ -191,7 +208,7 @@ public class AtomicTextBlock implements TextBlock { List rectanglesPerLine = stringBoundary.split(getAllLineBreaksInBoundary(stringBoundary)) .stream() .map(this::getPositions) - .map(RectangleTransformations::rectangleUnionWithGaps) + .map(RectangleTransformations::rectangleBBoxWithGaps) .flatMap(Collection::stream) .toList(); Map> rectanglePerLinePerPage = new HashMap<>(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/ConcatenatedTextBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/ConcatenatedTextBlock.java index 69e0473..4e1a5fb 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/ConcatenatedTextBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/ConcatenatedTextBlock.java @@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.graph.textblock; import static java.lang.String.format; import java.awt.geom.Rectangle2D; +import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.LinkedList; @@ -182,4 +183,38 @@ public class ConcatenatedTextBlock implements TextBlock { return getSearchText(); } + + @Override + public List getBoldTextBoundaries() { + + return getAtomicTextBlocks().stream().map(AtomicTextBlock::getBoldTextBoundaries).flatMap(Collection::stream).toList(); + } + + + @Override + public List getItalicTextBoundaries() { + + return getAtomicTextBlocks().stream().map(AtomicTextBlock::getItalicTextBoundaries).flatMap(Collection::stream).toList(); + } + + + @Override + public String getOrientation() { + + if (atomicTextBlocks.isEmpty()) { + return ""; + } + return atomicTextBlocks.get(0).getOrientation(); + } + + + @Override + public int getTextDirection() { + + if (atomicTextBlocks.isEmpty()) { + return 0; + } + return atomicTextBlocks.get(0).getTextDirection(); + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/TextBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/TextBlock.java index 34a0f7a..69d1640 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/TextBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/graph/textblock/TextBlock.java @@ -21,6 +21,18 @@ public interface TextBlock extends CharSequence { List getAtomicTextBlocks(); + List getBoldTextBoundaries(); + + + List getItalicTextBoundaries(); + + + String getOrientation(); + + + int getTextDirection(); + + Boundary getBoundary(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/DocumentDataMapper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/redaction/DocumentDataMapper.java similarity index 97% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/DocumentDataMapper.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/redaction/DocumentDataMapper.java index 08f182d..c901d36 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/DocumentDataMapper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/redaction/DocumentDataMapper.java @@ -1,15 +1,15 @@ -package com.knecon.fforesight.service.layoutparser.processor.mapper; +package com.knecon.fforesight.service.layoutparser.processor.mapper.redaction; import java.awt.geom.Rectangle2D; import java.util.HashMap; import java.util.List; import java.util.Map; -import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositionBlockData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.PageData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentTreeData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicPositionBlockData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicTextBlockData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.PageData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTreeData; import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Image; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/DocumentGraphMapper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/redaction/DocumentGraphMapper.java similarity index 97% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/DocumentGraphMapper.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/redaction/DocumentGraphMapper.java index 80973e1..633878c 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/DocumentGraphMapper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/redaction/DocumentGraphMapper.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.mapper; +package com.knecon.fforesight.service.layoutparser.processor.mapper.redaction; import java.util.Arrays; import java.util.HashSet; @@ -7,11 +7,11 @@ import java.util.List; import java.util.Map; import java.util.NoSuchElementException; -import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositionBlockData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentTreeData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.PageData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicPositionBlockData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicTextBlockData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTreeData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.PageData; import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Footer; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/PropertiesMapper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/redaction/PropertiesMapper.java similarity index 97% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/PropertiesMapper.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/redaction/PropertiesMapper.java index cbb6d49..fdff5c1 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/PropertiesMapper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/redaction/PropertiesMapper.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.mapper; +package com.knecon.fforesight.service.layoutparser.processor.mapper.redaction; import java.awt.geom.Rectangle2D; import java.util.Arrays; @@ -10,7 +10,7 @@ import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Image; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.ImageType; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.TableCell; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Table; -import com.knecon.fforesight.service.layoutparser.processor.services.RectangleTransformations; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; public class PropertiesMapper { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/taas/TaasDocumentDataMapper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/taas/TaasDocumentDataMapper.java new file mode 100644 index 0000000..de18ba2 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/mapper/taas/TaasDocumentDataMapper.java @@ -0,0 +1,132 @@ +package com.knecon.fforesight.service.layoutparser.processor.mapper.taas; + +import java.awt.geom.Rectangle2D; +import java.util.List; +import java.util.Locale; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.IntStream; + +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; +import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ParagraphData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.Range; +import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.RowData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.StructureObject; +import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.TableData; +import com.knecon.fforesight.service.layoutparser.processor.graph.Boundary; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Table; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.TableCell; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; + +public class TaasDocumentDataMapper { + + public static ResearchDocumentData fromDocument(Document document) { + + AtomicInteger structureObjectNumber = new AtomicInteger(); + List structureObjects = document.streamAllSubNodes() + .filter(node -> !node.getType().equals(NodeType.TABLE_CELL)) + .filter(node -> !node.getType().equals(NodeType.SECTION)) + .map(node -> { + if (node.getType().equals(NodeType.TABLE)) { + return TaasDocumentDataMapper.fromTableWithTableData((Table) node, structureObjectNumber.getAndIncrement()); + } else { + return TaasDocumentDataMapper.fromSemanticNodeWithParagraphData(node, structureObjectNumber.getAndIncrement()); + } + }) + .toList(); + return ResearchDocumentData.builder().structureObjects(structureObjects).build(); + } + + + public static ParagraphData fromTextBlock(String classification, TextBlock textBlock) { + + return ParagraphData.builder() + .boldTextBoundaries(textBlock.getBoldTextBoundaries().stream().map(b -> new Range(b.start(), b.end())).toList()) + .italicTextBoundaries(textBlock.getItalicTextBoundaries().stream().map(b -> new Range(b.start(), b.end())).toList()) + .text(textBlock.getSearchText()) + .linebreaks(textBlock.getLineBreaks()) + .classification(classification) + .orientation(textBlock.getOrientation()) + .textDirection(textBlock.getTextDirection()) + .build(); + } + + + public static TableData fromTable(Table table) { + + List rowData = IntStream.range(0, table.getNumberOfRows()) + .boxed() + .map(rowIdx -> table.streamRow(rowIdx).toList()) + .map(TaasDocumentDataMapper::fromTableCells) + .toList(); + return new TableData(rowData, table.getNumberOfCols(), table.getNumberOfRows()); + } + + + public static RowData fromTableCells(List tableCells) { + + if (tableCells.isEmpty()) { + throw new IllegalArgumentException("no table cells provided"); + } + boolean header = tableCells.stream().allMatch(TableCell::isHeader); + Page firstPage = tableCells.get(0).getFirstPage(); + Rectangle2D bBox = tableCells.stream().map(TableCell::getBBox).reduce((map1, map2) -> { + map1.forEach((page, rectangle) -> map2.merge(page, rectangle, (rect1, rect2) -> rect1.createUnion(rect2).getBounds2D())); + return map2; + }).orElseThrow().get(firstPage); + List textBlocks = tableCells.stream().map(TableCell::getTextBlock).toList(); + return new RowData(header, textBlocks.stream().map(textBlock -> TaasDocumentDataMapper.fromTextBlock("table_cell", textBlock)).toList(), toFloatArray(bBox)); + } + + + private static Range toRange(Boundary boundary) { + + return new Range(boundary.start(), boundary.end()); + } + + + private static List toRange(List boundary) { + + return boundary.stream().map(TaasDocumentDataMapper::toRange).toList(); + } + + + public static StructureObject fromSemanticNodeWithParagraphData(SemanticNode semanticNode, Integer structureObjectNumber) { + + Page page = semanticNode.getFirstPage(); + Rectangle2D bBox = semanticNode.getBBox().get(page); + return StructureObject.builder() + .structureObjectNumber(structureObjectNumber) + .boundingBox(toFloatArray(bBox)) + .stringOffset(semanticNode.getBoundary().start()) + .page(page.getNumber()) + .paragraph(TaasDocumentDataMapper.fromTextBlock(semanticNode.getType().toString().toLowerCase(Locale.ROOT), semanticNode.getTextBlock())) + .table(null) + .build(); + } + + + public static StructureObject fromTableWithTableData(Table table, int structureObjectNumber) { + + Page page = table.getFirstPage(); + Rectangle2D bBox = table.getBBox().get(page); + return StructureObject.builder() + .structureObjectNumber(structureObjectNumber) + .boundingBox(toFloatArray(bBox)) + .stringOffset(table.getBoundary().start()) + .page(page.getNumber()) + .paragraph(null) + .table(TaasDocumentDataMapper.fromTable(table)) + .build(); + } + + + private static float[] toFloatArray(Rectangle2D bBox) { + + return new float[]{(float) bBox.getX(), (float) bBox.getY(), (float) bBox.getWidth(), (float) bBox.getHeight()}; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/AbstractPageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/AbstractPageBlock.java similarity index 91% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/AbstractPageBlock.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/AbstractPageBlock.java index 821a3f6..42ef081 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/AbstractPageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/AbstractPageBlock.java @@ -1,8 +1,8 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model; +package com.knecon.fforesight.service.layoutparser.processor.model; import com.fasterxml.jackson.annotation.JsonIgnore; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import lombok.AllArgsConstructor; import lombok.Data; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationDocument.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java similarity index 78% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationDocument.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java index 1ce5a1c..5062790 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationDocument.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationDocument.java @@ -1,11 +1,11 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model; +package com.knecon.fforesight.service.layoutparser.processor.model; import java.util.ArrayList; import java.util.List; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionGrid; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.StringFrequencyCounter; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.UnclassifiedText; +import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; +import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText; import lombok.Data; import lombok.NoArgsConstructor; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationFooter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationFooter.java similarity index 50% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationFooter.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationFooter.java index 2aad008..c910293 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationFooter.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationFooter.java @@ -1,8 +1,8 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model; +package com.knecon.fforesight.service.layoutparser.processor.model; import java.util.List; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import lombok.AllArgsConstructor; import lombok.Data; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationHeader.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationHeader.java similarity index 50% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationHeader.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationHeader.java index be4447d..e161801 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationHeader.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationHeader.java @@ -1,8 +1,8 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model; +package com.knecon.fforesight.service.layoutparser.processor.model; import java.util.List; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import lombok.AllArgsConstructor; import lombok.Data; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationPage.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java similarity index 75% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationPage.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java index 91dfd79..21796c8 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationPage.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java @@ -1,11 +1,11 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model; +package com.knecon.fforesight.service.layoutparser.processor.model; import java.util.ArrayList; import java.util.List; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.StringFrequencyCounter; +import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; import lombok.Data; import lombok.NonNull; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationSection.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationSection.java similarity index 68% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationSection.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationSection.java index 8de2007..7074282 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/ClassificationSection.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationSection.java @@ -1,10 +1,10 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model; +package com.knecon.fforesight.service.layoutparser.processor.model; import java.util.ArrayList; import java.util.List; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; import lombok.Data; import lombok.NoArgsConstructor; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/FloatFrequencyCounter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/FloatFrequencyCounter.java similarity index 96% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/FloatFrequencyCounter.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/FloatFrequencyCounter.java index 80bcbf6..a3d7917 100755 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/FloatFrequencyCounter.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/FloatFrequencyCounter.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model; +package com.knecon.fforesight.service.layoutparser.processor.model; import java.util.ArrayList; import java.util.Collections; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/GapInformation.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/GapInformation.java new file mode 100644 index 0000000..f445b63 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/GapInformation.java @@ -0,0 +1,27 @@ +package com.knecon.fforesight.service.layoutparser.processor.model; + +import java.awt.geom.Rectangle2D; +import java.util.LinkedList; +import java.util.List; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Getter; +import lombok.experimental.FieldDefaults; + +@Getter +@AllArgsConstructor +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +public class GapInformation { + + List> xGaps; + List> yGaps; + + + public GapInformation() { + + xGaps = new LinkedList<>(); + yGaps = new LinkedList<>(); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/LineInformation.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/LineInformation.java new file mode 100644 index 0000000..b09df49 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/LineInformation.java @@ -0,0 +1,23 @@ +package com.knecon.fforesight.service.layoutparser.processor.model; + +import java.awt.geom.Rectangle2D; +import java.util.List; + +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.Getter; +import lombok.experimental.FieldDefaults; + +@Getter +@AllArgsConstructor +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +public class LineInformation { + + List lineBBox; + List> sequencesByLines; + List> bBoxWithGapsByLines; + List>> sequencesWithGapsByLines; + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/Orientation.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/Orientation.java new file mode 100644 index 0000000..eddfa8e --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/Orientation.java @@ -0,0 +1,8 @@ +package com.knecon.fforesight.service.layoutparser.processor.model; + +public enum Orientation { + + NONE, + LEFT, + RIGHT +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/PageBlockType.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageBlockType.java similarity index 89% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/PageBlockType.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageBlockType.java index 9740979..1292138 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/PageBlockType.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageBlockType.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model; +package com.knecon.fforesight.service.layoutparser.processor.model; public enum PageBlockType { H1, diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageContents.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageContents.java new file mode 100644 index 0000000..21d1e67 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageContents.java @@ -0,0 +1,21 @@ +package com.knecon.fforesight.service.layoutparser.processor.model; + +import java.awt.geom.Rectangle2D; +import java.util.List; + +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Getter; + +@Getter +@Builder +@AllArgsConstructor +public class PageContents { + + List sortedTextPositionSequences; + Rectangle2D cropBox; + Rectangle2D mediaBox; + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageInformation.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageInformation.java new file mode 100644 index 0000000..9080937 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/PageInformation.java @@ -0,0 +1,17 @@ +package com.knecon.fforesight.service.layoutparser.processor.model; + +import java.awt.geom.Rectangle2D; + +import lombok.AllArgsConstructor; +import lombok.Getter; + +@Getter +@AllArgsConstructor +public class PageInformation { + + PageContents pageContents; + LineInformation lineInformation; + Rectangle2D mainBodyTextFrame; + GapInformation gapInformation; + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/SectionIdentifier.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/SectionIdentifier.java new file mode 100644 index 0000000..7b6f8c4 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/SectionIdentifier.java @@ -0,0 +1,123 @@ +package com.knecon.fforesight.service.layoutparser.processor.model; + +import java.util.Collections; +import java.util.LinkedList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import lombok.AccessLevel; +import lombok.AllArgsConstructor; +import lombok.experimental.FieldDefaults; + +@AllArgsConstructor +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +public class SectionIdentifier { + + static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?"); + + private enum Format { + EMPTY, + NUMERICAL, + DOCUMENT + } + + Format format; + String identifierString; + List identifiers; + boolean asChild; + + + public static SectionIdentifier fromSearchText(String headline) { + + if (headline == null || headline.isEmpty() || headline.isBlank()) { + return SectionIdentifier.empty(); + } + + Matcher numericalIdentifierMatcher = numericalIdentifierPattern.matcher(headline); + if (numericalIdentifierMatcher.find()) { + return buildNumericalSectionIdentifier(headline, numericalIdentifierMatcher); + } + // more formats here + return SectionIdentifier.empty(); + } + + + public static SectionIdentifier asChildOf(SectionIdentifier sectionIdentifier) { + + return new SectionIdentifier(sectionIdentifier.format, sectionIdentifier.toString(), sectionIdentifier.identifiers, true); + } + + + public static SectionIdentifier document() { + + return new SectionIdentifier(Format.DOCUMENT, "document", Collections.emptyList(), false); + } + + + public static SectionIdentifier empty() { + + return new SectionIdentifier(Format.EMPTY, "empty", Collections.emptyList(), false); + } + + + private static SectionIdentifier buildNumericalSectionIdentifier(String headline, Matcher numericalIdentifierMatcher) { + + String identifierString = headline.substring(numericalIdentifierMatcher.start(), numericalIdentifierMatcher.end()); + List identifiers = new LinkedList<>(); + for (int i = 1; i <= 4; i++) { + String numericalIdentifier = numericalIdentifierMatcher.group(i); + if (numericalIdentifier == null || numericalIdentifier.equals("0") || numericalIdentifier.isEmpty() || numericalIdentifier.isBlank()) { + break; + } + identifiers.add(Integer.parseInt(numericalIdentifier.trim())); + } + return new SectionIdentifier(Format.NUMERICAL, identifierString, identifiers.stream().toList(), false); + } + + + /** + * Determines if the current section is the parent of the given section. + * + * @param sectionIdentifier The section identifier to compare against. + * @return true if the current section is the parent of the given section, false otherwise. + */ + public boolean isParentOf(SectionIdentifier sectionIdentifier) { + + if (this.format.equals(Format.EMPTY)) { + return false; + } + if (this.format.equals(Format.DOCUMENT)) { + return true; + } + if (!this.format.equals(sectionIdentifier.format)) { + return false; + } + if (this.identifiers.size() >= sectionIdentifier.identifiers.size() && !(this.identifiers.size() == sectionIdentifier.identifiers.size() && sectionIdentifier.asChild)) { + return false; + } + for (int i = 0; i < this.identifiers.size(); i++) { + if (!this.identifiers.get(i).equals(sectionIdentifier.identifiers.get(i))) { + return false; + } + } + return true; + } + + + public boolean isChildOf(SectionIdentifier sectionIdentifier) { + + if (this.format.equals(Format.DOCUMENT) || this.format.equals(Format.EMPTY)) { + return false; + } + return sectionIdentifier.isParentOf(this); + } + + + @Override + public String toString() { + + return identifierString; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/image/ClassifiedImage.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/image/ClassifiedImage.java similarity index 84% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/image/ClassifiedImage.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/image/ClassifiedImage.java index 3670100..b0da3b9 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/image/ClassifiedImage.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/image/ClassifiedImage.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model.image; +package com.knecon.fforesight.service.layoutparser.processor.model.image; import java.awt.geom.Rectangle2D; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/Cell.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Cell.java similarity index 82% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/Cell.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Cell.java index 42ac4be..2f324a4 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/Cell.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Cell.java @@ -1,13 +1,13 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model.table; +package com.knecon.fforesight.service.layoutparser.processor.model.table; import java.awt.geom.Point2D; import java.util.ArrayList; import java.util.Iterator; import java.util.List; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence; -import com.knecon.fforesight.service.layoutparser.processor.classification.utils.TextNormalizationUtilities; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities; import lombok.Data; import lombok.EqualsAndHashCode; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/CellPosition.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/CellPosition.java similarity index 79% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/CellPosition.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/CellPosition.java index 2b5ef89..a47e928 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/CellPosition.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/CellPosition.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model.table; +package com.knecon.fforesight.service.layoutparser.processor.model.table; import lombok.RequiredArgsConstructor; import lombok.Value; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/CleanRulings.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/CleanRulings.java similarity index 65% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/CleanRulings.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/CleanRulings.java index daa1055..735d7a5 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/CleanRulings.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/CleanRulings.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model.table; +package com.knecon.fforesight.service.layoutparser.processor.model.table; import java.util.List; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/Rectangle.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Rectangle.java similarity index 98% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/Rectangle.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Rectangle.java index 4ce30df..c357ab7 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/Rectangle.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Rectangle.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model.table; +package com.knecon.fforesight.service.layoutparser.processor.model.table; import java.awt.geom.Point2D; import java.awt.geom.Rectangle2D; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/Ruling.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Ruling.java similarity index 97% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/Ruling.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Ruling.java index 9759960..f2deee6 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/Ruling.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/Ruling.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model.table; +package com.knecon.fforesight.service.layoutparser.processor.model.table; import java.awt.geom.Line2D; import java.awt.geom.Point2D; @@ -11,8 +11,8 @@ import java.util.List; import java.util.Map; import java.util.TreeMap; -import com.knecon.fforesight.service.layoutparser.processor.classification.utils.CohenSutherlandClipping; -import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons; +import com.knecon.fforesight.service.layoutparser.processor.utils.CohenSutherlandClipping; +import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons; import lombok.extern.slf4j.Slf4j; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/TablePageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java similarity index 96% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/TablePageBlock.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java index 23e5631..10331fe 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/table/TablePageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model.table; +package com.knecon.fforesight.service.layoutparser.processor.model.table; import java.awt.geom.Point2D; import java.util.ArrayList; @@ -8,9 +8,9 @@ import java.util.List; import java.util.Set; import java.util.TreeMap; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import lombok.Getter; import lombok.Setter; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/RedTextPosition.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java similarity index 95% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/RedTextPosition.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java index 2a8de35..92059ae 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/RedTextPosition.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model.text; +package com.knecon.fforesight.service.layoutparser.processor.model.text; import org.apache.pdfbox.text.TextPosition; import org.springframework.beans.BeanUtils; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/SearchableText.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/SearchableText.java similarity index 84% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/SearchableText.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/SearchableText.java index b8081be..c0ef4e3 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/SearchableText.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/SearchableText.java @@ -1,8 +1,8 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model.text; +package com.knecon.fforesight.service.layoutparser.processor.model.text; import java.util.ArrayList; import java.util.List; -import com.knecon.fforesight.service.layoutparser.processor.classification.utils.TextNormalizationUtilities; +import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities; import lombok.Getter; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/SimplifiedSectionText.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/SimplifiedSectionText.java similarity index 74% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/SimplifiedSectionText.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/SimplifiedSectionText.java index beb8d8f..21d71be 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/SimplifiedSectionText.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/SimplifiedSectionText.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model.text; +package com.knecon.fforesight.service.layoutparser.processor.model.text; import lombok.AllArgsConstructor; import lombok.Builder; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/SimplifiedText.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/SimplifiedText.java similarity index 80% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/SimplifiedText.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/SimplifiedText.java index ea9b7ca..d021c52 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/SimplifiedText.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/SimplifiedText.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model.text; +package com.knecon.fforesight.service.layoutparser.processor.model.text; import java.util.ArrayList; import java.util.List; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/StringFrequencyCounter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/StringFrequencyCounter.java similarity index 93% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/StringFrequencyCounter.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/StringFrequencyCounter.java index a210116..934b1b3 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/StringFrequencyCounter.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/StringFrequencyCounter.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model.text; +package com.knecon.fforesight.service.layoutparser.processor.model.text; import java.util.HashMap; import java.util.Map; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/TextDirection.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextDirection.java similarity index 92% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/TextDirection.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextDirection.java index e555301..8d1fa97 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/TextDirection.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextDirection.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model.text; +package com.knecon.fforesight.service.layoutparser.processor.model.text; import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonValue; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/TextPageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java similarity index 66% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/TextPageBlock.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java index cbf6214..b9c816a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/TextPageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java @@ -1,21 +1,28 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model.text; +package com.knecon.fforesight.service.layoutparser.processor.model.text; + +import static java.util.stream.Collectors.toSet; import java.util.ArrayList; +import java.util.Comparator; import java.util.List; import com.fasterxml.jackson.annotation.JsonIgnore; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType; -import com.knecon.fforesight.service.layoutparser.processor.classification.utils.TextNormalizationUtilities; +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter; +import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; +import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons; +import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities; import lombok.AllArgsConstructor; import lombok.Builder; import lombok.Data; +import lombok.EqualsAndHashCode; import lombok.NoArgsConstructor; +@EqualsAndHashCode(callSuper = true) +@Data @AllArgsConstructor @Builder -@Data @NoArgsConstructor public class TextPageBlock extends AbstractPageBlock { @@ -67,6 +74,64 @@ public class TextPageBlock extends AbstractPageBlock { return sequences.get(0).getPageWidth(); } + public static TextPageBlock merge(List textBlocksToMerge) { + + List sequences = textBlocksToMerge.stream().map(TextPageBlock::getSequences).flatMap(java.util.Collection::stream).toList(); + sequences = new ArrayList<>(sequences); + return fromTextPositionSequences(sequences); + } + + public static TextPageBlock fromTextPositionSequences(List wordBlockList) { + + TextPageBlock textBlock = null; + + FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter(); + FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter(); + FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter(); + StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter(); + StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter(); + + for (TextPositionSequence wordBlock : wordBlockList) { + + lineHeightFrequencyCounter.add(wordBlock.getTextHeight()); + fontSizeFrequencyCounter.add(wordBlock.getFontSize()); + spaceFrequencyCounter.add(wordBlock.getSpaceWidth()); + fontFrequencyCounter.add(wordBlock.getFont()); + styleFrequencyCounter.add(wordBlock.getFontStyle()); + + if (textBlock == null) { + textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(), + wordBlock.getMaxXDirAdj(), + wordBlock.getMinYDirAdj(), + wordBlock.getMaxYDirAdj(), + wordBlockList, + wordBlock.getRotation()); + } else { + TextPageBlock spatialEntity = textBlock.union(wordBlock); + textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight()); + } + } + + if (textBlock != null) { + textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular()); + textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest()); + } + + if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences() + .stream() + .map(t -> DoubleComparisons.round(t.getMinYDirAdj(), 3)) + .collect(toSet()) + .size() == 1) { + textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj)); + } + return textBlock; + } + + /** * Returns the minX value in pdf coordinate system. diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/TextPositionSequence.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java similarity index 93% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/TextPositionSequence.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java index fa1b243..d4e58e8 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/TextPositionSequence.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model.text; +package com.knecon.fforesight.service.layoutparser.processor.model.text; import java.awt.geom.AffineTransform; import java.awt.geom.Point2D; @@ -8,8 +8,8 @@ import java.util.stream.Collectors; import org.apache.pdfbox.text.TextPosition; +import com.dslplatform.json.JsonAttribute; import com.fasterxml.jackson.annotation.JsonIgnore; -import com.fasterxml.jackson.annotation.JsonIgnoreProperties; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; @@ -25,7 +25,6 @@ import lombok.extern.slf4j.Slf4j; @Builder @NoArgsConstructor @AllArgsConstructor -@JsonIgnoreProperties({"empty"}) public class TextPositionSequence implements CharSequence { public static final int HEIGHT_PADDING = 2; @@ -36,6 +35,7 @@ public class TextPositionSequence implements CharSequence { private int rotation; private float pageHeight; private float pageWidth; + private boolean isParagraphStart; public TextPositionSequence(int page) { @@ -44,7 +44,7 @@ public class TextPositionSequence implements CharSequence { } - public TextPositionSequence(List textPositions, int page) { + public TextPositionSequence(List textPositions, int page, boolean isParagraphStart) { this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList()); this.page = page; @@ -52,6 +52,7 @@ public class TextPositionSequence implements CharSequence { this.rotation = textPositions.get(0).getRotation(); this.pageHeight = textPositions.get(0).getPageHeight(); this.pageWidth = textPositions.get(0).getPageWidth(); + this.isParagraphStart = isParagraphStart; } @@ -141,6 +142,7 @@ public class TextPositionSequence implements CharSequence { * @return the text direction adjusted minX value */ @JsonIgnore + @JsonAttribute(ignore = true) public float getMinXDirAdj() { return textPositions.get(0).getXDirAdj(); @@ -155,6 +157,7 @@ public class TextPositionSequence implements CharSequence { * @return the text direction adjusted maxX value */ @JsonIgnore + @JsonAttribute(ignore = true) public float getMaxXDirAdj() { return textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidthDirAdj() + HEIGHT_PADDING; @@ -169,6 +172,7 @@ public class TextPositionSequence implements CharSequence { * @return the text direction adjusted minY value. The upper border of the bounding box of the word. */ @JsonIgnore + @JsonAttribute(ignore = true) public float getMinYDirAdj() { return textPositions.get(0).getYDirAdj() - getTextHeight(); @@ -183,6 +187,7 @@ public class TextPositionSequence implements CharSequence { * @return the text direction adjusted maxY value. The lower border of the bounding box of the word. */ @JsonIgnore + @JsonAttribute(ignore = true) public float getMaxYDirAdj() { return textPositions.get(0).getYDirAdj(); @@ -191,6 +196,7 @@ public class TextPositionSequence implements CharSequence { @JsonIgnore + @JsonAttribute(ignore = true) public float getTextHeight() { return textPositions.get(0).getHeightDir() + HEIGHT_PADDING; @@ -198,6 +204,7 @@ public class TextPositionSequence implements CharSequence { @JsonIgnore + @JsonAttribute(ignore = true) public float getHeight() { return getMaxYDirAdj() - getMinYDirAdj(); @@ -205,6 +212,7 @@ public class TextPositionSequence implements CharSequence { @JsonIgnore + @JsonAttribute(ignore = true) public float getWidth() { return getMaxXDirAdj() - getMinXDirAdj(); @@ -212,6 +220,7 @@ public class TextPositionSequence implements CharSequence { @JsonIgnore + @JsonAttribute(ignore = true) public String getFont() { return textPositions.get(0).getFontName().toLowerCase().replaceAll(",bold", "").replaceAll(",italic", ""); @@ -219,6 +228,7 @@ public class TextPositionSequence implements CharSequence { @JsonIgnore + @JsonAttribute(ignore = true) public String getFontStyle() { String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase(); @@ -237,6 +247,7 @@ public class TextPositionSequence implements CharSequence { @JsonIgnore + @JsonAttribute(ignore = true) public float getFontSize() { return textPositions.get(0).getFontSizeInPt(); @@ -244,6 +255,7 @@ public class TextPositionSequence implements CharSequence { @JsonIgnore + @JsonAttribute(ignore = true) public float getSpaceWidth() { return textPositions.get(0).getWidthOfSpace(); @@ -260,6 +272,7 @@ public class TextPositionSequence implements CharSequence { * @return bounding box of the word in Pdf Coordinate System */ @JsonIgnore + @JsonAttribute(ignore = true) @SneakyThrows public Rectangle getRectangle() { @@ -299,3 +312,4 @@ public class TextPositionSequence implements CharSequence { } } + diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/UnclassifiedText.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/UnclassifiedText.java similarity index 67% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/UnclassifiedText.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/UnclassifiedText.java index 0d9bfb4..7da98c5 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/model/text/UnclassifiedText.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/UnclassifiedText.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.model.text; +package com.knecon.fforesight.service.layoutparser.processor.model.text; import java.util.List; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/LegacyPDFStreamEngine.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/parsing/LegacyPDFStreamEngine.java similarity index 98% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/LegacyPDFStreamEngine.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/parsing/LegacyPDFStreamEngine.java index 5aa1439..f2ece49 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/LegacyPDFStreamEngine.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/parsing/LegacyPDFStreamEngine.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package com.knecon.fforesight.service.layoutparser.processor.classification.parsing; +package com.knecon.fforesight.service.layoutparser.processor.parsing; import java.io.IOException; import java.io.InputStream; @@ -76,7 +76,7 @@ import org.apache.pdfbox.util.Vector; * THIS CODE IS DELIBERATELY INCORRECT, USE PDFStreamEngine INSTEAD. */ @SuppressWarnings({"PMD", "checkstyle:all"}) -class LegacyPDFStreamEngine extends PDFStreamEngine { +public class LegacyPDFStreamEngine extends PDFStreamEngine { private static final Log LOG = LogFactory.getLog(LegacyPDFStreamEngine.class); @@ -126,7 +126,7 @@ class LegacyPDFStreamEngine extends PDFStreamEngine { * This will initialize and process the contents of the stream. * * @param page the page to process - * @throws IOException if there is an error accessing the stream. + * @throws java.io.IOException if there is an error accessing the stream. */ @Override public void processPage(PDPage page) throws IOException { @@ -149,7 +149,7 @@ class LegacyPDFStreamEngine extends PDFStreamEngine { * written by Ben Litchfield for PDFStreamEngine. */ @Override - protected void showGlyph(Matrix textRenderingMatrix, PDFont font, int code,Vector displacement) throws IOException { + protected void showGlyph(Matrix textRenderingMatrix, PDFont font, int code, Vector displacement) throws IOException { // // legacy calculations which were previously in PDFStreamEngine // @@ -165,7 +165,7 @@ class LegacyPDFStreamEngine extends PDFStreamEngine { float displacementX = displacement.getX(); // the sorting algorithm is based on the width of the character. As the displacement - // for vertical characters doesn't provide any suitable value for it, we have to + // for vertical characters doesn't provide any suitable value for it, we have to // calculate our own if (font.isVertical()) { displacementX = font.getWidth(code) / 1000; @@ -382,3 +382,4 @@ class LegacyPDFStreamEngine extends PDFStreamEngine { } } + diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFLinesTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/parsing/PDFLinesTextStripper.java similarity index 89% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFLinesTextStripper.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/parsing/PDFLinesTextStripper.java index 223b0ba..549f726 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFLinesTextStripper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/parsing/PDFLinesTextStripper.java @@ -1,9 +1,9 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.parsing; +package com.knecon.fforesight.service.layoutparser.processor.parsing; +import java.awt.color.CMMException; import java.awt.geom.Point2D; import java.io.IOException; import java.util.ArrayList; -import java.util.Comparator; import java.util.List; import org.apache.pdfbox.contentstream.operator.Operator; @@ -32,14 +32,16 @@ import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSNumber; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.graphics.color.PDColor; import org.apache.pdfbox.text.TextPosition; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.RedTextPosition; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import lombok.Getter; import lombok.Setter; +import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; @Getter @@ -195,8 +197,8 @@ public class PDFLinesTextStripper extends PDFTextStripper { private void addVisibleRulings(List path, boolean stroke) throws IOException { try { - if (stroke && !getGraphicsState().getStrokingColor().isPattern() && getGraphicsState().getStrokingColor() - .toRGB() == 0 || !stroke && !getGraphicsState().getNonStrokingColor().isPattern() && getGraphicsState().getNonStrokingColor().toRGB() == 0) { + if (stroke && !getGraphicsState().getStrokingColor().isPattern() && isBlack(getGraphicsState().getStrokingColor()) || // + !stroke && !getGraphicsState().getNonStrokingColor().isPattern() && isBlack(getGraphicsState().getNonStrokingColor())) { rulings.addAll(path); } } catch (UnsupportedOperationException e) { @@ -207,14 +209,27 @@ public class PDFLinesTextStripper extends PDFTextStripper { } + @SneakyThrows + private boolean isBlack(PDColor color) { + + try { + return color.toRGB() == 0; + } catch (CMMException e) { + // see https://github.com/haraldk/TwelveMonkeys/issues/124 or https://issues.apache.org/jira/browse/PDFBOX-3531 + // This is a quick and dirt hack + // Happens for file 216.pdf + log.debug(e.getMessage()); + return color.getComponents()[0] == 0 && color.getComponents()[1] == 0 && color.getComponents()[2] == 0 && color.getComponents()[1] == 1; + } + } + + @Override - public void writeString(String text, List textPositions) throws IOException { + public void writeString(String text, List textPositions, boolean isParagraphStart) throws IOException { int startIndex = 0; RedTextPosition previous = null; - textPositions.sort(Comparator.comparing(TextPosition::getXDirAdj)); - for (int i = 0; i <= textPositions.size() - 1; i++) { if (!textPositionSequences.isEmpty()) { @@ -250,7 +265,7 @@ public class PDFLinesTextStripper extends PDFTextStripper { if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0) .getUnicode() .equals("\t")))) { - textPositionSequences.add(new TextPositionSequence(sublist, pageNumber)); + textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart)); } startIndex = i; } @@ -260,7 +275,7 @@ public class PDFLinesTextStripper extends PDFTextStripper { if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0) .getUnicode() .equals("\t")))) { - textPositionSequences.add(new TextPositionSequence(sublist, pageNumber)); + textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart)); } startIndex = i; } @@ -276,11 +291,11 @@ public class PDFLinesTextStripper extends PDFTextStripper { // Remove false sequence ends (whitespaces) if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0) .getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) { - for (TextPosition textPosition : sublist) { - textPositionSequences.get(textPositionSequences.size() - 1).add(textPosition); + for (TextPosition t : sublist) { + textPositionSequences.get(textPositionSequences.size() - 1).add(t); } } else { - textPositionSequences.add(new TextPositionSequence(sublist, pageNumber)); + textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart)); } } startIndex = i + 1; @@ -303,7 +318,7 @@ public class PDFLinesTextStripper extends PDFTextStripper { textPositionSequences.get(textPositionSequences.size() - 1).add(t); } } else { - textPositionSequences.add(new TextPositionSequence(sublist, pageNumber)); + textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, isParagraphStart)); } } super.writeString(text); @@ -328,3 +343,4 @@ public class PDFLinesTextStripper extends PDFTextStripper { } } + diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/parsing/PDFTextStripper.java similarity index 96% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFTextStripper.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/parsing/PDFTextStripper.java index de0490b..b7fe3e3 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/parsing/PDFTextStripper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/parsing/PDFTextStripper.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package com.knecon.fforesight.service.layoutparser.processor.classification.parsing; +package com.knecon.fforesight.service.layoutparser.processor.parsing; import java.io.BufferedInputStream; import java.io.IOException; @@ -27,6 +27,7 @@ import java.text.Bidi; import java.text.Normalizer; import java.util.ArrayList; import java.util.Collections; +import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; @@ -240,10 +241,10 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { document = doc; output = outputStream; if (getAddMoreFormatting()) { - paragraphEnd = lineSeparator; + paragraphEnd = "\n----ParagraphEnd----\n\n"; pageStart = lineSeparator; - articleStart = lineSeparator; - articleEnd = lineSeparator; + articleStart = "\n----ArticelStart----\n\n"; + articleEnd = "\n----ArticelEnd----\n\n"; } startDocument(document); processPages(document.getPages()); @@ -594,9 +595,14 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { // but this caused a lot of regression test failures. So, I'm leaving it be for // now if (!overlap(positionY, positionHeight, maxYForLine, maxHeightForLine)) { - writeLine(normalize(line)); - line.clear(); + var normalized = normalize(line); +// normalized.stream().filter(l -> System.out.println(l.getText().contains("Plenarprotokoll 20/24")).findFirst().isPresent() + + lastLineStartPosition = handleLineSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine); + writeLine(normalized, current.isParagraphStart); + line.clear(); + expectedStartOfNextWordX = EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE; maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE; maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE; @@ -630,7 +636,24 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { if (startOfPage && lastPosition == null) { writeParagraphStart();// not sure this is correct for RTL? } + line.add(new LineItem(position)); + +// Collections.sort(line, new Comparator() { +// +// @Override +// public int compare(LineItem str1, LineItem str2) { +// if(null == str1.getTextPosition()) { +// return 0; +// } +// else if(null == str2.getTextPosition()) { +// return 0; +// } +// return Float.compare(str1.getTextPosition().getX(), str2.getTextPosition().getX()); +// } +// }); + +// line.sort(Comparator.comparing(a -> a.getTextPosition() != null && a.getTextPosition().getX())); } maxHeightForLine = Math.max(maxHeightForLine, positionHeight); minYTopForLine = Math.min(minYTopForLine, positionY - positionHeight); @@ -646,7 +669,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { } // print the final line if (line.size() > 0) { - writeLine(normalize(line)); + writeLine(normalize(line), false); writeParagraphEnd(); } endArticle(); @@ -703,7 +726,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { * @param textPositions The TextPositions belonging to the text. * @throws IOException If there is an error when writing the text. */ - protected void writeString(String text, List textPositions) throws IOException { + protected void writeString(String text, List textPositions, boolean isParagraphEnd) throws IOException { writeString(text); } @@ -998,7 +1021,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { /** - * By default the text stripper will attempt to remove text that overlapps each other. Word paints the same + * By default, the text stripper will attempt to remove text that overlapps each other. Word paints the same * character several times in order to make it look bold. By setting this to false all text will be extracted, which * means that certain sections will be duplicated, but better performance will be noticed. * @@ -1385,6 +1408,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { } else { writeLineSeparator(); writeParagraphSeparator(); + lastLineStartPosition.setEndParagraphWritten(); } } else { writeLineSeparator(); @@ -1428,6 +1452,10 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { float newXVal = multiplyFloat(getIndentThreshold(), position.getTextPosition().getWidthOfSpace()); float positionWidth = multiplyFloat(0.25f, position.getTextPosition().getWidth()); +// if(xGap < 0){ +// result = true; +// } +// else if (yGap > newYVal) { result = true; } else if (xGap > newXVal) { @@ -1636,12 +1664,13 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { * @param line a list with the words of the given line * @throws IOException if something went wrong */ - private void writeLine(List line) throws IOException { + private void writeLine(List line, boolean isParagraphEnd) throws IOException { int numberOfStrings = line.size(); for (int i = 0; i < numberOfStrings; i++) { WordWithTextPositions word = line.get(i); - writeString(word.getText(), word.getTextPositions()); + word.getTextPositions().sort(Comparator.comparing(TextPosition::getX)); + writeString(word.getText(), word.getTextPositions(), isParagraphEnd && i == numberOfStrings - 1); if (i < numberOfStrings - 1) { writeWordSeparator(); } @@ -1963,6 +1992,8 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { private boolean isHangingIndent = false; private boolean isArticleStart = false; + private boolean endParagraphWritten = false; + private TextPosition position = null; @@ -2024,6 +2055,16 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { } + public boolean isEndParagraphWritten() { + + return endParagraphWritten; + } + + public void setEndParagraphWritten(){ + endParagraphWritten = true; + } + + /** * Sets the isArticleStart() flag to true. */ @@ -2065,3 +2106,4 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { } } + diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/queue/MessageHandler.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/queue/MessageHandler.java index 856f415..a834a25 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/queue/MessageHandler.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/queue/MessageHandler.java @@ -10,11 +10,10 @@ import org.springframework.amqp.rabbit.annotation.RabbitListener; import org.springframework.amqp.rabbit.core.RabbitTemplate; import org.springframework.stereotype.Service; -import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; -import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingService; +import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline; import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; @@ -25,7 +24,7 @@ import lombok.extern.slf4j.Slf4j; @RequiredArgsConstructor public class MessageHandler { - private final LayoutParsingService layoutParsingService; + private final LayoutParsingPipeline layoutParsingPipeline; private final ObjectMapper objectMapper; private final RabbitTemplate rabbitTemplate; @@ -42,7 +41,7 @@ public class MessageHandler { throw new AmqpRejectAndDontRequeueException(String.format("Error during last layout parsing of request with identifier: %s, do not retry.", layoutParsingRequest.identifier())); } - LayoutParsingFinishedEvent layoutParsingFinishedEvent = layoutParsingService.parseLayoutAndSaveFilesToStorage(layoutParsingRequest); + LayoutParsingFinishedEvent layoutParsingFinishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest); sendLayoutParsingFinishedEvent(layoutParsingFinishedEvent); log.info("Layout parsing finished {} in {} ms", layoutParsingRequest.identifier(), layoutParsingFinishedEvent.duration()); } @@ -50,11 +49,7 @@ public class MessageHandler { public void sendLayoutParsingFinishedEvent(LayoutParsingFinishedEvent layoutParsingFinishedEvent) { - try { - rabbitTemplate.convertAndSend(LAYOUT_PARSING_FINISHED_EVENT_QUEUE, objectMapper.writeValueAsString(layoutParsingFinishedEvent)); - } catch (JsonProcessingException e) { - throw new RuntimeException(e); - } + rabbitTemplate.convertAndSend(LAYOUT_PARSING_FINISHED_EVENT_QUEUE, layoutParsingFinishedEvent); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/BodyTextFrameService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java similarity index 89% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/BodyTextFrameService.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java index ded5d93..bcb7ef4 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/BodyTextFrameService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/BodyTextFrameService.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.service; +package com.knecon.fforesight.service.layoutparser.processor.services; import java.util.List; @@ -6,18 +6,18 @@ import org.springframework.stereotype.Service; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.FloatFrequencyCounter; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.utils.PositionUtils; +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; @Service public class BodyTextFrameService { - private static final float APPROXIMATE_HEADER_LINE_COUNT = 2.9f; + private static final float APPROXIMATE_HEADER_LINE_COUNT = 2.0f; /** diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/DividingColumnDetectionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/DividingColumnDetectionService.java new file mode 100644 index 0000000..7c062bc --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/DividingColumnDetectionService.java @@ -0,0 +1,87 @@ +package com.knecon.fforesight.service.layoutparser.processor.services; + +import java.awt.geom.Line2D; +import java.awt.geom.Rectangle2D; +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; +import java.util.stream.Stream; + +import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation; +import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; + +import lombok.experimental.UtilityClass; + +@UtilityClass +public class DividingColumnDetectionService { + + private static final int MAX_NUMBER_OF_COLUMNS = 200; + + private static final int LINE_COUNT_THRESHOLD = 5; + + + public List detectColumns(PageContents pageContents) { + + + if (pageContents.getSortedTextPositionSequences().size() < 2) { + return List.of(pageContents.getCropBox()); + } + + GapInformation linesWithGapInformation = GapDetectionService.findGapsInLines(pageContents.getSortedTextPositionSequences(), pageContents.getCropBox()); + + return detectColumnsFromLines(linesWithGapInformation.getXGaps(), pageContents.getCropBox()); + } + + + public List detectColumnsFromLines(List> gaps, Rectangle2D mainBodyTextFrame) { + + List> allColumnParts = new ArrayList<>(MAX_NUMBER_OF_COLUMNS); + for (int columnIndex = 1; columnIndex < MAX_NUMBER_OF_COLUMNS; columnIndex++) { + double x = calculateGapLocation(columnIndex, MAX_NUMBER_OF_COLUMNS, mainBodyTextFrame.getWidth()) + mainBodyTextFrame.getMinX(); + double currentMinY = mainBodyTextFrame.getMaxY(); + double currentMaxY = 0; + int currentLineCount = 0; + List columnParts = new LinkedList<>(); + allColumnParts.add(columnParts); + for (int lineNumber = 0; lineNumber < gaps.size(); lineNumber++) { + List textBlocksInLine = gaps.get(lineNumber); + if (anyBlockIntersectX(textBlocksInLine, x)) { + if (lineNumber == gaps.size() - 1) { + currentMaxY = mainBodyTextFrame.getMinY(); + } else { + currentMaxY = gaps.get(lineNumber + 1).get(0).getMinY(); + } + currentLineCount++; + } else { + if (currentLineCount >= LINE_COUNT_THRESHOLD) { + columnParts.add(new Line2D.Double(x, currentMinY, x, currentMaxY)); + } + currentMinY = gaps.get(lineNumber).get(0).getMaxY(); + currentMaxY = currentMinY; + currentLineCount = 0; + } + + } + if (currentLineCount >= LINE_COUNT_THRESHOLD) { + columnParts.add(new Line2D.Double(x, currentMinY, x, currentMaxY)); + } + } + return Stream.concat(Stream.of(mainBodyTextFrame), + allColumnParts.stream() + .flatMap(columnParts -> columnParts.stream().map(line -> new Rectangle2D.Double(line.getX2(), line.getY2(), 1, Math.abs(line.getY2() - line.getY1())))) + .map(r -> (Rectangle2D) r)).toList(); + } + + + private static boolean anyBlockIntersectX(List textBlocksInLine, double x) { + + return textBlocksInLine.stream().anyMatch(gap -> gap.getMinX() < x && x < gap.getMaxX()); + } + + + private double calculateGapLocation(int columnIndex, int numberOfColumns, double pageWidth) { + + return (pageWidth / numberOfColumns) * columnIndex; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapDetectionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapDetectionService.java new file mode 100644 index 0000000..ac7db1d --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapDetectionService.java @@ -0,0 +1,163 @@ +package com.knecon.fforesight.service.layoutparser.processor.services; + +import java.awt.geom.Rectangle2D; +import java.util.LinkedList; +import java.util.List; + +import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; + +import lombok.AllArgsConstructor; +import lombok.experimental.UtilityClass; + +@UtilityClass +public class GapDetectionService { + + private static final double X_GAP_FACTOR = 1; // multiplied with average text height, determines the minimum distance of gaps in lines + private static final double Y_GAP_FACTOR = 1; + private static final double NEW_LINE_FACTOR = 0.2; + + + public static GapInformation findGapsInLines(List sortedTextPositionSequences, Rectangle2D mainBodyTextFrame) { + + if (sortedTextPositionSequences.isEmpty()) { + return new GapInformation(); + } + + final double avgTextPositionHeight = getAvgTextPositionHeight(sortedTextPositionSequences); + + XGapsContext xGapContext = XGapsContext.init(mainBodyTextFrame); + YGapsContext yGapContext = YGapsContext.init(mainBodyTextFrame); + + var previousTextPosition = sortedTextPositionSequences.get(0); + Rectangle2D rectangle = toRectangle2D(previousTextPosition); + + xGapContext.addGapFromLeftEdgeOfMainBody(rectangle); + + for (TextPositionSequence currentTextPosition : sortedTextPositionSequences.subList(1, sortedTextPositionSequences.size())) { + + double yDifference = Math.abs(currentTextPosition.getMaxYDirAdj() - previousTextPosition.getMaxYDirAdj()); + double xGap = Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getMinXDirAdj()); + Rectangle2D previousTextPositionBBox = toRectangle2D(previousTextPosition); + Rectangle2D currentTextPositionBBox = toRectangle2D(currentTextPosition); + + if (yDifference > avgTextPositionHeight * Y_GAP_FACTOR) { + yGapContext.addGap(mainBodyTextFrame.getMinX(), + previousTextPositionBBox.getMaxY(), + mainBodyTextFrame.getWidth(), + -(previousTextPositionBBox.getMaxY() - currentTextPositionBBox.getMinY())); + } + if (yDifference > avgTextPositionHeight * NEW_LINE_FACTOR) { + + xGapContext.addGapToRightEdgeOfMainBody(previousTextPositionBBox); + xGapContext.gapsInCurrentLine = new LinkedList<>(); + xGapContext.gapsPerLine.add(xGapContext.gapsInCurrentLine); + xGapContext.addGapFromLeftEdgeOfMainBody(currentTextPositionBBox); + + } else if (xGap > avgTextPositionHeight * X_GAP_FACTOR) { + addGapToLine(currentTextPositionBBox, previousTextPositionBBox, xGapContext); + } + previousTextPosition = currentTextPosition; + } + xGapContext.addGapToRightEdgeOfMainBody(toRectangle2D(sortedTextPositionSequences.get(sortedTextPositionSequences.size() - 1))); + xGapContext.gapsPerLine.add(xGapContext.gapsInCurrentLine); + + return new GapInformation(xGapContext.gapsPerLine, yGapContext.gapsPerLine); + } + + + private static Rectangle2D toRectangle2D(TextPositionSequence textPosition) { + + return mirrorY(RectangleTransformations.toRectangle2D(textPosition.getRectangle())); + } + + private static Rectangle2D mirrorY(Rectangle2D rectangle2D) { + + return new Rectangle2D.Double(rectangle2D.getX(), Math.min(rectangle2D.getMinY(), rectangle2D.getMaxY()), rectangle2D.getWidth(), Math.abs(rectangle2D.getHeight())); + } + + private static void addGapToLine(Rectangle2D currentTextPosition, Rectangle2D previousTextPosition, XGapsContext context) { + + context.gapsInCurrentLine.add(new Rectangle2D.Double(previousTextPosition.getMaxX(), + previousTextPosition.getMinY(), + currentTextPosition.getMinX() - previousTextPosition.getMaxX(), + (previousTextPosition.getHeight() + currentTextPosition.getHeight()) / 2)); + } + + + private static void assertAllTextPositionsHaveSameDir(List textPositionSequences) { + + assert textPositionSequences.stream().map(TextPositionSequence::getDir).allMatch(a -> a.equals(textPositionSequences.get(0).getDir())); + } + + + private static double getAvgTextPositionHeight(List textPositionSequences) { + + return textPositionSequences.stream().mapToDouble(TextPositionSequence::getHeight).average().orElseThrow(); + } + + + @AllArgsConstructor + private static class YGapsContext { + + List> gapsPerLine; + List gapsInCurrentLine; + Rectangle2D mainBodyTextFrame; + + + public static YGapsContext init(Rectangle2D mainBodyTextFrame) { + + List> initialLinesWithGaps = new LinkedList<>(); + List initialBlocksInLine = new LinkedList<>(); + initialLinesWithGaps.add(initialBlocksInLine); + return new YGapsContext(initialLinesWithGaps, initialBlocksInLine, mainBodyTextFrame); + } + + + public void addGap(double x1, double y1, double w, double h) { + + gapsInCurrentLine.add(new Rectangle2D.Double(x1, y1, w, h)); + } + + } + + @AllArgsConstructor + private static class XGapsContext { + + List> gapsPerLine; + List gapsInCurrentLine; + Rectangle2D mainBodyTextFrame; + + + public static XGapsContext init(Rectangle2D mainBodyTextFrame) { + + List> initialLinesWithGaps = new LinkedList<>(); + List initialBlocksInLine = new LinkedList<>(); + initialLinesWithGaps.add(initialBlocksInLine); + return new XGapsContext(initialLinesWithGaps, initialBlocksInLine, mainBodyTextFrame); + } + + + public void addGapToRightEdgeOfMainBody(Rectangle2D textPosition) { + + Rectangle2D leftGap = new Rectangle2D.Double(textPosition.getMaxX(), + textPosition.getMinY(), + mainBodyTextFrame.getMaxX() - textPosition.getMaxX(), + textPosition.getHeight()); + gapsInCurrentLine.add(leftGap); + } + + + public void addGapFromLeftEdgeOfMainBody(Rectangle2D textPosition) { + + Rectangle2D leftGap = new Rectangle2D.Double(mainBodyTextFrame.getMinX(), + textPosition.getMinY(), + textPosition.getMinX() - mainBodyTextFrame.getMinX(), + textPosition.getHeight()); + gapsInCurrentLine.add(leftGap); + } + + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapsAcrossLinesService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapsAcrossLinesService.java new file mode 100644 index 0000000..fd1b7f4 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/GapsAcrossLinesService.java @@ -0,0 +1,201 @@ +package com.knecon.fforesight.service.layoutparser.processor.services; + +import java.awt.geom.Rectangle2D; +import java.awt.geom.RectangularShape; +import java.util.LinkedList; +import java.util.List; +import java.util.Queue; +import java.util.stream.Stream; + +import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation; + +import lombok.AllArgsConstructor; +import lombok.Getter; +import lombok.RequiredArgsConstructor; +import lombok.experimental.UtilityClass; + +@UtilityClass +public class GapsAcrossLinesService { + + private static final double GAP_WIDTH_THRESHOLD_FACTOR = 0.01; // multiplied with avg text height + private static final double LINE_COUNT_THRESHOLD_FACTOR = 0.3; // multiplied with average line count per page + private static final double DISTANCE_TO_BORDER_THRESHOLD = 1; + + + public List detectXGapsAcrossLines(GapInformation gapInformation, Rectangle2D mainBodyTextFrame) { + + if (gapInformation.getXGaps().size() < 2) { + return List.of(mainBodyTextFrame); + } + double avgHeight = gapInformation.getXGaps() + .stream() + .filter(gaps -> !gaps.isEmpty()) + .map(gaps -> gaps.get(0)) + .mapToDouble(RectangularShape::getHeight) + .average() + .orElseThrow(); + + ColumnFactory columnFactory = ColumnFactory.init(avgHeight, gapInformation.getXGaps().size()); + gapInformation.getXGaps().get(0).stream().map(GapAcrossLines::new).forEach(columnFactory::addToQueue); + List> xGaps = gapInformation.getXGaps(); + for (var gaps : xGaps.subList(1, xGaps.size())) { + + while (columnFactory.hasGapsToProcess()) { + GapAcrossLines gapAcrossLines = columnFactory.getNext(); + rememberColumnIfValid(columnFactory, gapAcrossLines); + elongateColumnsAndFilterForWidth(gapAcrossLines, gaps, columnFactory).forEach(columnFactory::setToStillInProgress); + } + columnFactory.addStillInProgressToQueue(); + columnFactory.addGapsToQueue(gaps); + } + + return columnFactory.outputGaps.stream() + .filter(gapAcrossLines -> columnFactory.outputGaps.stream().filter(gapAcrossLines::intersectsX).noneMatch(gapAcrossLines1 -> gapAcrossLines1.lineCount > gapAcrossLines.lineCount)) + .filter(gapAcrossLines -> Math.abs(gapAcrossLines.rectangle2D.getMinX() - mainBodyTextFrame.getMinX()) > DISTANCE_TO_BORDER_THRESHOLD) + .filter(gapAcrossLines -> Math.abs(gapAcrossLines.rectangle2D.getMaxX() - mainBodyTextFrame.getMaxX()) > DISTANCE_TO_BORDER_THRESHOLD) + .map(GapAcrossLines::getRectangle2D) + .toList(); + } + + + private static void rememberColumnIfValid(ColumnFactory columnFactory, GapAcrossLines gapAcrossLines) { + + if (gapAcrossLines.lineCount >= LINE_COUNT_THRESHOLD_FACTOR * (double) columnFactory.lineCount) { + columnFactory.outputGaps.add(gapAcrossLines); + } + } + + + private static Stream elongateColumnsAndFilterForWidth(GapAcrossLines gapAcrossLines, List gaps, ColumnFactory columnFactory) { + + return gaps.stream()// + .filter(gap -> gapAcrossLines.getIntersectionWidth(gap) > GAP_WIDTH_THRESHOLD_FACTOR * columnFactory.avgHeight)// + .map(gapAcrossLines::addNewLineAndShrink); + + } + + + private static Rectangle2D correctRectangle(Rectangle2D rectangle2D) { + + double minX = Math.min(rectangle2D.getMinX(), rectangle2D.getMaxX()); + double minY = Math.min(rectangle2D.getMinY(), rectangle2D.getMaxY()); + double maxX = Math.max(rectangle2D.getMinX(), rectangle2D.getMaxX()); + double maxY = Math.max(rectangle2D.getMinY(), rectangle2D.getMaxY()); + return new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY); + } + + + @Getter + @AllArgsConstructor + private class GapAcrossLines { + + Rectangle2D rectangle2D; + int lineCount = 1; + + + public GapAcrossLines(Rectangle2D rectangle2D) { + + this.rectangle2D = correctRectangle(rectangle2D); + } + + + public boolean intersectsX(Rectangle2D rectangle2D) { + + return rectangle2D.getMinX() < this.rectangle2D.getMaxX() && this.rectangle2D.getMinX() < rectangle2D.getMaxX(); + } + + + public boolean intersectsX(GapAcrossLines gapAcrossLines) { + + return this.intersectsX(gapAcrossLines.getRectangle2D()); + } + + + public double getIntersectionWidth(Rectangle2D rectangle2D) { + + if (!intersectsX(rectangle2D)) { + return -1; + } + double min_x = Math.max(rectangle2D.getMinX(), this.rectangle2D.getMinX()); + double max_x = Math.min(rectangle2D.getMaxX(), this.rectangle2D.getMaxX()); + return max_x - min_x; + } + + + public GapAcrossLines addNewLineAndShrink(Rectangle2D rectangle2D) { + + var correctedRectangle = correctRectangle(rectangle2D); + double min_x = Math.max(correctedRectangle.getMinX(), this.rectangle2D.getMinX()); + double max_x = Math.min(correctedRectangle.getMaxX(), this.rectangle2D.getMaxX()); + double min_y = correctedRectangle.getMinY(); + double max_y = this.rectangle2D.getMaxY(); + double width = max_x - min_x; + double height = max_y - min_y; + return new GapAcrossLines(new Rectangle2D.Double(min_x, min_y, width, height), lineCount + 1); + } + + } + + @RequiredArgsConstructor + private class ColumnFactory { + + final double avgHeight; + final int lineCount; + + List outputGaps = new LinkedList<>(); + Queue gapsQueue = new LinkedList<>(); + List gapsToQueue = new LinkedList<>(); + + + public static ColumnFactory init(double avgHeight, int lineCount) { + + return new ColumnFactory(Math.abs(avgHeight), lineCount); + } + + + public GapAcrossLines getNext() { + + return gapsQueue.remove(); + } + + + public void addToQueue(GapAcrossLines gapAcrossLines) { + + gapsQueue.add(gapAcrossLines); + } + + + public void addToQueue(Rectangle2D gap) { + + gapsQueue.add(new GapAcrossLines(gap)); + } + + + private boolean hasGapsToProcess() { + + return gapsQueue.peek() != null; + } + + + public void setToStillInProgress(GapAcrossLines gapAcrossLines) { + + gapsToQueue.add(gapAcrossLines); + } + + + private void addStillInProgressToQueue() { + + for (int i = gapsToQueue.size() - 1; i >= 0; i--) { + gapsQueue.add(gapsToQueue.remove(i)); + } + } + + + public void addGapsToQueue(List gaps) { + + gaps.forEach(this::addToQueue); + } + + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/InvisibleTableDetectionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/InvisibleTableDetectionService.java new file mode 100644 index 0000000..809fd80 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/InvisibleTableDetectionService.java @@ -0,0 +1,63 @@ +package com.knecon.fforesight.service.layoutparser.processor.services; + +import java.awt.geom.Rectangle2D; +import java.awt.geom.RectangularShape; +import java.util.LinkedList; +import java.util.List; + +import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation; +import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; + +import lombok.experimental.UtilityClass; + +@UtilityClass +public class InvisibleTableDetectionService { + + public List> detectTable(List textPositionSequences, Rectangle2D tableBBox) { + + LineInformation lineInformation = LineDetectionService.calculateLineInformation(textPositionSequences); + GapInformation gaps = GapDetectionService.findGapsInLines(textPositionSequences, tableBBox); + List gapsAcrossLines = GapsAcrossLinesService.detectXGapsAcrossLines(gaps, tableBBox); + List columnXCoords = gapsAcrossLines.stream().map(RectangularShape::getCenterX).toList(); + int colCount = gapsAcrossLines.size(); + int rowCount = lineInformation.getLineBBox().size(); + List> cells = new LinkedList<>(); + List cellsInLine = new LinkedList<>(); + cells.add(cellsInLine); + double x1; + double y1; + double x2; + double y2; + for (int col = 0; col < colCount + 1; col++) { + for (int row = 0; row < rowCount + 1; row++) { + if (col == 0) { + x1 = tableBBox.getX(); + } else { + x1 = columnXCoords.get(col - 1); + } + if (row == 0) { + y2 = tableBBox.getMaxY(); + } else { + y2 = lineInformation.getLineBBox().get(row - 1).getY(); + } + if (col == colCount) { + x2 = tableBBox.getMaxX(); + } else { + x2 = columnXCoords.get(col); + } + if (row == rowCount) { + y1 = tableBBox.getY(); + } else { + y1 = lineInformation.getLineBBox().get(row).getY(); + } + cellsInLine.add(new Rectangle2D.Double(x1, y1, x2 - x1, y2 - y1)); + } + cellsInLine = new LinkedList<>(); + cells.add(cellsInLine); + } + + return cells; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/LineDetectionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/LineDetectionService.java new file mode 100644 index 0000000..8b14767 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/LineDetectionService.java @@ -0,0 +1,202 @@ +package com.knecon.fforesight.service.layoutparser.processor.services; + +import java.awt.geom.Rectangle2D; +import java.util.LinkedList; +import java.util.List; + +import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation; +import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; + +import lombok.AllArgsConstructor; +import lombok.Getter; +import lombok.experimental.UtilityClass; + +@UtilityClass +public class LineDetectionService { + + private static final double X_GAP_FACTOR = 1; // multiplied with average text height, determines the minimum distance of gaps in lines + + + public LineInformation calculateLineInformation(List sortedTextPositionSequences) { + + if (sortedTextPositionSequences.isEmpty()) { + return LineFactory.init().build(); + } + + return buildLineInformation(sortedTextPositionSequences); + } + + + public List> findLinesWithGaps(List sortedTextPositionSequences, Rectangle2D mainBodyTextFrame) { + + return calculateLineInformation(sortedTextPositionSequences).getBBoxWithGapsByLines(); + } + + + public List> orderByLines(List sortedTextPositionSequences, Rectangle2D mainBodyTextFrame) { + + return calculateLineInformation(sortedTextPositionSequences).getSequencesByLines(); + } + + + private static LineInformation buildLineInformation(List sortedTextPositionSequences) { + + final double avgTextPositionHeight = getAvgTextPositionHeight(sortedTextPositionSequences); + + LineFactory lineFactory = LineFactory.init(); + + var previousTextPosition = sortedTextPositionSequences.get(0); + lineFactory.addToCurrentLine(previousTextPosition); + for (TextPositionSequence currentTextPosition : sortedTextPositionSequences.subList(1, sortedTextPositionSequences.size())) { + if (isNewLine(currentTextPosition, previousTextPosition, avgTextPositionHeight) || isSplitByOrientation(currentTextPosition, previousTextPosition)) { + lineFactory.startNewLine(); + } else if (isXGap(currentTextPosition, previousTextPosition, avgTextPositionHeight)) { + lineFactory.startNewBlock(); + } + lineFactory.addToCurrentLine(currentTextPosition); + previousTextPosition = currentTextPosition; + } + lineFactory.addFinalLine(); + return lineFactory.build(); + } + + + private static double getAvgTextPositionHeight(List textPositionSequences) { + + return textPositionSequences.stream().mapToDouble(TextPositionSequence::getHeight).average().orElseThrow(); + } + + + private static boolean isXGap(TextPositionSequence currentTextPosition, TextPositionSequence previousTextPosition, double avgTextPositionHeight) { + + return Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getMinXDirAdj()) > (avgTextPositionHeight * X_GAP_FACTOR); + } + + + private static boolean isSplitByOrientation(TextPositionSequence currentTextPosition, TextPositionSequence previousTextPosition) { + + return !previousTextPosition.getDir().equals(currentTextPosition.getDir()); + } + + + private static boolean isNewLine(TextPositionSequence currentTextPosition, TextPositionSequence previousTextPosition, double avgTextPositionHeight) { + + return Math.abs(previousTextPosition.getMinYDirAdj() - currentTextPosition.getMinYDirAdj()) > avgTextPositionHeight; + } + + + @Getter + @AllArgsConstructor + private class LineFactory { + + List lineBBox; + + List> bBoxWithGapsByLines; + List bBoxWithGapsInCurrentLine; + + List>> sequencesWithGapsByLines; + List> sequencesWithGapsInCurrentLine; + + List currentSequencesWithoutGaps; + + List> sequencesByLines; + List sequencesInCurrentLine; + + List> xGaps; + List> yGaps; + + + public static LineFactory init() { + + List lineBBox = new LinkedList<>(); + + List> bBoxWithGapsByLines = new LinkedList<>(); + List bBoxWithGapsInCurrentLine = new LinkedList<>(); + bBoxWithGapsByLines.add(bBoxWithGapsInCurrentLine); + + List>> sequencesWithGapsByLines = new LinkedList<>(); + List> sequencesWithGapsInCurrentLine = new LinkedList<>(); + sequencesWithGapsByLines.add(sequencesWithGapsInCurrentLine); + List currentSequencesWithoutGaps = new LinkedList<>(); + sequencesWithGapsInCurrentLine.add(currentSequencesWithoutGaps); + + List> sequencesByLines = new LinkedList<>(); + List sequencesInCurrentLine = new LinkedList<>(); + sequencesByLines.add(sequencesInCurrentLine); + + return new LineFactory(lineBBox, + bBoxWithGapsByLines, + bBoxWithGapsInCurrentLine, + sequencesWithGapsByLines, + sequencesWithGapsInCurrentLine, + currentSequencesWithoutGaps, + sequencesByLines, + sequencesInCurrentLine, + null, + null); + } + + + public void addGaps(GapInformation gapInformation) { + + this.xGaps = gapInformation.getXGaps(); + this.yGaps = gapInformation.getYGaps(); + } + + + public LineInformation build() { + + return new LineInformation(lineBBox, sequencesByLines, bBoxWithGapsByLines, sequencesWithGapsByLines); + } + + + public void startNewBlock() { + + bBoxWithGapsInCurrentLine.add(textPositionBBox(currentSequencesWithoutGaps)); + currentSequencesWithoutGaps = new LinkedList<>(); + sequencesWithGapsInCurrentLine.add(currentSequencesWithoutGaps); + } + + + public void startNewLine() { + + lineBBox.add(textPositionBBox(sequencesInCurrentLine)); + + bBoxWithGapsInCurrentLine.add(textPositionBBox(currentSequencesWithoutGaps)); + bBoxWithGapsInCurrentLine = new LinkedList<>(); + bBoxWithGapsByLines.add(bBoxWithGapsInCurrentLine); + + sequencesWithGapsInCurrentLine = new LinkedList<>(); + sequencesWithGapsByLines.add(sequencesWithGapsInCurrentLine); + currentSequencesWithoutGaps = new LinkedList<>(); + sequencesWithGapsInCurrentLine.add(currentSequencesWithoutGaps); + + sequencesInCurrentLine = new LinkedList<>(); + sequencesByLines.add(sequencesInCurrentLine); + } + + + private Rectangle2D textPositionBBox(List textPositionSequences) { + + return RectangleTransformations.rectangleBBox(textPositionSequences.stream().map(TextPositionSequence::getRectangle).toList()); + } + + + public void addToCurrentLine(TextPositionSequence current) { + + sequencesInCurrentLine.add(current); + currentSequencesWithoutGaps.add(current); + } + + + public void addFinalLine() { + + lineBBox.add(textPositionBBox(sequencesInCurrentLine)); + bBoxWithGapsInCurrentLine.add(textPositionBBox(currentSequencesWithoutGaps)); + } + + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/MainBodyTextFrameExtractionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/MainBodyTextFrameExtractionService.java new file mode 100644 index 0000000..0cac3ee --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/MainBodyTextFrameExtractionService.java @@ -0,0 +1,25 @@ +package com.knecon.fforesight.service.layoutparser.processor.services; + +import java.awt.geom.Rectangle2D; + +import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; + +import lombok.experimental.UtilityClass; + +@UtilityClass +public class MainBodyTextFrameExtractionService { + + private static final double TEXT_FRAME_PAD_WIDTH = 0.0; + private static final double TEXT_FRAME_PAD_HEIGHT = 0.02; + + + public Rectangle2D calculateMainBodyTextFrame(LineInformation lineInformation) { + + Rectangle2D mainBodyTextFrame = lineInformation.getLineBBox().stream() + .collect(RectangleTransformations.collectBBox()); + + return RectangleTransformations.pad(mainBodyTextFrame, mainBodyTextFrame.getWidth() * TEXT_FRAME_PAD_WIDTH, mainBodyTextFrame.getHeight() * TEXT_FRAME_PAD_HEIGHT); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageInformationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageInformationService.java new file mode 100644 index 0000000..03353a1 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PageInformationService.java @@ -0,0 +1,24 @@ +package com.knecon.fforesight.service.layoutparser.processor.services; + +import java.awt.geom.Rectangle2D; + +import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation; +import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation; +import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; +import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation; + +import lombok.experimental.UtilityClass; + +@UtilityClass +public class PageInformationService { + + public PageInformation build(PageContents pageContents) { + + LineInformation lineInformation = LineDetectionService.calculateLineInformation(pageContents.getSortedTextPositionSequences()); + Rectangle2D mainBodyTextFrame = MainBodyTextFrameExtractionService.calculateMainBodyTextFrame(lineInformation); + GapInformation gapInformation = GapDetectionService.findGapsInLines(pageContents.getSortedTextPositionSequences(), mainBodyTextFrame); + + return new PageInformation(pageContents, lineInformation, mainBodyTextFrame, gapInformation); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/PdfParsingService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PdfParsingService.java similarity index 66% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/PdfParsingService.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PdfParsingService.java index e6c22e2..5b7fb23 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/PdfParsingService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PdfParsingService.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.service; +package com.knecon.fforesight.service.layoutparser.processor.services; import java.util.ArrayList; import java.util.List; @@ -9,16 +9,20 @@ import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.springframework.stereotype.Service; +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.processor.adapter.ImageServiceResponseAdapter; import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableCells; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.CleanRulings; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence; -import com.knecon.fforesight.service.layoutparser.processor.classification.parsing.PDFLinesTextStripper; +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.parsing.PDFLinesTextStripper; +import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService; +import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService; +import com.knecon.fforesight.service.layoutparser.processor.services.blockification.TaasBlockificationService; import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; @@ -31,11 +35,16 @@ public class PdfParsingService { private final RulingCleaningService rulingCleaningService; private final TableExtractionService tableExtractionService; - private final BlockificationService blockificationService; private final ImageServiceResponseAdapter imageServiceResponseAdapter; + private final TaasBlockificationService taasBlockificationService; + private final DocuMineBlockificationService docuMineBlockificationService; + private final RedactManagerBlockificationService redactManagerBlockificationService; - public ClassificationDocument parseDocument(PDDocument originDocument, Map> pdfTableCells, Map> pdfImages) { + public ClassificationDocument parseDocument(LayoutParsingType layoutParsingType, + PDDocument originDocument, + Map> pdfTableCells, + Map> pdfImages) { ClassificationDocument document = new ClassificationDocument(); List classificationPages = new ArrayList<>(); @@ -44,7 +53,7 @@ public class PdfParsingService { long pageCount = originDocument.getNumberOfPages(); for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) { - parsePage(pdfImages, originDocument, pdfTableCells, document, classificationPages, pageNumber); + parsePage(layoutParsingType, pdfImages, originDocument, pdfTableCells, document, classificationPages, pageNumber); } document.setPages(classificationPages); @@ -54,7 +63,8 @@ public class PdfParsingService { @SneakyThrows - private void parsePage(Map> pdfImages, + private void parsePage(LayoutParsingType layoutParsingType, + Map> pdfImages, PDDocument pdDocument, Map> pdfTableCells, ClassificationDocument document, @@ -79,7 +89,12 @@ public class PdfParsingService { stripper.getRulings(), stripper.getMinCharWidth(), stripper.getMaxCharHeight()); - ClassificationPage classificationPage = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); + + ClassificationPage classificationPage = switch (layoutParsingType) { + case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); + case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); + case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); + }; classificationPage.setRotation(rotation); classificationPage.setLandscape(isLandscape); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RectangleTransformations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RectangleTransformations.java deleted file mode 100644 index 3618dbd..0000000 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RectangleTransformations.java +++ /dev/null @@ -1,95 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.processor.services; - -import static java.lang.String.format; - -import java.awt.geom.Area; -import java.awt.geom.Rectangle2D; -import java.util.Arrays; -import java.util.List; -import java.util.Set; -import java.util.function.BiConsumer; -import java.util.function.BinaryOperator; -import java.util.function.Function; -import java.util.function.Supplier; -import java.util.stream.Collector; - -import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.AtomicTextBlock; - -import lombok.experimental.UtilityClass; - -@UtilityClass -public class RectangleTransformations { - - public static Rectangle2D pad(Rectangle2D rectangle2D, int deltaX, int deltaY) { - - return new Rectangle2D.Double(rectangle2D.getMinX() - deltaX, rectangle2D.getMinY() - deltaY, rectangle2D.getWidth() + 2 * deltaX, rectangle2D.getHeight() + 2 * deltaY); - } - - - public static Rectangle2D bBoxUnionAtomicTextBlock(List atomicTextBlocks) { - - return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DUnion()); - } - - - public static Rectangle2D rectangleUnion(List rectangle2DList) { - - return rectangle2DList.stream().collect(new Rectangle2DUnion()); - } - - - public static String toString(Rectangle2D rectangle2D) { - - return format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight()); - } - - - public static Rectangle2D parseRectangle2D(String bBox) { - - List floats = Arrays.stream(bBox.split(",")).map(Float::parseFloat).toList(); - return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3)); - } - - - private static class Rectangle2DUnion implements Collector { - - @Override - public Supplier supplier() { - - return Area::new; - } - - - @Override - public BiConsumer accumulator() { - - return (area, rectangle2D) -> area.add(new Area(rectangle2D)); - } - - - @Override - public BinaryOperator combiner() { - - return (area1, area2) -> { - area1.add(area2); - return area1; - }; - } - - - @Override - public Function finisher() { - - return Area::getBounds2D; - } - - - @Override - public Set characteristics() { - - return Set.of(Characteristics.CONCURRENT, Characteristics.UNORDERED); - } - - } - -} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/RulingCleaningService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java similarity index 95% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/RulingCleaningService.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java index 8e8de6f..bb102c9 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/RulingCleaningService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/RulingCleaningService.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.service; +package com.knecon.fforesight.service.layoutparser.processor.services; import java.awt.geom.Line2D; import java.awt.geom.Point2D; @@ -13,9 +13,9 @@ import java.util.Map; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableCells; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.CleanRulings; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling; -import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons; +import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionGridCreatorService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionGridCreatorService.java new file mode 100644 index 0000000..5a4a40f --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionGridCreatorService.java @@ -0,0 +1,146 @@ +package com.knecon.fforesight.service.layoutparser.processor.services; + +import java.awt.geom.Rectangle2D; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.function.BiConsumer; +import java.util.function.BinaryOperator; +import java.util.function.Function; +import java.util.function.Supplier; +import java.util.stream.Collector; +import java.util.stream.Stream; + +import org.springframework.stereotype.Service; + +import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point; +import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.CellRectangle; +import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionGrid; +import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionRectangle; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Table; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.TableCell; + +import lombok.RequiredArgsConstructor; + +@Service +@RequiredArgsConstructor +public class SectionGridCreatorService { + + public SectionGrid createSectionGrid(Document document) { + + Map> sectionBBox = document.streamAllSubNodesOfType(NodeType.SECTION).map(SemanticNode::getBBox).collect(new SectionGridCollector()); + Map> paragraphBBox = document.streamAllSubNodesOfType(NodeType.PARAGRAPH).map(SemanticNode::getBBox).collect(new SectionGridCollector()); + Map> headlineBBox = document.streamAllSubNodesOfType(NodeType.HEADLINE).map(SemanticNode::getBBox).collect(new SectionGridCollector()); + Map> tableBBox = document.streamAllSubNodesOfType(NodeType.TABLE).map(node -> (Table) node).collect(new TableGridCollector()); + var sectionGrid = new SectionGrid(); + + sectionGrid.setRectanglesPerPage(mergeMapsByConcatenatingLists(// + mergeMapsByConcatenatingLists(paragraphBBox, headlineBBox), // + mergeMapsByConcatenatingLists(sectionBBox, tableBBox))); + + return sectionGrid; + } + + + private static abstract class GridCollector implements Collector>, Map>> { + + @Override + public Supplier>> supplier() { + + return HashMap::new; + } + + + @Override + public Function>, Map>> finisher() { + + return Function.identity(); + } + + + @Override + public BinaryOperator>> combiner() { + + return SectionGridCreatorService::mergeMapsByConcatenatingLists; + } + + + @Override + public Set characteristics() { + + return Set.of(Characteristics.IDENTITY_FINISH, Characteristics.CONCURRENT, Characteristics.UNORDERED); + } + + } + + private static class TableGridCollector extends GridCollector { + + @Override + public BiConsumer>, Table> accumulator() { + + return (map, table) -> table.getPages() + .forEach(page -> map.merge(page.getNumber(), List.of(toSectionRectangle(table, page, table.getPages().size())), SectionGridCreatorService::concatLists)); + } + + + private static SectionRectangle toSectionRectangle(Table table, Page page, int numberOfParts) { + + Rectangle2D rect = table.getBBox().get(page); + List tableCellRectangles = table.streamTableCells() + .map(TableCell::getBBox) + .map(map -> map.get(page)) + .filter(Objects::nonNull) + .map(rectangle2D -> new CellRectangle(new Point((float) rectangle2D.getX(), (float) rectangle2D.getY()), + (float) rectangle2D.getWidth(), + (float) rectangle2D.getHeight())) + .toList(); + return new SectionRectangle(new Point((float) rect.getX(), (float) rect.getY()), + (float) rect.getWidth(), + (float) rect.getHeight(), + 1, + numberOfParts, + tableCellRectangles); + } + + } + + private static class SectionGridCollector extends GridCollector> { + + @Override + public BiConsumer>, Map> accumulator() { + + return (mapToKeep, mapToMerge) -> mapToMerge.forEach((page, rectangle) -> mapToKeep.merge(page.getNumber(), + List.of(toSectionRectangle(rectangle, mapToMerge.values().size())), + SectionGridCreatorService::concatLists)); + + } + + + private static SectionRectangle toSectionRectangle(Rectangle2D rect, int numberOfParts) { + + return new SectionRectangle(new Point((float) rect.getX(), (float) rect.getY()), (float) rect.getWidth(), (float) rect.getHeight(), 1, numberOfParts, null); + } + + } + + + private static Map> mergeMapsByConcatenatingLists(Map> mapToKeep, + Map> mapToMerge) { + + mapToMerge.forEach((page, rectangle) -> mapToKeep.merge(page, rectangle, SectionGridCreatorService::concatLists)); + return mapToKeep; + } + + + private static List concatLists(List l1, List l2) { + + return Stream.concat(l1.stream(), l2.stream()).toList(); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/SectionsBuilderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java similarity index 90% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/SectionsBuilderService.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java index 044e98b..04cc930 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/SectionsBuilderService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.service; +package com.knecon.fforesight.service.layoutparser.processor.services; import java.util.ArrayList; import java.util.Collections; @@ -9,18 +9,18 @@ import java.util.stream.Collectors; import org.springframework.stereotype.Service; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationFooter; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationHeader; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationSection; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.UnclassifiedText; +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationSection; +import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; +import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText; import lombok.extern.slf4j.Slf4j; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/TableExtractionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java similarity index 91% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/TableExtractionService.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java index c11cca0..c89db54 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/TableExtractionService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.service; +package com.knecon.fforesight.service.layoutparser.processor.services; import java.awt.geom.Point2D; import java.util.ArrayList; @@ -12,15 +12,15 @@ import java.util.Set; import org.springframework.stereotype.Service; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.CleanRulings; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Rectangle; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons; +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons; @Service public class TableExtractionService { @@ -136,6 +136,14 @@ public class TableExtractionService { public List findCells(List horizontalRulingLines, List verticalRulingLines) { + for (Ruling r : horizontalRulingLines) { + if (r.getX2() < r.getX1()) { + double a = r.getX2(); + r.x2 = (float) r.getX1(); + r.x1 = (float) a; + } + } + List cellsFound = new ArrayList<>(); Map intersectionPoints = Ruling.findIntersections(horizontalRulingLines, verticalRulingLines); List intersectionPointsList = new ArrayList<>(intersectionPoints.keySet()); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TextPositionSequenceSorter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TextPositionSequenceSorter.java new file mode 100644 index 0000000..29e2634 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TextPositionSequenceSorter.java @@ -0,0 +1,75 @@ +package com.knecon.fforesight.service.layoutparser.processor.services; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Collection; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.springframework.core.io.ClassPathResource; + +import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.parsing.PDFLinesTextStripper; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; + +import lombok.experimental.UtilityClass; + +@UtilityClass +public class TextPositionSequenceSorter { + + public List getSortedTextPositionsWithPages(String filename) throws IOException { + + List textPositionSequencesPerPage = new LinkedList<>(); + try (InputStream inputStream = new ClassPathResource(filename).getInputStream()) { + + PDDocument pdDocument = Loader.loadPDF(inputStream); + + for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) { + + PDFLinesTextStripper stripper = new PDFLinesTextStripper(); + PDPage pdPage = pdDocument.getPage(pageNumber - 1); + stripper.setPageNumber(pageNumber); + stripper.setSortByPosition(true); + stripper.setStartPage(pageNumber); + stripper.setEndPage(pageNumber); + stripper.setPdpage(pdPage); + stripper.getText(pdDocument); + + Map> sortedTextPositionSequencesPerDir = stripper.getTextPositionSequences() + .stream() + .collect(Collectors.groupingBy(textPositionSequence -> textPositionSequence.getDir().getDegrees())); + + var sortedTextPositionSequences = sortByDirAccordingToPageRotation(sortedTextPositionSequencesPerDir, pdPage.getRotation()); + + textPositionSequencesPerPage.add(new PageContents(sortedTextPositionSequences, + RectangleTransformations.toRectangle2D(pdPage.getCropBox()), + RectangleTransformations.toRectangle2D(pdPage.getMediaBox()))); + } + + pdDocument.close(); + } + + return textPositionSequencesPerPage; + } + + + public List sortByDirAccordingToPageRotation(Map> sortedTextPositionSequencesPerDir, int rotation) { + + LinkedList sortedKeys = new LinkedList<>(sortedTextPositionSequencesPerDir.keySet().stream().sorted().toList()); + + for (int i = 0; i < sortedKeys.size(); i++) { + if (sortedKeys.get(i) < rotation) { + Float keyToSwap = sortedKeys.remove(i); + sortedKeys.addLast(keyToSwap); + } + } + return sortedKeys.stream().map(sortedTextPositionSequencesPerDir::get).flatMap(Collection::stream).toList(); + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java new file mode 100644 index 0000000..d5bd90d --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java @@ -0,0 +1,229 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.blockification; + +import static java.util.stream.Collectors.toSet; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter; +import com.knecon.fforesight.service.layoutparser.processor.model.Orientation; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil; + +@Service +public class DocuMineBlockificationService { + + static final float THRESHOLD = 1f; + + + /** + * This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions. + * This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this! + * Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling. + * + * @param textPositions The words of a page. + * @param horizontalRulingLines Horizontal table lines. + * @param verticalRulingLines Vertical table lines. + * @return Page object that contains the Textblock and text statistics. + */ + public ClassificationPage blockify(List textPositions, List horizontalRulingLines, List verticalRulingLines) { + + List chunkWords = new ArrayList<>(); + List chunkBlockList1 = new ArrayList<>(); + + float minX = 1000, maxX = 0, minY = 1000, maxY = 0; + TextPositionSequence prev = null; + + boolean wasSplitted = false; + Float splitX1 = null; + for (TextPositionSequence word : textPositions) { + + boolean lineSeparation = prev != null && word.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * 1.25; + boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight(); + boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj(); + boolean negativeXGap = prev != null && word.getMinXDirAdj() - minX < -5; + boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj(); + boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines); + boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir()); + boolean splitByOtherFontAndOtherY = prev != null && prev.getMaxYDirAdj() != word.getMaxYDirAdj() && (word.getFontStyle().contains("bold") && !prev.getFontStyle() + .contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold")); + + if (prev != null && (lineSeparation || startFromTop || splitByDir || isSplitByRuling || splitByOtherFontAndOtherY || negativeXGap)) { + + Orientation prevOrientation = null; + if (!chunkBlockList1.isEmpty()) { + prevOrientation = chunkBlockList1.get(chunkBlockList1.size() - 1).getOrientation(); + } + + TextPageBlock cb1 = buildTextBlock(chunkWords); + chunkBlockList1.add(cb1); + chunkWords = new ArrayList<>(); + + if (splitByX && !isSplitByRuling) { + wasSplitted = true; + cb1.setOrientation(Orientation.LEFT); + splitX1 = word.getMinXDirAdj(); + } else if (newLineAfterSplit && !isSplitByRuling) { + wasSplitted = false; + cb1.setOrientation(Orientation.RIGHT); + splitX1 = null; + } else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) { + cb1.setOrientation(Orientation.LEFT); + } + + minX = 1000; + maxX = 0; + minY = 1000; + maxY = 0; + prev = null; + } + + chunkWords.add(word); + + prev = word; + if (word.getMinXDirAdj() < minX) { + minX = word.getMinXDirAdj(); + } + if (word.getMaxXDirAdj() > maxX) { + maxX = word.getMaxXDirAdj(); + } + if (word.getMinYDirAdj() < minY) { + minY = word.getMinYDirAdj(); + } + if (word.getMaxYDirAdj() > maxY) { + maxY = word.getMaxYDirAdj(); + } + } + + TextPageBlock cb1 = buildTextBlock(chunkWords); + if (cb1 != null) { + chunkBlockList1.add(cb1); + } + + return new ClassificationPage(chunkBlockList1); + } + + + private boolean equalsWithThreshold(float f1, float f2) { + + return Math.abs(f1 - f2) < THRESHOLD; + } + + + private TextPageBlock buildTextBlock(List wordBlockList) { + + TextPageBlock textBlock = null; + + FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter(); + FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter(); + FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter(); + StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter(); + StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter(); + + for (TextPositionSequence wordBlock : wordBlockList) { + + lineHeightFrequencyCounter.add(wordBlock.getTextHeight()); + fontSizeFrequencyCounter.add(wordBlock.getFontSize()); + spaceFrequencyCounter.add(wordBlock.getSpaceWidth()); + fontFrequencyCounter.add(wordBlock.getFont()); + styleFrequencyCounter.add(wordBlock.getFontStyle()); + + if (textBlock == null) { + textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(), + wordBlock.getMaxXDirAdj(), + wordBlock.getMinYDirAdj(), + wordBlock.getMaxYDirAdj(), + wordBlockList, + wordBlock.getRotation()); + } else { + TextPageBlock spatialEntity = textBlock.union(wordBlock); + textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight()); + } + } + + if (textBlock != null) { + textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular()); + textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular()); + textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest()); + } + + if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) { + textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj)); + } + return textBlock; + } + + + private boolean isSplitByRuling(float minX, + float minY, + float maxX, + float maxY, + TextPositionSequence word, + List horizontalRulingLines, + List verticalRulingLines) { + + return isSplitByRuling(maxX, + minY, + word.getMinXDirAdj(), + word.getMinYDirAdj(), + verticalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()) // + || isSplitByRuling(minX, + minY, + word.getMinXDirAdj(), + word.getMaxYDirAdj(), + horizontalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()) // + || isSplitByRuling(maxX, + minY, + word.getMinXDirAdj(), + word.getMinYDirAdj(), + horizontalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()) // + || isSplitByRuling(minX, + minY, + word.getMinXDirAdj(), + word.getMaxYDirAdj(), + verticalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()); // + } + + + private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List rulingLines, float dir, float pageWidth, float pageHeight) { + + for (Ruling ruling : rulingLines) { + var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight); + if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) { + return true; + } + } + return false; + } + + + private double round(float value, int decimalPoints) { + + var d = Math.pow(10, decimalPoints); + return Math.round(value * d) / d; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/BlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java similarity index 91% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/BlockificationService.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java index c657ffc..3062c78 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/BlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/RedactManagerBlockificationService.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.service; +package com.knecon.fforesight.service.layoutparser.processor.services.blockification; import static java.util.stream.Collectors.toSet; @@ -9,19 +9,19 @@ import java.util.List; import org.springframework.stereotype.Service; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.FloatFrequencyCounter; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.Orientation; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.StringFrequencyCounter; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence; -import com.knecon.fforesight.service.layoutparser.processor.classification.utils.RulingTextDirAdjustUtil; +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter; +import com.knecon.fforesight.service.layoutparser.processor.model.Orientation; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil; -@Service @SuppressWarnings("all") -public class BlockificationService { +@Service +public class RedactManagerBlockificationService { static final float THRESHOLD = 1f; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/TaasBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/TaasBlockificationService.java new file mode 100644 index 0000000..abcbcac --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/TaasBlockificationService.java @@ -0,0 +1,280 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.blockification; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.springframework.stereotype.Service; + +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.Orientation; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil; + +@Service +@SuppressWarnings("all") +public class TaasBlockificationService { + + private static final float THRESHOLD = 1f; + private static final float Y_GAP_SPLIT_HEIGHT_MODIFIER = 1.25f; + private static final int X_GAP_SPLIT_CONSTANT = 50; + + + /** + * This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions. + * This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this! + * Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling. + * + * @param textPositions The words of a page. + * @param horizontalRulingLines Horizontal table lines. + * @param verticalRulingLines Vertical table lines. + * @return ClassificationPage object that contains the Textblock and text statistics. + */ + public ClassificationPage blockify(List textPositions, List horizontalRulingLines, List verticalRulingLines) { + + List classificationTextBlocks = constructFineGranularTextPageBlocks(textPositions, horizontalRulingLines, verticalRulingLines); + + classificationTextBlocks = mergeFineGranularTextPageBlocks(classificationTextBlocks); + + return new ClassificationPage(new ArrayList<>(classificationTextBlocks.stream().map(classificationTextBlock -> (AbstractPageBlock) classificationTextBlock).toList())); + } + + + private List mergeFineGranularTextPageBlocks(List classificationTextBlocks) { + + if (classificationTextBlocks.isEmpty()) { + return new ArrayList<>(); + } + List> textBlocksToMerge = new LinkedList<>(); + List currentTextBlocksToMerge = new LinkedList<>(); + textBlocksToMerge.add(currentTextBlocksToMerge); + TextPageBlock previousTextBlock = null; + for (TextPageBlock currentTextBlock : classificationTextBlocks) { + if (previousTextBlock == null) { + currentTextBlocksToMerge.add(currentTextBlock); + previousTextBlock = currentTextBlock; + continue; + } + boolean alignsXRight = Math.abs(currentTextBlock.getPdfMaxX() - previousTextBlock.getPdfMaxX()) < 1; + boolean smallYGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < 5; + if (alignsXRight && smallYGap) { + currentTextBlocksToMerge.add(currentTextBlock); + } else { + currentTextBlocksToMerge = new LinkedList<>(); + currentTextBlocksToMerge.add(currentTextBlock); + textBlocksToMerge.add(currentTextBlocksToMerge); + } + previousTextBlock = currentTextBlock; + } + return textBlocksToMerge.stream().map(TextPageBlock::merge).toList(); + } + + + private void assignOrientations(List classificationTextBlocks) { + + Iterator itty = classificationTextBlocks.iterator(); + + TextPageBlock previousLeft = null; + TextPageBlock previousRight = null; + while (itty.hasNext()) { + TextPageBlock block = (TextPageBlock) itty.next(); + + if (previousLeft != null && block.getOrientation().equals(Orientation.LEFT)) { + if (previousLeft.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousLeft.getMinY()) { + previousLeft.add(block); + itty.remove(); + continue; + } + } + + if (previousRight != null && block.getOrientation().equals(Orientation.RIGHT)) { + if (previousRight.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousRight.getMinY()) { + previousRight.add(block); + itty.remove(); + continue; + } + } + + if (block.getOrientation().equals(Orientation.LEFT)) { + previousLeft = block; + } else if (block.getOrientation().equals(Orientation.RIGHT)) { + previousRight = block; + } + } + + itty = classificationTextBlocks.iterator(); + TextPageBlock previous = null; + while (itty.hasNext()) { + TextPageBlock block = (TextPageBlock) itty.next(); + + if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold( + block.getMaxY(), + previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation() + .equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) { + previous.add(block); + itty.remove(); + continue; + } + + previous = block; + } + } + + + private List constructFineGranularTextPageBlocks(List textPositions, + List horizontalRulingLines, + List verticalRulingLines) { + + int indexOnPage = 0; + List wordClusterToCombine = new ArrayList<>(); + List classificationTextBlocks = new ArrayList<>(); + + float minX = 1000, maxX = 0, minY = 1000, maxY = 0; + TextPositionSequence prev = null; + // TODO: make static final constant + var listIdentitifier = Pattern.compile("\\b(?:[1-9]|1\\d|20|[ivxlc]|[a-z])\\s*(?:[.)])", Pattern.CASE_INSENSITIVE); + + boolean wasSplitted = false; + Float splitX1 = null; + for (TextPositionSequence word : textPositions) { + + Matcher listIdentifierPattern = listIdentitifier.matcher(word.toString()); + + boolean yGap = prev != null && word.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * Y_GAP_SPLIT_HEIGHT_MODIFIER; + boolean sameLine = prev != null && equalsWithThreshold(prev.getMinYDirAdj(), word.getMinYDirAdj()); + boolean positiveXGapInline = prev != null && maxX + X_GAP_SPLIT_CONSTANT < word.getMinXDirAdj() && sameLine; + boolean negativeXGap = prev != null && word.getMinXDirAdj() - minX < -5; + boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight(); + boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj(); + boolean splitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines); + boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir()); + boolean fontChange = prev != null && (!word.getFont().equals(prev.getFont()) || !word.getFontStyle() + .equals(prev.getFontStyle()) || word.getFontSize() != prev.getFontSize()); + boolean newline = prev != null && Math.abs(word.getMinYDirAdj() - prev.getMinYDirAdj()) > word.getHeight(); + boolean isListIdentifier = listIdentifierPattern.matches(); + + if (prev != null && (prev.isParagraphStart() || negativeXGap || positiveXGapInline || yGap || startFromTop || splitByRuling || (newline && (fontChange || isListIdentifier)))) { +// if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) { + + Orientation prevOrientation = null; + if (!classificationTextBlocks.isEmpty()) { + prevOrientation = classificationTextBlocks.get(classificationTextBlocks.size() - 1).getOrientation(); + } + + TextPageBlock classificationTextBlock = TextPageBlock.fromTextPositionSequences(wordClusterToCombine); + + classificationTextBlocks.add(classificationTextBlock); + wordClusterToCombine = new ArrayList<>(); + + if (positiveXGapInline && !splitByRuling) { + wasSplitted = true; + classificationTextBlock.setOrientation(Orientation.LEFT); + splitX1 = word.getMinXDirAdj(); + } else if (newLineAfterSplit && !splitByRuling) { + wasSplitted = false; + classificationTextBlock.setOrientation(Orientation.RIGHT); + splitX1 = null; + } else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (yGap || !startFromTop || !positiveXGapInline || !newLineAfterSplit || !splitByRuling)) { + classificationTextBlock.setOrientation(Orientation.LEFT); + } + + minX = 1000; + maxX = 0; + minY = 1000; + maxY = 0; + prev = null; + } + + wordClusterToCombine.add(word); + + prev = word; + if (word.getMinXDirAdj() < minX) { + minX = word.getMinXDirAdj(); + } + if (word.getMaxXDirAdj() > maxX) { + maxX = word.getMaxXDirAdj(); + } + if (word.getMinYDirAdj() < minY) { + minY = word.getMinYDirAdj(); + } + if (word.getMaxYDirAdj() > maxY) { + maxY = word.getMaxYDirAdj(); + } + } + + TextPageBlock classificationTextBlock = TextPageBlock.fromTextPositionSequences(wordClusterToCombine); + if (classificationTextBlock != null) { + classificationTextBlocks.add(classificationTextBlock); + } + return classificationTextBlocks; + } + + + private boolean equalsWithThreshold(float f1, float f2) { + + return Math.abs(f1 - f2) < THRESHOLD; + } + + + private boolean isSplitByRuling(float minX, + float minY, + float maxX, + float maxY, + TextPositionSequence word, + List horizontalRulingLines, + List verticalRulingLines) { + + return isSplitByRuling(maxX, + minY, + word.getMinXDirAdj(), + word.getMinYDirAdj(), + verticalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()) // + || isSplitByRuling(minX, + minY, + word.getMinXDirAdj(), + word.getMaxYDirAdj(), + horizontalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()) // + || isSplitByRuling(maxX, + minY, + word.getMinXDirAdj(), + word.getMinYDirAdj(), + horizontalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()) // + || isSplitByRuling(minX, + minY, + word.getMinXDirAdj(), + word.getMaxYDirAdj(), + verticalRulingLines, + word.getDir().getDegrees(), + word.getPageWidth(), + word.getPageHeight()); // + } + + + private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List rulingLines, float dir, float pageWidth, float pageHeight) { + + for (Ruling ruling : rulingLines) { + var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight); + if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) { + return true; + } + } + return false; + } + +} + diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java new file mode 100644 index 0000000..3cedb20 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java @@ -0,0 +1,117 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.classification; + +import java.util.List; +import java.util.Locale; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.springframework.stereotype.Service; + +import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService; +import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; + +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; + + +@Slf4j +@Service +@RequiredArgsConstructor +public class DocuMineClassificationService { + + private final BodyTextFrameService bodyTextFrameService; + private static final Pattern pattern = Pattern.compile("^(\\d{1,1}\\.?){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z\\[\\]\\-]{2,50}", Pattern.CASE_INSENSITIVE); + private static final Pattern pattern2 = Pattern.compile(".*\\d{4}$", Pattern.CASE_INSENSITIVE); + private static final Pattern pattern3 = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*"); + + + public void classifyDocument(ClassificationDocument document) { + + Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false); + Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true); + List headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular(); + + log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue()); + + for (ClassificationPage page : document.getPages()) { + bodyTextFrameService.setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame); + classifyPage(page, document, headlineFontSizes); + } + } + + + private void classifyPage(ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { + + for (AbstractPageBlock textBlock : page.getTextBlocks()) { + if (textBlock instanceof TextPageBlock) { + classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes); + } + } + } + + + private void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { + + log.debug("headlineFontSizes: {}", headlineFontSizes); + var bodyTextFrame = page.getBodyTextFrame(); + + Matcher matcher = pattern.matcher(textBlock.toString()); + Matcher matcher2 = pattern2.matcher(textBlock.toString()); + Matcher matcher3 = pattern3.matcher(textBlock.toString()); + + if (document.getFontSizeCounter().getMostPopular() == null) { + textBlock.setClassification(PageBlockType.OTHER); + return; + } + if (textBlock.getText().length() > 6 && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter() + .getMostPopular() || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) && PositionUtils.getApproxLineCount(textBlock) < 5.9 + + && (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && !matcher2.matches() && !textBlock.toString() + .contains(":") || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && !matcher2.matches() && !textBlock.toString() + .contains(":") || textBlock.toString().startsWith("APPENDIX") || textBlock.toString().startsWith("FIGURE") || textBlock.toString() + .startsWith("TABLE")) && !textBlock.toString().endsWith(":")) { + textBlock.setClassification(PageBlockType.getHeadlineType(1)); + document.setHeadlines(true); + + } else if (matcher.find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && !matcher3.matches() && !matcher2.matches()) { + textBlock.setClassification(PageBlockType.getHeadlineType(2)); + document.setHeadlines(true); + } else if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() + .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) { + textBlock.setClassification(PageBlockType.HEADER); + + } else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() + .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) { + textBlock.setClassification(PageBlockType.FOOTER); + } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, + document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks() + .size() == 1)) { + if (!Pattern.matches("[0-9]+", textBlock.toString())) { + textBlock.setClassification(PageBlockType.TITLE); + } + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter() + .getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) { + textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD); + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont() + .equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle() + .equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) { + textBlock.setClassification(PageBlockType.PARAGRAPH); + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter() + .getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter() + .getMostPopular() + .equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) { + textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC); + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) { + textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN); + } else { + textBlock.setClassification(PageBlockType.OTHER); + } + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/ClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java similarity index 85% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/ClassificationService.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java index 263b7eb..6150cba 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/service/ClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/RedactManagerClassificationService.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.service; +package com.knecon.fforesight.service.layoutparser.processor.services.classification; import java.util.List; import java.util.regex.Pattern; @@ -6,12 +6,13 @@ import java.util.regex.Pattern; import org.springframework.stereotype.Service; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.utils.PositionUtils; +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService; +import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; @@ -19,7 +20,7 @@ import lombok.extern.slf4j.Slf4j; @Slf4j @Service @RequiredArgsConstructor -public class ClassificationService { +public class RedactManagerClassificationService { private final BodyTextFrameService bodyTextFrameService; @@ -39,7 +40,7 @@ public class ClassificationService { } - public void classifyPage(ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { + private void classifyPage(ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { for (AbstractPageBlock textBlock : page.getTextBlocks()) { if (textBlock instanceof TextPageBlock) { @@ -49,7 +50,7 @@ public class ClassificationService { } - public void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { + private void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { var bodyTextFrame = page.getBodyTextFrame(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TaasClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TaasClassificationService.java new file mode 100644 index 0000000..76f2e63 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/TaasClassificationService.java @@ -0,0 +1,113 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.classification; + +import java.util.List; +import java.util.regex.Pattern; + +import org.springframework.stereotype.Service; + +import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; +import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; +import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService; +import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; + +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@Service +@RequiredArgsConstructor +public class TaasClassificationService { + + private final BodyTextFrameService bodyTextFrameService; + + + public void classifyDocument(ClassificationDocument document) { + + Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false); + Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true); + List headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular(); + + log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue()); + + for (ClassificationPage page : document.getPages()) { + bodyTextFrameService.setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame); + classifyPage(page, document, headlineFontSizes); + } + } + + + public void classifyPage(ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { + + for (AbstractPageBlock textBlock : page.getTextBlocks()) { + if (textBlock instanceof TextPageBlock) { + classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes); + } + } + } + + + public void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { + + var bodyTextFrame = page.getBodyTextFrame(); + + if (document.getFontSizeCounter().getMostPopular() == null) { + textBlock.setClassification(PageBlockType.OTHER); + return; + } + if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) { + textBlock.setClassification(PageBlockType.HEADER); + } else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) { + textBlock.setClassification(PageBlockType.FOOTER); + } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, + document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks() + .size() == 1)) { + if (!Pattern.matches("[0-9]+", textBlock.toString())) { + textBlock.setClassification(PageBlockType.TITLE); + } + } else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter() + .getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter() + .getCountPerValue() + .containsKey("bold") && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1) && textBlock.getSequences() + .get(0) + .getTextPositions() + .get(0) + .getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { + + for (int i = 1; i <= headlineFontSizes.size(); i++) { + if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) { + textBlock.setClassification(PageBlockType.getHeadlineType(i)); + document.setHeadlines(true); + } + } + } else if (!textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle() + .equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences() + .get(0) + .getTextPositions() + .get(0) + .getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) { + textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1)); + document.setHeadlines(true); + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter() + .getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) { + textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD); + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont() + .equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle() + .equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) { + textBlock.setClassification(PageBlockType.PARAGRAPH); + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter() + .getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter() + .getMostPopular() + .equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) { + textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC); + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) { + textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN); + } else { + textBlock.setClassification(PageBlockType.OTHER); + } + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/CohenSutherlandClipping.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/CohenSutherlandClipping.java similarity index 97% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/CohenSutherlandClipping.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/CohenSutherlandClipping.java index 6c424c9..b1a409a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/CohenSutherlandClipping.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/CohenSutherlandClipping.java @@ -9,7 +9,7 @@ * This program is free software under the LGPL (>=v2.1) * Read the file LICENSE.txt coming with the sources for details. */ -package com.knecon.fforesight.service.layoutparser.processor.classification.utils; +package com.knecon.fforesight.service.layoutparser.processor.utils; import java.awt.geom.Line2D; import java.awt.geom.Rectangle2D; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/DoubleComparisons.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/DoubleComparisons.java similarity index 91% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/DoubleComparisons.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/DoubleComparisons.java index 05fe8ad..c8651bc 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/DoubleComparisons.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/DoubleComparisons.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.utils; +package com.knecon.fforesight.service.layoutparser.processor.utils; import java.math.BigDecimal; import java.util.Comparator; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PdfVisualisationUtility.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PdfVisualisationUtility.java index 2616560..0e82c1d 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PdfVisualisationUtility.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PdfVisualisationUtility.java @@ -1,6 +1,7 @@ package com.knecon.fforesight.service.layoutparser.processor.utils; import java.awt.Color; +import java.awt.geom.Line2D; import java.awt.geom.Point2D; import java.awt.geom.Rectangle2D; import java.io.IOException; @@ -13,7 +14,7 @@ import org.apache.pdfbox.pdmodel.PDPageContentStream; import org.apache.pdfbox.pdmodel.font.PDType1Font; import org.apache.pdfbox.pdmodel.font.Standard14Fonts; -import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page; @@ -150,6 +151,32 @@ public class PdfVisualisationUtility { } + @SneakyThrows + public static void drawLine2DList(PDDocument pdDocument, int pageNumber, List line2DS, Options options) { + + var pdPage = pdDocument.getPage(pageNumber - 1); + var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, true); + + contentStream.setStrokingColor(options.getStrokeColor()); + contentStream.setNonStrokingColor(options.getFillColor()); + contentStream.setLineWidth(options.getStrokeWidth()); + + for (var line2D : line2DS) { + contentStream.moveTo((float) line2D.getX1(), (float) line2D.getY1()); + contentStream.lineTo((float) line2D.getX2(), (float) line2D.getY2()); + + if (options.isStroke() && options.isFill()) { + contentStream.fillAndStroke(); + } else if (options.isStroke()) { + contentStream.stroke(); + } else if (options.isFill()) { + contentStream.fill(); + } + } + contentStream.close(); + } + + @Builder @Getter @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/PositionUtils.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PositionUtils.java similarity index 95% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/PositionUtils.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PositionUtils.java index 8b52b74..3aecb92 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/PositionUtils.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PositionUtils.java @@ -1,7 +1,7 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.utils; +package com.knecon.fforesight.service.layoutparser.processor.utils; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import lombok.experimental.UtilityClass; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/QuickSort.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/QuickSort.java similarity index 96% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/QuickSort.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/QuickSort.java index 5e65c49..32793b0 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/QuickSort.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/QuickSort.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.utils; +package com.knecon.fforesight.service.layoutparser.processor.utils; import java.util.ArrayDeque; import java.util.Comparator; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java index d5617a3..8dc23ad 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RectangleTransformations.java @@ -1,6 +1,7 @@ package com.knecon.fforesight.service.layoutparser.processor.utils; -import java.awt.geom.Area; +import static java.lang.String.format; + import java.awt.geom.Rectangle2D; import java.awt.geom.RectangularShape; import java.util.Collections; @@ -19,11 +20,35 @@ import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlo import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.AtomicTextBlock; +import lombok.AllArgsConstructor; +import lombok.NoArgsConstructor; + public class RectangleTransformations { - public static PDRectangle toPDRectangleUnion(List rectangles) { + public static Rectangle2D pad(Rectangle2D rectangle2D, int deltaX, int deltaY) { - Rectangle2D rectangle2D = RectangleTransformations.bBoxUnionRectangle(rectangles); + return new Rectangle2D.Double(rectangle2D.getMinX() - deltaX, rectangle2D.getMinY() - deltaY, rectangle2D.getWidth() + 2 * deltaX, rectangle2D.getHeight() + 2 * deltaY); + } + + + public static Rectangle2D pad(Rectangle2D rectangle2D, double deltaX, double deltaY) { + + return new Rectangle2D.Double(rectangle2D.getMinX() - deltaX, rectangle2D.getMinY() - deltaY, rectangle2D.getWidth() + 2 * deltaX, rectangle2D.getHeight() + 2 * deltaY); + } + + + public static Rectangle2D bBoxUnionAtomicTextBlock(List atomicTextBlocks) { + + return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DBBoxCollector()); + } + public static Collector collectBBox() { + + return new Rectangle2DBBoxCollector(); + } + + public static PDRectangle toPDRectangleBBox(List rectangles) { + + Rectangle2D rectangle2D = RectangleTransformations.rectangleBBox(rectangles); PDRectangle annotationPosition = new PDRectangle(); annotationPosition.setLowerLeftX((float) rectangle2D.getMinX()); @@ -34,15 +59,20 @@ public class RectangleTransformations { } - public static Rectangle2D bBoxUnionAtomicTextBlock(List atomicTextBlocks) { + public static Rectangle2D atomicTextBlockBBox(List atomicTextBlocks) { - return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DUnion()); + return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DBBoxCollector()); } - public static Rectangle2D bBoxUnionRectangle(List rectangles) { + public static String toString(Rectangle2D rectangle2D) { - return rectangles.stream().map(RectangleTransformations::toRectangle2D).collect(new Rectangle2DUnion()); + return format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight()); + } + + public static Rectangle2D rectangleBBox(List rectangles) { + + return rectangles.stream().map(RectangleTransformations::toRectangle2D).collect(new Rectangle2DBBoxCollector()); } @@ -54,6 +84,11 @@ public class RectangleTransformations { -redactionLogRectangle.getHeight()); } + public static Rectangle2D toRectangle2D(PDRectangle rectangle) { + + return new Rectangle2D.Double(rectangle.getLowerLeftX(), rectangle.getLowerLeftY(), rectangle.getWidth(), rectangle.getHeight()); + } + public static Rectangle toRedactionLogRectangle(Rectangle2D rectangle2D, int pageNumber) { @@ -64,9 +99,9 @@ public class RectangleTransformations { } - public static Rectangle2D rectangleUnion(List rectangle2DList) { + public static Rectangle2D rectangle2DBBox(List rectangle2DList) { - return rectangle2DList.stream().collect(new Rectangle2DUnion()); + return rectangle2DList.stream().collect(new Rectangle2DBBoxCollector()); } @@ -76,7 +111,7 @@ public class RectangleTransformations { * @param rectangle2DList A list of rectangles to combine * @return A list of rectangles which are combined if they are closer than the split threshold */ - public static List rectangleUnionWithGaps(List rectangle2DList) { + public static List rectangleBBoxWithGaps(List rectangle2DList) { if (rectangle2DList.isEmpty()) { return Collections.emptyList(); @@ -98,49 +133,87 @@ public class RectangleTransformations { previousRectangle = currentRectangle; } } - return rectangleListsWithGaps.stream().map(RectangleTransformations::rectangleUnion).toList(); + return rectangleListsWithGaps.stream().map(RectangleTransformations::rectangle2DBBox).toList(); } - private static class Rectangle2DUnion implements Collector { + private static class Rectangle2DBBoxCollector implements Collector { @Override - public Supplier supplier() { + public Supplier supplier() { - return Area::new; + return BBox::new; } @Override - public BiConsumer accumulator() { + public BiConsumer accumulator() { - return (area, rectangle2D) -> area.add(new Area(rectangle2D)); + return (bb, rect) -> bb.addRectangle(rect.getMinX(), rect.getMinY(), rect.getMaxX(), rect.getMaxY()); } @Override - public BinaryOperator combiner() { + public BinaryOperator combiner() { - return (area1, area2) -> { - area1.add(area2); - return area1; - }; + return (b1, b2) -> new BBox(Math.min(b1.lowerLeftX, b2.lowerLeftX), + Math.min(b1.lowerLeftY, b2.lowerLeftY), + Math.max(b1.upperRightX, b2.upperRightX), + Math.max(b1.upperRightY, b2.upperRightY)); } @Override - public Function finisher() { + public Function finisher() { - return Area::getBounds2D; + return bb -> new Rectangle2D.Double(bb.lowerLeftX, bb.lowerLeftY, bb.upperRightX - bb.lowerLeftX, bb.upperRightY - bb.lowerLeftY); } @Override public Set characteristics() { - return Set.of(Characteristics.CONCURRENT, Characteristics.UNORDERED); + return Set.of(Characteristics.UNORDERED); + } + + + @AllArgsConstructor + @NoArgsConstructor + private static class BBox { + + Double lowerLeftX; + Double lowerLeftY; + Double upperRightX; + Double upperRightY; + + + public void addRectangle(double lowerLeftX, double lowerLeftY, double upperRightX, double upperRightY) { + + if (this.lowerLeftX == null) { + this.lowerLeftX = lowerLeftX; + } else if (this.lowerLeftX > lowerLeftX) { + this.lowerLeftX = lowerLeftX; + } + if (this.lowerLeftY == null) { + this.lowerLeftY = lowerLeftY; + } else if (this.lowerLeftY > lowerLeftY) { + this.lowerLeftY = lowerLeftY; + } + if (this.upperRightX == null) { + this.upperRightX = upperRightX; + } else if (this.upperRightX < upperRightX) { + this.upperRightX = upperRightX; + } + if (this.upperRightY == null) { + this.upperRightY = upperRightY; + } else if (this.upperRightY < upperRightY) { + this.upperRightY = upperRightY; + } + + } + } } -} +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/RulingTextDirAdjustUtil.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RulingTextDirAdjustUtil.java similarity index 91% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/RulingTextDirAdjustUtil.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RulingTextDirAdjustUtil.java index 7931d65..04ff106 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/RulingTextDirAdjustUtil.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/RulingTextDirAdjustUtil.java @@ -1,9 +1,9 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.utils; +package com.knecon.fforesight.service.layoutparser.processor.utils; import java.awt.geom.Line2D; import java.awt.geom.Point2D; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; import lombok.experimental.UtilityClass; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TableMergingUtility.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TableMergingUtility.java index fd59588..e6a7332 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TableMergingUtility.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TableMergingUtility.java @@ -5,15 +5,18 @@ import java.util.LinkedList; import java.util.List; import java.util.stream.Stream; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; +import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; import lombok.experimental.UtilityClass; @UtilityClass public class TableMergingUtility { + private static final double TABLE_ALIGNMENT_THRESHOLD = 2d; + + public List findConsecutiveTablesWithSameColCountAndSameHeaders(TablePageBlock originalTablePageBlock, List pageBlocks) { List consecutiveTables = pageBlocks.stream() @@ -24,7 +27,8 @@ public class TableMergingUtility { List consecutiveTablesWithSameColCountAndHeaders = new LinkedList<>(); for (TablePageBlock consecutiveTable : consecutiveTables) { - if (consecutiveTable.getColCount() == originalTablePageBlock.getColCount() && !hasTableHeader(consecutiveTable)) { + if (consecutiveTable.getColCount() == originalTablePageBlock.getColCount() && !hasTableHeader(consecutiveTable) && outerBoundaryAlignsX(originalTablePageBlock, + consecutiveTable)) { consecutiveTablesWithSameColCountAndHeaders.add(consecutiveTable); } else { break; @@ -34,6 +38,12 @@ public class TableMergingUtility { } + private static boolean outerBoundaryAlignsX(TablePageBlock originalTablePageBlock, TablePageBlock consecutiveTable) { + + return Math.abs(consecutiveTable.getMinX() - originalTablePageBlock.getMinX()) < TABLE_ALIGNMENT_THRESHOLD && Math.abs(consecutiveTable.getMaxX() - originalTablePageBlock.getMaxX()) < TABLE_ALIGNMENT_THRESHOLD; + } + + private boolean hasTableHeader(TablePageBlock table) { return table.getRows().stream().flatMap(Collection::stream).anyMatch(Cell::isHeaderCell); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/TextNormalizationUtilities.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextNormalizationUtilities.java similarity index 88% rename from layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/TextNormalizationUtilities.java rename to layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextNormalizationUtilities.java index 9cec075..9f90bee 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/classification/utils/TextNormalizationUtilities.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextNormalizationUtilities.java @@ -1,4 +1,4 @@ -package com.knecon.fforesight.service.layoutparser.processor.classification.utils; +package com.knecon.fforesight.service.layoutparser.processor.utils; import lombok.experimental.UtilityClass; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java index c4c0eba..fbd57c4 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java @@ -3,8 +3,8 @@ package com.knecon.fforesight.service.layoutparser.processor.utils; import java.util.Comparator; import java.util.List; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock; -import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; public class TextPositionOperations { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionSequenceComparator.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionSequenceComparator.java new file mode 100644 index 0000000..40dce07 --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionSequenceComparator.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.knecon.fforesight.service.layoutparser.processor.utils; + +import java.util.Comparator; + +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; + +/** + * This class is a comparator for TextPosition operators. It handles + * pages with text in different directions by grouping the text based + * on direction and sorting in that direction. This allows continuous text + * in a given direction to be more easily grouped together. + * + * @author Ben Litchfield + */ +public class TextPositionSequenceComparator implements Comparator +{ + @Override + public int compare(TextPositionSequence pos1, TextPositionSequence pos2) + { + // only compare text that is in the same direction + int cmp1 = Float.compare(pos1.getDir().getDegrees(), pos2.getDir().getDegrees()); + if (cmp1 != 0) + { + return cmp1; + } + + // get the text direction adjusted coordinates + float x1 = pos1.getMinXDirAdj(); + float x2 = pos2.getMinXDirAdj(); + + float pos1YBottom = pos1.getMaxYDirAdj(); + float pos2YBottom = pos2.getMaxYDirAdj(); + + // note that the coordinates have been adjusted so 0,0 is in upper left + float pos1YTop = pos1YBottom - pos1.getTextHeight(); + float pos2YTop = pos2YBottom - pos2.getTextHeight(); + + float yDifference = Math.abs(pos1YBottom - pos2YBottom); + + // we will do a simple tolerance comparison + if (yDifference < .1 || + pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom || + pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom) + { + return Float.compare(x1, x2); + } + else if (pos1YBottom < pos2YBottom) + { + return -1; + } + else + { + return 1; + } + } +} diff --git a/layoutparser-service/layoutparser-service-processor/src/test/resources/log4j2-test.xml b/layoutparser-service/layoutparser-service-processor/src/test/resources/log4j2-test.xml new file mode 100644 index 0000000..b4895cf --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/test/resources/log4j2-test.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java new file mode 100644 index 0000000..86b2a66 --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java @@ -0,0 +1,106 @@ +package com.knecon.fforesight.service.layoutparser.server; + +import java.awt.Color; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.time.Duration; +import java.util.List; + +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData; +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; +import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline; +import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse; +import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; +import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; +import com.knecon.fforesight.service.layoutparser.processor.mapper.taas.TaasDocumentDataMapper; +import com.knecon.fforesight.service.layoutparser.server.utils.BaseTest; +import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; + +import lombok.SneakyThrows; + +public class BdrJsonBuildTest extends BaseTest { + + @Autowired + private ObjectMapper objectMapper; + + @Autowired + private LayoutParsingPipeline layoutParsingPipeline; + + + @SneakyThrows + protected Document buildGraph(File filename) { + + try (InputStream inputStream = new FileInputStream(filename)) { + PDDocument pdDocument = Loader.loadPDF(inputStream); + return layoutParsingPipeline.parseLayoutWithTimer(LayoutParsingType.REDACT_MANAGER, pdDocument, new ImageServiceResponse(), new TableServiceResponse()); + } + } + + + @Test + @Disabled + public void writeBDRDocumentData() throws IOException { + + String sourcePath = "/tmp/bdr_files"; + String targetPath = "/tmp/result"; + Paths.get(targetPath).toFile().mkdirs(); + + List files = Files.walk(Paths.get(sourcePath)).filter(currentPath -> currentPath.toString().endsWith(".pdf")).map(Path::toFile).toList(); + + System.out.printf("Found %d files \n", files.size()); + for (int i = 0; i < files.size(); i++) { + System.out.println(i + ": " + files.get(i)); + } + + System.out.println(); + + for (var file : files) { + long start = System.currentTimeMillis(); + System.out.println("Starting Structure Analysis for: " + file); + Document document = buildGraph(file); + + long start2 = System.currentTimeMillis(); + ResearchDocumentData researchDocumentData = TaasDocumentDataMapper.fromDocument(document); + researchDocumentData.setOriginalFile(file.toString()); + System.out.printf(", mapped to research data %d ms \n", System.currentTimeMillis() - start2); + + File jsonFile = Paths.get(targetPath, file.getName().replace(".pdf", ".json")).toFile(); + try (FileOutputStream fileOutputStream = new FileOutputStream(jsonFile)) { + System.out.println("json written to: " + jsonFile); + fileOutputStream.write(objectMapper.writeValueAsBytes(researchDocumentData)); + } + File visualizationFile = Paths.get(targetPath, file.getName().replace(".pdf", "_BBOX.pdf")).toFile(); + visualizeSemanticNodes(file, visualizationFile, document, document.getTextBlock()); + System.out.println("visualization pdf written to: " + visualizationFile); + System.out.printf("Full analysis and file creation took %s\n\n", Duration.ofMillis(System.currentTimeMillis() - start)); + } + + } + + + private static void visualizeSemanticNodes(File file, File resultingFileName, Document document, TextBlock textBlock) throws IOException { + + try (var fileStream = new FileInputStream(file); var outputStream = new FileOutputStream(resultingFileName)) { + PDDocument pdDocument = Loader.loadPDF(fileStream); + PdfDraw.drawDocumentGraph(pdDocument, document); + PdfDraw.drawTextBlock(pdDocument, textBlock, PdfDraw.Options.builder().stroke(true).strokeWidth(0.1f).strokeColor(Color.YELLOW).build()); + pdDocument.save(outputStream); + pdDocument.close(); + } + } + +} diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BuildDocumentGraphTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BuildDocumentGraphTest.java index d74d2e0..094cb5c 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BuildDocumentGraphTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/BuildDocumentGraphTest.java @@ -6,11 +6,13 @@ import java.io.InputStream; import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.core.io.ClassPathResource; -import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingService; +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; +import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline; import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.server.utils.BaseTest; @@ -20,9 +22,10 @@ import lombok.SneakyThrows; public class BuildDocumentGraphTest extends BaseTest { @Autowired - private LayoutParsingService layoutParsingService; + private LayoutParsingPipeline layoutParsingPipeline; @Test + @Disabled public void buildMetolachlor() { Document documentGraph = buildGraph("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06"); @@ -35,16 +38,20 @@ public class BuildDocumentGraphTest extends BaseTest { @SneakyThrows protected Document buildGraph(String filename) { - if (filename.equals("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06")) { - prepareStorage(filename + ".pdf", "cv_table_parsing_response/empty.json", "image_service_response/S-Metolachlor_RAR_01_Volume_1_2018-09-06.IMAGE_INFO.json"); - } else { - prepareStorage(filename + ".pdf"); + if (!filename.endsWith(".pdf")) { + filename = filename + ".pdf"; } - ClassPathResource fileResource = new ClassPathResource(filename + ".pdf"); + + if (filename.equals("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06")) { + prepareStorage(filename, "cv_table_parsing_response/empty.json", "image_service_response/S-Metolachlor_RAR_01_Volume_1_2018-09-06.IMAGE_INFO.json"); + } else { + prepareStorage(filename); + } + ClassPathResource fileResource = new ClassPathResource(filename); try (InputStream inputStream = fileResource.getInputStream()) { PDDocument pdDocument = Loader.loadPDF(inputStream); - return layoutParsingService.parseLayout(pdDocument, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), new TableServiceResponse()); + return layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, pdDocument, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), new TableServiceResponse()); } } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentDataTests.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentDataTests.java new file mode 100644 index 0000000..179dcfc --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentDataTests.java @@ -0,0 +1,45 @@ +package com.knecon.fforesight.service.layoutparser.server.graph; + +import java.io.File; +import java.io.FileOutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; + +import org.junit.jupiter.api.Test; +import org.springframework.core.io.ClassPathResource; + +import com.iqser.red.commons.jackson.ObjectMapperFactory; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData; +import com.knecon.fforesight.service.layoutparser.processor.mapper.redaction.DocumentDataMapper; + +import lombok.SneakyThrows; + +public class DocumentDataTests extends BuildDocumentGraphTest{ + @Test + @SneakyThrows + public void createDocumentDataForAllFiles() { + + String outPath = "/tmp/document_data_output_layoutparser"; + + ClassPathResource resource = new ClassPathResource("files"); + List pdfFileNames = Files.walk(resource.getFile().toPath()) + .filter(path -> path.getFileName().toString().endsWith(".pdf")) + .map(Path::toAbsolutePath) + .map(Path::toString) + .toList(); + System.out.printf("%d Files found%n", pdfFileNames.size()); + for (int i = 0; i < pdfFileNames.size(); i++) { + System.out.printf("%d/%d: %s%n", i, pdfFileNames.size(), pdfFileNames.get(i)); + } + for (String pdfFileName : pdfFileNames) { + System.out.println(pdfFileName); + DocumentData documentData = DocumentDataMapper.toDocumentData(buildGraph(resource.getFile().toPath().getParent().relativize(Path.of(pdfFileName)).toString())); + File outputFile = Path.of(outPath).resolve(resource.getFile().toPath().relativize(Path.of(pdfFileName))).toFile(); + outputFile.toPath().getParent().toFile().mkdirs(); + try (var out = new FileOutputStream(outputFile.toString().replace(".pdf", ".json"))) { + ObjectMapperFactory.create().writeValue(out, documentData); + } + } + } +} diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java index 0874336..85ef429 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java @@ -8,9 +8,9 @@ import org.junit.jupiter.api.Test; import com.fasterxml.jackson.databind.ObjectMapper; import com.iqser.red.commons.jackson.ObjectMapperFactory; -import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; -import com.knecon.fforesight.service.layoutparser.processor.mapper.DocumentDataMapper; +import com.knecon.fforesight.service.layoutparser.processor.mapper.redaction.DocumentDataMapper; import lombok.SneakyThrows; @@ -20,7 +20,7 @@ public class DocumentGraphJsonWritingTest extends BuildDocumentGraphTest { @Disabled public void writeJsonForFileTest() { - writeJsons("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06"); + writeJsons("files/216"); } @SneakyThrows diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphMappingTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphMappingTest.java index ccf6f82..80e9ecb 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphMappingTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphMappingTest.java @@ -1,12 +1,14 @@ package com.knecon.fforesight.service.layoutparser.server.graph; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; -import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; -import com.knecon.fforesight.service.layoutparser.processor.mapper.DocumentDataMapper; -import com.knecon.fforesight.service.layoutparser.processor.mapper.DocumentGraphMapper; +import com.knecon.fforesight.service.layoutparser.processor.mapper.redaction.DocumentDataMapper; +import com.knecon.fforesight.service.layoutparser.processor.mapper.redaction.DocumentGraphMapper; +import com.knecon.fforesight.service.layoutparser.processor.mapper.taas.TaasDocumentDataMapper; import lombok.SneakyThrows; @@ -14,13 +16,17 @@ public class DocumentGraphMappingTest extends BuildDocumentGraphTest { @Test @SneakyThrows + @Disabled public void testGraphMapping() { Document document = buildGraph("files/crafted document"); LayoutParsingRequest layoutParsingRequest = buildStandardLayoutParsingRequest(); DocumentData documentData = DocumentDataMapper.toDocumentData(document); + var researchDocumentData = TaasDocumentDataMapper.fromDocument(document); + layoutParsingStorageService.storeDocumentData(layoutParsingRequest, documentData); + DocumentData documentData2 = layoutParsingStorageService.readDocumentData(layoutParsingRequest); Document newDocumentGraph = DocumentGraphMapper.toDocumentGraph(documentData2); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/model/SectionIdentifierTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/model/SectionIdentifierTest.java new file mode 100644 index 0000000..5ff1fe1 --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/model/SectionIdentifierTest.java @@ -0,0 +1,58 @@ +package com.knecon.fforesight.service.layoutparser.server.model; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import org.junit.jupiter.api.Test; + +import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentifier; + +class SectionIdentifierTest { + + @Test + public void testParentOf() { + + var headline = SectionIdentifier.fromSearchText("1 Did you ever hear the tragedy of Darth Plagueis The Wise?"); + var headline1 = SectionIdentifier.fromSearchText("1.0 I thought not. It’s not a story the Jedi would tell you."); + var headline2 = SectionIdentifier.fromSearchText("1.1 It’s a Sith legend. Darth Plagueis was a Dark Lord of the Sith, "); + var headline3 = SectionIdentifier.fromSearchText("1.2.3 so powerful and so wise he could use the Force to influence the midichlorians to create life…"); + var headline4 = SectionIdentifier.fromSearchText("1.2.3.4 He had such a knowledge of the dark side that he could even keep the ones he cared about from dying."); + var headline5 = SectionIdentifier.fromSearchText("1.2.3.4.5 The dark side of the Force is a pathway to many abilities some consider to be unnatural."); + var headline6 = SectionIdentifier.fromSearchText("2.0 He became so powerful…"); + var headline7 = SectionIdentifier.fromSearchText("10000.0 the only thing he was afraid of was losing his power,"); + var headline8 = SectionIdentifier.fromSearchText("A.0 which eventually, of course, he did."); + var headline9 = SectionIdentifier.fromSearchText("Unfortunately, he taught his apprentice everything he knew, then his apprentice killed him in his sleep."); + var headline10 = SectionIdentifier.fromSearchText("2.1.2 Ironic."); + var headline11 = SectionIdentifier.fromSearchText("2.He could save others from death,"); + var headline12 = SectionIdentifier.fromSearchText(" 2. but not himself."); + + var paragraph1 = SectionIdentifier.asChildOf(headline); + assertTrue(paragraph1.isChildOf(headline)); + assertTrue(headline.isParentOf(paragraph1)); + assertFalse(paragraph1.isParentOf(headline)); + + assertFalse(headline.isParentOf(headline1)); + assertTrue(headline.isParentOf(headline2)); + assertTrue(headline.isParentOf(headline3)); + assertTrue(headline.isParentOf(headline4)); + assertTrue(headline.isParentOf(headline5)); + assertTrue(headline1.isParentOf(headline2)); + assertFalse(headline1.isParentOf(headline1)); + assertTrue(headline3.isParentOf(headline4)); + assertFalse(headline4.isParentOf(headline5)); + assertFalse(headline2.isParentOf(headline3)); + assertFalse(headline2.isParentOf(headline4)); + assertTrue(headline1.isParentOf(headline3)); + assertTrue(headline1.isParentOf(headline4)); + assertFalse(headline1.isParentOf(headline6)); + assertFalse(headline1.isParentOf(headline7)); + assertFalse(headline8.isParentOf(headline1)); + assertFalse(headline8.isParentOf(headline2)); + assertFalse(headline8.isParentOf(headline3)); + assertFalse(headline8.isParentOf(headline4)); + assertFalse(headline9.isParentOf(headline9)); + assertTrue(headline10.isChildOf(headline11)); + assertTrue(headline10.isChildOf(headline12)); + } + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/GapAcrossLinesDetectionServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/GapAcrossLinesDetectionServiceTest.java new file mode 100644 index 0000000..56d0126 --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/GapAcrossLinesDetectionServiceTest.java @@ -0,0 +1,74 @@ +package com.knecon.fforesight.service.layoutparser.server.services; + +import java.awt.geom.Rectangle2D; +import java.util.LinkedList; +import java.util.List; + +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; + +import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation; +import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; +import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation; +import com.knecon.fforesight.service.layoutparser.processor.services.DividingColumnDetectionService; +import com.knecon.fforesight.service.layoutparser.processor.services.GapDetectionService; +import com.knecon.fforesight.service.layoutparser.processor.services.GapsAcrossLinesService; +import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService; +import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter; +import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; + +import lombok.SneakyThrows; + +class GapAcrossLinesDetectionServiceTest { + + @Test + @Disabled + @SneakyThrows + public void testGapBasedColumnDetection() { + + String filename = "files/211.pdf"; + var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf"; + System.out.println("start TextPosition extraction"); + long start = System.currentTimeMillis(); + List pageInformations = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename).stream().map(PageInformationService::build).toList(); + List> columnsPerPage = new LinkedList<>(); + System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start); + System.out.println("start column detection"); + start = System.currentTimeMillis(); + for (PageInformation pageInformation : pageInformations) { + GapInformation gapInformation = GapDetectionService.findGapsInLines(pageInformation.getPageContents().getSortedTextPositionSequences(), pageInformation.getMainBodyTextFrame()); + columnsPerPage.add(GapsAcrossLinesService.detectXGapsAcrossLines(gapInformation, pageInformation.getMainBodyTextFrame())); + } + System.out.printf("Finished column detection in %d ms%n", System.currentTimeMillis() - start); + System.out.println("start draw rectangles"); + start = System.currentTimeMillis(); + PdfDraw.drawRectanglesPerPage(filename, columnsPerPage, tmpFileName); + System.out.printf("Finished drawing rectangles in %d ms%n", System.currentTimeMillis() - start); + } + + + @Test + @Disabled + @SneakyThrows + public void testColumnDetection() { + + String filename = "files/211.pdf"; + var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf"; + System.out.println("start TextPosition extraction"); + long start = System.currentTimeMillis(); + List sortedTextPositionSequencesPerPage = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename); + List> columnsPerPage = new LinkedList<>(); + System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start); + System.out.println("start column detection"); + start = System.currentTimeMillis(); + for (PageContents pageContents : sortedTextPositionSequencesPerPage) { + columnsPerPage.add(DividingColumnDetectionService.detectColumns(pageContents)); + } + System.out.printf("Finished column detection in %d ms%n", System.currentTimeMillis() - start); + System.out.println("start draw rectangles"); + start = System.currentTimeMillis(); + PdfDraw.drawRectanglesPerPage(filename, columnsPerPage, tmpFileName); + System.out.printf("Finished drawing rectangles in %d ms%n", System.currentTimeMillis() - start); + } + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/InvisibleTableDetectionServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/InvisibleTableDetectionServiceTest.java new file mode 100644 index 0000000..628a66f --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/InvisibleTableDetectionServiceTest.java @@ -0,0 +1,66 @@ +package com.knecon.fforesight.service.layoutparser.server.services; + +import java.awt.geom.Rectangle2D; +import java.nio.file.Path; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; + +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; + +import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.services.InvisibleTableDetectionService; +import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; +import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter; +import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; + +import lombok.SneakyThrows; + +class InvisibleTableDetectionServiceTest { + + @Test + @Disabled + @SneakyThrows + public void detectInvisibleTableTest() { + + String fileName = "files/211.pdf"; + var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TABLE.pdf").toString(); + List pageContents = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName).stream().map(PageInformationService::build).collect(Collectors.toList()); + + int pageNumber = 1; + Rectangle2D tableBBox = pageContents.get(0) + .getPageContents() + .getSortedTextPositionSequences() + .subList(45, 152) + .stream() + .map(TextPositionSequence::getRectangle) + .map(RectangleTransformations::toRectangle2D) + .map(this::mirrorY) + .collect(RectangleTransformations.collectBBox()); + + List textPositionSequences = pageContents.get(0) + .getPageContents() + .getSortedTextPositionSequences() + .stream() + .filter(textPositionSequence -> tableBBox.contains(mirrorY(RectangleTransformations.toRectangle2D(textPositionSequence.getRectangle())))) + .toList(); + + var table = InvisibleTableDetectionService.detectTable(textPositionSequences, tableBBox); + + PdfDraw.drawRectanglesPerPage(fileName, List.of(table.stream().flatMap(Collection::stream).toList(), Collections.emptyList()), tmpFileName); + } + + + private Rectangle2D mirrorY(Rectangle2D rectangle2D) { + + if (rectangle2D.getHeight() >= 0) { + return rectangle2D; + } + return new Rectangle2D.Double(rectangle2D.getX(), rectangle2D.getY() + rectangle2D.getHeight(), rectangle2D.getWidth(), -rectangle2D.getHeight()); + } + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/MainBodyTextFrameExtractionServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/MainBodyTextFrameExtractionServiceTest.java new file mode 100644 index 0000000..cb50c0a --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/MainBodyTextFrameExtractionServiceTest.java @@ -0,0 +1,28 @@ +package com.knecon.fforesight.service.layoutparser.server.services; + +import java.nio.file.Path; +import java.util.List; + +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; + +import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; +import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter; + +import lombok.SneakyThrows; + +class MainBodyTextFrameExtractionServiceTest { + + @Test + @Disabled + @SneakyThrows + public void testMainBodyDetection() { + + String fileName = "files/211.pdf"; + String tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_MAIN_BODY.pdf").toString(); + List sortedTextPositionSequence = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName); + + + } + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageInformationServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageInformationServiceTest.java new file mode 100644 index 0000000..f256e66 --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/PageInformationServiceTest.java @@ -0,0 +1,63 @@ +package com.knecon.fforesight.service.layoutparser.server.services; + +import java.util.Collection; +import java.util.List; + +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; + +import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation; +import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService; +import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter; +import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; + +import lombok.SneakyThrows; + +class PageInformationServiceTest { + + @Test + @Disabled + @SneakyThrows + public void testGapDetection() { + + String filename = "files/211.pdf"; + var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf"; + System.out.println("start TextPosition extraction"); + long start = System.currentTimeMillis(); + List pageInformations = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename).stream().map(PageInformationService::build).toList(); + System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start); + System.out.println("start gap detection"); + start = System.currentTimeMillis(); + System.out.printf("Finished gap detection in %d ms%n", System.currentTimeMillis() - start); + System.out.println("start draw rectangles"); + start = System.currentTimeMillis(); + PdfDraw.drawRectanglesAndLinesPerPage(filename, + pageInformations.stream().map(PageInformation::getGapInformation).map(gaps -> gaps.getYGaps().stream().flatMap(Collection::stream).toList()).toList(), + pageInformations.stream().map(PageInformation::getGapInformation).map(gaps -> gaps.getXGaps().stream().flatMap(Collection::stream).toList()).toList(), + tmpFileName); + System.out.printf("Finished drawing rectangles in %d ms%n", System.currentTimeMillis() - start); + } + + @Test + @Disabled + @SneakyThrows + public void testLineDetection() { + + String filename = "files/211.pdf"; + var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf"; + System.out.println("start TextPosition extraction"); + long start = System.currentTimeMillis(); + List pageInformations = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename).stream().map(PageInformationService::build).toList(); + System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start); + System.out.println("start gap detection"); + start = System.currentTimeMillis(); + System.out.printf("Finished gap detection in %d ms%n", System.currentTimeMillis() - start); + System.out.println("start draw rectangles"); + start = System.currentTimeMillis(); + PdfDraw.drawRectanglesPerPageNumberedByLine(filename, + pageInformations.stream().map(PageInformation::getLineInformation).map(gaps -> gaps.getBBoxWithGapsByLines().stream().toList()).toList(), + tmpFileName); + System.out.printf("Finished drawing rectangles in %d ms%n", System.currentTimeMillis() - start); + } + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/TextPositionSequenceSorterTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/TextPositionSequenceSorterTest.java new file mode 100644 index 0000000..5f3858c --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/TextPositionSequenceSorterTest.java @@ -0,0 +1,41 @@ +package com.knecon.fforesight.service.layoutparser.server.services; + +import java.nio.file.Path; +import java.util.List; + +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; + +import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; +import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter; +import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; + +import lombok.SneakyThrows; + +class TextPositionSequenceSorterTest { + + @Test + @Disabled + @SneakyThrows + public void testTextPositionSequenceExtraction() { + + String fileName = "files/211.pdf"; + var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TEXT_POSITION_SEQUENCES.pdf").toString(); + + List textPositionPerPage = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName); + + PdfDraw.drawRectanglesPerPageNumberedByLine(fileName, + textPositionPerPage.stream() + .map(t -> t.getSortedTextPositionSequences() + .stream() + .map(TextPositionSequence::getRectangle) + .map(RectangleTransformations::toRectangle2D) + //.map(textPositionSequence -> (Rectangle2D) new Rectangle2D.Double(textPositionSequence.getMaxXDirAdj(), textPositionSequence.getMaxYDirAdj(), textPositionSequence.getWidth(), textPositionSequence.getHeight())) + .map(List::of) + .toList()) + .toList(), tmpFileName); + } + +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/visualizations/PdfDraw.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/visualizations/PdfDraw.java index e8fd284..e3d7822 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/visualizations/PdfDraw.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/visualizations/PdfDraw.java @@ -3,24 +3,29 @@ package com.knecon.fforesight.service.layoutparser.server.utils.visualizations; import java.awt.Color; import java.awt.geom.Point2D; import java.awt.geom.Rectangle2D; +import java.io.FileOutputStream; import java.io.IOException; +import java.io.InputStream; import java.util.List; import java.util.Map; +import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPageContentStream; import org.apache.pdfbox.pdmodel.font.PDType1Font; import org.apache.pdfbox.pdmodel.font.Standard14Fonts; import org.apache.pdfbox.util.Matrix; +import org.springframework.core.io.ClassPathResource; -import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.AtomicTextBlock; import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock; -import com.knecon.fforesight.service.layoutparser.processor.services.RectangleTransformations; +import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; +import com.knecon.fforesight.service.layoutparser.processor.utils.PdfVisualisationUtility; import lombok.AccessLevel; import lombok.AllArgsConstructor; @@ -34,6 +39,68 @@ import lombok.experimental.UtilityClass; @UtilityClass public class PdfDraw { + public static void drawRectanglesPerPage(String filename, List> rectanglesPerPage, String tmpFileName) throws IOException { + + try (InputStream inputStream = new ClassPathResource(filename).getInputStream()) { + PDDocument pdDocument = Loader.loadPDF(inputStream); + + for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) { + PdfVisualisationUtility.drawRectangle2DList(pdDocument, + pageNumber, + rectanglesPerPage.get(pageNumber - 1), + PdfVisualisationUtility.Options.builder().stroke(true).build()); + } + try (var out = new FileOutputStream(tmpFileName)) { + pdDocument.save(out); + pdDocument.close(); + } + + } + + } + + + public static void drawRectanglesPerPageNumberedByLine(String filename, List>> rectanglesPerPage, String tmpFileName) throws IOException { + + try (InputStream inputStream = new ClassPathResource(filename).getInputStream()) { + PDDocument pdDocument = Loader.loadPDF(inputStream); + + for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) { + var rectanglesOnPage = rectanglesPerPage.get(pageNumber - 1); + for (int lineNumber = 0; lineNumber < rectanglesOnPage.size(); lineNumber++) { + var rectanglesInLine = rectanglesOnPage.get(lineNumber); + PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, rectanglesInLine, PdfVisualisationUtility.Options.builder().stroke(true).build()); + double y = Math.min(rectanglesInLine.get(0).getMinY(), rectanglesInLine.get(0).getMaxY()); + PdfVisualisationUtility.drawText(String.format("%d", lineNumber), + pdDocument, + new Point2D.Double(rectanglesInLine.get(0).getX() - (5 + (5 * countNumberOfDigits(lineNumber))), y + 2), + pageNumber, + PdfVisualisationUtility.Options.builder().stroke(true).build()); + + } + + } + try (var out = new FileOutputStream(tmpFileName)) { + pdDocument.save(out); + pdDocument.close(); + } + + } + + } + + + private static int countNumberOfDigits(int num) { + + if (num == 0) { + return 1; + } + int count = 0; + for (; num != 0; num /= 10, ++count) { + } + return count; + } + public static void drawDocumentGraph(PDDocument document, Document documentGraph) { documentGraph.getDocumentTree().allEntriesInOrder().forEach(entry -> drawNode(document, entry)); @@ -115,6 +182,35 @@ public class PdfDraw { } + @SneakyThrows + public static void drawRectanglesAndLinesPerPage(String filename, List> list, List> rectanglesPerPage, String tmpFileName) { + + try (InputStream inputStream = new ClassPathResource(filename).getInputStream()) { + PDDocument pdDocument = Loader.loadPDF(inputStream); + + for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) { +// PdfVisualisationUtility.drawLine2DList(pdDocument, +// pageNumber, +// list.get(pageNumber - 1), +// PdfVisualisationUtility.Options.builder().stroke(true).build()); + PdfVisualisationUtility.drawRectangle2DList(pdDocument, + pageNumber, + rectanglesPerPage.get(pageNumber - 1), + PdfVisualisationUtility.Options.builder().stroke(true).build()); + PdfVisualisationUtility.drawRectangle2DList(pdDocument, + pageNumber, + list.get(pageNumber - 1), + PdfVisualisationUtility.Options.builder().stroke(true).build()); + } + try (var out = new FileOutputStream(tmpFileName)) { + pdDocument.save(out); + pdDocument.close(); + } + + } + } + + @Builder @AllArgsConstructor @NoArgsConstructor diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/application.yml b/layoutparser-service/layoutparser-service-server/src/test/resources/application.yml new file mode 100644 index 0000000..83c7a1c --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/resources/application.yml @@ -0,0 +1,37 @@ +info: + description: Layout Parser Service Processor + +tenant-user-management-service.url: "http://tenant-user-management-service:8080/internal" +fforesight.tenants.remote: true + +server: + port: 8080 + +spring: + main: + allow-circular-references: true # FIXME + rabbitmq: + host: ${RABBITMQ_HOST:localhost} + port: ${RABBITMQ_PORT:5672} + username: ${RABBITMQ_USERNAME:user} + password: ${RABBITMQ_PASSWORD:rabbitmq} + listener: + simple: + acknowledge-mode: AUTO + concurrency: 2 + retry: + enabled: true + max-attempts: 3 + max-interval: 15000 + prefetch: 1 + +management: + endpoint: + metrics.enabled: ${monitoring.enabled:false} + prometheus.enabled: ${monitoring.enabled:false} + health.enabled: true + endpoints.web.exposure.include: prometheus, health + + +storage: + backend: 's3' diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/211.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/211.pdf new file mode 100644 index 0000000..2f84f92 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/211.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/216.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/216.pdf new file mode 100644 index 0000000..f3e4a5c Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/216.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/Plenarprotokoll 1 (keine Druchsache!) (1).pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/Plenarprotokoll 1 (keine Druchsache!) (1).pdf new file mode 100644 index 0000000..564320d Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/Plenarprotokoll 1 (keine Druchsache!) (1).pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/Plenarprotokoll 1 Page 6.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/Plenarprotokoll 1 Page 6.pdf new file mode 100644 index 0000000..530d2c7 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/Plenarprotokoll 1 Page 6.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/Plenarprotokoll 1 page 12.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/Plenarprotokoll 1 page 12.pdf new file mode 100644 index 0000000..d44ad6f Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/Plenarprotokoll 1 page 12.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/RotateTestFileWithImages.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/RotateTestFileWithImages.pdf deleted file mode 100644 index 2b009d1..0000000 Binary files a/layoutparser-service/layoutparser-service-server/src/test/resources/files/RotateTestFileWithImages.pdf and /dev/null differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/SYNGENTA_EFSA_sanitisation_GFL_v1_moreSections.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SYNGENTA_EFSA_sanitisation_GFL_v1_moreSections.pdf new file mode 100644 index 0000000..40218a6 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/SYNGENTA_EFSA_sanitisation_GFL_v1_moreSections.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/crafted document.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/crafted document.pdf deleted file mode 100644 index be18a14..0000000 Binary files a/layoutparser-service/layoutparser-service-server/src/test/resources/files/crafted document.pdf and /dev/null differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/log4j2-test.xml b/layoutparser-service/layoutparser-service-server/src/test/resources/log4j2-test.xml new file mode 100644 index 0000000..b4895cf --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/resources/log4j2-test.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/layoutparser-service/pom.xml b/layoutparser-service/pom.xml index 87d1ca1..7f61c7e 100644 --- a/layoutparser-service/pom.xml +++ b/layoutparser-service/pom.xml @@ -7,6 +7,7 @@ org.springframework.boot spring-boot-starter-parent 3.0.6 + com.knecon.fforesight