From ba1c7c07abcf1a4d74a01d4a07b9074ab97e227a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kilian=20Sch=C3=BCttler?= Date: Wed, 20 Dec 2023 12:40:00 +0100 Subject: [PATCH] RED-7384: fixes for migration --- ...con.fforesight.java-conventions.gradle.kts | 2 + .../build.gradle.kts | 4 +- .../processor/LayoutParsingPipeline.java | 223 ++++++++++++------ .../LayoutParsingStorageService.java | 34 +-- .../processor/model/graph/nodes/Document.java | 4 +- .../processor/model/graph/nodes/Footer.java | 15 ++ .../processor/model/graph/nodes/Header.java | 15 ++ .../processor/model/graph/nodes/Headline.java | 15 ++ .../model/graph/nodes/Paragraph.java | 15 ++ .../processor/model/graph/nodes/Section.java | 15 ++ .../model/graph/nodes/SemanticNode.java | 29 ++- .../processor/model/graph/nodes/Table.java | 11 + .../model/text/TextPositionSequence.java | 32 +-- .../visualization/ViewerDocumentService.java | 78 ++++-- .../layoutparser/server/Application.java | 13 +- .../layoutparser/server/BdrJsonBuildTest.java | 11 +- .../HeadlinesGoldStandardIntegrationTest.java | 6 +- .../server/LayoutparserEnd2EndTest.java | 4 +- .../server/graph/DocumentDataTests.java | 6 +- .../graph/DocumentGraphJsonWritingTest.java | 6 +- .../server/graph/ViewerDocumentTest.java | 52 +--- .../PdfSegmentationServiceTest.java | 68 +++--- .../services/RulingCleaningServiceTest.java | 12 +- .../server/utils/AbstractTest.java | 41 ++-- .../server/utils/BuildDocumentTest.java | 10 +- .../src/test/resources/application.yml | 3 + .../src/test/resources/log4j2-test.xml | 16 -- .../src/test/resources/logback-spring.xml | 17 ++ publish-custom-image.sh | 13 +- 29 files changed, 472 insertions(+), 298 deletions(-) delete mode 100644 layoutparser-service/layoutparser-service-server/src/test/resources/log4j2-test.xml create mode 100644 layoutparser-service/layoutparser-service-server/src/test/resources/logback-spring.xml diff --git a/buildSrc/src/main/kotlin/com.knecon.fforesight.java-conventions.gradle.kts b/buildSrc/src/main/kotlin/com.knecon.fforesight.java-conventions.gradle.kts index 3cc3e04..61951b8 100644 --- a/buildSrc/src/main/kotlin/com.knecon.fforesight.java-conventions.gradle.kts +++ b/buildSrc/src/main/kotlin/com.knecon.fforesight.java-conventions.gradle.kts @@ -24,6 +24,8 @@ tasks.named("test") { reports { junitXml.outputLocation.set(layout.buildDirectory.dir("reports/junit")) } + minHeapSize = "512m" + maxHeapSize = "2048m" } tasks.test { diff --git a/layoutparser-service/layoutparser-service-processor/build.gradle.kts b/layoutparser-service/layoutparser-service-processor/build.gradle.kts index 5c79274..210972d 100644 --- a/layoutparser-service/layoutparser-service-processor/build.gradle.kts +++ b/layoutparser-service/layoutparser-service-processor/build.gradle.kts @@ -15,8 +15,8 @@ dependencies { exclude("org.springframework.boot", "spring-boot-starter-security") exclude("org.springframework.boot", "spring-boot-starter-validation") } - implementation("com.knecon.fforesight:tenant-commons:0.10.0") - implementation("com.iqser.red.commons:storage-commons:2.40.0") + implementation("com.knecon.fforesight:tenant-commons:0.19.0") + implementation("com.iqser.red.commons:storage-commons:2.45.0") implementation("org.apache.pdfbox:pdfbox:${pdfBoxVersion}") implementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}") diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index a02f627..ba1b1c2 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -3,13 +3,16 @@ package com.knecon.fforesight.service.layoutparser.processor; import static java.lang.String.format; import java.awt.geom.Rectangle2D; -import java.io.ByteArrayOutputStream; +import java.io.File; import java.io.IOException; +import java.nio.file.Files; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.concurrent.atomic.AtomicReference; +import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.PDRectangle; @@ -51,91 +54,121 @@ import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDF import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService; import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; +import io.micrometer.observation.Observation; +import io.micrometer.observation.ObservationRegistry; +import io.micrometer.observation.annotation.Observed; +import lombok.AccessLevel; import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; +import lombok.experimental.FieldDefaults; import lombok.extern.slf4j.Slf4j; +@SuppressWarnings("PMD.CloseResource") @Slf4j @Service @RequiredArgsConstructor +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) public class LayoutParsingPipeline { - private final ImageServiceResponseAdapter imageServiceResponseAdapter; - private final CvTableParsingAdapter cvTableParsingAdapter; - private final LayoutParsingStorageService layoutParsingStorageService; - private final SectionsBuilderService sectionsBuilderService; - private final TaasClassificationService taasClassificationService; - private final RedactManagerClassificationService redactManagerClassificationService; - private final DocuMineClassificationService docuMineClassificationService; - private final SimplifiedSectionTextService simplifiedSectionTextService; - private final BodyTextFrameService bodyTextFrameService; - private final RulingCleaningService rulingCleaningService; - private final TableExtractionService tableExtractionService; - private final TaasBlockificationService taasBlockificationService; - private final DocuMineBlockificationService docuMineBlockificationService; - private final RedactManagerBlockificationService redactManagerBlockificationService; - private final ViewerDocumentService viewerDocumentService; + ImageServiceResponseAdapter imageServiceResponseAdapter; + CvTableParsingAdapter cvTableParsingAdapter; + LayoutParsingStorageService layoutParsingStorageService; + SectionsBuilderService sectionsBuilderService; + TaasClassificationService taasClassificationService; + RedactManagerClassificationService redactManagerClassificationService; + DocuMineClassificationService docuMineClassificationService; + SimplifiedSectionTextService simplifiedSectionTextService; + BodyTextFrameService bodyTextFrameService; + RulingCleaningService rulingCleaningService; + TableExtractionService tableExtractionService; + TaasBlockificationService taasBlockificationService; + DocuMineBlockificationService docuMineBlockificationService; + RedactManagerBlockificationService redactManagerBlockificationService; + ViewerDocumentService viewerDocumentService; + ObservationRegistry observationRegistry; public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException { long start = System.currentTimeMillis(); + log.info("Starting layout parsing for {}", layoutParsingRequest.identifier()); - try (PDDocument originDocument = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId())) { - ImageServiceResponse imageServiceResponse = new ImageServiceResponse(); - if (layoutParsingRequest.imagesFileStorageId().isPresent()) { - imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get()); - } + File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId()); + File viewerDocumentFile = File.createTempFile("viewer_document", ".pdf"); - TableServiceResponse tableServiceResponse = new TableServiceResponse(); - if (layoutParsingRequest.tablesFileStorageId().isPresent()) { - tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get()); - } - - ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(), originDocument, imageServiceResponse, tableServiceResponse); - Document documentGraph = DocumentGraphFactory.buildDocumentGraph(classificationDocument); - - int numberOfPages = originDocument.getNumberOfPages(); - - layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph)); - layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph)); - - try (var out = new ByteArrayOutputStream()) { - viewerDocumentService.createViewerDocument(originDocument, documentGraph, out, false); - layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, out); - } - - if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.TAAS)) { - var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentGraph); - layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData); - } - - return LayoutParsingFinishedEvent.builder() - .identifier(layoutParsingRequest.identifier()) - .numberOfPages(numberOfPages) - .duration(System.currentTimeMillis() - start) - .message(format(""" - Layout parsing has finished in %.02f s. - identifiers: %s - %s - Files have been saved with Ids: - Structure: %s - Text: %s - Positions: %s - PageData: %s - Simplified Text: %s - Viewer Doc: %s""", - ((float) (System.currentTimeMillis() - start)) / 1000, - layoutParsingRequest.identifier(), - buildSemanticNodeCountMessage(numberOfPages, documentGraph.buildSemanticNodeCounts()), - layoutParsingRequest.structureFileStorageId(), - layoutParsingRequest.textBlockFileStorageId(), - layoutParsingRequest.positionBlockFileStorageId(), - layoutParsingRequest.pageFileStorageId(), - layoutParsingRequest.simplifiedTextStorageId(), - layoutParsingRequest.viewerDocumentStorageId())) - .build(); + ImageServiceResponse imageServiceResponse = new ImageServiceResponse(); + if (layoutParsingRequest.imagesFileStorageId().isPresent()) { + imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get()); } + + TableServiceResponse tableServiceResponse = new TableServiceResponse(); + if (layoutParsingRequest.tablesFileStorageId().isPresent()) { + tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get()); + } + + ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(), + originFile, + imageServiceResponse, + tableServiceResponse, + layoutParsingRequest.identifier().toString()); + log.info("Building document graph for {}", layoutParsingRequest.identifier()); + + Document documentGraph = observeBuildDocumentGraph(classificationDocument); + + log.info("Storing resulting files for {}", layoutParsingRequest.identifier()); + layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph)); + layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph)); + + log.info("Creating viewer document for {}", layoutParsingRequest.identifier()); + viewerDocumentService.createViewerDocument(originFile, documentGraph, viewerDocumentFile, false); + layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile); + if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.TAAS)) { + log.info("Building research document data for {}", layoutParsingRequest.identifier()); + var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentGraph); + layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData); + } + + originFile.delete(); + viewerDocumentFile.delete(); + + return LayoutParsingFinishedEvent.builder() + .identifier(layoutParsingRequest.identifier()) + .numberOfPages(documentGraph.getNumberOfPages()) + .duration(System.currentTimeMillis() - start) + .message(format(""" + Layout parsing has finished in %.02f s. + identifiers: %s + %s + Files have been saved with Ids: + Structure: %s + Text: %s + Positions: %s + PageData: %s + Simplified Text: %s + Viewer Doc: %s""", + ((float) (System.currentTimeMillis() - start)) / 1000, + layoutParsingRequest.identifier(), + buildSemanticNodeCountMessage(documentGraph.getNumberOfPages(), documentGraph.buildSemanticNodeCounts()), + layoutParsingRequest.structureFileStorageId(), + layoutParsingRequest.textBlockFileStorageId(), + layoutParsingRequest.positionBlockFileStorageId(), + layoutParsingRequest.pageFileStorageId(), + layoutParsingRequest.simplifiedTextStorageId(), + layoutParsingRequest.viewerDocumentStorageId())) + .build(); + + } + + + private Document observeBuildDocumentGraph(ClassificationDocument classificationDocument) { + + AtomicReference documentReference = new AtomicReference<>(); + + Observation.createNotStarted("LayoutParsingPipeline", observationRegistry).contextualName("build-document-graph").observe(() -> { + documentReference.set(DocumentGraphFactory.buildDocumentGraph(classificationDocument)); + }); + + return documentReference.get(); } @@ -154,21 +187,36 @@ public class LayoutParsingPipeline { @SneakyThrows + @Observed(name = "LayoutParsingPipeline", contextualName = "parse-layout") public ClassificationDocument parseLayout(LayoutParsingType layoutParsingType, - PDDocument originDocument, + File originFile, ImageServiceResponse imageServiceResponse, - TableServiceResponse tableServiceResponse) { + TableServiceResponse tableServiceResponse, + String identifier) { + PDDocument originDocument = openDocument(originFile); + addNumberOfPagesToTrace(originDocument.getNumberOfPages(), Files.size(originFile.toPath())); Map> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse); Map> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse); ClassificationDocument classificationDocument = new ClassificationDocument(); List classificationPages = new ArrayList<>(); - originDocument.setAllSecurityToBeRemoved(true); long pageCount = originDocument.getNumberOfPages(); for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) { + + if (pageNumber % 100 == 0) { + // re-open document every once in a while to save on RAM. This has no significant performance impact. + // This is due to PDFBox caching all images and some other stuff with Soft References. This dereferences them and forces the freeing of memory. + originDocument.close(); + originDocument = openDocument(originFile); + } + + if (pageNumber % 100 == 0 || pageNumber == pageCount || pageNumber == 1) { + log.info("Extracting text on Page {} for {}", pageNumber, identifier); + } + classificationDocument.setPages(classificationPages); PDFLinesTextStripper stripper = new PDFLinesTextStripper(); PDPage pdPage = originDocument.getPage(pageNumber - 1); @@ -218,21 +266,42 @@ public class LayoutParsingPipeline { classificationPages.add(classificationPage); } - bodyTextFrameService.setBodyTextFrames(classificationDocument, layoutParsingType); + originDocument.close(); + log.info("Calculating BodyTextFrame for {}", identifier); + bodyTextFrameService.setBodyTextFrames(classificationDocument, layoutParsingType); + log.info("Classify TextBlocks for {}", identifier); switch (layoutParsingType) { case TAAS -> taasClassificationService.classifyDocument(classificationDocument); case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument); case REDACT_MANAGER -> redactManagerClassificationService.classifyDocument(classificationDocument); } + log.info("Building Sections for {}", identifier); sectionsBuilderService.buildSections(classificationDocument); sectionsBuilderService.addImagesToSections(classificationDocument); - return classificationDocument; } + private void addNumberOfPagesToTrace(int numberOfPages, long size) { + + if (observationRegistry.getCurrentObservation() != null) { + observationRegistry.getCurrentObservation().highCardinalityKeyValue("numberOfPages", String.valueOf(numberOfPages)); + observationRegistry.getCurrentObservation().highCardinalityKeyValue("fileSize", String.valueOf(size)); + } + } + + + @SneakyThrows + private PDDocument openDocument(File originFile) { + + PDDocument document = Loader.loadPDF(originFile); + document.setAllSecurityToBeRemoved(true); + return document; + } + + private Map> convertMarkedContents(List pdMarkedContents) { Map> markedContentBboxes = new HashMap<>(); @@ -244,9 +313,9 @@ public class LayoutParsingPipeline { private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) { - if (!classificationPage.isLandscape()) { - document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue()); - } + if (!classificationPage.isLandscape()) { + document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue()); + } document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue()); document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue()); document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue()); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java index 6e35cbe..3082d54 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java @@ -1,9 +1,7 @@ package com.knecon.fforesight.service.layoutparser.processor; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; import java.io.File; -import java.io.FileOutputStream; +import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.nio.file.Files; @@ -11,7 +9,6 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.StandardOpenOption; -import org.apache.commons.io.IOUtils; import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; import org.springframework.stereotype.Service; @@ -26,6 +23,7 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.ima import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.tenantcommons.TenantContext; +import io.micrometer.observation.annotation.Observed; import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; @@ -39,16 +37,18 @@ public class LayoutParsingStorageService { private final ObjectMapper objectMapper; - public PDDocument getOriginFile(String storageId) throws IOException { + public PDDocument getOriginDocument(String storageId) throws IOException { - try (var originDocumentInputStream = getObject(storageId)) { - File tempFile = createTempFile("document", ".pdf"); - try (var tempFileOutputStream = new FileOutputStream(tempFile)) { - IOUtils.copy(originDocumentInputStream, tempFileOutputStream); - originDocumentInputStream.close(); - } - return Loader.loadPDF(tempFile); - } + return Loader.loadPDF(getOriginFile(storageId)); + } + + + @Observed(name = "LayoutParsingStorageService", contextualName = "get-origin-file") + public File getOriginFile(String storageId) throws IOException { + + File tempFile = createTempFile("document", ".pdf"); + storageService.downloadTo(TenantContext.getTenantId(), storageId, tempFile); + return tempFile; } @@ -74,6 +74,7 @@ public class LayoutParsingStorageService { } + @Observed(name = "LayoutParsingStorageService", contextualName = "store-document-data") public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, DocumentData documentData) { storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), documentData.getDocumentStructure()); @@ -83,7 +84,6 @@ public class LayoutParsingStorageService { } - public void storeResearchDocumentData(LayoutParsingRequest layoutParsingRequest, ResearchDocumentData researchDocumentData) { storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.researchDocumentStorageId(), researchDocumentData); @@ -115,6 +115,7 @@ public class LayoutParsingStorageService { } + @Observed(name = "LayoutParsingStorageService", contextualName = "store-simplified-text") public void storeSimplifiedText(LayoutParsingRequest layoutParsingRequest, SimplifiedText simplifiedText) { storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.simplifiedTextStorageId(), simplifiedText); @@ -132,9 +133,10 @@ public class LayoutParsingStorageService { @SneakyThrows - public void storeViewerDocument(LayoutParsingRequest layoutParsingRequest, ByteArrayOutputStream out) { + @Observed(name = "LayoutParsingStorageService", contextualName = "store-viewer-document") + public void storeViewerDocument(LayoutParsingRequest layoutParsingRequest, File out) { - try (var in = new ByteArrayInputStream(out.toByteArray())) { + try (var in = new FileInputStream(out)) { storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.viewerDocumentStorageId(), in); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java index 0df92c2..c07bffa 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java @@ -10,7 +10,6 @@ import java.util.Set; import java.util.stream.Collectors; import java.util.stream.Stream; -import com.amazonaws.services.kms.model.NotFoundException; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity; @@ -84,7 +83,7 @@ public class Document implements GenericSemanticNode { @Override public Headline getHeadline() { - return streamAllSubNodesOfType(NodeType.HEADLINE).map(node -> (Headline) node).findFirst().orElseThrow(() -> new NotFoundException("No Headlines found in this document!")); + return streamAllSubNodesOfType(NodeType.HEADLINE).map(node -> (Headline) node).findFirst().orElse(Headline.builder().build()); } @@ -105,6 +104,7 @@ public class Document implements GenericSemanticNode { return streamAllSubNodes().collect(Collectors.groupingBy(SemanticNode::getType, Collectors.counting())); } + @Override public String toString() { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Footer.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Footer.java index ee21d4e..e8e43d1f 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Footer.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Footer.java @@ -1,7 +1,9 @@ package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes; +import java.awt.geom.Rectangle2D; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Set; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; @@ -34,6 +36,9 @@ public class Footer implements GenericSemanticNode { @EqualsAndHashCode.Exclude Set entities = new HashSet<>(); + @EqualsAndHashCode.Exclude + Map bBoxCache; + @Override public NodeType getType() { @@ -62,4 +67,14 @@ public class Footer implements GenericSemanticNode { return treeId + ": " + NodeType.FOOTER + ": " + leafTextBlock.buildSummary(); } + + @Override + public Map getBBox() { + + if (bBoxCache == null) { + bBoxCache = GenericSemanticNode.super.getBBox(); + } + return bBoxCache; + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Header.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Header.java index 1a06a18..2092c32 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Header.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Header.java @@ -1,7 +1,9 @@ package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes; +import java.awt.geom.Rectangle2D; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Set; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; @@ -34,6 +36,9 @@ public class Header implements GenericSemanticNode { @EqualsAndHashCode.Exclude Set entities = new HashSet<>(); + @EqualsAndHashCode.Exclude + Map bBoxCache; + @Override public boolean isLeaf() { @@ -62,4 +67,14 @@ public class Header implements GenericSemanticNode { return treeId + ": " + NodeType.HEADER + ": " + leafTextBlock.buildSummary(); } + + @Override + public Map getBBox() { + + if (bBoxCache == null) { + bBoxCache = GenericSemanticNode.super.getBBox(); + } + return bBoxCache; + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Headline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Headline.java index c7d8a4f..95be162 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Headline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Headline.java @@ -1,7 +1,9 @@ package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes; +import java.awt.geom.Rectangle2D; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Set; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; @@ -34,6 +36,9 @@ public class Headline implements GenericSemanticNode { @EqualsAndHashCode.Exclude Set entities = new HashSet<>(); + @EqualsAndHashCode.Exclude + Map bBoxCache; + @Override public NodeType getType() { @@ -69,4 +74,14 @@ public class Headline implements GenericSemanticNode { return this; } + + @Override + public Map getBBox() { + + if (bBoxCache == null) { + bBoxCache = GenericSemanticNode.super.getBBox(); + } + return bBoxCache; + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Paragraph.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Paragraph.java index f092253..224b537 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Paragraph.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Paragraph.java @@ -1,7 +1,9 @@ package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes; +import java.awt.geom.Rectangle2D; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Set; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; @@ -32,6 +34,9 @@ public class Paragraph implements GenericSemanticNode { @EqualsAndHashCode.Exclude Set entities = new HashSet<>(); + @EqualsAndHashCode.Exclude + Map bBoxCache; + @Override public NodeType getType() { @@ -60,4 +65,14 @@ public class Paragraph implements GenericSemanticNode { return treeId + ": " + NodeType.PARAGRAPH + ": " + leafTextBlock.buildSummary(); } + + @Override + public Map getBBox() { + + if (bBoxCache == null) { + bBoxCache = GenericSemanticNode.super.getBBox(); + } + return bBoxCache; + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Section.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Section.java index 51655a2..60cc243 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Section.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Section.java @@ -1,7 +1,9 @@ package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes; +import java.awt.geom.Rectangle2D; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Set; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; @@ -35,6 +37,9 @@ public class Section implements GenericSemanticNode { @EqualsAndHashCode.Exclude Set entities = new HashSet<>(); + @EqualsAndHashCode.Exclude + Map bBoxCache; + @Override public NodeType getType() { @@ -74,4 +79,14 @@ public class Section implements GenericSemanticNode { .orElseGet(() -> getParent().getHeadline()); } + + @Override + public Map getBBox() { + + if (bBoxCache == null) { + bBoxCache = GenericSemanticNode.super.getBBox(); + } + return bBoxCache; + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SemanticNode.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SemanticNode.java index 8799b44..811ca45 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SemanticNode.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/SemanticNode.java @@ -398,12 +398,11 @@ public interface SemanticNode { */ default Map getBBox() { - Map bBoxPerPage = new HashMap<>(); if (isLeaf()) { - return getBBoxFromLeafTextBlock(bBoxPerPage); + return getBBoxFromLeafTextBlock(); } - return getBBoxFromChildren(bBoxPerPage); + return getBBoxFromChildren(); } @@ -426,25 +425,31 @@ public interface SemanticNode { /** * TODO: this produces unwanted results for sections spanning multiple columns. - * - * @param bBoxPerPage initial empty BoundingBox + * Computes the Union of the bounding boxes of all children recursively. * @return The union of the BoundingBoxes of all children */ - private Map getBBoxFromChildren(Map bBoxPerPage) { + private Map getBBoxFromChildren() { - return streamChildren().map(SemanticNode::getBBox).reduce((map1, map2) -> { - map1.forEach((page, rectangle) -> map2.merge(page, rectangle, (rect1, rect2) -> rect1.createUnion(rect2).getBounds2D())); - return map2; - }).orElse(bBoxPerPage); + Map bBoxPerPage = new HashMap<>(); + List> childrenBBoxes = streamChildren().map(SemanticNode::getBBox).toList(); + Set pages = childrenBBoxes.stream().flatMap(map -> map.keySet().stream()).collect(Collectors.toSet()); + for (Page page : pages) { + Rectangle2D bBoxOnPage = childrenBBoxes.stream() + .filter(childBboxPerPage -> childBboxPerPage.containsKey(page)) + .map(childBboxPerPage -> childBboxPerPage.get(page)) + .collect(RectangleTransformations.collectBBox()); + bBoxPerPage.put(page, bBoxOnPage); + } + return bBoxPerPage; } /** - * @param bBoxPerPage initial empty BoundingBox * @return The union of all BoundingBoxes of the TextBlock of this node */ - private Map getBBoxFromLeafTextBlock(Map bBoxPerPage) { + private Map getBBoxFromLeafTextBlock() { + Map bBoxPerPage = new HashMap<>(); Map> atomicTextBlockPerPage = getTextBlock().getAtomicTextBlocks().stream().collect(Collectors.groupingBy(AtomicTextBlock::getPage)); atomicTextBlockPerPage.forEach((page, atbs) -> bBoxPerPage.put(page, RectangleTransformations.bBoxUnionAtomicTextBlock(atbs))); return bBoxPerPage; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java index d62b4cf..8f77162 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java @@ -2,10 +2,12 @@ package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes; import static java.lang.String.format; +import java.awt.geom.Rectangle2D; import java.util.Collection; import java.util.HashSet; import java.util.List; import java.util.Locale; +import java.util.Map; import java.util.Set; import java.util.stream.IntStream; import java.util.stream.Stream; @@ -40,6 +42,8 @@ public class Table implements SemanticNode { @EqualsAndHashCode.Exclude Set entities = new HashSet<>(); + @EqualsAndHashCode.Exclude + Map bBoxCache; /** * Streams all entities in this table, that appear in a row, which contains any of the provided strings. @@ -311,5 +315,12 @@ public class Table implements SemanticNode { return treeId.toString() + ": " + NodeType.TABLE + ": #cols: " + numberOfCols + ", #rows: " + numberOfRows + ", " + this.getTextBlock().buildSummary(); } + @Override + public Map getBBox() { + if (bBoxCache == null) { + bBoxCache = SemanticNode.super.getBBox(); + } + return bBoxCache; + } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java index 7b776dc..82829c6 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPositionSequence.java @@ -9,8 +9,6 @@ import java.util.stream.Collectors; import org.apache.pdfbox.text.TextPosition; -import com.dslplatform.json.JsonAttribute; -import com.fasterxml.jackson.annotation.JsonIgnore; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point; import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle; @@ -142,8 +140,7 @@ public class TextPositionSequence implements CharSequence { * * @return the text direction adjusted minX value */ - @JsonIgnore - @JsonAttribute(ignore = true) + public float getMinXDirAdj() { return textPositions.get(0).getXDirAdj(); @@ -157,8 +154,7 @@ public class TextPositionSequence implements CharSequence { * * @return the text direction adjusted maxX value */ - @JsonIgnore - @JsonAttribute(ignore = true) + public float getMaxXDirAdj() { return textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidthDirAdj() + HEIGHT_PADDING; @@ -172,8 +168,7 @@ public class TextPositionSequence implements CharSequence { * * @return the text direction adjusted minY value. The upper border of the bounding box of the word. */ - @JsonIgnore - @JsonAttribute(ignore = true) + public float getMinYDirAdj() { return textPositions.get(0).getYDirAdj() - getTextHeight(); @@ -187,8 +182,7 @@ public class TextPositionSequence implements CharSequence { * * @return the text direction adjusted maxY value. The lower border of the bounding box of the word. */ - @JsonIgnore - @JsonAttribute(ignore = true) + public float getMaxYDirAdj() { return textPositions.get(0).getYDirAdj(); @@ -196,32 +190,24 @@ public class TextPositionSequence implements CharSequence { } - @JsonIgnore - @JsonAttribute(ignore = true) public float getTextHeight() { return textPositions.get(0).getHeightDir() + HEIGHT_PADDING; } - @JsonIgnore - @JsonAttribute(ignore = true) public float getHeight() { return getMaxYDirAdj() - getMinYDirAdj(); } - @JsonIgnore - @JsonAttribute(ignore = true) public float getWidth() { return getMaxXDirAdj() - getMinXDirAdj(); } - @JsonIgnore - @JsonAttribute(ignore = true) public String getFont() { if (textPositions.get(0).getFontName() == null) { @@ -231,9 +217,8 @@ public class TextPositionSequence implements CharSequence { } - @JsonIgnore - @JsonAttribute(ignore = true) public String getFontStyle() { + if (textPositions.get(0).getFontName() == null) { return "standard"; } @@ -251,16 +236,12 @@ public class TextPositionSequence implements CharSequence { } - @JsonIgnore - @JsonAttribute(ignore = true) public float getFontSize() { return textPositions.get(0).getFontSizeInPt(); } - @JsonIgnore - @JsonAttribute(ignore = true) public float getSpaceWidth() { return textPositions.get(0).getWidthOfSpace(); @@ -276,8 +257,7 @@ public class TextPositionSequence implements CharSequence { * * @return bounding box of the word in Pdf Coordinate System */ - @JsonIgnore - @JsonAttribute(ignore = true) + @SneakyThrows public Rectangle getRectangle() { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/ViewerDocumentService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/ViewerDocumentService.java index 2290eed..2335517 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/ViewerDocumentService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/ViewerDocumentService.java @@ -2,12 +2,13 @@ package com.knecon.fforesight.service.layoutparser.processor.services.visualizat import java.awt.geom.AffineTransform; import java.awt.geom.Rectangle2D; +import java.io.File; import java.io.IOException; -import java.io.OutputStream; -import java.util.HashSet; -import java.util.Set; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardCopyOption; -import org.apache.pdfbox.cos.COSDictionary; +import org.apache.pdfbox.Loader; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocumentCatalog; @@ -31,6 +32,9 @@ import com.knecon.fforesight.service.layoutparser.processor.model.visualization. import com.knecon.fforesight.service.layoutparser.processor.model.visualization.PlacedText; import com.knecon.fforesight.service.layoutparser.processor.model.visualization.VisualizationsOnPage; +import io.micrometer.observation.Observation; +import io.micrometer.observation.ObservationRegistry; +import io.micrometer.observation.annotation.Observed; import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; @@ -40,29 +44,31 @@ import lombok.extern.slf4j.Slf4j; @RequiredArgsConstructor public class ViewerDocumentService { - private static final String LAYER_NAME = "Layout grid"; private static final int FONT_SIZE = 10; public static final float LINE_WIDTH = 1f; private final LayoutGridService layoutGridService; + private final ObservationRegistry observationRegistry; @SneakyThrows - public void createViewerDocument(PDDocument pdDocument, Document document, OutputStream outputStream, boolean layerVisibilityDefaultValue) { + @Observed(name = "ViewerDocumentService", contextualName = "create-viewer-document") + public void createViewerDocument(File originFile, Document document, File destinationFile, boolean layerVisibilityDefaultValue) { + Path tmpFile = Files.createTempFile("tmpViewerDocument", ".pdf"); + PDDocument pdDocument = openPDDocument(originFile); LayoutGrid layoutGrid = layoutGridService.createLayoutGrid(document); - // PDDocument.save() is very slow, since it actually traverses the entire pdf and writes a new one. - // If we collect all COSDictionaries we changed and tell it explicitly to only add the changed ones by using saveIncremental it's very fast. - Set dictionariesToUpdate = new HashSet<>(); - PDOptionalContentGroup layer = addLayerToDocument(pdDocument, dictionariesToUpdate, layerVisibilityDefaultValue); + + PDOptionalContentGroup layer = addLayerToDocument(pdDocument, layerVisibilityDefaultValue); PDFont font = new PDType1Font(Standard14Fonts.FontName.HELVETICA); for (int pageNumber = 0; pageNumber < pdDocument.getNumberOfPages(); pageNumber++) { - PDPage pdPage = pdDocument.getPage(pageNumber); + PDPage pdPage = pdDocument.getPage(pageNumber); +// AffineTransform textDeRotationMatrix = getTextDeRotationTransform(pdPage); - addLayerToPageRessources(pdPage); + addLayerToPageResources(pdPage); // We need to save the graphics state before, such that our appended content cannot be affected by previous content streams with side effects, // e.g. not escaped matrix transformations. @@ -115,16 +121,48 @@ public class ViewerDocumentService { contentStream.restoreGraphicsState(); contentStream.endMarkedContent(); } - dictionariesToUpdate.add(pdPage.getCOSObject()); - dictionariesToUpdate.add(pdPage.getResources().getCOSObject()); + + if (pageNumber % 500 == 0 && pageNumber != 0) { // re-open document every once in a while to save on RAM + log.info("Incremental save after {} pages", pageNumber); + observedIncrementalSave(pdDocument, destinationFile); + pdDocument.close(); + Files.copy(destinationFile.toPath(), tmpFile, StandardCopyOption.REPLACE_EXISTING); + pdDocument = openPDDocument(tmpFile.toFile()); + layer = addLayerToDocument(pdDocument, layerVisibilityDefaultValue); + } + } - dictionariesToUpdate.add(pdDocument.getDocumentInformation().getCOSObject()); -// dictionariesToUpdate.add(pdDocument.getDocument().getTrailer()); - pdDocument.saveIncremental(outputStream, dictionariesToUpdate); + observedIncrementalSave(pdDocument, destinationFile); + + tmpFile.toFile().delete(); + pdDocument.close(); } - private static void addLayerToPageRessources(PDPage pdPage) { + private static PDDocument openPDDocument(File tmpFile) throws IOException { + + PDDocument pdDocument; + pdDocument = Loader.loadPDF(tmpFile); + pdDocument.setAllSecurityToBeRemoved(true); + return pdDocument; + } + + + @SneakyThrows + private void observedIncrementalSave(PDDocument pdDocument, File outputFile) { + + Observation.createNotStarted("ViewerDocumentService", observationRegistry).contextualName("incremental-save").observe(() -> { + try { + pdDocument.save(outputFile); + } catch (IOException e) { + throw new RuntimeException(e); + } + }); + + } + + + private static void addLayerToPageResources(PDPage pdPage) { PDResources resources = pdPage.getResources(); if (resources == null) { @@ -145,7 +183,7 @@ public class ViewerDocumentService { } - private static PDOptionalContentGroup addLayerToDocument(PDDocument pdDocument, Set dictionariesToUpdate, boolean layerVisibilityDefaultValue) { + private static PDOptionalContentGroup addLayerToDocument(PDDocument pdDocument, boolean layerVisibilityDefaultValue) { PDDocumentCatalog catalog = pdDocument.getDocumentCatalog(); PDOptionalContentProperties ocprops = catalog.getOCProperties(); @@ -161,7 +199,7 @@ public class ViewerDocumentService { ocprops.addGroup(layer); } ocprops.setGroupEnabled(layer, layerVisibilityDefaultValue); - dictionariesToUpdate.add(catalog.getCOSObject()); +// dictionariesToUpdate.add(catalog.getCOSObject()); return layer; } diff --git a/layoutparser-service/layoutparser-service-server/src/main/java/com/knecon/fforesight/service/layoutparser/server/Application.java b/layoutparser-service/layoutparser-service-server/src/main/java/com/knecon/fforesight/service/layoutparser/server/Application.java index 75cfabd..2a89c48 100644 --- a/layoutparser-service/layoutparser-service-server/src/main/java/com/knecon/fforesight/service/layoutparser/server/Application.java +++ b/layoutparser-service/layoutparser-service-server/src/main/java/com/knecon/fforesight/service/layoutparser/server/Application.java @@ -5,6 +5,7 @@ import org.springframework.boot.actuate.autoconfigure.security.servlet.Managemen import org.springframework.boot.autoconfigure.ImportAutoConfiguration; import org.springframework.boot.autoconfigure.SpringBootApplication; import org.springframework.boot.autoconfigure.security.servlet.SecurityAutoConfiguration; +import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Import; import com.amazonaws.services.s3.model.metrics.MetricsConfiguration; @@ -13,8 +14,11 @@ import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingService import com.knecon.fforesight.service.layoutparser.server.queue.MessagingConfiguration; import com.knecon.fforesight.tenantcommons.MultiTenancyAutoConfiguration; +import io.micrometer.observation.ObservationRegistry; +import io.micrometer.observation.aop.ObservedAspect; + @ImportAutoConfiguration({MultiTenancyAutoConfiguration.class}) -@Import({MetricsConfiguration.class, StorageAutoConfiguration.class, LayoutParsingServiceProcessorConfiguration.class, MessagingConfiguration.class}) +@Import({MetricsConfiguration.class, StorageAutoConfiguration.class, LayoutParsingServiceProcessorConfiguration.class, MessagingConfiguration.class}) @SpringBootApplication(exclude = {SecurityAutoConfiguration.class, ManagementWebSecurityAutoConfiguration.class}) public class Application { @@ -23,4 +27,11 @@ public class Application { SpringApplication.run(Application.class, args); } + + @Bean + public ObservedAspect observedAspect(ObservationRegistry observationRegistry) { + + return new ObservedAspect(observationRegistry); + } + } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java index 8e6255d..4b2358e 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/BdrJsonBuildTest.java @@ -46,12 +46,11 @@ public class BdrJsonBuildTest extends AbstractTest { @SneakyThrows protected Document buildGraph(File file) { - try (PDDocument pdDocument = Loader.loadPDF(file)) { - return DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.TAAS, - pdDocument, - new ImageServiceResponse(), - new TableServiceResponse())); - } + return DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.TAAS, + file, + new ImageServiceResponse(), + new TableServiceResponse(), + file.toString())); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java index b2e35d6..a667c22 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java @@ -6,7 +6,6 @@ import java.util.List; import java.util.Set; import java.util.stream.Collectors; -import org.apache.pdfbox.Loader; import org.assertj.core.api.Assertions; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; @@ -96,9 +95,10 @@ public class HeadlinesGoldStandardIntegrationTest { goldStandardLog.getRedactionLogEntry().forEach(e -> goldStandardHeadlines.add(new Headline(e.getPositions().get(0).getPage(), e.getValue()))); Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, - Loader.loadPDF(pdfFileResource.getFile()), + pdfFileResource.getFile(), new ImageServiceResponse(), - new TableServiceResponse())); + new TableServiceResponse(), + filePath)); var foundHeadlines = documentGraph.streamAllSubNodes() .map(SemanticNode::getHeadline) diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java index 59771aa..0751be3 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java @@ -25,8 +25,8 @@ public class LayoutparserEnd2EndTest extends AbstractTest { @SneakyThrows public void testLayoutParserEndToEnd() { - prepareStorage("files/bdr/btd_19_053_1905391.pdf"); - LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.TAAS); + prepareStorage("files/bdr/Wie weiter bei Kristeneinrichtungen.pdf"); + LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER); LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest); Arrays.stream(finishedEvent.message().split("\n")).forEach(log::info); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentDataTests.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentDataTests.java index 003b94b..3c8bea4 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentDataTests.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentDataTests.java @@ -6,6 +6,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.List; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.springframework.core.io.ClassPathResource; @@ -17,7 +18,9 @@ import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest import lombok.SneakyThrows; public class DocumentDataTests extends BuildDocumentTest { + @Test + @Disabled // This test takes waaaaaay too long, it's ridiculous @SneakyThrows public void createDocumentDataForAllFiles() { @@ -36,11 +39,12 @@ public class DocumentDataTests extends BuildDocumentTest { for (String pdfFileName : pdfFileNames) { System.out.println(pdfFileName); DocumentData documentData = DocumentDataMapper.toDocumentData(buildGraph(resource.getFile().toPath().getParent().relativize(Path.of(pdfFileName)).toString())); - File outputFile = Path.of(outPath).resolve(resource.getFile().toPath().relativize(Path.of(pdfFileName))).toFile(); + File outputFile = Path.of(outPath).resolve(resource.getFile().toPath().relativize(Path.of(pdfFileName))).toFile(); outputFile.toPath().getParent().toFile().mkdirs(); try (var out = new FileOutputStream(outputFile.toString().replace(".pdf", ".json"))) { ObjectMapperFactory.create().writeValue(out, documentData); } } } + } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java index 973e0b9..ab829c0 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/DocumentGraphJsonWritingTest.java @@ -5,7 +5,6 @@ import java.io.FileOutputStream; import java.nio.file.Files; import java.nio.file.Path; -import org.apache.pdfbox.Loader; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.springframework.core.io.ClassPathResource; @@ -56,9 +55,10 @@ public class DocumentGraphJsonWritingTest extends BuildDocumentTest { private void writeJsons(Path filename) { Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, - Loader.loadPDF(filename.toFile()), + filename.toFile(), new ImageServiceResponse(), - new TableServiceResponse())); + new TableServiceResponse(), + filename.toFile().toString())); DocumentData documentData = DocumentDataMapper.toDocumentData(documentGraph); ObjectMapper mapper = ObjectMapperFactory.create(); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index e8dd8d6..c8f5207 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -1,32 +1,13 @@ package com.knecon.fforesight.service.layoutparser.server.graph; -import java.io.FileOutputStream; +import java.io.File; import java.nio.file.Path; -import java.util.List; -import org.apache.pdfbox.Loader; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; -import org.springframework.beans.factory.annotation.Autowired; import org.springframework.core.io.ClassPathResource; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure; -import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; -import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; -import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; -import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; -import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; -import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; -import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; -import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService; -import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService; -import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; -import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper; -import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper; import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService; import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService; import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest; @@ -35,38 +16,17 @@ import lombok.SneakyThrows; public class ViewerDocumentTest extends BuildDocumentTest { - @Autowired - private SectionsBuilderService sectionsBuilderService; - - @Autowired - private RedactManagerClassificationService redactManagerClassificationService; - @Test @SneakyThrows public void testViewerDocument() { - String fileName = "files/bdr/notMergedParagraphs.pdf"; + String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; LayoutGridService layoutGridService = new LayoutGridService(); - ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService); - Document document = buildGraph(fileName, LayoutParsingType.TAAS); - try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getFile()); var out = new FileOutputStream(tmpFileName)) { - viewerDocumentService.createViewerDocument(pdDocument, document, out, true); - } - } - - public ClassificationDocument buildClassificationDocument(PDDocument originDocument) { - - ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, - originDocument, - new ImageServiceResponse(), - new TableServiceResponse()); - - redactManagerClassificationService.classifyDocument(classificationDocument); - - sectionsBuilderService.buildSections(classificationDocument); - - return classificationDocument; + ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService, null); + Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER); + var documentFile = new ClassPathResource(fileName).getFile(); + viewerDocumentService.createViewerDocument(documentFile, document, new File(tmpFileName), true); } } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java index f2e9896..c3a7058 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java @@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.server.segmentation; import static org.assertj.core.api.Assertions.assertThat; import java.awt.geom.Rectangle2D; +import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.nio.file.Path; @@ -15,8 +16,6 @@ import java.util.Locale; import java.util.Map; import java.util.stream.Collectors; -import org.apache.pdfbox.Loader; -import org.apache.pdfbox.pdmodel.PDDocument; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.springframework.beans.factory.annotation.Autowired; @@ -62,12 +61,13 @@ public class PdfSegmentationServiceTest extends AbstractTest { @SneakyThrows - public ClassificationDocument buildClassificationDocument(PDDocument originDocument, TableServiceResponse tableServiceResponse) { + public ClassificationDocument buildClassificationDocument(File originDocument, TableServiceResponse tableServiceResponse) { ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, originDocument, new ImageServiceResponse(), - tableServiceResponse); + tableServiceResponse, + "document"); redactManagerClassificationService.classifyDocument(classificationDocument); @@ -78,7 +78,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { @SneakyThrows - public ClassificationDocument buildClassificationDocument(PDDocument originDocument) { + public ClassificationDocument buildClassificationDocument(File originDocument) { return buildClassificationDocument(originDocument, new TableServiceResponse()); } @@ -89,7 +89,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izRMS (CZ) fRR Part B9_Page185.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); + ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); toHtml(document, "/tmp/A20622A izRMS (CZ) fRR Part B9_Page185.html"); @@ -103,7 +103,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource cvTablesResource = new ClassPathResource("files/cv_tables/ScanRotationBorder.TABLES.json"); var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()), tableServiceResponse); + ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile(), tableServiceResponse); toHtml(document, "/tmp/ScanRotationBorder.html"); } @@ -116,7 +116,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource cvTablesResource = new ClassPathResource("files/cv_tables/ScanRotationBorder.TABLES.json"); var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()), tableServiceResponse); + ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile(), tableServiceResponse); assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty(); var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList(); @@ -156,7 +156,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Spanning Cells.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); + ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty(); TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0); assertThat(table.getColCount()).isEqualTo(6); @@ -170,7 +170,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Table.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); + ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty(); TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0); assertThat(firstTable.getColCount()).isEqualTo(8); @@ -188,7 +188,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Multi Page Table.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); + ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty(); TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0); assertThat(firstTable.getColCount()).isEqualTo(9); @@ -206,7 +206,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Rotated Table Headers.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); + ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty(); TablePageBlock firstTable = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList()).get(0); assertThat(firstTable.getColCount()).isEqualTo(8); @@ -224,7 +224,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/56 Fludioxonil_RAR_12_Volume_3CA_B-7_2018-02-21_Page170.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); + ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); validateTableSize(document, 4); @@ -241,7 +241,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/211.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); + ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); validateTableSize(document, 4); @@ -258,7 +258,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/VV-931175_Page1.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); + ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); validateTableSize(document, 1); @@ -299,7 +299,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/27 A8637C - EU AIR3 - MCP Section 1 - Identity of the plant protection product_Page6.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); + ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); toHtml(document, "/tmp/html.html"); @@ -319,7 +319,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izRMS (CZ) fRR Part B9_Page185.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); + ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); validateTableSize(document, 1); @@ -332,7 +332,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izRMS (CZ) fRR Part B9_Page185_fixed.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); + ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); validateTableSize(document, 1); @@ -345,7 +345,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/A20622A izZRMS (CZ) fRR Part B7_Page123.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); + ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); validateTableSize(document, 6); @@ -364,7 +364,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/77 Pirimicarb_RAR_08_Volume_3CA_B-6_2017-12-04_Page11.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); + ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); validateTableSize(document, 3); @@ -380,7 +380,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/95 Trinexapac-ethyl_RAR_08_Volume_3CA_B-6_2018-01-10_Page532.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); + ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); validateTableSize(document, 1); validateTable(document, 0, 9, 9, 0, 0); @@ -393,7 +393,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/52 Fludioxonil_RAR_07_Volume_3CA_B-5_2018-02-21_Page175.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); + ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); validateTableSize(document, 1); @@ -407,7 +407,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/52 Fludioxonil_RAR_07_Volume_3CA_B-5_2018-02-21_Page174.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); + ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); validateTableSize(document, 1); validateTable(document, 0, 9, 6, 7, 0); @@ -420,7 +420,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/19 Chlorothalonil RAR 08 Volume 3CA B 6b metabolites Oct 2017_Page35.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); + ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); validateTableSize(document, 1); validateTable(document, 0, 10, 6, 0, 0); @@ -433,7 +433,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/19 Chlorothalonil RAR 08 Volume 3CA B 6b metabolites Oct 2017_Page161.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); + ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); validateTableSize(document, 2); validateTable(document, 0, 2, 2, 0, 0); @@ -448,7 +448,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource( "files/SinglePages/47 Cyprodinil - EU AIR3 - MCA Section 5 Supplement - Toxicological and metabolism studies on the active substance_Page30.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); + ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); validateTableSize(document, 2); @@ -464,7 +464,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource( "files/SinglePages/49 Cyprodinil - EU AIR3 - MCA Section 8 Supplement - Ecotoxicological studies on the active substance_Page61.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); + ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); validateTableSize(document, 2); @@ -479,7 +479,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/81 Pirimicarb_RAR_20_Volume_3CP_A10788A (_Pirimor_)_B-9_2017-12-04_Page54.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); + ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); validateTableSize(document, 2); @@ -494,7 +494,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/85 Pydiflumetofen_DAR_08_Volume_3CA_B-6_2017-07-26_Page134.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); + ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); validateTableSize(document, 2); @@ -509,7 +509,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/Thiabendazole DAR Addendum for ED_April_2020_Page18.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); + ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); validateTableSize(document, 4); @@ -526,7 +526,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/15 - Pretilachlor - Acute Oral Toxicity (Up and Down Procedure) - Rat_Page18.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); + ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); validateTableSize(document, 1); @@ -541,7 +541,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource( "files/SinglePages/28 A8637C - EU AIR3 - MCP Section 10 - Ecotoxicological studies on the plant protection product_Page23.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); + ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); validateTableSize(document, 2); @@ -556,7 +556,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/24 - SYN549522 - Acute Oral Toxicity - Rats_Page17.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); + ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); validateTableSize(document, 1); @@ -570,7 +570,7 @@ public class PdfSegmentationServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/SinglePages/30 - Dicamba - Acute Oral Toxicity - Rats_Page5.pdf"); - ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); + ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile()); validateTableSize(document, 1); diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java index ae7e418..d712814 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java @@ -58,7 +58,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest { public void testTableExtraction() { LayoutGridService layoutGridService = new LayoutGridService(); - ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService); + ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService, null); ClassPathResource resource = new ClassPathResource("files"); List pdfFileNames = Files.walk(resource.getFile().toPath()) @@ -77,13 +77,15 @@ public class RulingCleaningServiceTest extends BuildDocumentTest { private void writeJsons(Path filename) { Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, - Loader.loadPDF(filename.toFile()), + filename.toFile(), new ImageServiceResponse(), - new TableServiceResponse())); + new TableServiceResponse(), + filename.toFile().toString())); Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, - Loader.loadPDF(filename.toFile()), + filename.toFile(), new ImageServiceResponse(), - new TableServiceResponse())); + new TableServiceResponse(), + filename.toFile().toString())); DocumentData documentDataBefore = DocumentDataMapper.toDocumentData(documentGraphBefore); DocumentData documentDataAfter = DocumentDataMapper.toDocumentData(documentGraphAfter); if (!compareStructures(documentDataBefore.getDocumentStructure(), documentDataAfter.getDocumentStructure(), filename.getFileName().toString())) { diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/AbstractTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/AbstractTest.java index 4f56c76..a4ebaca 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/AbstractTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/AbstractTest.java @@ -1,5 +1,26 @@ package com.knecon.fforesight.service.layoutparser.server.utils; +import java.io.InputStream; +import java.util.Map; +import java.util.Optional; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.extension.ExtendWith; +import org.springframework.amqp.rabbit.core.RabbitTemplate; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.autoconfigure.EnableAutoConfiguration; +import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.boot.test.mock.mockito.MockBean; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.ComponentScan; +import org.springframework.context.annotation.Configuration; +import org.springframework.context.annotation.Import; +import org.springframework.context.annotation.Primary; +import org.springframework.core.io.ClassPathResource; +import org.springframework.test.context.junit.jupiter.SpringExtension; + import com.iqser.red.commons.jackson.ObjectMapperFactory; import com.iqser.red.storage.commons.service.StorageService; import com.iqser.red.storage.commons.utils.FileSystemBackedStorageService; @@ -9,22 +30,8 @@ import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingStorage import com.knecon.fforesight.service.layoutparser.server.Application; import com.knecon.fforesight.tenantcommons.TenantContext; import com.knecon.fforesight.tenantcommons.TenantsClient; -import lombok.SneakyThrows; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.extension.ExtendWith; -import org.springframework.amqp.rabbit.core.RabbitTemplate; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.boot.autoconfigure.EnableAutoConfiguration; -import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration; -import org.springframework.boot.test.context.SpringBootTest; -import org.springframework.boot.test.mock.mockito.MockBean; -import org.springframework.context.annotation.*; -import org.springframework.core.io.ClassPathResource; -import org.springframework.test.context.junit.jupiter.SpringExtension; -import java.io.InputStream; -import java.util.Optional; +import lombok.SneakyThrows; @ExtendWith(SpringExtension.class) @SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT) @@ -100,9 +107,11 @@ public abstract class AbstractTest { return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER); } + protected LayoutParsingRequest buildDefaultLayoutParsingRequest(LayoutParsingType layoutParsingType) { return LayoutParsingRequest.builder() + .identifier(Map.of("fileId", "1337")) .layoutParsingType(layoutParsingType) .originFileStorageId(ORIGIN_FILE_ID) .tablesFileStorageId(Optional.of(TABLE_FILE_ID)) @@ -116,6 +125,7 @@ public abstract class AbstractTest { .build(); } + @SneakyThrows protected LayoutParsingRequest prepareStorage(String file, String cvServiceResponseFile, String imageInfoFile) { @@ -152,7 +162,6 @@ public abstract class AbstractTest { @ComponentScan("com.knecon.fforesight.service.layoutparser") public static class TestConfiguration { - @Bean @Primary public StorageService inmemoryStorage() { diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java index dc9d0d6..69ed656 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/BuildDocumentTest.java @@ -1,9 +1,7 @@ package com.knecon.fforesight.service.layoutparser.server.utils; -import java.io.InputStream; +import java.io.File; -import org.apache.pdfbox.Loader; -import org.apache.pdfbox.pdmodel.PDDocument; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.core.io.ClassPathResource; @@ -25,11 +23,9 @@ public abstract class BuildDocumentTest extends AbstractTest { @SneakyThrows protected ClassificationDocument parseLayout(String filename, LayoutParsingType layoutParsingType) { - ClassPathResource fileResource = new ClassPathResource(filename); + File fileResource = new ClassPathResource(filename).getFile(); prepareStorage(filename); - try (InputStream inputStream = fileResource.getInputStream(); PDDocument pdDocument = Loader.loadPDF(inputStream.readAllBytes())) { - return layoutParsingPipeline.parseLayout(layoutParsingType, pdDocument, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), new TableServiceResponse()); - } + return layoutParsingPipeline.parseLayout(layoutParsingType, fileResource, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), new TableServiceResponse(), filename); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/application.yml b/layoutparser-service/layoutparser-service-server/src/test/resources/application.yml index 18e4690..8cee17b 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/resources/application.yml +++ b/layoutparser-service/layoutparser-service-server/src/test/resources/application.yml @@ -7,6 +7,9 @@ fforesight.tenants.remote: true server: port: 8080 +logging.pattern.level: "%5p [${spring.application.name},%X{traceId:-},%X{spanId:-}]" +logging.type: ${LOGGING_TYPE:CONSOLE} + spring: main: allow-circular-references: true # FIXME diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/log4j2-test.xml b/layoutparser-service/layoutparser-service-server/src/test/resources/log4j2-test.xml deleted file mode 100644 index b4895cf..0000000 --- a/layoutparser-service/layoutparser-service-server/src/test/resources/log4j2-test.xml +++ /dev/null @@ -1,16 +0,0 @@ - - - - - - - - - - - - - - - - diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/logback-spring.xml b/layoutparser-service/layoutparser-service-server/src/test/resources/logback-spring.xml new file mode 100644 index 0000000..33b2cef --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/resources/logback-spring.xml @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/publish-custom-image.sh b/publish-custom-image.sh index 2610fc3..c8c81d1 100755 --- a/publish-custom-image.sh +++ b/publish-custom-image.sh @@ -2,7 +2,14 @@ dir=${PWD##*/} gradle assemble -buildNumber=${1:-1} +# Get the current Git branch +branch=$(git rev-parse --abbrev-ref HEAD) -gradle bootBuildImage --cleanCache --publishImage -PbuildbootDockerHostNetwork=true -Pversion=$USER-$buildNumber -echo "nexus.knecon.com:5001/red/${dir}-server-v1:$USER-$buildNumber" +# Get the short commit hash (first 5 characters) +commit_hash=$(git rev-parse --short=5 HEAD) + +# Combine branch and commit hash +buildName="${USER}-${branch}-${commit_hash}" + +gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=$buildName --no-build-cache +echo "nexus.knecon.com:5001/ff/${dir}-service-server:$buildName"