diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index fa8d7d7..e62c55a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -10,13 +10,13 @@ import java.util.HashMap; import java.util.List; import java.util.Map; -import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; import org.springframework.stereotype.Service; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; @@ -50,6 +50,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.mapper.Docu import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper; import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper; import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService; +import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; @@ -116,16 +117,45 @@ public class LayoutParsingPipeline { .identifier(layoutParsingRequest.identifier()) .numberOfPages(numberOfPages) .duration(System.currentTimeMillis() - start) - .message(format("Layout parsing is finished and files have been saved with Ids:\n Structure: %s\nText: %s\nPositions: %s\nPageData: %s", + .message(format(""" + Layout parsing has finished in %.02f s. + identifiers: %s + %s + Files have been saved with Ids: + Structure: %s + Text: %s + Positions: %s + PageData: %s + Simplified Text: %s + Viewer Doc: %s""", + ((float) (System.currentTimeMillis() - start)) / 1000, + layoutParsingRequest.identifier(), + buildSemanticNodeCountMessage(numberOfPages, documentGraph.buildSemanticNodeCounts()), layoutParsingRequest.structureFileStorageId(), layoutParsingRequest.textBlockFileStorageId(), layoutParsingRequest.positionBlockFileStorageId(), - layoutParsingRequest.pageFileStorageId())) + layoutParsingRequest.pageFileStorageId(), + layoutParsingRequest.simplifiedTextStorageId(), + layoutParsingRequest.viewerDocumentStorageId())) .build(); } } + private String buildSemanticNodeCountMessage(int numberOfPages, Map semanticNodeCounts) { + + return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed", + numberOfPages, + semanticNodeCounts.get(NodeType.SECTION), + semanticNodeCounts.get(NodeType.HEADLINE), + semanticNodeCounts.get(NodeType.PARAGRAPH), + semanticNodeCounts.get(NodeType.TABLE), + semanticNodeCounts.get(NodeType.TABLE_CELL), + semanticNodeCounts.get(NodeType.HEADER), + semanticNodeCounts.get(NodeType.FOOTER)); + } + + @SneakyThrows public ClassificationDocument parseLayout(LayoutParsingType layoutParsingType, PDDocument originDocument, @@ -166,12 +196,9 @@ public class LayoutParsingPipeline { stripper.getMaxCharHeight()); ClassificationPage classificationPage = switch (layoutParsingType) { - case REDACT_MANAGER -> - redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); - case TAAS -> - taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); - case DOCUMINE -> - docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); + case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); + case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); + case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); }; classificationPage.setCleanRulings(cleanRulings); classificationPage.setRotation(rotation); @@ -212,12 +239,14 @@ public class LayoutParsingPipeline { private Map convertMarkedContents(List pdMarkedContents) { + Map markedContentBboxes = new HashMap<>(); markedContentBboxes.put(MarkedContentUtils.HEADER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.HEADER)); markedContentBboxes.put(MarkedContentUtils.FOOTER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.FOOTER)); return markedContentBboxes; } + private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) { if (!classificationPage.isLandscape()) { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java index e3cc3e2..0df92c2 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Document.java @@ -100,6 +100,11 @@ public class Document implements GenericSemanticNode { } + public Map buildSemanticNodeCounts() { + + return streamAllSubNodes().collect(Collectors.groupingBy(SemanticNode::getType, Collectors.counting())); + } + @Override public String toString() { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/ViewerDocumentService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/ViewerDocumentService.java index 39bba26..2290eed 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/ViewerDocumentService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/visualization/ViewerDocumentService.java @@ -51,7 +51,6 @@ public class ViewerDocumentService { @SneakyThrows public void createViewerDocument(PDDocument pdDocument, Document document, OutputStream outputStream, boolean layerVisibilityDefaultValue) { - log.info("Start Viewer Document Creation"); LayoutGrid layoutGrid = layoutGridService.createLayoutGrid(document); // PDDocument.save() is very slow, since it actually traverses the entire pdf and writes a new one. // If we collect all COSDictionaries we changed and tell it explicitly to only add the changed ones by using saveIncremental it's very fast. @@ -122,7 +121,6 @@ public class ViewerDocumentService { dictionariesToUpdate.add(pdDocument.getDocumentInformation().getCOSObject()); // dictionariesToUpdate.add(pdDocument.getDocument().getTrailer()); pdDocument.saveIncremental(outputStream, dictionariesToUpdate); - log.info("Saved Viewer Document"); } diff --git a/layoutparser-service/layoutparser-service-server/src/main/java/com/knecon/fforesight/service/layoutparser/server/queue/MessageHandler.java b/layoutparser-service/layoutparser-service-server/src/main/java/com/knecon/fforesight/service/layoutparser/server/queue/MessageHandler.java index 2e58f2f..22f4899 100644 --- a/layoutparser-service/layoutparser-service-server/src/main/java/com/knecon/fforesight/service/layoutparser/server/queue/MessageHandler.java +++ b/layoutparser-service/layoutparser-service-server/src/main/java/com/knecon/fforesight/service/layoutparser/server/queue/MessageHandler.java @@ -1,5 +1,7 @@ package com.knecon.fforesight.service.layoutparser.server.queue; +import java.util.Arrays; + import org.springframework.amqp.AmqpRejectAndDontRequeueException; import org.springframework.amqp.core.Message; import org.springframework.amqp.rabbit.annotation.RabbitHandler; @@ -45,12 +47,12 @@ public class MessageHandler { } LayoutParsingFinishedEvent layoutParsingFinishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest); sendLayoutParsingFinishedEvent(layoutParsingFinishedEvent); - log.info("Layout parsing finished {} in {} ms", layoutParsingRequest.identifier(), layoutParsingFinishedEvent.duration()); } public void sendLayoutParsingFinishedEvent(LayoutParsingFinishedEvent layoutParsingFinishedEvent) { + Arrays.stream(layoutParsingFinishedEvent.message().split("\n")).forEach(log::info); rabbitTemplate.convertAndSend(LayoutParsingQueueNames.LAYOUT_PARSING_FINISHED_EVENT_QUEUE, layoutParsingFinishedEvent); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java new file mode 100644 index 0000000..59771aa --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java @@ -0,0 +1,34 @@ +package com.knecon.fforesight.service.layoutparser.server; + +import java.util.Arrays; + +import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; + +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent; +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; +import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; +import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline; +import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest; + +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +public class LayoutparserEnd2EndTest extends AbstractTest { + + @Autowired + private LayoutParsingPipeline layoutParsingPipeline; + + + @Test + @SneakyThrows + public void testLayoutParserEndToEnd() { + + prepareStorage("files/bdr/btd_19_053_1905391.pdf"); + LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.TAAS); + LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest); + Arrays.stream(finishedEvent.message().split("\n")).forEach(log::info); + } + +} diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/AbstractTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/AbstractTest.java index b9b58b3..0f02ad7 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/AbstractTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/AbstractTest.java @@ -101,8 +101,13 @@ public abstract class AbstractTest { protected LayoutParsingRequest prepareStorage(InputStream fileInputStream) { storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileInputStream); + return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER); + } + + protected LayoutParsingRequest buildDefaultLayoutParsingRequest(LayoutParsingType layoutParsingType) { + return LayoutParsingRequest.builder() - .layoutParsingType(LayoutParsingType.REDACT_MANAGER) + .layoutParsingType(layoutParsingType) .originFileStorageId(ORIGIN_FILE_ID) .tablesFileStorageId(Optional.of(TABLE_FILE_ID)) .imagesFileStorageId(Optional.of(IMAGE_FILE_ID)) @@ -116,7 +121,6 @@ public abstract class AbstractTest { .build(); } - @SneakyThrows protected LayoutParsingRequest prepareStorage(String file, String cvServiceResponseFile, String imageInfoFile) { @@ -135,19 +139,7 @@ public abstract class AbstractTest { storageService.storeObject(TenantContext.getTenantId(), TABLE_FILE_ID, cvServiceResponseFileStream); storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream); - return LayoutParsingRequest.builder() - .layoutParsingType(LayoutParsingType.REDACT_MANAGER) - .originFileStorageId(ORIGIN_FILE_ID) - .tablesFileStorageId(Optional.of(TABLE_FILE_ID)) - .imagesFileStorageId(Optional.of(IMAGE_FILE_ID)) - .structureFileStorageId(STRUCTURE_FILE_ID) - .textBlockFileStorageId(TEXT_FILE_ID) - .positionBlockFileStorageId(POSITION_FILE_ID) - .pageFileStorageId(PAGES_FILE_ID) - .simplifiedTextStorageId(SIMPLIFIED_ID) - .sectionGridStorageId(SECTION_GRID_ID) - .viewerDocumentStorageId(VIEWER_DOCUMENT_ID) - .build(); + return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/bdr/btd_19_053_1905391.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/bdr/btd_19_053_1905391.pdf new file mode 100644 index 0000000..45d8ffd Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/bdr/btd_19_053_1905391.pdf differ