Merge branch 'TAAS-89' into 'main'

TAAS-89: added log entry and an end2end test

See merge request fforesight/layout-parser!71
This commit is contained in:
Kilian Schüttler 2023-08-31 14:40:48 +02:00
commit 754fd8f933
7 changed files with 87 additions and 27 deletions

View File

@ -10,13 +10,13 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
@ -50,6 +50,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.mapper.Docu
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
@ -116,16 +117,45 @@ public class LayoutParsingPipeline {
.identifier(layoutParsingRequest.identifier())
.numberOfPages(numberOfPages)
.duration(System.currentTimeMillis() - start)
.message(format("Layout parsing is finished and files have been saved with Ids:\n Structure: %s\nText: %s\nPositions: %s\nPageData: %s",
.message(format("""
Layout parsing has finished in %.02f s.
identifiers: %s
%s
Files have been saved with Ids:
Structure: %s
Text: %s
Positions: %s
PageData: %s
Simplified Text: %s
Viewer Doc: %s""",
((float) (System.currentTimeMillis() - start)) / 1000,
layoutParsingRequest.identifier(),
buildSemanticNodeCountMessage(numberOfPages, documentGraph.buildSemanticNodeCounts()),
layoutParsingRequest.structureFileStorageId(),
layoutParsingRequest.textBlockFileStorageId(),
layoutParsingRequest.positionBlockFileStorageId(),
layoutParsingRequest.pageFileStorageId()))
layoutParsingRequest.pageFileStorageId(),
layoutParsingRequest.simplifiedTextStorageId(),
layoutParsingRequest.viewerDocumentStorageId()))
.build();
}
}
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
numberOfPages,
semanticNodeCounts.get(NodeType.SECTION),
semanticNodeCounts.get(NodeType.HEADLINE),
semanticNodeCounts.get(NodeType.PARAGRAPH),
semanticNodeCounts.get(NodeType.TABLE),
semanticNodeCounts.get(NodeType.TABLE_CELL),
semanticNodeCounts.get(NodeType.HEADER),
semanticNodeCounts.get(NodeType.FOOTER));
}
@SneakyThrows
public ClassificationDocument parseLayout(LayoutParsingType layoutParsingType,
PDDocument originDocument,
@ -166,12 +196,9 @@ public class LayoutParsingPipeline {
stripper.getMaxCharHeight());
ClassificationPage classificationPage = switch (layoutParsingType) {
case REDACT_MANAGER ->
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case TAAS ->
taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case DOCUMINE ->
docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
};
classificationPage.setCleanRulings(cleanRulings);
classificationPage.setRotation(rotation);
@ -212,12 +239,14 @@ public class LayoutParsingPipeline {
private Map<String, Rectangle2D> convertMarkedContents(List<PDMarkedContent> pdMarkedContents) {
Map<String, Rectangle2D> markedContentBboxes = new HashMap<>();
markedContentBboxes.put(MarkedContentUtils.HEADER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.HEADER));
markedContentBboxes.put(MarkedContentUtils.FOOTER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.FOOTER));
return markedContentBboxes;
}
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
if (!classificationPage.isLandscape()) {

View File

@ -100,6 +100,11 @@ public class Document implements GenericSemanticNode {
}
public Map<NodeType, Long> buildSemanticNodeCounts() {
return streamAllSubNodes().collect(Collectors.groupingBy(SemanticNode::getType, Collectors.counting()));
}
@Override
public String toString() {

View File

@ -51,7 +51,6 @@ public class ViewerDocumentService {
@SneakyThrows
public void createViewerDocument(PDDocument pdDocument, Document document, OutputStream outputStream, boolean layerVisibilityDefaultValue) {
log.info("Start Viewer Document Creation");
LayoutGrid layoutGrid = layoutGridService.createLayoutGrid(document);
// PDDocument.save() is very slow, since it actually traverses the entire pdf and writes a new one.
// If we collect all COSDictionaries we changed and tell it explicitly to only add the changed ones by using saveIncremental it's very fast.
@ -122,7 +121,6 @@ public class ViewerDocumentService {
dictionariesToUpdate.add(pdDocument.getDocumentInformation().getCOSObject());
// dictionariesToUpdate.add(pdDocument.getDocument().getTrailer());
pdDocument.saveIncremental(outputStream, dictionariesToUpdate);
log.info("Saved Viewer Document");
}

View File

@ -1,5 +1,7 @@
package com.knecon.fforesight.service.layoutparser.server.queue;
import java.util.Arrays;
import org.springframework.amqp.AmqpRejectAndDontRequeueException;
import org.springframework.amqp.core.Message;
import org.springframework.amqp.rabbit.annotation.RabbitHandler;
@ -45,12 +47,12 @@ public class MessageHandler {
}
LayoutParsingFinishedEvent layoutParsingFinishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
sendLayoutParsingFinishedEvent(layoutParsingFinishedEvent);
log.info("Layout parsing finished {} in {} ms", layoutParsingRequest.identifier(), layoutParsingFinishedEvent.duration());
}
public void sendLayoutParsingFinishedEvent(LayoutParsingFinishedEvent layoutParsingFinishedEvent) {
Arrays.stream(layoutParsingFinishedEvent.message().split("\n")).forEach(log::info);
rabbitTemplate.convertAndSend(LayoutParsingQueueNames.LAYOUT_PARSING_FINISHED_EVENT_QUEUE, layoutParsingFinishedEvent);
}

View File

@ -0,0 +1,34 @@
package com.knecon.fforesight.service.layoutparser.server;
import java.util.Arrays;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public class LayoutparserEnd2EndTest extends AbstractTest {
@Autowired
private LayoutParsingPipeline layoutParsingPipeline;
@Test
@SneakyThrows
public void testLayoutParserEndToEnd() {
prepareStorage("files/bdr/btd_19_053_1905391.pdf");
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.TAAS);
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
Arrays.stream(finishedEvent.message().split("\n")).forEach(log::info);
}
}

View File

@ -101,8 +101,13 @@ public abstract class AbstractTest {
protected LayoutParsingRequest prepareStorage(InputStream fileInputStream) {
storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileInputStream);
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
}
protected LayoutParsingRequest buildDefaultLayoutParsingRequest(LayoutParsingType layoutParsingType) {
return LayoutParsingRequest.builder()
.layoutParsingType(LayoutParsingType.REDACT_MANAGER)
.layoutParsingType(layoutParsingType)
.originFileStorageId(ORIGIN_FILE_ID)
.tablesFileStorageId(Optional.of(TABLE_FILE_ID))
.imagesFileStorageId(Optional.of(IMAGE_FILE_ID))
@ -116,7 +121,6 @@ public abstract class AbstractTest {
.build();
}
@SneakyThrows
protected LayoutParsingRequest prepareStorage(String file, String cvServiceResponseFile, String imageInfoFile) {
@ -135,19 +139,7 @@ public abstract class AbstractTest {
storageService.storeObject(TenantContext.getTenantId(), TABLE_FILE_ID, cvServiceResponseFileStream);
storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream);
return LayoutParsingRequest.builder()
.layoutParsingType(LayoutParsingType.REDACT_MANAGER)
.originFileStorageId(ORIGIN_FILE_ID)
.tablesFileStorageId(Optional.of(TABLE_FILE_ID))
.imagesFileStorageId(Optional.of(IMAGE_FILE_ID))
.structureFileStorageId(STRUCTURE_FILE_ID)
.textBlockFileStorageId(TEXT_FILE_ID)
.positionBlockFileStorageId(POSITION_FILE_ID)
.pageFileStorageId(PAGES_FILE_ID)
.simplifiedTextStorageId(SIMPLIFIED_ID)
.sectionGridStorageId(SECTION_GRID_ID)
.viewerDocumentStorageId(VIEWER_DOCUMENT_ID)
.build();
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
}