TAAS-89: added log entry and an end2end test
This commit is contained in:
parent
aed4a55787
commit
28ec4c9ccb
@ -10,13 +10,13 @@ import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
@ -50,6 +50,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.mapper.Docu
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
@ -116,16 +117,45 @@ public class LayoutParsingPipeline {
|
||||
.identifier(layoutParsingRequest.identifier())
|
||||
.numberOfPages(numberOfPages)
|
||||
.duration(System.currentTimeMillis() - start)
|
||||
.message(format("Layout parsing is finished and files have been saved with Ids:\n Structure: %s\nText: %s\nPositions: %s\nPageData: %s",
|
||||
.message(format("""
|
||||
Layout parsing has finished in %.02f s.
|
||||
identifiers: %s
|
||||
%s
|
||||
Files have been saved with Ids:
|
||||
Structure: %s
|
||||
Text: %s
|
||||
Positions: %s
|
||||
PageData: %s
|
||||
Simplified Text: %s
|
||||
Viewer Doc: %s""",
|
||||
((float) (System.currentTimeMillis() - start)) / 1000,
|
||||
layoutParsingRequest.identifier(),
|
||||
buildSemanticNodeCountMessage(numberOfPages, documentGraph.buildSemanticNodeCounts()),
|
||||
layoutParsingRequest.structureFileStorageId(),
|
||||
layoutParsingRequest.textBlockFileStorageId(),
|
||||
layoutParsingRequest.positionBlockFileStorageId(),
|
||||
layoutParsingRequest.pageFileStorageId()))
|
||||
layoutParsingRequest.pageFileStorageId(),
|
||||
layoutParsingRequest.simplifiedTextStorageId(),
|
||||
layoutParsingRequest.viewerDocumentStorageId()))
|
||||
.build();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
|
||||
|
||||
return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
|
||||
numberOfPages,
|
||||
semanticNodeCounts.get(NodeType.SECTION),
|
||||
semanticNodeCounts.get(NodeType.HEADLINE),
|
||||
semanticNodeCounts.get(NodeType.PARAGRAPH),
|
||||
semanticNodeCounts.get(NodeType.TABLE),
|
||||
semanticNodeCounts.get(NodeType.TABLE_CELL),
|
||||
semanticNodeCounts.get(NodeType.HEADER),
|
||||
semanticNodeCounts.get(NodeType.FOOTER));
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public ClassificationDocument parseLayout(LayoutParsingType layoutParsingType,
|
||||
PDDocument originDocument,
|
||||
@ -166,12 +196,9 @@ public class LayoutParsingPipeline {
|
||||
stripper.getMaxCharHeight());
|
||||
|
||||
ClassificationPage classificationPage = switch (layoutParsingType) {
|
||||
case REDACT_MANAGER ->
|
||||
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case TAAS ->
|
||||
taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case DOCUMINE ->
|
||||
docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
};
|
||||
classificationPage.setCleanRulings(cleanRulings);
|
||||
classificationPage.setRotation(rotation);
|
||||
@ -212,12 +239,14 @@ public class LayoutParsingPipeline {
|
||||
|
||||
|
||||
private Map<String, Rectangle2D> convertMarkedContents(List<PDMarkedContent> pdMarkedContents) {
|
||||
|
||||
Map<String, Rectangle2D> markedContentBboxes = new HashMap<>();
|
||||
markedContentBboxes.put(MarkedContentUtils.HEADER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.HEADER));
|
||||
markedContentBboxes.put(MarkedContentUtils.FOOTER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.FOOTER));
|
||||
return markedContentBboxes;
|
||||
}
|
||||
|
||||
|
||||
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
|
||||
|
||||
if (!classificationPage.isLandscape()) {
|
||||
|
||||
@ -100,6 +100,11 @@ public class Document implements GenericSemanticNode {
|
||||
}
|
||||
|
||||
|
||||
public Map<NodeType, Long> buildSemanticNodeCounts() {
|
||||
|
||||
return streamAllSubNodes().collect(Collectors.groupingBy(SemanticNode::getType, Collectors.counting()));
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
|
||||
@ -51,7 +51,6 @@ public class ViewerDocumentService {
|
||||
@SneakyThrows
|
||||
public void createViewerDocument(PDDocument pdDocument, Document document, OutputStream outputStream, boolean layerVisibilityDefaultValue) {
|
||||
|
||||
log.info("Start Viewer Document Creation");
|
||||
LayoutGrid layoutGrid = layoutGridService.createLayoutGrid(document);
|
||||
// PDDocument.save() is very slow, since it actually traverses the entire pdf and writes a new one.
|
||||
// If we collect all COSDictionaries we changed and tell it explicitly to only add the changed ones by using saveIncremental it's very fast.
|
||||
@ -122,7 +121,6 @@ public class ViewerDocumentService {
|
||||
dictionariesToUpdate.add(pdDocument.getDocumentInformation().getCOSObject());
|
||||
// dictionariesToUpdate.add(pdDocument.getDocument().getTrailer());
|
||||
pdDocument.saveIncremental(outputStream, dictionariesToUpdate);
|
||||
log.info("Saved Viewer Document");
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1,5 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.queue;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.springframework.amqp.AmqpRejectAndDontRequeueException;
|
||||
import org.springframework.amqp.core.Message;
|
||||
import org.springframework.amqp.rabbit.annotation.RabbitHandler;
|
||||
@ -45,12 +47,12 @@ public class MessageHandler {
|
||||
}
|
||||
LayoutParsingFinishedEvent layoutParsingFinishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
||||
sendLayoutParsingFinishedEvent(layoutParsingFinishedEvent);
|
||||
log.info("Layout parsing finished {} in {} ms", layoutParsingRequest.identifier(), layoutParsingFinishedEvent.duration());
|
||||
}
|
||||
|
||||
|
||||
public void sendLayoutParsingFinishedEvent(LayoutParsingFinishedEvent layoutParsingFinishedEvent) {
|
||||
|
||||
Arrays.stream(layoutParsingFinishedEvent.message().split("\n")).forEach(log::info);
|
||||
rabbitTemplate.convertAndSend(LayoutParsingQueueNames.LAYOUT_PARSING_FINISHED_EVENT_QUEUE, layoutParsingFinishedEvent);
|
||||
}
|
||||
|
||||
|
||||
@ -0,0 +1,34 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
|
||||
@Autowired
|
||||
private LayoutParsingPipeline layoutParsingPipeline;
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testLayoutParserEndToEnd() {
|
||||
|
||||
prepareStorage("files/bdr/btd_19_053_1905391.pdf");
|
||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.TAAS);
|
||||
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
||||
Arrays.stream(finishedEvent.message().split("\n")).forEach(log::info);
|
||||
}
|
||||
|
||||
}
|
||||
@ -101,8 +101,13 @@ public abstract class AbstractTest {
|
||||
protected LayoutParsingRequest prepareStorage(InputStream fileInputStream) {
|
||||
|
||||
storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileInputStream);
|
||||
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
|
||||
}
|
||||
|
||||
protected LayoutParsingRequest buildDefaultLayoutParsingRequest(LayoutParsingType layoutParsingType) {
|
||||
|
||||
return LayoutParsingRequest.builder()
|
||||
.layoutParsingType(LayoutParsingType.REDACT_MANAGER)
|
||||
.layoutParsingType(layoutParsingType)
|
||||
.originFileStorageId(ORIGIN_FILE_ID)
|
||||
.tablesFileStorageId(Optional.of(TABLE_FILE_ID))
|
||||
.imagesFileStorageId(Optional.of(IMAGE_FILE_ID))
|
||||
@ -116,7 +121,6 @@ public abstract class AbstractTest {
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
protected LayoutParsingRequest prepareStorage(String file, String cvServiceResponseFile, String imageInfoFile) {
|
||||
|
||||
@ -135,19 +139,7 @@ public abstract class AbstractTest {
|
||||
storageService.storeObject(TenantContext.getTenantId(), TABLE_FILE_ID, cvServiceResponseFileStream);
|
||||
storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream);
|
||||
|
||||
return LayoutParsingRequest.builder()
|
||||
.layoutParsingType(LayoutParsingType.REDACT_MANAGER)
|
||||
.originFileStorageId(ORIGIN_FILE_ID)
|
||||
.tablesFileStorageId(Optional.of(TABLE_FILE_ID))
|
||||
.imagesFileStorageId(Optional.of(IMAGE_FILE_ID))
|
||||
.structureFileStorageId(STRUCTURE_FILE_ID)
|
||||
.textBlockFileStorageId(TEXT_FILE_ID)
|
||||
.positionBlockFileStorageId(POSITION_FILE_ID)
|
||||
.pageFileStorageId(PAGES_FILE_ID)
|
||||
.simplifiedTextStorageId(SIMPLIFIED_ID)
|
||||
.sectionGridStorageId(SECTION_GRID_ID)
|
||||
.viewerDocumentStorageId(VIEWER_DOCUMENT_ID)
|
||||
.build();
|
||||
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
|
||||
}
|
||||
|
||||
|
||||
|
||||
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user