TAAS-89: added log entry and an end2end test
This commit is contained in:
parent
aed4a55787
commit
28ec4c9ccb
@ -10,13 +10,13 @@ import java.util.HashMap;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
import org.apache.pdfbox.pdmodel.PDPage;
|
import org.apache.pdfbox.pdmodel.PDPage;
|
||||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||||
@ -50,6 +50,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.mapper.Docu
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
|
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
|
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
@ -116,16 +117,45 @@ public class LayoutParsingPipeline {
|
|||||||
.identifier(layoutParsingRequest.identifier())
|
.identifier(layoutParsingRequest.identifier())
|
||||||
.numberOfPages(numberOfPages)
|
.numberOfPages(numberOfPages)
|
||||||
.duration(System.currentTimeMillis() - start)
|
.duration(System.currentTimeMillis() - start)
|
||||||
.message(format("Layout parsing is finished and files have been saved with Ids:\n Structure: %s\nText: %s\nPositions: %s\nPageData: %s",
|
.message(format("""
|
||||||
|
Layout parsing has finished in %.02f s.
|
||||||
|
identifiers: %s
|
||||||
|
%s
|
||||||
|
Files have been saved with Ids:
|
||||||
|
Structure: %s
|
||||||
|
Text: %s
|
||||||
|
Positions: %s
|
||||||
|
PageData: %s
|
||||||
|
Simplified Text: %s
|
||||||
|
Viewer Doc: %s""",
|
||||||
|
((float) (System.currentTimeMillis() - start)) / 1000,
|
||||||
|
layoutParsingRequest.identifier(),
|
||||||
|
buildSemanticNodeCountMessage(numberOfPages, documentGraph.buildSemanticNodeCounts()),
|
||||||
layoutParsingRequest.structureFileStorageId(),
|
layoutParsingRequest.structureFileStorageId(),
|
||||||
layoutParsingRequest.textBlockFileStorageId(),
|
layoutParsingRequest.textBlockFileStorageId(),
|
||||||
layoutParsingRequest.positionBlockFileStorageId(),
|
layoutParsingRequest.positionBlockFileStorageId(),
|
||||||
layoutParsingRequest.pageFileStorageId()))
|
layoutParsingRequest.pageFileStorageId(),
|
||||||
|
layoutParsingRequest.simplifiedTextStorageId(),
|
||||||
|
layoutParsingRequest.viewerDocumentStorageId()))
|
||||||
.build();
|
.build();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
|
||||||
|
|
||||||
|
return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
|
||||||
|
numberOfPages,
|
||||||
|
semanticNodeCounts.get(NodeType.SECTION),
|
||||||
|
semanticNodeCounts.get(NodeType.HEADLINE),
|
||||||
|
semanticNodeCounts.get(NodeType.PARAGRAPH),
|
||||||
|
semanticNodeCounts.get(NodeType.TABLE),
|
||||||
|
semanticNodeCounts.get(NodeType.TABLE_CELL),
|
||||||
|
semanticNodeCounts.get(NodeType.HEADER),
|
||||||
|
semanticNodeCounts.get(NodeType.FOOTER));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public ClassificationDocument parseLayout(LayoutParsingType layoutParsingType,
|
public ClassificationDocument parseLayout(LayoutParsingType layoutParsingType,
|
||||||
PDDocument originDocument,
|
PDDocument originDocument,
|
||||||
@ -166,12 +196,9 @@ public class LayoutParsingPipeline {
|
|||||||
stripper.getMaxCharHeight());
|
stripper.getMaxCharHeight());
|
||||||
|
|
||||||
ClassificationPage classificationPage = switch (layoutParsingType) {
|
ClassificationPage classificationPage = switch (layoutParsingType) {
|
||||||
case REDACT_MANAGER ->
|
case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||||
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||||
case TAAS ->
|
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||||
taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
|
||||||
case DOCUMINE ->
|
|
||||||
docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
|
||||||
};
|
};
|
||||||
classificationPage.setCleanRulings(cleanRulings);
|
classificationPage.setCleanRulings(cleanRulings);
|
||||||
classificationPage.setRotation(rotation);
|
classificationPage.setRotation(rotation);
|
||||||
@ -212,12 +239,14 @@ public class LayoutParsingPipeline {
|
|||||||
|
|
||||||
|
|
||||||
private Map<String, Rectangle2D> convertMarkedContents(List<PDMarkedContent> pdMarkedContents) {
|
private Map<String, Rectangle2D> convertMarkedContents(List<PDMarkedContent> pdMarkedContents) {
|
||||||
|
|
||||||
Map<String, Rectangle2D> markedContentBboxes = new HashMap<>();
|
Map<String, Rectangle2D> markedContentBboxes = new HashMap<>();
|
||||||
markedContentBboxes.put(MarkedContentUtils.HEADER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.HEADER));
|
markedContentBboxes.put(MarkedContentUtils.HEADER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.HEADER));
|
||||||
markedContentBboxes.put(MarkedContentUtils.FOOTER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.FOOTER));
|
markedContentBboxes.put(MarkedContentUtils.FOOTER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.FOOTER));
|
||||||
return markedContentBboxes;
|
return markedContentBboxes;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
|
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
|
||||||
|
|
||||||
if (!classificationPage.isLandscape()) {
|
if (!classificationPage.isLandscape()) {
|
||||||
|
|||||||
@ -100,6 +100,11 @@ public class Document implements GenericSemanticNode {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Map<NodeType, Long> buildSemanticNodeCounts() {
|
||||||
|
|
||||||
|
return streamAllSubNodes().collect(Collectors.groupingBy(SemanticNode::getType, Collectors.counting()));
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
|
|
||||||
|
|||||||
@ -51,7 +51,6 @@ public class ViewerDocumentService {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void createViewerDocument(PDDocument pdDocument, Document document, OutputStream outputStream, boolean layerVisibilityDefaultValue) {
|
public void createViewerDocument(PDDocument pdDocument, Document document, OutputStream outputStream, boolean layerVisibilityDefaultValue) {
|
||||||
|
|
||||||
log.info("Start Viewer Document Creation");
|
|
||||||
LayoutGrid layoutGrid = layoutGridService.createLayoutGrid(document);
|
LayoutGrid layoutGrid = layoutGridService.createLayoutGrid(document);
|
||||||
// PDDocument.save() is very slow, since it actually traverses the entire pdf and writes a new one.
|
// PDDocument.save() is very slow, since it actually traverses the entire pdf and writes a new one.
|
||||||
// If we collect all COSDictionaries we changed and tell it explicitly to only add the changed ones by using saveIncremental it's very fast.
|
// If we collect all COSDictionaries we changed and tell it explicitly to only add the changed ones by using saveIncremental it's very fast.
|
||||||
@ -122,7 +121,6 @@ public class ViewerDocumentService {
|
|||||||
dictionariesToUpdate.add(pdDocument.getDocumentInformation().getCOSObject());
|
dictionariesToUpdate.add(pdDocument.getDocumentInformation().getCOSObject());
|
||||||
// dictionariesToUpdate.add(pdDocument.getDocument().getTrailer());
|
// dictionariesToUpdate.add(pdDocument.getDocument().getTrailer());
|
||||||
pdDocument.saveIncremental(outputStream, dictionariesToUpdate);
|
pdDocument.saveIncremental(outputStream, dictionariesToUpdate);
|
||||||
log.info("Saved Viewer Document");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,7 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.server.queue;
|
package com.knecon.fforesight.service.layoutparser.server.queue;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
import org.springframework.amqp.AmqpRejectAndDontRequeueException;
|
import org.springframework.amqp.AmqpRejectAndDontRequeueException;
|
||||||
import org.springframework.amqp.core.Message;
|
import org.springframework.amqp.core.Message;
|
||||||
import org.springframework.amqp.rabbit.annotation.RabbitHandler;
|
import org.springframework.amqp.rabbit.annotation.RabbitHandler;
|
||||||
@ -45,12 +47,12 @@ public class MessageHandler {
|
|||||||
}
|
}
|
||||||
LayoutParsingFinishedEvent layoutParsingFinishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
LayoutParsingFinishedEvent layoutParsingFinishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
||||||
sendLayoutParsingFinishedEvent(layoutParsingFinishedEvent);
|
sendLayoutParsingFinishedEvent(layoutParsingFinishedEvent);
|
||||||
log.info("Layout parsing finished {} in {} ms", layoutParsingRequest.identifier(), layoutParsingFinishedEvent.duration());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void sendLayoutParsingFinishedEvent(LayoutParsingFinishedEvent layoutParsingFinishedEvent) {
|
public void sendLayoutParsingFinishedEvent(LayoutParsingFinishedEvent layoutParsingFinishedEvent) {
|
||||||
|
|
||||||
|
Arrays.stream(layoutParsingFinishedEvent.message().split("\n")).forEach(log::info);
|
||||||
rabbitTemplate.convertAndSend(LayoutParsingQueueNames.LAYOUT_PARSING_FINISHED_EVENT_QUEUE, layoutParsingFinishedEvent);
|
rabbitTemplate.convertAndSend(LayoutParsingQueueNames.LAYOUT_PARSING_FINISHED_EVENT_QUEUE, layoutParsingFinishedEvent);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -0,0 +1,34 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.server;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
|
||||||
|
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
|
@Slf4j
|
||||||
|
public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||||
|
|
||||||
|
@Autowired
|
||||||
|
private LayoutParsingPipeline layoutParsingPipeline;
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@SneakyThrows
|
||||||
|
public void testLayoutParserEndToEnd() {
|
||||||
|
|
||||||
|
prepareStorage("files/bdr/btd_19_053_1905391.pdf");
|
||||||
|
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.TAAS);
|
||||||
|
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
||||||
|
Arrays.stream(finishedEvent.message().split("\n")).forEach(log::info);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -101,8 +101,13 @@ public abstract class AbstractTest {
|
|||||||
protected LayoutParsingRequest prepareStorage(InputStream fileInputStream) {
|
protected LayoutParsingRequest prepareStorage(InputStream fileInputStream) {
|
||||||
|
|
||||||
storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileInputStream);
|
storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileInputStream);
|
||||||
|
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected LayoutParsingRequest buildDefaultLayoutParsingRequest(LayoutParsingType layoutParsingType) {
|
||||||
|
|
||||||
return LayoutParsingRequest.builder()
|
return LayoutParsingRequest.builder()
|
||||||
.layoutParsingType(LayoutParsingType.REDACT_MANAGER)
|
.layoutParsingType(layoutParsingType)
|
||||||
.originFileStorageId(ORIGIN_FILE_ID)
|
.originFileStorageId(ORIGIN_FILE_ID)
|
||||||
.tablesFileStorageId(Optional.of(TABLE_FILE_ID))
|
.tablesFileStorageId(Optional.of(TABLE_FILE_ID))
|
||||||
.imagesFileStorageId(Optional.of(IMAGE_FILE_ID))
|
.imagesFileStorageId(Optional.of(IMAGE_FILE_ID))
|
||||||
@ -116,7 +121,6 @@ public abstract class AbstractTest {
|
|||||||
.build();
|
.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
protected LayoutParsingRequest prepareStorage(String file, String cvServiceResponseFile, String imageInfoFile) {
|
protected LayoutParsingRequest prepareStorage(String file, String cvServiceResponseFile, String imageInfoFile) {
|
||||||
|
|
||||||
@ -135,19 +139,7 @@ public abstract class AbstractTest {
|
|||||||
storageService.storeObject(TenantContext.getTenantId(), TABLE_FILE_ID, cvServiceResponseFileStream);
|
storageService.storeObject(TenantContext.getTenantId(), TABLE_FILE_ID, cvServiceResponseFileStream);
|
||||||
storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream);
|
storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream);
|
||||||
|
|
||||||
return LayoutParsingRequest.builder()
|
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
|
||||||
.layoutParsingType(LayoutParsingType.REDACT_MANAGER)
|
|
||||||
.originFileStorageId(ORIGIN_FILE_ID)
|
|
||||||
.tablesFileStorageId(Optional.of(TABLE_FILE_ID))
|
|
||||||
.imagesFileStorageId(Optional.of(IMAGE_FILE_ID))
|
|
||||||
.structureFileStorageId(STRUCTURE_FILE_ID)
|
|
||||||
.textBlockFileStorageId(TEXT_FILE_ID)
|
|
||||||
.positionBlockFileStorageId(POSITION_FILE_ID)
|
|
||||||
.pageFileStorageId(PAGES_FILE_ID)
|
|
||||||
.simplifiedTextStorageId(SIMPLIFIED_ID)
|
|
||||||
.sectionGridStorageId(SECTION_GRID_ID)
|
|
||||||
.viewerDocumentStorageId(VIEWER_DOCUMENT_ID)
|
|
||||||
.build();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user