RED-9129: Track pagestatistics also for landscape pages in documine

This commit is contained in:
Dominique Eifländer 2024-05-08 10:50:36 +02:00
parent ae6bad830e
commit 71025f7f16

View File

@ -83,13 +83,17 @@ public class LayoutParsingPipeline {
try (PDDocument originDocument = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId())) { try (PDDocument originDocument = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId())) {
ImageServiceResponse imageServiceResponse = new ImageServiceResponse(); ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
if (layoutParsingRequest.imagesFileStorageId().isPresent()) { if (layoutParsingRequest.imagesFileStorageId()
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get()); .isPresent()) {
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId()
.get());
} }
TableServiceResponse tableServiceResponse = new TableServiceResponse(); TableServiceResponse tableServiceResponse = new TableServiceResponse();
if (layoutParsingRequest.tablesFileStorageId().isPresent()) { if (layoutParsingRequest.tablesFileStorageId()
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get()); .isPresent()) {
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId()
.get());
} }
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(), originDocument, imageServiceResponse, tableServiceResponse); ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(), originDocument, imageServiceResponse, tableServiceResponse);
@ -115,25 +119,25 @@ public class LayoutParsingPipeline {
.numberOfPages(numberOfPages) .numberOfPages(numberOfPages)
.duration(System.currentTimeMillis() - start) .duration(System.currentTimeMillis() - start)
.message(format(""" .message(format("""
Layout parsing has finished in %.02f s. Layout parsing has finished in %.02f s.
identifiers: %s identifiers: %s
%s %s
Files have been saved with Ids: Files have been saved with Ids:
Structure: %s Structure: %s
Text: %s Text: %s
Positions: %s Positions: %s
PageData: %s PageData: %s
Simplified Text: %s Simplified Text: %s
Viewer Doc: %s""", Viewer Doc: %s""",
((float) (System.currentTimeMillis() - start)) / 1000, ((float) (System.currentTimeMillis() - start)) / 1000,
layoutParsingRequest.identifier(), layoutParsingRequest.identifier(),
buildSemanticNodeCountMessage(numberOfPages, documentGraph.buildSemanticNodeCounts()), buildSemanticNodeCountMessage(numberOfPages, documentGraph.buildSemanticNodeCounts()),
layoutParsingRequest.structureFileStorageId(), layoutParsingRequest.structureFileStorageId(),
layoutParsingRequest.textBlockFileStorageId(), layoutParsingRequest.textBlockFileStorageId(),
layoutParsingRequest.positionBlockFileStorageId(), layoutParsingRequest.positionBlockFileStorageId(),
layoutParsingRequest.pageFileStorageId(), layoutParsingRequest.pageFileStorageId(),
layoutParsingRequest.simplifiedTextStorageId(), layoutParsingRequest.simplifiedTextStorageId(),
layoutParsingRequest.viewerDocumentStorageId())) layoutParsingRequest.viewerDocumentStorageId()))
.build(); .build();
} }
} }
@ -142,14 +146,14 @@ public class LayoutParsingPipeline {
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) { private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed", return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
numberOfPages, numberOfPages,
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION), semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE), semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH), semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE), semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL), semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER), semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER)); semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
} }
@ -213,7 +217,7 @@ public class LayoutParsingPipeline {
tableExtractionService.extractTables(cleanRulings, classificationPage); tableExtractionService.extractTables(cleanRulings, classificationPage);
buildPageStatistics(classificationPage); buildPageStatistics(classificationPage);
increaseDocumentStatistics(classificationPage, classificationDocument); increaseDocumentStatistics(layoutParsingType, classificationPage, classificationDocument);
classificationPages.add(classificationPage); classificationPages.add(classificationPage);
} }
@ -242,11 +246,11 @@ public class LayoutParsingPipeline {
} }
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) { private void increaseDocumentStatistics(LayoutParsingType layoutParsingType, ClassificationPage classificationPage, ClassificationDocument document) {
if (!classificationPage.isLandscape()) { if (layoutParsingType.equals(LayoutParsingType.DOCUMINE) || !classificationPage.isLandscape()) {
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue()); document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
} }
document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue()); document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue());
document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue()); document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue());
document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue()); document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue());