From ae6bad830ecd77a927e5ef7a8628344e4d78dca0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dominique=20Eifl=C3=A4nder?= Date: Tue, 30 Apr 2024 10:44:48 +0200 Subject: [PATCH 1/2] RED-8933: Fixed bugs in DocumineClassificationService --- .../DocuMineClassificationService.java | 64 +++++++++++-------- 1 file changed, 36 insertions(+), 28 deletions(-) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java index d622fc8..4c881c6 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java @@ -5,7 +5,6 @@ import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; -import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; @@ -13,6 +12,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; +import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils; import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; import lombok.RequiredArgsConstructor; @@ -23,7 +23,7 @@ import lombok.extern.slf4j.Slf4j; @RequiredArgsConstructor public class DocuMineClassificationService { - private static final Pattern pattern = Pattern.compile("^(\\d{1,2}\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE); + private static final Pattern pattern = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE); private static final Pattern pattern2 = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE); private static final Pattern pattern3 = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*"); @@ -64,46 +64,54 @@ public class DocuMineClassificationService { return; } if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) - || PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() - .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular()) - ) { + || PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null + || textBlock.getHighestFontSize() <= document.getFontSizeCounter() + .getMostPopular())) { textBlock.setClassification(PageBlockType.HEADER); } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) - || PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() - .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular()) - ) { + || PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null + || textBlock.getHighestFontSize() <= document.getFontSizeCounter() + .getMostPopular())) { textBlock.setClassification(PageBlockType.FOOTER); - } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, - document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks() - .size() == 1)) { + } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5 + && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) { if (!Pattern.matches("[0-9]+", textBlock.toString())) { textBlock.setClassification(PageBlockType.TITLE); } - } else if (textBlock.getText().length() > 5 && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter() - .getMostPopular() || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) && PositionUtils.getApproxLineCount(textBlock) < 5.9 - - && (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && !matcher2.matches() && !textBlock.toString() - .contains(":") || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && !matcher2.matches() && !textBlock.toString() - .contains(":") || textBlock.toString().startsWith("APPENDIX") || textBlock.toString().startsWith("FIGURE") || textBlock.toString() - .startsWith("TABLE")) && !textBlock.toString().endsWith(":") && matcher2.find()) { + } else if (textBlock.getText().length() > 5 + && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter().getMostPopular() + || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) + && PositionUtils.getApproxLineCount(textBlock) < 5.9 + && (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && matcher2.reset().find() && !textBlock.toString() + .contains(":") + || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && matcher2.reset().find() && !textBlock.toString().contains(":") + || textBlock.toString().startsWith("APPENDIX") + || textBlock.toString().startsWith("FIGURE") + || textBlock.toString().startsWith("TABLE")) + && !textBlock.toString().endsWith(":") + && matcher2.reset().find()) { textBlock.setClassification(PageBlockType.getHeadlineType(1)); document.setHeadlines(true); - } else if (matcher.find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.find() && !matcher3.matches()) { + } else if (matcher.reset().find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.reset().find() && !matcher3.reset().matches()) { textBlock.setClassification(PageBlockType.getHeadlineType(2)); document.setHeadlines(true); - } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter() - .getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) { + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) + && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular() + && textBlock.getMostPopularWordStyle().equals("bold") + && !document.getFontStyleCounter().getMostPopular().equals("bold")) { textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD); - } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont() - .equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle() - .equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) { + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) + && textBlock.getMostPopularWordFont().equals(document.getFontCounter().getMostPopular()) + && textBlock.getMostPopularWordStyle().equals(document.getFontStyleCounter().getMostPopular()) + && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) { textBlock.setClassification(PageBlockType.PARAGRAPH); - } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter() - .getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter() - .getMostPopular() - .equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) { + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) + && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular() + && textBlock.getMostPopularWordStyle().equals("italic") + && !document.getFontStyleCounter().getMostPopular().equals("italic") + && PositionUtils.getApproxLineCount(textBlock) < 2.9) { textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC); } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) { textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN); From 71025f7f161b1fbdd48be084183f71c0255903ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dominique=20Eifl=C3=A4nder?= Date: Wed, 8 May 2024 10:50:36 +0200 Subject: [PATCH 2/2] RED-9129: Track pagestatistics also for landscape pages in documine --- .../processor/LayoutParsingPipeline.java | 76 ++++++++++--------- 1 file changed, 40 insertions(+), 36 deletions(-) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index a02f627..9a5547e 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -83,13 +83,17 @@ public class LayoutParsingPipeline { try (PDDocument originDocument = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId())) { ImageServiceResponse imageServiceResponse = new ImageServiceResponse(); - if (layoutParsingRequest.imagesFileStorageId().isPresent()) { - imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get()); + if (layoutParsingRequest.imagesFileStorageId() + .isPresent()) { + imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId() + .get()); } TableServiceResponse tableServiceResponse = new TableServiceResponse(); - if (layoutParsingRequest.tablesFileStorageId().isPresent()) { - tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get()); + if (layoutParsingRequest.tablesFileStorageId() + .isPresent()) { + tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId() + .get()); } ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(), originDocument, imageServiceResponse, tableServiceResponse); @@ -115,25 +119,25 @@ public class LayoutParsingPipeline { .numberOfPages(numberOfPages) .duration(System.currentTimeMillis() - start) .message(format(""" - Layout parsing has finished in %.02f s. - identifiers: %s - %s - Files have been saved with Ids: - Structure: %s - Text: %s - Positions: %s - PageData: %s - Simplified Text: %s - Viewer Doc: %s""", - ((float) (System.currentTimeMillis() - start)) / 1000, - layoutParsingRequest.identifier(), - buildSemanticNodeCountMessage(numberOfPages, documentGraph.buildSemanticNodeCounts()), - layoutParsingRequest.structureFileStorageId(), - layoutParsingRequest.textBlockFileStorageId(), - layoutParsingRequest.positionBlockFileStorageId(), - layoutParsingRequest.pageFileStorageId(), - layoutParsingRequest.simplifiedTextStorageId(), - layoutParsingRequest.viewerDocumentStorageId())) + Layout parsing has finished in %.02f s. + identifiers: %s + %s + Files have been saved with Ids: + Structure: %s + Text: %s + Positions: %s + PageData: %s + Simplified Text: %s + Viewer Doc: %s""", + ((float) (System.currentTimeMillis() - start)) / 1000, + layoutParsingRequest.identifier(), + buildSemanticNodeCountMessage(numberOfPages, documentGraph.buildSemanticNodeCounts()), + layoutParsingRequest.structureFileStorageId(), + layoutParsingRequest.textBlockFileStorageId(), + layoutParsingRequest.positionBlockFileStorageId(), + layoutParsingRequest.pageFileStorageId(), + layoutParsingRequest.simplifiedTextStorageId(), + layoutParsingRequest.viewerDocumentStorageId())) .build(); } } @@ -142,14 +146,14 @@ public class LayoutParsingPipeline { private String buildSemanticNodeCountMessage(int numberOfPages, Map semanticNodeCounts) { return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed", - numberOfPages, - semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION), - semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE), - semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH), - semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE), - semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL), - semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER), - semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER)); + numberOfPages, + semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION), + semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE), + semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH), + semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE), + semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL), + semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER), + semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER)); } @@ -213,7 +217,7 @@ public class LayoutParsingPipeline { tableExtractionService.extractTables(cleanRulings, classificationPage); buildPageStatistics(classificationPage); - increaseDocumentStatistics(classificationPage, classificationDocument); + increaseDocumentStatistics(layoutParsingType, classificationPage, classificationDocument); classificationPages.add(classificationPage); } @@ -242,11 +246,11 @@ public class LayoutParsingPipeline { } - private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) { + private void increaseDocumentStatistics(LayoutParsingType layoutParsingType, ClassificationPage classificationPage, ClassificationDocument document) { - if (!classificationPage.isLandscape()) { - document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue()); - } + if (layoutParsingType.equals(LayoutParsingType.DOCUMINE) || !classificationPage.isLandscape()) { + document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue()); + } document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue()); document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue()); document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue());