From 570a348a77effb137ffdd239d5794dea3b1a44e8 Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Fri, 21 Jun 2024 14:39:27 +0200 Subject: [PATCH 1/3] RED-9194: roll back single digit headline change --- .../DocuMineClassificationService.java | 50 +++++++++++-------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java index 3da3cfe..4e9459f 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java @@ -24,9 +24,9 @@ import lombok.extern.slf4j.Slf4j; @RequiredArgsConstructor public class DocuMineClassificationService { - private static final Pattern HEADLINE_WITH_IDENTIFER_PATTERN = Pattern.compile("^([1-9]\\d?\\.){0,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE); + private static final Pattern HEADLINE_WITH_IDENTIFER_PATTERN = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE); private static final Pattern AT_LEAST_3_PATTERN = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE); - private static final Pattern HEADLINE_PATTTERN_WITH_SLASHES = Pattern.compile("^(\\d{1,1}\\.){0,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*"); + private static final Pattern HEADLINE_PATTTERN_WITH_SLASHES = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*"); public void classifyDocument(ClassificationDocument document) { @@ -43,7 +43,10 @@ public class DocuMineClassificationService { } - private void classifyPage(HeadlineClassificationService headlineClassificationService, ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { + private void classifyPage(HeadlineClassificationService headlineClassificationService, + ClassificationPage page, + ClassificationDocument document, + List headlineFontSizes) { for (AbstractPageBlock textBlock : page.getTextBlocks()) { if (textBlock instanceof TextPageBlock) { @@ -53,7 +56,11 @@ public class DocuMineClassificationService { } - private void classifyBlock(HeadlineClassificationService headlineClassificationService, TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { + private void classifyBlock(HeadlineClassificationService headlineClassificationService, + TextPageBlock textBlock, + ClassificationPage page, + ClassificationDocument document, + List headlineFontSizes) { log.debug("headlineFontSizes: {}", headlineFontSizes); var bodyTextFrame = page.getBodyTextFrame(); @@ -70,14 +77,14 @@ public class DocuMineClassificationService { textBlock.setClassification(PageBlockType.OTHER); return; } - if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) - || (PositionUtils.isOverBodyTextFrame(bodyTextFrame, - textBlock, - page.getRotation()) && (document.getFontSizeCounter().getMostPopular() - == null - || textBlock.getHighestFontSize() - <= document.getFontSizeCounter() - .getMostPopular()))) { + if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || (PositionUtils.isOverBodyTextFrame(bodyTextFrame, + textBlock, + page.getRotation()) && ( + document.getFontSizeCounter().getMostPopular() + == null + || textBlock.getHighestFontSize() + <= document.getFontSizeCounter() + .getMostPopular()))) { textBlock.setClassification(PageBlockType.HEADER); } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) @@ -101,20 +108,23 @@ public class DocuMineClassificationService { || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) && PositionUtils.getApproxLineCount(textBlock) < 5.9 - && (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && atLeast3Matcher.reset().find() && !textBlock.toString() - .contains(":") - || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && atLeast3Matcher.reset().find() && !textBlock.toString().contains(":") - || textBlock.toString().startsWith("APPENDIX") - || textBlock.toString().startsWith("FIGURE") - || textBlock.toString().startsWith("Continued TABLE") - || textBlock.toString().startsWith("TABLE")) + && (textBlock.getMostPopularWordStyle().contains("bold") + && Character.isDigit(textBlock.toString().charAt(0)) + && atLeast3Matcher.reset().find() + && !textBlock.toString().contains(":") || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) + && atLeast3Matcher.reset().find() + && !textBlock.toString().contains(":") || textBlock.toString().startsWith("APPENDIX") || textBlock.toString() + .startsWith("FIGURE") || textBlock.toString().startsWith("Continued TABLE") || textBlock.toString().startsWith("TABLE")) && !textBlock.toString().endsWith(":") && atLeast3Matcher.reset().find()) { PageBlockType headlineType = PageBlockType.getHeadlineType(1); headlineClassificationService.classifyHeadline(textBlock, headlineType); document.setHeadlines(true); - } else if (headlineWithIdentifierMatcher.reset().find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && atLeast3Matcher.reset().find() && !headlineWithSlashesMatcher.reset().matches()) { + } else if (headlineWithIdentifierMatcher.reset().find() + && PositionUtils.getApproxLineCount(textBlock) < 2.9 + && atLeast3Matcher.reset().find() + && !headlineWithSlashesMatcher.reset().matches()) { PageBlockType headlineType = PageBlockType.getHeadlineType(2); headlineClassificationService.classifyHeadline(textBlock, headlineType); document.setHeadlines(true); From 9f7ed974ec3b0b9ed7989d4bffca06903e815952 Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Fri, 21 Jun 2024 14:41:30 +0200 Subject: [PATCH 2/3] RED-9194: roll back single digit headline change --- .../classification/DocuMineClassificationService.java | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java index 4e9459f..8a73452 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java @@ -111,10 +111,12 @@ public class DocuMineClassificationService { && (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && atLeast3Matcher.reset().find() - && !textBlock.toString().contains(":") || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) - && atLeast3Matcher.reset().find() - && !textBlock.toString().contains(":") || textBlock.toString().startsWith("APPENDIX") || textBlock.toString() - .startsWith("FIGURE") || textBlock.toString().startsWith("Continued TABLE") || textBlock.toString().startsWith("TABLE")) + && !textBlock.toString().contains(":") // + || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && atLeast3Matcher.reset().find() && !textBlock.toString().contains(":") // + || textBlock.toString().startsWith("APPENDIX") // + || textBlock.toString().startsWith("FIGURE") // + || textBlock.toString().startsWith("Continued TABLE") // + || textBlock.toString().startsWith("TABLE")) && !textBlock.toString().endsWith(":") && atLeast3Matcher.reset().find()) { PageBlockType headlineType = PageBlockType.getHeadlineType(1); From 2e2f30ba35b1b2dc2772fb8520c12636de320987 Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Fri, 21 Jun 2024 14:42:30 +0200 Subject: [PATCH 3/3] RED-9194: roll back single digit headline change --- .../DocuMineClassificationService.java | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java index 8a73452..ea019bd 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java @@ -77,14 +77,10 @@ public class DocuMineClassificationService { textBlock.setClassification(PageBlockType.OTHER); return; } - if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || (PositionUtils.isOverBodyTextFrame(bodyTextFrame, - textBlock, - page.getRotation()) && ( - document.getFontSizeCounter().getMostPopular() - == null - || textBlock.getHighestFontSize() - <= document.getFontSizeCounter() - .getMostPopular()))) { + if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) // + || (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) // + && (document.getFontSizeCounter().getMostPopular() == null // + || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular()))) { textBlock.setClassification(PageBlockType.HEADER); } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)