diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java index 3da3cfe..ea019bd 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java @@ -24,9 +24,9 @@ import lombok.extern.slf4j.Slf4j; @RequiredArgsConstructor public class DocuMineClassificationService { - private static final Pattern HEADLINE_WITH_IDENTIFER_PATTERN = Pattern.compile("^([1-9]\\d?\\.){0,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE); + private static final Pattern HEADLINE_WITH_IDENTIFER_PATTERN = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE); private static final Pattern AT_LEAST_3_PATTERN = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE); - private static final Pattern HEADLINE_PATTTERN_WITH_SLASHES = Pattern.compile("^(\\d{1,1}\\.){0,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*"); + private static final Pattern HEADLINE_PATTTERN_WITH_SLASHES = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*"); public void classifyDocument(ClassificationDocument document) { @@ -43,7 +43,10 @@ public class DocuMineClassificationService { } - private void classifyPage(HeadlineClassificationService headlineClassificationService, ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { + private void classifyPage(HeadlineClassificationService headlineClassificationService, + ClassificationPage page, + ClassificationDocument document, + List headlineFontSizes) { for (AbstractPageBlock textBlock : page.getTextBlocks()) { if (textBlock instanceof TextPageBlock) { @@ -53,7 +56,11 @@ public class DocuMineClassificationService { } - private void classifyBlock(HeadlineClassificationService headlineClassificationService, TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List headlineFontSizes) { + private void classifyBlock(HeadlineClassificationService headlineClassificationService, + TextPageBlock textBlock, + ClassificationPage page, + ClassificationDocument document, + List headlineFontSizes) { log.debug("headlineFontSizes: {}", headlineFontSizes); var bodyTextFrame = page.getBodyTextFrame(); @@ -70,14 +77,10 @@ public class DocuMineClassificationService { textBlock.setClassification(PageBlockType.OTHER); return; } - if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) - || (PositionUtils.isOverBodyTextFrame(bodyTextFrame, - textBlock, - page.getRotation()) && (document.getFontSizeCounter().getMostPopular() - == null - || textBlock.getHighestFontSize() - <= document.getFontSizeCounter() - .getMostPopular()))) { + if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) // + || (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) // + && (document.getFontSizeCounter().getMostPopular() == null // + || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular()))) { textBlock.setClassification(PageBlockType.HEADER); } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) @@ -101,12 +104,14 @@ public class DocuMineClassificationService { || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) && PositionUtils.getApproxLineCount(textBlock) < 5.9 - && (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && atLeast3Matcher.reset().find() && !textBlock.toString() - .contains(":") - || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && atLeast3Matcher.reset().find() && !textBlock.toString().contains(":") - || textBlock.toString().startsWith("APPENDIX") - || textBlock.toString().startsWith("FIGURE") - || textBlock.toString().startsWith("Continued TABLE") + && (textBlock.getMostPopularWordStyle().contains("bold") + && Character.isDigit(textBlock.toString().charAt(0)) + && atLeast3Matcher.reset().find() + && !textBlock.toString().contains(":") // + || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && atLeast3Matcher.reset().find() && !textBlock.toString().contains(":") // + || textBlock.toString().startsWith("APPENDIX") // + || textBlock.toString().startsWith("FIGURE") // + || textBlock.toString().startsWith("Continued TABLE") // || textBlock.toString().startsWith("TABLE")) && !textBlock.toString().endsWith(":") && atLeast3Matcher.reset().find()) { @@ -114,7 +119,10 @@ public class DocuMineClassificationService { headlineClassificationService.classifyHeadline(textBlock, headlineType); document.setHeadlines(true); - } else if (headlineWithIdentifierMatcher.reset().find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && atLeast3Matcher.reset().find() && !headlineWithSlashesMatcher.reset().matches()) { + } else if (headlineWithIdentifierMatcher.reset().find() + && PositionUtils.getApproxLineCount(textBlock) < 2.9 + && atLeast3Matcher.reset().find() + && !headlineWithSlashesMatcher.reset().matches()) { PageBlockType headlineType = PageBlockType.getHeadlineType(2); headlineClassificationService.classifyHeadline(textBlock, headlineType); document.setHeadlines(true);