diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java index 54c57d8..3da3cfe 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java @@ -24,9 +24,9 @@ import lombok.extern.slf4j.Slf4j; @RequiredArgsConstructor public class DocuMineClassificationService { - private static final Pattern pattern = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE); - private static final Pattern pattern2 = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE); - private static final Pattern pattern3 = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*"); + private static final Pattern HEADLINE_WITH_IDENTIFER_PATTERN = Pattern.compile("^([1-9]\\d?\\.){0,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE); + private static final Pattern AT_LEAST_3_PATTERN = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE); + private static final Pattern HEADLINE_PATTTERN_WITH_SLASHES = Pattern.compile("^(\\d{1,1}\\.){0,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*"); public void classifyDocument(ClassificationDocument document) { @@ -58,9 +58,9 @@ public class DocuMineClassificationService { log.debug("headlineFontSizes: {}", headlineFontSizes); var bodyTextFrame = page.getBodyTextFrame(); - Matcher matcher = pattern.matcher(textBlock.toString()); - Matcher matcher2 = pattern2.matcher(textBlock.toString()); - Matcher matcher3 = pattern3.matcher(textBlock.toString()); + Matcher headlineWithIdentifierMatcher = HEADLINE_WITH_IDENTIFER_PATTERN.matcher(textBlock.toString()); + Matcher atLeast3Matcher = AT_LEAST_3_PATTERN.matcher(textBlock.toString()); + Matcher headlineWithSlashesMatcher = HEADLINE_PATTTERN_WITH_SLASHES.matcher(textBlock.toString()); if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) { headlineClassificationService.setLastHeadlineFromOutline(textBlock); @@ -101,19 +101,20 @@ public class DocuMineClassificationService { || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) && PositionUtils.getApproxLineCount(textBlock) < 5.9 - && (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && matcher2.reset().find() && !textBlock.toString() + && (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && atLeast3Matcher.reset().find() && !textBlock.toString() .contains(":") - || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && matcher2.reset().find() && !textBlock.toString().contains(":") + || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && atLeast3Matcher.reset().find() && !textBlock.toString().contains(":") || textBlock.toString().startsWith("APPENDIX") || textBlock.toString().startsWith("FIGURE") + || textBlock.toString().startsWith("Continued TABLE") || textBlock.toString().startsWith("TABLE")) && !textBlock.toString().endsWith(":") - && matcher2.reset().find()) { + && atLeast3Matcher.reset().find()) { PageBlockType headlineType = PageBlockType.getHeadlineType(1); headlineClassificationService.classifyHeadline(textBlock, headlineType); document.setHeadlines(true); - } else if (matcher.reset().find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.reset().find() && !matcher3.reset().matches()) { + } else if (headlineWithIdentifierMatcher.reset().find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && atLeast3Matcher.reset().find() && !headlineWithSlashesMatcher.reset().matches()) { PageBlockType headlineType = PageBlockType.getHeadlineType(2); headlineClassificationService.classifyHeadline(textBlock, headlineType); document.setHeadlines(true);