From 89b5be8d67045ea5ef858b10f0fffd92615271bb Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Fri, 6 Dec 2024 13:41:44 +0100 Subject: [PATCH] RED-10127: Paragraphs with multiple table, appendix, figure can't be headlines --- .../services/classification/ClassificationPatterns.java | 6 +++++- .../classification/DocuMineClassificationService.java | 8 ++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClassificationPatterns.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClassificationPatterns.java index 2ca8432..eea868e 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClassificationPatterns.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/ClassificationPatterns.java @@ -18,10 +18,14 @@ public class ClassificationPatterns { - public static final Pattern TABLE_OR_FIGURE_PATTERN = Pattern.compile( + public static final Pattern TABLE_OR_FIGURE_HEADLINE_PATTERN = Pattern.compile( "^\\s*(?:table|continued\\s+table|appendix|figure)\\s+(?:[xvi]+|[a-z0-9]{1,3}(?:\\.[0-9]{1,3})*(?:-[0-9]{1,3})?)\\b", Pattern.CASE_INSENSITIVE); + public static final Pattern TABLE_MID_SENTENCE_PATTERN = Pattern.compile( + "(?:table|continued\\s+table|appendix|figure)\\s+(?:[xvi]+|[a-z0-9]{1,3}(?:\\.[0-9]{1,3})*(?:-[0-9]{1,3})?)\\b", + Pattern.CASE_INSENSITIVE); + public static final Pattern ALPHANUMERIC = Pattern.compile("[a-zA-Z0-9]"); public static final Pattern NUMERIC = Pattern.compile("[0-9]+"); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java index 38b1097..a451efc 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java @@ -6,7 +6,8 @@ import static com.knecon.fforesight.service.layoutparser.processor.services.clas import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_PATTERN_WITH_SLASHES; import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_WITH_2_IDENTIFER_PATTERN; import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN; -import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.TABLE_OR_FIGURE_PATTERN; +import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.TABLE_MID_SENTENCE_PATTERN; +import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.TABLE_OR_FIGURE_HEADLINE_PATTERN; import java.util.ArrayList; import java.util.Comparator; @@ -83,7 +84,8 @@ public class DocuMineClassificationService { Matcher atLeast3Matcher = AT_LEAST_3_CHARS_PATTERN.matcher(textBlock.toString()); Matcher headlineWithSlashesMatcher = HEADLINE_PATTERN_WITH_SLASHES.matcher(textBlock.toString()); Matcher amountMatcher = AMOUNT_PATTERN.matcher(textBlock.toString()); - Matcher tableOrFigureMatcher = TABLE_OR_FIGURE_PATTERN.matcher(textBlock.toString()); + Matcher tableOrFigureMatcher = TABLE_OR_FIGURE_HEADLINE_PATTERN.matcher(textBlock.toString()); + Matcher tableMidSentenceMatcher = TABLE_MID_SENTENCE_PATTERN.matcher(textBlock.toString()); Matcher headlineWithSingleIdentifierMatcher = HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN.matcher(textBlock.toString()); boolean isAtLeast3Characters = atLeast3Matcher.reset().find(); boolean headlineWithSlashesMatches = headlineWithSlashesMatcher.reset().matches(); @@ -148,6 +150,8 @@ public class DocuMineClassificationService { && greaterOrEqualFontThanPageAverage(textBlock, page)// && PositionUtils.getApproxLineCount(textBlock) < 2.9// && (tableOrFigureMatcher.reset().find() || (headlineWithSingleIdentifierMatcher.reset().find() && listIdentifiers.isEmpty())) // + && tableMidSentenceMatcher.reset().results() + .count() <= 1 // && !isAmount// && !headlineWithSlashesMatches) {