RED-10127: Paragraphs with multiple table, appendix, figure can't be headlines
This commit is contained in:
parent
077ce60c9d
commit
89b5be8d67
@ -18,10 +18,14 @@ public class ClassificationPatterns {
|
||||
|
||||
|
||||
|
||||
public static final Pattern TABLE_OR_FIGURE_PATTERN = Pattern.compile(
|
||||
public static final Pattern TABLE_OR_FIGURE_HEADLINE_PATTERN = Pattern.compile(
|
||||
"^\\s*(?:table|continued\\s+table|appendix|figure)\\s+(?:[xvi]+|[a-z0-9]{1,3}(?:\\.[0-9]{1,3})*(?:-[0-9]{1,3})?)\\b",
|
||||
Pattern.CASE_INSENSITIVE);
|
||||
|
||||
public static final Pattern TABLE_MID_SENTENCE_PATTERN = Pattern.compile(
|
||||
"(?:table|continued\\s+table|appendix|figure)\\s+(?:[xvi]+|[a-z0-9]{1,3}(?:\\.[0-9]{1,3})*(?:-[0-9]{1,3})?)\\b",
|
||||
Pattern.CASE_INSENSITIVE);
|
||||
|
||||
public static final Pattern ALPHANUMERIC = Pattern.compile("[a-zA-Z0-9]");
|
||||
|
||||
public static final Pattern NUMERIC = Pattern.compile("[0-9]+");
|
||||
|
||||
@ -6,7 +6,8 @@ import static com.knecon.fforesight.service.layoutparser.processor.services.clas
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_PATTERN_WITH_SLASHES;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_WITH_2_IDENTIFER_PATTERN;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.TABLE_OR_FIGURE_PATTERN;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.TABLE_MID_SENTENCE_PATTERN;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.TABLE_OR_FIGURE_HEADLINE_PATTERN;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
@ -83,7 +84,8 @@ public class DocuMineClassificationService {
|
||||
Matcher atLeast3Matcher = AT_LEAST_3_CHARS_PATTERN.matcher(textBlock.toString());
|
||||
Matcher headlineWithSlashesMatcher = HEADLINE_PATTERN_WITH_SLASHES.matcher(textBlock.toString());
|
||||
Matcher amountMatcher = AMOUNT_PATTERN.matcher(textBlock.toString());
|
||||
Matcher tableOrFigureMatcher = TABLE_OR_FIGURE_PATTERN.matcher(textBlock.toString());
|
||||
Matcher tableOrFigureMatcher = TABLE_OR_FIGURE_HEADLINE_PATTERN.matcher(textBlock.toString());
|
||||
Matcher tableMidSentenceMatcher = TABLE_MID_SENTENCE_PATTERN.matcher(textBlock.toString());
|
||||
Matcher headlineWithSingleIdentifierMatcher = HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN.matcher(textBlock.toString());
|
||||
boolean isAtLeast3Characters = atLeast3Matcher.reset().find();
|
||||
boolean headlineWithSlashesMatches = headlineWithSlashesMatcher.reset().matches();
|
||||
@ -148,6 +150,8 @@ public class DocuMineClassificationService {
|
||||
&& greaterOrEqualFontThanPageAverage(textBlock, page)//
|
||||
&& PositionUtils.getApproxLineCount(textBlock) < 2.9//
|
||||
&& (tableOrFigureMatcher.reset().find() || (headlineWithSingleIdentifierMatcher.reset().find() && listIdentifiers.isEmpty())) //
|
||||
&& tableMidSentenceMatcher.reset().results()
|
||||
.count() <= 1 //
|
||||
&& !isAmount//
|
||||
&& !headlineWithSlashesMatches) {
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user