RED-10127: Paragraphs with multiple table, appendix, figure can't be headlines
This commit is contained in:
parent
2fdc53429c
commit
fcbc005102
@ -18,10 +18,14 @@ public class ClassificationPatterns {
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
public static final Pattern TABLE_OR_FIGURE_PATTERN = Pattern.compile(
|
public static final Pattern TABLE_OR_FIGURE_HEADLINE_PATTERN = Pattern.compile(
|
||||||
"^\\s*(?:table|continued\\s+table|appendix|figure)\\s+(?:[xvi]+|[a-z0-9]{1,3}(?:\\.[0-9]{1,3})*(?:-[0-9]{1,3})?)\\b",
|
"^\\s*(?:table|continued\\s+table|appendix|figure)\\s+(?:[xvi]+|[a-z0-9]{1,3}(?:\\.[0-9]{1,3})*(?:-[0-9]{1,3})?)\\b",
|
||||||
Pattern.CASE_INSENSITIVE);
|
Pattern.CASE_INSENSITIVE);
|
||||||
|
|
||||||
|
public static final Pattern TABLE_MID_SENTENCE_PATTERN = Pattern.compile(
|
||||||
|
"(?:table|continued\\s+table|appendix|figure)\\s+(?:[xvi]+|[a-z0-9]{1,3}(?:\\.[0-9]{1,3})*(?:-[0-9]{1,3})?)\\b",
|
||||||
|
Pattern.CASE_INSENSITIVE);
|
||||||
|
|
||||||
public static final Pattern ALPHANUMERIC = Pattern.compile("[a-zA-Z0-9]");
|
public static final Pattern ALPHANUMERIC = Pattern.compile("[a-zA-Z0-9]");
|
||||||
|
|
||||||
public static final Pattern NUMERIC = Pattern.compile("[0-9]+");
|
public static final Pattern NUMERIC = Pattern.compile("[0-9]+");
|
||||||
|
|||||||
@ -6,7 +6,8 @@ import static com.knecon.fforesight.service.layoutparser.processor.services.clas
|
|||||||
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_PATTERN_WITH_SLASHES;
|
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_PATTERN_WITH_SLASHES;
|
||||||
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_WITH_2_IDENTIFER_PATTERN;
|
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_WITH_2_IDENTIFER_PATTERN;
|
||||||
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN;
|
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN;
|
||||||
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.TABLE_OR_FIGURE_PATTERN;
|
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.TABLE_MID_SENTENCE_PATTERN;
|
||||||
|
import static com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationPatterns.TABLE_OR_FIGURE_HEADLINE_PATTERN;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
@ -83,7 +84,8 @@ public class DocuMineClassificationService {
|
|||||||
Matcher atLeast3Matcher = AT_LEAST_3_CHARS_PATTERN.matcher(textBlock.toString());
|
Matcher atLeast3Matcher = AT_LEAST_3_CHARS_PATTERN.matcher(textBlock.toString());
|
||||||
Matcher headlineWithSlashesMatcher = HEADLINE_PATTERN_WITH_SLASHES.matcher(textBlock.toString());
|
Matcher headlineWithSlashesMatcher = HEADLINE_PATTERN_WITH_SLASHES.matcher(textBlock.toString());
|
||||||
Matcher amountMatcher = AMOUNT_PATTERN.matcher(textBlock.toString());
|
Matcher amountMatcher = AMOUNT_PATTERN.matcher(textBlock.toString());
|
||||||
Matcher tableOrFigureMatcher = TABLE_OR_FIGURE_PATTERN.matcher(textBlock.toString());
|
Matcher tableOrFigureMatcher = TABLE_OR_FIGURE_HEADLINE_PATTERN.matcher(textBlock.toString());
|
||||||
|
Matcher tableMidSentenceMatcher = TABLE_MID_SENTENCE_PATTERN.matcher(textBlock.toString());
|
||||||
Matcher headlineWithSingleIdentifierMatcher = HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN.matcher(textBlock.toString());
|
Matcher headlineWithSingleIdentifierMatcher = HEADLINE_WITH_SINGLE_IDENTIFER_PATTERN.matcher(textBlock.toString());
|
||||||
boolean isAtLeast3Characters = atLeast3Matcher.reset().find();
|
boolean isAtLeast3Characters = atLeast3Matcher.reset().find();
|
||||||
boolean headlineWithSlashesMatches = headlineWithSlashesMatcher.reset().matches();
|
boolean headlineWithSlashesMatches = headlineWithSlashesMatcher.reset().matches();
|
||||||
@ -148,6 +150,8 @@ public class DocuMineClassificationService {
|
|||||||
&& greaterOrEqualFontThanPageAverage(textBlock, page)//
|
&& greaterOrEqualFontThanPageAverage(textBlock, page)//
|
||||||
&& PositionUtils.getApproxLineCount(textBlock) < 2.9//
|
&& PositionUtils.getApproxLineCount(textBlock) < 2.9//
|
||||||
&& (tableOrFigureMatcher.reset().find() || (headlineWithSingleIdentifierMatcher.reset().find() && listIdentifiers.isEmpty())) //
|
&& (tableOrFigureMatcher.reset().find() || (headlineWithSingleIdentifierMatcher.reset().find() && listIdentifiers.isEmpty())) //
|
||||||
|
&& tableMidSentenceMatcher.reset().results()
|
||||||
|
.count() <= 1 //
|
||||||
&& !isAmount//
|
&& !isAmount//
|
||||||
&& !headlineWithSlashesMatches) {
|
&& !headlineWithSlashesMatches) {
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user