Merge branch 'RED-9194' into 'main'
RED-9194: allow single digit headline identifiers See merge request fforesight/layout-parser!168
This commit is contained in:
commit
79795e408a
@ -24,9 +24,9 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@RequiredArgsConstructor
|
||||
public class DocuMineClassificationService {
|
||||
|
||||
private static final Pattern pattern = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
|
||||
private static final Pattern pattern2 = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
|
||||
private static final Pattern pattern3 = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
|
||||
private static final Pattern HEADLINE_WITH_IDENTIFER_PATTERN = Pattern.compile("^([1-9]\\d?\\.){0,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
|
||||
private static final Pattern AT_LEAST_3_PATTERN = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
|
||||
private static final Pattern HEADLINE_PATTTERN_WITH_SLASHES = Pattern.compile("^(\\d{1,1}\\.){0,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
|
||||
|
||||
|
||||
public void classifyDocument(ClassificationDocument document) {
|
||||
@ -58,9 +58,9 @@ public class DocuMineClassificationService {
|
||||
log.debug("headlineFontSizes: {}", headlineFontSizes);
|
||||
var bodyTextFrame = page.getBodyTextFrame();
|
||||
|
||||
Matcher matcher = pattern.matcher(textBlock.toString());
|
||||
Matcher matcher2 = pattern2.matcher(textBlock.toString());
|
||||
Matcher matcher3 = pattern3.matcher(textBlock.toString());
|
||||
Matcher headlineWithIdentifierMatcher = HEADLINE_WITH_IDENTIFER_PATTERN.matcher(textBlock.toString());
|
||||
Matcher atLeast3Matcher = AT_LEAST_3_PATTERN.matcher(textBlock.toString());
|
||||
Matcher headlineWithSlashesMatcher = HEADLINE_PATTTERN_WITH_SLASHES.matcher(textBlock.toString());
|
||||
|
||||
if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
|
||||
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
|
||||
@ -101,19 +101,20 @@ public class DocuMineClassificationService {
|
||||
|| textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular())
|
||||
&& PositionUtils.getApproxLineCount(textBlock) < 5.9
|
||||
|
||||
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && matcher2.reset().find() && !textBlock.toString()
|
||||
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && atLeast3Matcher.reset().find() && !textBlock.toString()
|
||||
.contains(":")
|
||||
|| textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && matcher2.reset().find() && !textBlock.toString().contains(":")
|
||||
|| textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && atLeast3Matcher.reset().find() && !textBlock.toString().contains(":")
|
||||
|| textBlock.toString().startsWith("APPENDIX")
|
||||
|| textBlock.toString().startsWith("FIGURE")
|
||||
|| textBlock.toString().startsWith("Continued TABLE")
|
||||
|| textBlock.toString().startsWith("TABLE"))
|
||||
&& !textBlock.toString().endsWith(":")
|
||||
&& matcher2.reset().find()) {
|
||||
&& atLeast3Matcher.reset().find()) {
|
||||
PageBlockType headlineType = PageBlockType.getHeadlineType(1);
|
||||
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
||||
document.setHeadlines(true);
|
||||
|
||||
} else if (matcher.reset().find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.reset().find() && !matcher3.reset().matches()) {
|
||||
} else if (headlineWithIdentifierMatcher.reset().find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && atLeast3Matcher.reset().find() && !headlineWithSlashesMatcher.reset().matches()) {
|
||||
PageBlockType headlineType = PageBlockType.getHeadlineType(2);
|
||||
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
||||
document.setHeadlines(true);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user