Merge branch 'RED-9194' into 'main'

RED-9194: roll back single digit headline change

See merge request fforesight/layout-parser!171
This commit is contained in:
Kilian Schüttler 2024-06-21 15:13:40 +02:00
commit b216f02e15

View File

@ -24,9 +24,9 @@ import lombok.extern.slf4j.Slf4j;
@RequiredArgsConstructor
public class DocuMineClassificationService {
private static final Pattern HEADLINE_WITH_IDENTIFER_PATTERN = Pattern.compile("^([1-9]\\d?\\.){0,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
private static final Pattern HEADLINE_WITH_IDENTIFER_PATTERN = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
private static final Pattern AT_LEAST_3_PATTERN = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
private static final Pattern HEADLINE_PATTTERN_WITH_SLASHES = Pattern.compile("^(\\d{1,1}\\.){0,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
private static final Pattern HEADLINE_PATTTERN_WITH_SLASHES = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
public void classifyDocument(ClassificationDocument document) {
@ -43,7 +43,10 @@ public class DocuMineClassificationService {
}
private void classifyPage(HeadlineClassificationService headlineClassificationService, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
private void classifyPage(HeadlineClassificationService headlineClassificationService,
ClassificationPage page,
ClassificationDocument document,
List<Float> headlineFontSizes) {
for (AbstractPageBlock textBlock : page.getTextBlocks()) {
if (textBlock instanceof TextPageBlock) {
@ -53,7 +56,11 @@ public class DocuMineClassificationService {
}
private void classifyBlock(HeadlineClassificationService headlineClassificationService, TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
private void classifyBlock(HeadlineClassificationService headlineClassificationService,
TextPageBlock textBlock,
ClassificationPage page,
ClassificationDocument document,
List<Float> headlineFontSizes) {
log.debug("headlineFontSizes: {}", headlineFontSizes);
var bodyTextFrame = page.getBodyTextFrame();
@ -70,14 +77,10 @@ public class DocuMineClassificationService {
textBlock.setClassification(PageBlockType.OTHER);
return;
}
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|| (PositionUtils.isOverBodyTextFrame(bodyTextFrame,
textBlock,
page.getRotation()) && (document.getFontSizeCounter().getMostPopular()
== null
|| textBlock.getHighestFontSize()
<= document.getFontSizeCounter()
.getMostPopular()))) {
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) //
|| (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) //
&& (document.getFontSizeCounter().getMostPopular() == null //
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular()))) {
textBlock.setClassification(PageBlockType.HEADER);
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
@ -101,12 +104,14 @@ public class DocuMineClassificationService {
|| textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular())
&& PositionUtils.getApproxLineCount(textBlock) < 5.9
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && atLeast3Matcher.reset().find() && !textBlock.toString()
.contains(":")
|| textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && atLeast3Matcher.reset().find() && !textBlock.toString().contains(":")
|| textBlock.toString().startsWith("APPENDIX")
|| textBlock.toString().startsWith("FIGURE")
|| textBlock.toString().startsWith("Continued TABLE")
&& (textBlock.getMostPopularWordStyle().contains("bold")
&& Character.isDigit(textBlock.toString().charAt(0))
&& atLeast3Matcher.reset().find()
&& !textBlock.toString().contains(":") //
|| textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && atLeast3Matcher.reset().find() && !textBlock.toString().contains(":") //
|| textBlock.toString().startsWith("APPENDIX") //
|| textBlock.toString().startsWith("FIGURE") //
|| textBlock.toString().startsWith("Continued TABLE") //
|| textBlock.toString().startsWith("TABLE"))
&& !textBlock.toString().endsWith(":")
&& atLeast3Matcher.reset().find()) {
@ -114,7 +119,10 @@ public class DocuMineClassificationService {
headlineClassificationService.classifyHeadline(textBlock, headlineType);
document.setHeadlines(true);
} else if (headlineWithIdentifierMatcher.reset().find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && atLeast3Matcher.reset().find() && !headlineWithSlashesMatcher.reset().matches()) {
} else if (headlineWithIdentifierMatcher.reset().find()
&& PositionUtils.getApproxLineCount(textBlock) < 2.9
&& atLeast3Matcher.reset().find()
&& !headlineWithSlashesMatcher.reset().matches()) {
PageBlockType headlineType = PageBlockType.getHeadlineType(2);
headlineClassificationService.classifyHeadline(textBlock, headlineType);
document.setHeadlines(true);