RED-9194: roll back single digit headline change
This commit is contained in:
parent
859dba2ecf
commit
570a348a77
@ -24,9 +24,9 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
public class DocuMineClassificationService {
|
public class DocuMineClassificationService {
|
||||||
|
|
||||||
private static final Pattern HEADLINE_WITH_IDENTIFER_PATTERN = Pattern.compile("^([1-9]\\d?\\.){0,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
|
private static final Pattern HEADLINE_WITH_IDENTIFER_PATTERN = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
|
||||||
private static final Pattern AT_LEAST_3_PATTERN = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
|
private static final Pattern AT_LEAST_3_PATTERN = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
|
||||||
private static final Pattern HEADLINE_PATTTERN_WITH_SLASHES = Pattern.compile("^(\\d{1,1}\\.){0,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
|
private static final Pattern HEADLINE_PATTTERN_WITH_SLASHES = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
|
||||||
|
|
||||||
|
|
||||||
public void classifyDocument(ClassificationDocument document) {
|
public void classifyDocument(ClassificationDocument document) {
|
||||||
@ -43,7 +43,10 @@ public class DocuMineClassificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void classifyPage(HeadlineClassificationService headlineClassificationService, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
|
private void classifyPage(HeadlineClassificationService headlineClassificationService,
|
||||||
|
ClassificationPage page,
|
||||||
|
ClassificationDocument document,
|
||||||
|
List<Float> headlineFontSizes) {
|
||||||
|
|
||||||
for (AbstractPageBlock textBlock : page.getTextBlocks()) {
|
for (AbstractPageBlock textBlock : page.getTextBlocks()) {
|
||||||
if (textBlock instanceof TextPageBlock) {
|
if (textBlock instanceof TextPageBlock) {
|
||||||
@ -53,7 +56,11 @@ public class DocuMineClassificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void classifyBlock(HeadlineClassificationService headlineClassificationService, TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
|
private void classifyBlock(HeadlineClassificationService headlineClassificationService,
|
||||||
|
TextPageBlock textBlock,
|
||||||
|
ClassificationPage page,
|
||||||
|
ClassificationDocument document,
|
||||||
|
List<Float> headlineFontSizes) {
|
||||||
|
|
||||||
log.debug("headlineFontSizes: {}", headlineFontSizes);
|
log.debug("headlineFontSizes: {}", headlineFontSizes);
|
||||||
var bodyTextFrame = page.getBodyTextFrame();
|
var bodyTextFrame = page.getBodyTextFrame();
|
||||||
@ -70,10 +77,10 @@ public class DocuMineClassificationService {
|
|||||||
textBlock.setClassification(PageBlockType.OTHER);
|
textBlock.setClassification(PageBlockType.OTHER);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || (PositionUtils.isOverBodyTextFrame(bodyTextFrame,
|
||||||
|| (PositionUtils.isOverBodyTextFrame(bodyTextFrame,
|
|
||||||
textBlock,
|
textBlock,
|
||||||
page.getRotation()) && (document.getFontSizeCounter().getMostPopular()
|
page.getRotation()) && (
|
||||||
|
document.getFontSizeCounter().getMostPopular()
|
||||||
== null
|
== null
|
||||||
|| textBlock.getHighestFontSize()
|
|| textBlock.getHighestFontSize()
|
||||||
<= document.getFontSizeCounter()
|
<= document.getFontSizeCounter()
|
||||||
@ -101,20 +108,23 @@ public class DocuMineClassificationService {
|
|||||||
|| textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular())
|
|| textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular())
|
||||||
&& PositionUtils.getApproxLineCount(textBlock) < 5.9
|
&& PositionUtils.getApproxLineCount(textBlock) < 5.9
|
||||||
|
|
||||||
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && atLeast3Matcher.reset().find() && !textBlock.toString()
|
&& (textBlock.getMostPopularWordStyle().contains("bold")
|
||||||
.contains(":")
|
&& Character.isDigit(textBlock.toString().charAt(0))
|
||||||
|| textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && atLeast3Matcher.reset().find() && !textBlock.toString().contains(":")
|
&& atLeast3Matcher.reset().find()
|
||||||
|| textBlock.toString().startsWith("APPENDIX")
|
&& !textBlock.toString().contains(":") || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT))
|
||||||
|| textBlock.toString().startsWith("FIGURE")
|
&& atLeast3Matcher.reset().find()
|
||||||
|| textBlock.toString().startsWith("Continued TABLE")
|
&& !textBlock.toString().contains(":") || textBlock.toString().startsWith("APPENDIX") || textBlock.toString()
|
||||||
|| textBlock.toString().startsWith("TABLE"))
|
.startsWith("FIGURE") || textBlock.toString().startsWith("Continued TABLE") || textBlock.toString().startsWith("TABLE"))
|
||||||
&& !textBlock.toString().endsWith(":")
|
&& !textBlock.toString().endsWith(":")
|
||||||
&& atLeast3Matcher.reset().find()) {
|
&& atLeast3Matcher.reset().find()) {
|
||||||
PageBlockType headlineType = PageBlockType.getHeadlineType(1);
|
PageBlockType headlineType = PageBlockType.getHeadlineType(1);
|
||||||
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
||||||
document.setHeadlines(true);
|
document.setHeadlines(true);
|
||||||
|
|
||||||
} else if (headlineWithIdentifierMatcher.reset().find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && atLeast3Matcher.reset().find() && !headlineWithSlashesMatcher.reset().matches()) {
|
} else if (headlineWithIdentifierMatcher.reset().find()
|
||||||
|
&& PositionUtils.getApproxLineCount(textBlock) < 2.9
|
||||||
|
&& atLeast3Matcher.reset().find()
|
||||||
|
&& !headlineWithSlashesMatcher.reset().matches()) {
|
||||||
PageBlockType headlineType = PageBlockType.getHeadlineType(2);
|
PageBlockType headlineType = PageBlockType.getHeadlineType(2);
|
||||||
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
||||||
document.setHeadlines(true);
|
document.setHeadlines(true);
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user