Merge branch 'DM-307' into 'master'
DM-307 Improved headline detection Closes DM-307 See merge request redactmanager/redaction-service!56
This commit is contained in:
commit
3bc11b7d33
@ -59,8 +59,8 @@ public class DocuMineClassificationService implements ClassificationService {
|
|||||||
log.debug("headlineFontSizes: {}", headlineFontSizes);
|
log.debug("headlineFontSizes: {}", headlineFontSizes);
|
||||||
var bodyTextFrame = page.getBodyTextFrame();
|
var bodyTextFrame = page.getBodyTextFrame();
|
||||||
|
|
||||||
var pattern = Patterns.getCompiledPattern("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z]{2,50}", true);
|
var pattern = Patterns.getCompiledPattern("^(\\d{1,1}\\.?){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z\\[\\]\\-]{2,50}", true);
|
||||||
var pattern2 = Patterns.getCompiledPattern(".*\\d$", true);
|
var pattern2 = Patterns.getCompiledPattern(".*\\d{4}$", true);
|
||||||
var pattern3 = Patterns.getCompiledPattern("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*", false);
|
var pattern3 = Patterns.getCompiledPattern("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*", false);
|
||||||
|
|
||||||
Matcher matcher = pattern.matcher(textBlock.toString());
|
Matcher matcher = pattern.matcher(textBlock.toString());
|
||||||
@ -71,16 +71,16 @@ public class DocuMineClassificationService implements ClassificationService {
|
|||||||
textBlock.setClassification(PageBlockType.OTHER);
|
textBlock.setClassification(PageBlockType.OTHER);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (textBlock.getText().length() > 5 && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter()
|
if (textBlock.getText().length() > 6 && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter()
|
||||||
.getMostPopular() || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) && PositionUtils.getApproxLineCount(textBlock) < 5.9
|
.getMostPopular() || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) && PositionUtils.getApproxLineCount(textBlock) < 5.9
|
||||||
|
|
||||||
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && !matcher2.matches() && !textBlock.toString()
|
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && !matcher2.matches() && !textBlock.toString()
|
||||||
.contains(":") || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && !matcher2.matches() && !textBlock.toString()
|
.contains(":") || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && !matcher2.matches() && !textBlock.toString()
|
||||||
.contains(":") || textBlock.toString().startsWith("APPENDIX") || textBlock.toString().startsWith("TABLE")) && !textBlock.toString().endsWith(":")) {
|
.contains(":") || textBlock.toString().startsWith("APPENDIX") || textBlock.toString().startsWith("FIGURE") || textBlock.toString().startsWith("TABLE")) && !textBlock.toString().endsWith(":")) {
|
||||||
textBlock.setClassification(PageBlockType.getHeadlineType(1));
|
textBlock.setClassification(PageBlockType.getHeadlineType(1));
|
||||||
document.setHeadlines(true);
|
document.setHeadlines(true);
|
||||||
|
|
||||||
} else if (matcher.find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && !matcher2.matches() && !matcher3.matches()) {
|
} else if (matcher.find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && !matcher3.matches() && !matcher2.matches()) {
|
||||||
textBlock.setClassification(PageBlockType.getHeadlineType(2));
|
textBlock.setClassification(PageBlockType.getHeadlineType(2));
|
||||||
document.setHeadlines(true);
|
document.setHeadlines(true);
|
||||||
} else if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
} else if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
||||||
|
|||||||
@ -82,7 +82,8 @@ public class SectionNodeFactory {
|
|||||||
|
|
||||||
if (abstractPageBlock instanceof TextPageBlock) {
|
if (abstractPageBlock instanceof TextPageBlock) {
|
||||||
// List<TextPageBlock> textBlocksToMerge = findTextBlocksWithSameClassificationAndAlignsYAndSameOrientationUntilConvergence((TextPageBlock) abstractPageBlock, remainingBlocks);
|
// List<TextPageBlock> textBlocksToMerge = findTextBlocksWithSameClassificationAndAlignsYAndSameOrientationUntilConvergence((TextPageBlock) abstractPageBlock, remainingBlocks);
|
||||||
List<TextPageBlock> textBlocksToMerge = findTextBlocksWithSameClassificationAndAlignsYAndSameOrientation(List.of((TextPageBlock) abstractPageBlock), remainingBlocks);
|
List<TextPageBlock> textBlocksToMerge = findTextBlocksWithSameClassificationAndAlignsYAndSameOrientation(List.of((TextPageBlock) abstractPageBlock),
|
||||||
|
remainingBlocks);
|
||||||
alreadyMerged.addAll(textBlocksToMerge);
|
alreadyMerged.addAll(textBlocksToMerge);
|
||||||
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocksToMerge);
|
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocksToMerge);
|
||||||
} else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) {
|
} else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) {
|
||||||
@ -125,7 +126,7 @@ public class SectionNodeFactory {
|
|||||||
List<AbstractPageBlock> previousList = splitList.get(i - 1);
|
List<AbstractPageBlock> previousList = splitList.get(i - 1);
|
||||||
AbstractPageBlock lastPageBlockInPreviousList = previousList.get(previousList.size() - 1);
|
AbstractPageBlock lastPageBlockInPreviousList = previousList.get(previousList.size() - 1);
|
||||||
if (lastPageBlockInPreviousList.isHeadline()) {
|
if (lastPageBlockInPreviousList.isHeadline()) {
|
||||||
previousList.remove(i - 1);
|
previousList.remove(previousList.size() - 1);
|
||||||
splitList.get(i).add(0, lastPageBlockInPreviousList);
|
splitList.get(i).add(0, lastPageBlockInPreviousList);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user