From 33a4562938a3e7468c9611de78264d9c430da17a Mon Sep 17 00:00:00 2001 From: deiflaender Date: Tue, 18 Jul 2023 15:24:01 +0200 Subject: [PATCH] DM-307 Improved headline detection --- .../service/DocuMineClassificationService.java | 10 +++++----- .../document/factory/SectionNodeFactory.java | 5 +++-- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/DocuMineClassificationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/DocuMineClassificationService.java index 35ca3899..86214dea 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/DocuMineClassificationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/classification/service/DocuMineClassificationService.java @@ -59,8 +59,8 @@ public class DocuMineClassificationService implements ClassificationService { log.debug("headlineFontSizes: {}", headlineFontSizes); var bodyTextFrame = page.getBodyTextFrame(); - var pattern = Patterns.getCompiledPattern("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z]{2,50}", true); - var pattern2 = Patterns.getCompiledPattern(".*\\d$", true); + var pattern = Patterns.getCompiledPattern("^(\\d{1,1}\\.?){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z\\[\\]\\-]{2,50}", true); + var pattern2 = Patterns.getCompiledPattern(".*\\d{4}$", true); var pattern3 = Patterns.getCompiledPattern("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*", false); Matcher matcher = pattern.matcher(textBlock.toString()); @@ -71,16 +71,16 @@ public class DocuMineClassificationService implements ClassificationService { textBlock.setClassification(PageBlockType.OTHER); return; } - if (textBlock.getText().length() > 5 && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter() + if (textBlock.getText().length() > 6 && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter() .getMostPopular() || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) && PositionUtils.getApproxLineCount(textBlock) < 5.9 && (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && !matcher2.matches() && !textBlock.toString() .contains(":") || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && !matcher2.matches() && !textBlock.toString() - .contains(":") || textBlock.toString().startsWith("APPENDIX") || textBlock.toString().startsWith("TABLE")) && !textBlock.toString().endsWith(":")) { + .contains(":") || textBlock.toString().startsWith("APPENDIX") || textBlock.toString().startsWith("FIGURE") || textBlock.toString().startsWith("TABLE")) && !textBlock.toString().endsWith(":")) { textBlock.setClassification(PageBlockType.getHeadlineType(1)); document.setHeadlines(true); - } else if (matcher.find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && !matcher2.matches() && !matcher3.matches()) { + } else if (matcher.find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && !matcher3.matches() && !matcher2.matches()) { textBlock.setClassification(PageBlockType.getHeadlineType(2)); document.setHeadlines(true); } else if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/factory/SectionNodeFactory.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/factory/SectionNodeFactory.java index 6e760b9b..1360e089 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/factory/SectionNodeFactory.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/layoutparsing/document/factory/SectionNodeFactory.java @@ -82,7 +82,8 @@ public class SectionNodeFactory { if (abstractPageBlock instanceof TextPageBlock) { // List textBlocksToMerge = findTextBlocksWithSameClassificationAndAlignsYAndSameOrientationUntilConvergence((TextPageBlock) abstractPageBlock, remainingBlocks); - List textBlocksToMerge = findTextBlocksWithSameClassificationAndAlignsYAndSameOrientation(List.of((TextPageBlock) abstractPageBlock), remainingBlocks); + List textBlocksToMerge = findTextBlocksWithSameClassificationAndAlignsYAndSameOrientation(List.of((TextPageBlock) abstractPageBlock), + remainingBlocks); alreadyMerged.addAll(textBlocksToMerge); DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocksToMerge); } else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) { @@ -125,7 +126,7 @@ public class SectionNodeFactory { List previousList = splitList.get(i - 1); AbstractPageBlock lastPageBlockInPreviousList = previousList.get(previousList.size() - 1); if (lastPageBlockInPreviousList.isHeadline()) { - previousList.remove(i - 1); + previousList.remove(previousList.size() - 1); splitList.get(i).add(0, lastPageBlockInPreviousList); } } -- 2.47.2