diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java index 28202f7..e73dc0c 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java @@ -104,6 +104,9 @@ public class DocuMineClassificationService { boolean isTocItem = textBlock.getText().contains(".............."); boolean headlineWithSlashesMatches = headlineWithSlashesMatcher.reset().matches(); boolean isAmount = amountMatcher.reset().find(); + int charCount = countChars(textBlock); + + boolean enoughChars = charCount > textBlock.getText().length() * 0.5; if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) { headlineClassificationService.setLastHeadlineFromOutline(textBlock); @@ -150,15 +153,17 @@ public class DocuMineClassificationService { && !textBlock.toString().endsWith(":") && isAtLeast3Characters && !isTocItem - && !isAmount) { + && !isAmount + && enoughChars) { setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes); - } else if (isAllCaps(textBlock) // - && textBlock.getText().length() > 5 // - && isAtLeast3Characters // - && !isAmount// - && !textBlock.toString().contains(":") // - && !textBlock.toString().startsWith("(") // + } else if (isAllCaps(textBlock) + && textBlock.getText().length() > 5 + && isAtLeast3Characters + && !isAmount + && enoughChars + && !textBlock.toString().contains(":") + && !textBlock.toString().startsWith("(") && PositionUtils.getApproxLineCount(textBlock) < 2.9) { setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes); @@ -167,17 +172,18 @@ public class DocuMineClassificationService { && isAtLeast3Characters && !headlineWithSlashesMatches && !isAmount - && !isTocItem) { + && !isTocItem + && enoughChars) { setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes); - } else if (!isTocItem // - && hasSeparation(textBlock, surroundingBlocks) // - && (textBlock.getMostPopularWordHeight() >= page.getTextHeightCounter().getMostPopular() // - || textBlock.getMostPopularWordFontSize() >= page.getFontSizeCounter().getMostPopular()) // - && PositionUtils.getApproxLineCount(textBlock) < 2.9 // - && (tableOrFigureMatcher.reset().find() || headlineWithSingleIdentifierMatcher.reset().find()) // - && !isAmount // - && !headlineWithSlashesMatches) { + } else if (!isTocItem + && hasSeparation(textBlock, surroundingBlocks) + && greaterOrEqualThanFontPageAverage(textBlock, page) + && PositionUtils.getApproxLineCount(textBlock) < 2.9 + && (tableOrFigureMatcher.reset().find() || headlineWithSingleIdentifierMatcher.reset().find()) + && !isAmount + && !headlineWithSlashesMatches + && enoughChars) { setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes); } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) @@ -205,6 +211,26 @@ public class DocuMineClassificationService { } + private int countChars(TextPageBlock textBlock) { + + int count = 0; + + for (int i = 0; i < textBlock.getText().length(); i++) { + if (Character.isAlphabetic(textBlock.getText().charAt(i))) { + count++; + } + } + return count; + } + + + private static boolean greaterOrEqualThanFontPageAverage(TextPageBlock textBlock, ClassificationPage page) { + + return textBlock.getMostPopularWordHeight() >= page.getTextHeightCounter().getMostPopular() // + || textBlock.getMostPopularWordFontSize() >= page.getFontSizeCounter().getMostPopular(); + } + + private static boolean isAllCaps(TextPageBlock textBlock) { return textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/LegacyPDFStreamEngine.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/LegacyPDFStreamEngine.java index 560ea93..e77b9e0 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/LegacyPDFStreamEngine.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/LegacyPDFStreamEngine.java @@ -289,7 +289,7 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine { new int[]{code}, font, fontSize, - (int) (fontSize * textMatrix.getScalingFactorX()))); + (int) (fontSize * textMatrix.getScalingFactorX() * textMatrix.getScalingFactorY()))); processTextPosition(new TextPosition(pageRotation, pageSize.getWidth(), pageSize.getHeight(), @@ -303,7 +303,7 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine { new int[]{code}, font, fontSize, - (int) (fontSize * textMatrix.getScalingFactorX()))); + (int) (fontSize * textMatrix.getScalingFactorX() * textMatrix.getScalingFactorY()))); } else { processTextPosition(new TextPosition(pageRotation, @@ -319,7 +319,7 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine { new int[]{code}, font, fontSize, - (int) (fontSize * textMatrix.getScalingFactorX()))); + (int) (fontSize * textMatrix.getScalingFactorX() * textMatrix.getScalingFactorY()))); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java index bcd9f21..7d27ee5 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionOperations.java @@ -14,8 +14,6 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextB import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind; import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils; import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2; -import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; -import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; @@ -102,11 +100,16 @@ public class TextPositionOperations { double normalizedVerticalDistance = Math.abs(sequence.getBBoxDirAdj().getCenterY() - sequence2.getBBoxDirAdj().getCenterY()) / maxLineDistance; double normalizedHorizontalDistance = Math.abs(sequence.getBBoxDirAdj().getCenterX() - sequence2.getBBoxDirAdj().getCenterX()) / maxXGap; - if (sequence.getDir() != sequence2.getDir() - || Math.abs(sequence.getFontSize() - sequence2.getFontSize()) > 0.5 * Math.min(sequence.getFontSize(), - sequence2.getFontSize()) - || Math.pow(normalizedVerticalDistance, 2) + Math.pow(normalizedHorizontalDistance, 2) > 1 - || !ANGLE_FILTER.matches(angle)) { + if (sequence.getDir() != sequence2.getDir()) { + continue; + } + if (Math.abs(sequence.getFontSize() - sequence2.getFontSize()) > 0.5 * Math.max(sequence.getFontSize(), sequence2.getFontSize())) { + continue; + } + if (Math.pow(normalizedVerticalDistance, 2) + Math.pow(normalizedHorizontalDistance, 2) > 1) { + continue; + } + if (!ANGLE_FILTER.matches(angle)) { continue; }