RED-9974: wip

This commit is contained in:
Kilian Schuettler 2024-09-13 14:03:13 +02:00 committed by Dominique Eifländer
parent 1337c56591
commit 95e6fdecd7
3 changed files with 55 additions and 26 deletions

View File

@ -104,6 +104,9 @@ public class DocuMineClassificationService {
boolean isTocItem = textBlock.getText().contains("..............");
boolean headlineWithSlashesMatches = headlineWithSlashesMatcher.reset().matches();
boolean isAmount = amountMatcher.reset().find();
int charCount = countChars(textBlock);
boolean enoughChars = charCount > textBlock.getText().length() * 0.5;
if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
@ -150,15 +153,17 @@ public class DocuMineClassificationService {
&& !textBlock.toString().endsWith(":")
&& isAtLeast3Characters
&& !isTocItem
&& !isAmount) {
&& !isAmount
&& enoughChars) {
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
} else if (isAllCaps(textBlock) //
&& textBlock.getText().length() > 5 //
&& isAtLeast3Characters //
&& !isAmount//
&& !textBlock.toString().contains(":") //
&& !textBlock.toString().startsWith("(") //
} else if (isAllCaps(textBlock)
&& textBlock.getText().length() > 5
&& isAtLeast3Characters
&& !isAmount
&& enoughChars
&& !textBlock.toString().contains(":")
&& !textBlock.toString().startsWith("(")
&& PositionUtils.getApproxLineCount(textBlock) < 2.9) {
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
@ -167,17 +172,18 @@ public class DocuMineClassificationService {
&& isAtLeast3Characters
&& !headlineWithSlashesMatches
&& !isAmount
&& !isTocItem) {
&& !isTocItem
&& enoughChars) {
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
} else if (!isTocItem //
&& hasSeparation(textBlock, surroundingBlocks) //
&& (textBlock.getMostPopularWordHeight() >= page.getTextHeightCounter().getMostPopular() //
|| textBlock.getMostPopularWordFontSize() >= page.getFontSizeCounter().getMostPopular()) //
&& PositionUtils.getApproxLineCount(textBlock) < 2.9 //
&& (tableOrFigureMatcher.reset().find() || headlineWithSingleIdentifierMatcher.reset().find()) //
&& !isAmount //
&& !headlineWithSlashesMatches) {
} else if (!isTocItem
&& hasSeparation(textBlock, surroundingBlocks)
&& greaterOrEqualThanFontPageAverage(textBlock, page)
&& PositionUtils.getApproxLineCount(textBlock) < 2.9
&& (tableOrFigureMatcher.reset().find() || headlineWithSingleIdentifierMatcher.reset().find())
&& !isAmount
&& !headlineWithSlashesMatches
&& enoughChars) {
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
@ -205,6 +211,26 @@ public class DocuMineClassificationService {
}
private int countChars(TextPageBlock textBlock) {
int count = 0;
for (int i = 0; i < textBlock.getText().length(); i++) {
if (Character.isAlphabetic(textBlock.getText().charAt(i))) {
count++;
}
}
return count;
}
private static boolean greaterOrEqualThanFontPageAverage(TextPageBlock textBlock, ClassificationPage page) {
return textBlock.getMostPopularWordHeight() >= page.getTextHeightCounter().getMostPopular() //
|| textBlock.getMostPopularWordFontSize() >= page.getFontSizeCounter().getMostPopular();
}
private static boolean isAllCaps(TextPageBlock textBlock) {
return textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT));

View File

@ -289,7 +289,7 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
new int[]{code},
font,
fontSize,
(int) (fontSize * textMatrix.getScalingFactorX())));
(int) (fontSize * textMatrix.getScalingFactorX() * textMatrix.getScalingFactorY())));
processTextPosition(new TextPosition(pageRotation,
pageSize.getWidth(),
pageSize.getHeight(),
@ -303,7 +303,7 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
new int[]{code},
font,
fontSize,
(int) (fontSize * textMatrix.getScalingFactorX())));
(int) (fontSize * textMatrix.getScalingFactorX() * textMatrix.getScalingFactorY())));
} else {
processTextPosition(new TextPosition(pageRotation,
@ -319,7 +319,7 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
new int[]{code},
font,
fontSize,
(int) (fontSize * textMatrix.getScalingFactorX())));
(int) (fontSize * textMatrix.getScalingFactorX() * textMatrix.getScalingFactorY())));
}
}

View File

@ -14,8 +14,6 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextB
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
@ -102,11 +100,16 @@ public class TextPositionOperations {
double normalizedVerticalDistance = Math.abs(sequence.getBBoxDirAdj().getCenterY() - sequence2.getBBoxDirAdj().getCenterY()) / maxLineDistance;
double normalizedHorizontalDistance = Math.abs(sequence.getBBoxDirAdj().getCenterX() - sequence2.getBBoxDirAdj().getCenterX()) / maxXGap;
if (sequence.getDir() != sequence2.getDir()
|| Math.abs(sequence.getFontSize() - sequence2.getFontSize()) > 0.5 * Math.min(sequence.getFontSize(),
sequence2.getFontSize())
|| Math.pow(normalizedVerticalDistance, 2) + Math.pow(normalizedHorizontalDistance, 2) > 1
|| !ANGLE_FILTER.matches(angle)) {
if (sequence.getDir() != sequence2.getDir()) {
continue;
}
if (Math.abs(sequence.getFontSize() - sequence2.getFontSize()) > 0.5 * Math.max(sequence.getFontSize(), sequence2.getFontSize())) {
continue;
}
if (Math.pow(normalizedVerticalDistance, 2) + Math.pow(normalizedHorizontalDistance, 2) > 1) {
continue;
}
if (!ANGLE_FILTER.matches(angle)) {
continue;
}