RED-9974: wip

This commit is contained in:
Kilian Schuettler 2024-09-13 14:03:13 +02:00 committed by Dominique Eifländer
parent 1337c56591
commit 95e6fdecd7
3 changed files with 55 additions and 26 deletions

View File

@ -104,6 +104,9 @@ public class DocuMineClassificationService {
boolean isTocItem = textBlock.getText().contains(".............."); boolean isTocItem = textBlock.getText().contains("..............");
boolean headlineWithSlashesMatches = headlineWithSlashesMatcher.reset().matches(); boolean headlineWithSlashesMatches = headlineWithSlashesMatcher.reset().matches();
boolean isAmount = amountMatcher.reset().find(); boolean isAmount = amountMatcher.reset().find();
int charCount = countChars(textBlock);
boolean enoughChars = charCount > textBlock.getText().length() * 0.5;
if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) { if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
headlineClassificationService.setLastHeadlineFromOutline(textBlock); headlineClassificationService.setLastHeadlineFromOutline(textBlock);
@ -150,15 +153,17 @@ public class DocuMineClassificationService {
&& !textBlock.toString().endsWith(":") && !textBlock.toString().endsWith(":")
&& isAtLeast3Characters && isAtLeast3Characters
&& !isTocItem && !isTocItem
&& !isAmount) { && !isAmount
&& enoughChars) {
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes); setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
} else if (isAllCaps(textBlock) // } else if (isAllCaps(textBlock)
&& textBlock.getText().length() > 5 // && textBlock.getText().length() > 5
&& isAtLeast3Characters // && isAtLeast3Characters
&& !isAmount// && !isAmount
&& !textBlock.toString().contains(":") // && enoughChars
&& !textBlock.toString().startsWith("(") // && !textBlock.toString().contains(":")
&& !textBlock.toString().startsWith("(")
&& PositionUtils.getApproxLineCount(textBlock) < 2.9) { && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes); setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
@ -167,17 +172,18 @@ public class DocuMineClassificationService {
&& isAtLeast3Characters && isAtLeast3Characters
&& !headlineWithSlashesMatches && !headlineWithSlashesMatches
&& !isAmount && !isAmount
&& !isTocItem) { && !isTocItem
&& enoughChars) {
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes); setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
} else if (!isTocItem // } else if (!isTocItem
&& hasSeparation(textBlock, surroundingBlocks) // && hasSeparation(textBlock, surroundingBlocks)
&& (textBlock.getMostPopularWordHeight() >= page.getTextHeightCounter().getMostPopular() // && greaterOrEqualThanFontPageAverage(textBlock, page)
|| textBlock.getMostPopularWordFontSize() >= page.getFontSizeCounter().getMostPopular()) // && PositionUtils.getApproxLineCount(textBlock) < 2.9
&& PositionUtils.getApproxLineCount(textBlock) < 2.9 // && (tableOrFigureMatcher.reset().find() || headlineWithSingleIdentifierMatcher.reset().find())
&& (tableOrFigureMatcher.reset().find() || headlineWithSingleIdentifierMatcher.reset().find()) // && !isAmount
&& !isAmount // && !headlineWithSlashesMatches
&& !headlineWithSlashesMatches) { && enoughChars) {
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes); setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
@ -205,6 +211,26 @@ public class DocuMineClassificationService {
} }
private int countChars(TextPageBlock textBlock) {
int count = 0;
for (int i = 0; i < textBlock.getText().length(); i++) {
if (Character.isAlphabetic(textBlock.getText().charAt(i))) {
count++;
}
}
return count;
}
private static boolean greaterOrEqualThanFontPageAverage(TextPageBlock textBlock, ClassificationPage page) {
return textBlock.getMostPopularWordHeight() >= page.getTextHeightCounter().getMostPopular() //
|| textBlock.getMostPopularWordFontSize() >= page.getFontSizeCounter().getMostPopular();
}
private static boolean isAllCaps(TextPageBlock textBlock) { private static boolean isAllCaps(TextPageBlock textBlock) {
return textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)); return textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT));

View File

@ -289,7 +289,7 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
new int[]{code}, new int[]{code},
font, font,
fontSize, fontSize,
(int) (fontSize * textMatrix.getScalingFactorX()))); (int) (fontSize * textMatrix.getScalingFactorX() * textMatrix.getScalingFactorY())));
processTextPosition(new TextPosition(pageRotation, processTextPosition(new TextPosition(pageRotation,
pageSize.getWidth(), pageSize.getWidth(),
pageSize.getHeight(), pageSize.getHeight(),
@ -303,7 +303,7 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
new int[]{code}, new int[]{code},
font, font,
fontSize, fontSize,
(int) (fontSize * textMatrix.getScalingFactorX()))); (int) (fontSize * textMatrix.getScalingFactorX() * textMatrix.getScalingFactorY())));
} else { } else {
processTextPosition(new TextPosition(pageRotation, processTextPosition(new TextPosition(pageRotation,
@ -319,7 +319,7 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
new int[]{code}, new int[]{code},
font, font,
fontSize, fontSize,
(int) (fontSize * textMatrix.getScalingFactorX()))); (int) (fontSize * textMatrix.getScalingFactorX() * textMatrix.getScalingFactorY())));
} }
} }

View File

@ -14,8 +14,6 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextB
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils; import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2; import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
@ -102,11 +100,16 @@ public class TextPositionOperations {
double normalizedVerticalDistance = Math.abs(sequence.getBBoxDirAdj().getCenterY() - sequence2.getBBoxDirAdj().getCenterY()) / maxLineDistance; double normalizedVerticalDistance = Math.abs(sequence.getBBoxDirAdj().getCenterY() - sequence2.getBBoxDirAdj().getCenterY()) / maxLineDistance;
double normalizedHorizontalDistance = Math.abs(sequence.getBBoxDirAdj().getCenterX() - sequence2.getBBoxDirAdj().getCenterX()) / maxXGap; double normalizedHorizontalDistance = Math.abs(sequence.getBBoxDirAdj().getCenterX() - sequence2.getBBoxDirAdj().getCenterX()) / maxXGap;
if (sequence.getDir() != sequence2.getDir() if (sequence.getDir() != sequence2.getDir()) {
|| Math.abs(sequence.getFontSize() - sequence2.getFontSize()) > 0.5 * Math.min(sequence.getFontSize(), continue;
sequence2.getFontSize()) }
|| Math.pow(normalizedVerticalDistance, 2) + Math.pow(normalizedHorizontalDistance, 2) > 1 if (Math.abs(sequence.getFontSize() - sequence2.getFontSize()) > 0.5 * Math.max(sequence.getFontSize(), sequence2.getFontSize())) {
|| !ANGLE_FILTER.matches(angle)) { continue;
}
if (Math.pow(normalizedVerticalDistance, 2) + Math.pow(normalizedHorizontalDistance, 2) > 1) {
continue;
}
if (!ANGLE_FILTER.matches(angle)) {
continue; continue;
} }