RED-9974: wip
This commit is contained in:
parent
1337c56591
commit
95e6fdecd7
@ -104,6 +104,9 @@ public class DocuMineClassificationService {
|
||||
boolean isTocItem = textBlock.getText().contains("..............");
|
||||
boolean headlineWithSlashesMatches = headlineWithSlashesMatcher.reset().matches();
|
||||
boolean isAmount = amountMatcher.reset().find();
|
||||
int charCount = countChars(textBlock);
|
||||
|
||||
boolean enoughChars = charCount > textBlock.getText().length() * 0.5;
|
||||
|
||||
if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
|
||||
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
|
||||
@ -150,15 +153,17 @@ public class DocuMineClassificationService {
|
||||
&& !textBlock.toString().endsWith(":")
|
||||
&& isAtLeast3Characters
|
||||
&& !isTocItem
|
||||
&& !isAmount) {
|
||||
&& !isAmount
|
||||
&& enoughChars) {
|
||||
|
||||
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
|
||||
} else if (isAllCaps(textBlock) //
|
||||
&& textBlock.getText().length() > 5 //
|
||||
&& isAtLeast3Characters //
|
||||
&& !isAmount//
|
||||
&& !textBlock.toString().contains(":") //
|
||||
&& !textBlock.toString().startsWith("(") //
|
||||
} else if (isAllCaps(textBlock)
|
||||
&& textBlock.getText().length() > 5
|
||||
&& isAtLeast3Characters
|
||||
&& !isAmount
|
||||
&& enoughChars
|
||||
&& !textBlock.toString().contains(":")
|
||||
&& !textBlock.toString().startsWith("(")
|
||||
&& PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
||||
|
||||
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
|
||||
@ -167,17 +172,18 @@ public class DocuMineClassificationService {
|
||||
&& isAtLeast3Characters
|
||||
&& !headlineWithSlashesMatches
|
||||
&& !isAmount
|
||||
&& !isTocItem) {
|
||||
&& !isTocItem
|
||||
&& enoughChars) {
|
||||
|
||||
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
|
||||
} else if (!isTocItem //
|
||||
&& hasSeparation(textBlock, surroundingBlocks) //
|
||||
&& (textBlock.getMostPopularWordHeight() >= page.getTextHeightCounter().getMostPopular() //
|
||||
|| textBlock.getMostPopularWordFontSize() >= page.getFontSizeCounter().getMostPopular()) //
|
||||
&& PositionUtils.getApproxLineCount(textBlock) < 2.9 //
|
||||
&& (tableOrFigureMatcher.reset().find() || headlineWithSingleIdentifierMatcher.reset().find()) //
|
||||
&& !isAmount //
|
||||
&& !headlineWithSlashesMatches) {
|
||||
} else if (!isTocItem
|
||||
&& hasSeparation(textBlock, surroundingBlocks)
|
||||
&& greaterOrEqualThanFontPageAverage(textBlock, page)
|
||||
&& PositionUtils.getApproxLineCount(textBlock) < 2.9
|
||||
&& (tableOrFigureMatcher.reset().find() || headlineWithSingleIdentifierMatcher.reset().find())
|
||||
&& !isAmount
|
||||
&& !headlineWithSlashesMatches
|
||||
&& enoughChars) {
|
||||
|
||||
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
||||
@ -205,6 +211,26 @@ public class DocuMineClassificationService {
|
||||
}
|
||||
|
||||
|
||||
private int countChars(TextPageBlock textBlock) {
|
||||
|
||||
int count = 0;
|
||||
|
||||
for (int i = 0; i < textBlock.getText().length(); i++) {
|
||||
if (Character.isAlphabetic(textBlock.getText().charAt(i))) {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
|
||||
private static boolean greaterOrEqualThanFontPageAverage(TextPageBlock textBlock, ClassificationPage page) {
|
||||
|
||||
return textBlock.getMostPopularWordHeight() >= page.getTextHeightCounter().getMostPopular() //
|
||||
|| textBlock.getMostPopularWordFontSize() >= page.getFontSizeCounter().getMostPopular();
|
||||
}
|
||||
|
||||
|
||||
private static boolean isAllCaps(TextPageBlock textBlock) {
|
||||
|
||||
return textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT));
|
||||
|
||||
@ -289,7 +289,7 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
|
||||
new int[]{code},
|
||||
font,
|
||||
fontSize,
|
||||
(int) (fontSize * textMatrix.getScalingFactorX())));
|
||||
(int) (fontSize * textMatrix.getScalingFactorX() * textMatrix.getScalingFactorY())));
|
||||
processTextPosition(new TextPosition(pageRotation,
|
||||
pageSize.getWidth(),
|
||||
pageSize.getHeight(),
|
||||
@ -303,7 +303,7 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
|
||||
new int[]{code},
|
||||
font,
|
||||
fontSize,
|
||||
(int) (fontSize * textMatrix.getScalingFactorX())));
|
||||
(int) (fontSize * textMatrix.getScalingFactorX() * textMatrix.getScalingFactorY())));
|
||||
} else {
|
||||
|
||||
processTextPosition(new TextPosition(pageRotation,
|
||||
@ -319,7 +319,7 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
|
||||
new int[]{code},
|
||||
font,
|
||||
fontSize,
|
||||
(int) (fontSize * textMatrix.getScalingFactorX())));
|
||||
(int) (fontSize * textMatrix.getScalingFactorX() * textMatrix.getScalingFactorY())));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -14,8 +14,6 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextB
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
|
||||
@ -102,11 +100,16 @@ public class TextPositionOperations {
|
||||
double normalizedVerticalDistance = Math.abs(sequence.getBBoxDirAdj().getCenterY() - sequence2.getBBoxDirAdj().getCenterY()) / maxLineDistance;
|
||||
double normalizedHorizontalDistance = Math.abs(sequence.getBBoxDirAdj().getCenterX() - sequence2.getBBoxDirAdj().getCenterX()) / maxXGap;
|
||||
|
||||
if (sequence.getDir() != sequence2.getDir()
|
||||
|| Math.abs(sequence.getFontSize() - sequence2.getFontSize()) > 0.5 * Math.min(sequence.getFontSize(),
|
||||
sequence2.getFontSize())
|
||||
|| Math.pow(normalizedVerticalDistance, 2) + Math.pow(normalizedHorizontalDistance, 2) > 1
|
||||
|| !ANGLE_FILTER.matches(angle)) {
|
||||
if (sequence.getDir() != sequence2.getDir()) {
|
||||
continue;
|
||||
}
|
||||
if (Math.abs(sequence.getFontSize() - sequence2.getFontSize()) > 0.5 * Math.max(sequence.getFontSize(), sequence2.getFontSize())) {
|
||||
continue;
|
||||
}
|
||||
if (Math.pow(normalizedVerticalDistance, 2) + Math.pow(normalizedHorizontalDistance, 2) > 1) {
|
||||
continue;
|
||||
}
|
||||
if (!ANGLE_FILTER.matches(angle)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user