RED-9974: wip
This commit is contained in:
parent
1337c56591
commit
95e6fdecd7
@ -104,6 +104,9 @@ public class DocuMineClassificationService {
|
|||||||
boolean isTocItem = textBlock.getText().contains("..............");
|
boolean isTocItem = textBlock.getText().contains("..............");
|
||||||
boolean headlineWithSlashesMatches = headlineWithSlashesMatcher.reset().matches();
|
boolean headlineWithSlashesMatches = headlineWithSlashesMatcher.reset().matches();
|
||||||
boolean isAmount = amountMatcher.reset().find();
|
boolean isAmount = amountMatcher.reset().find();
|
||||||
|
int charCount = countChars(textBlock);
|
||||||
|
|
||||||
|
boolean enoughChars = charCount > textBlock.getText().length() * 0.5;
|
||||||
|
|
||||||
if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
|
if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
|
||||||
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
|
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
|
||||||
@ -150,15 +153,17 @@ public class DocuMineClassificationService {
|
|||||||
&& !textBlock.toString().endsWith(":")
|
&& !textBlock.toString().endsWith(":")
|
||||||
&& isAtLeast3Characters
|
&& isAtLeast3Characters
|
||||||
&& !isTocItem
|
&& !isTocItem
|
||||||
&& !isAmount) {
|
&& !isAmount
|
||||||
|
&& enoughChars) {
|
||||||
|
|
||||||
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
|
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
|
||||||
} else if (isAllCaps(textBlock) //
|
} else if (isAllCaps(textBlock)
|
||||||
&& textBlock.getText().length() > 5 //
|
&& textBlock.getText().length() > 5
|
||||||
&& isAtLeast3Characters //
|
&& isAtLeast3Characters
|
||||||
&& !isAmount//
|
&& !isAmount
|
||||||
&& !textBlock.toString().contains(":") //
|
&& enoughChars
|
||||||
&& !textBlock.toString().startsWith("(") //
|
&& !textBlock.toString().contains(":")
|
||||||
|
&& !textBlock.toString().startsWith("(")
|
||||||
&& PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
&& PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
||||||
|
|
||||||
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
|
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
|
||||||
@ -167,17 +172,18 @@ public class DocuMineClassificationService {
|
|||||||
&& isAtLeast3Characters
|
&& isAtLeast3Characters
|
||||||
&& !headlineWithSlashesMatches
|
&& !headlineWithSlashesMatches
|
||||||
&& !isAmount
|
&& !isAmount
|
||||||
&& !isTocItem) {
|
&& !isTocItem
|
||||||
|
&& enoughChars) {
|
||||||
|
|
||||||
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
|
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
|
||||||
} else if (!isTocItem //
|
} else if (!isTocItem
|
||||||
&& hasSeparation(textBlock, surroundingBlocks) //
|
&& hasSeparation(textBlock, surroundingBlocks)
|
||||||
&& (textBlock.getMostPopularWordHeight() >= page.getTextHeightCounter().getMostPopular() //
|
&& greaterOrEqualThanFontPageAverage(textBlock, page)
|
||||||
|| textBlock.getMostPopularWordFontSize() >= page.getFontSizeCounter().getMostPopular()) //
|
&& PositionUtils.getApproxLineCount(textBlock) < 2.9
|
||||||
&& PositionUtils.getApproxLineCount(textBlock) < 2.9 //
|
&& (tableOrFigureMatcher.reset().find() || headlineWithSingleIdentifierMatcher.reset().find())
|
||||||
&& (tableOrFigureMatcher.reset().find() || headlineWithSingleIdentifierMatcher.reset().find()) //
|
&& !isAmount
|
||||||
&& !isAmount //
|
&& !headlineWithSlashesMatches
|
||||||
&& !headlineWithSlashesMatches) {
|
&& enoughChars) {
|
||||||
|
|
||||||
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
|
setAsHeadline(headlineClassificationService, textBlock, document, headlineFontSizes);
|
||||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
||||||
@ -205,6 +211,26 @@ public class DocuMineClassificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private int countChars(TextPageBlock textBlock) {
|
||||||
|
|
||||||
|
int count = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < textBlock.getText().length(); i++) {
|
||||||
|
if (Character.isAlphabetic(textBlock.getText().charAt(i))) {
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static boolean greaterOrEqualThanFontPageAverage(TextPageBlock textBlock, ClassificationPage page) {
|
||||||
|
|
||||||
|
return textBlock.getMostPopularWordHeight() >= page.getTextHeightCounter().getMostPopular() //
|
||||||
|
|| textBlock.getMostPopularWordFontSize() >= page.getFontSizeCounter().getMostPopular();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private static boolean isAllCaps(TextPageBlock textBlock) {
|
private static boolean isAllCaps(TextPageBlock textBlock) {
|
||||||
|
|
||||||
return textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT));
|
return textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT));
|
||||||
|
|||||||
@ -289,7 +289,7 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
|
|||||||
new int[]{code},
|
new int[]{code},
|
||||||
font,
|
font,
|
||||||
fontSize,
|
fontSize,
|
||||||
(int) (fontSize * textMatrix.getScalingFactorX())));
|
(int) (fontSize * textMatrix.getScalingFactorX() * textMatrix.getScalingFactorY())));
|
||||||
processTextPosition(new TextPosition(pageRotation,
|
processTextPosition(new TextPosition(pageRotation,
|
||||||
pageSize.getWidth(),
|
pageSize.getWidth(),
|
||||||
pageSize.getHeight(),
|
pageSize.getHeight(),
|
||||||
@ -303,7 +303,7 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
|
|||||||
new int[]{code},
|
new int[]{code},
|
||||||
font,
|
font,
|
||||||
fontSize,
|
fontSize,
|
||||||
(int) (fontSize * textMatrix.getScalingFactorX())));
|
(int) (fontSize * textMatrix.getScalingFactorX() * textMatrix.getScalingFactorY())));
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
processTextPosition(new TextPosition(pageRotation,
|
processTextPosition(new TextPosition(pageRotation,
|
||||||
@ -319,7 +319,7 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
|
|||||||
new int[]{code},
|
new int[]{code},
|
||||||
font,
|
font,
|
||||||
fontSize,
|
fontSize,
|
||||||
(int) (fontSize * textMatrix.getScalingFactorX())));
|
(int) (fontSize * textMatrix.getScalingFactorX() * textMatrix.getScalingFactorY())));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -14,8 +14,6 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextB
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
|
|
||||||
@ -102,11 +100,16 @@ public class TextPositionOperations {
|
|||||||
double normalizedVerticalDistance = Math.abs(sequence.getBBoxDirAdj().getCenterY() - sequence2.getBBoxDirAdj().getCenterY()) / maxLineDistance;
|
double normalizedVerticalDistance = Math.abs(sequence.getBBoxDirAdj().getCenterY() - sequence2.getBBoxDirAdj().getCenterY()) / maxLineDistance;
|
||||||
double normalizedHorizontalDistance = Math.abs(sequence.getBBoxDirAdj().getCenterX() - sequence2.getBBoxDirAdj().getCenterX()) / maxXGap;
|
double normalizedHorizontalDistance = Math.abs(sequence.getBBoxDirAdj().getCenterX() - sequence2.getBBoxDirAdj().getCenterX()) / maxXGap;
|
||||||
|
|
||||||
if (sequence.getDir() != sequence2.getDir()
|
if (sequence.getDir() != sequence2.getDir()) {
|
||||||
|| Math.abs(sequence.getFontSize() - sequence2.getFontSize()) > 0.5 * Math.min(sequence.getFontSize(),
|
continue;
|
||||||
sequence2.getFontSize())
|
}
|
||||||
|| Math.pow(normalizedVerticalDistance, 2) + Math.pow(normalizedHorizontalDistance, 2) > 1
|
if (Math.abs(sequence.getFontSize() - sequence2.getFontSize()) > 0.5 * Math.max(sequence.getFontSize(), sequence2.getFontSize())) {
|
||||||
|| !ANGLE_FILTER.matches(angle)) {
|
continue;
|
||||||
|
}
|
||||||
|
if (Math.pow(normalizedVerticalDistance, 2) + Math.pow(normalizedHorizontalDistance, 2) > 1) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (!ANGLE_FILTER.matches(angle)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user