RED-8826: Do not classify textblocks in graphics as headlines
This commit is contained in:
parent
b53930328a
commit
683f7f1fb8
@ -269,7 +269,7 @@ public class LayoutParsingPipeline {
|
||||
|
||||
pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>())
|
||||
.addAll(graphics.stream()
|
||||
.map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), ImageType.GRAPHICS, false, stripper.getPageNumber()))
|
||||
.map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), ImageType.GRAPHIC, false, stripper.getPageNumber()))
|
||||
.toList());
|
||||
|
||||
ClassificationPage classificationPage = switch (layoutParsingType) {
|
||||
|
||||
@ -10,7 +10,7 @@ public enum ImageType {
|
||||
SIGNATURE_VISUAL,
|
||||
OTHER,
|
||||
OCR,
|
||||
GRAPHICS;
|
||||
GRAPHIC;
|
||||
|
||||
|
||||
public static ImageType fromString(String imageType) {
|
||||
@ -20,6 +20,7 @@ public enum ImageType {
|
||||
case "formula" -> ImageType.FORMULA;
|
||||
case "signature" -> ImageType.SIGNATURE;
|
||||
case "ocr" -> ImageType.OCR;
|
||||
case "graphic" -> ImageType.GRAPHIC;
|
||||
default -> ImageType.OTHER;
|
||||
};
|
||||
}
|
||||
|
||||
@ -3,14 +3,15 @@ package com.knecon.fforesight.service.layoutparser.processor.services.classifica
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
@ -21,7 +22,6 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@RequiredArgsConstructor
|
||||
public class RedactManagerClassificationService {
|
||||
|
||||
|
||||
public void classifyDocument(ClassificationDocument document) {
|
||||
|
||||
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
||||
@ -52,14 +52,25 @@ public class RedactManagerClassificationService {
|
||||
textBlock.setClassification(PageBlockType.OTHER);
|
||||
return;
|
||||
}
|
||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
||||
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
|
||||
|
||||
if (page.getImages()
|
||||
.stream()
|
||||
.filter(image -> image.getImageType().equals(ImageType.GRAPHIC))
|
||||
.anyMatch(graphic -> graphic.getPosition().intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()))) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||
return;
|
||||
}
|
||||
|
||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || PositionUtils.isOverBodyTextFrame(bodyTextFrame,
|
||||
textBlock,
|
||||
page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||
.getMostPopular())) {
|
||||
textBlock.setClassification(PageBlockType.HEADER);
|
||||
|
||||
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|
||||
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
|
||||
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) || PositionUtils.isUnderBodyTextFrame(bodyTextFrame,
|
||||
textBlock,
|
||||
page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||
.getMostPopular())) {
|
||||
textBlock.setClassification(PageBlockType.FOOTER);
|
||||
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
|
||||
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
|
||||
|
||||
@ -218,12 +218,12 @@ public class GraphicBBDetector extends PDFGraphicsStreamEngine {
|
||||
}
|
||||
|
||||
|
||||
private class NullOp extends OperatorProcessor {
|
||||
private final class NullOp extends OperatorProcessor {
|
||||
|
||||
private final String name;
|
||||
|
||||
|
||||
public NullOp(String name, PDFStreamEngine context) {
|
||||
private NullOp(String name, PDFStreamEngine context) {
|
||||
|
||||
super(context);
|
||||
this.name = name;
|
||||
|
||||
@ -44,6 +44,7 @@ public class GraphicExtractorService {
|
||||
var graphicBBoxes = graphicBBDetector.findGraphicBB();
|
||||
|
||||
if (graphicsRaster) {
|
||||
// This should only be used if ocr was performed, it is currently in an early stage and needs to be improved.
|
||||
graphicBBoxes.addAll(findGraphicsRaster.findCCBoundingBoxes(pdDocument,
|
||||
characterBBoxes.stream().map(box -> new Rectangle2D.Double(box.x1 - 2, box.y1 - 2, box.width() + 4, box.height() + 4)).collect(Collectors.toList()),
|
||||
PageInformation.fromPDPage(pageNumber, pdPage)));
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user