RED-8826: Do not classify textblocks in graphics as headlines

This commit is contained in:
Dominique Eifländer 2024-04-23 09:28:28 +02:00
parent b53930328a
commit 683f7f1fb8
5 changed files with 25 additions and 12 deletions

View File

@ -269,7 +269,7 @@ public class LayoutParsingPipeline {
pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>()) pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>())
.addAll(graphics.stream() .addAll(graphics.stream()
.map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), ImageType.GRAPHICS, false, stripper.getPageNumber())) .map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), ImageType.GRAPHIC, false, stripper.getPageNumber()))
.toList()); .toList());
ClassificationPage classificationPage = switch (layoutParsingType) { ClassificationPage classificationPage = switch (layoutParsingType) {

View File

@ -10,7 +10,7 @@ public enum ImageType {
SIGNATURE_VISUAL, SIGNATURE_VISUAL,
OTHER, OTHER,
OCR, OCR,
GRAPHICS; GRAPHIC;
public static ImageType fromString(String imageType) { public static ImageType fromString(String imageType) {
@ -20,6 +20,7 @@ public enum ImageType {
case "formula" -> ImageType.FORMULA; case "formula" -> ImageType.FORMULA;
case "signature" -> ImageType.SIGNATURE; case "signature" -> ImageType.SIGNATURE;
case "ocr" -> ImageType.OCR; case "ocr" -> ImageType.OCR;
case "graphic" -> ImageType.GRAPHIC;
default -> ImageType.OTHER; default -> ImageType.OTHER;
}; };
} }

View File

@ -3,14 +3,15 @@ package com.knecon.fforesight.service.layoutparser.processor.services.classifica
import java.util.List; import java.util.List;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
@ -21,7 +22,6 @@ import lombok.extern.slf4j.Slf4j;
@RequiredArgsConstructor @RequiredArgsConstructor
public class RedactManagerClassificationService { public class RedactManagerClassificationService {
public void classifyDocument(ClassificationDocument document) { public void classifyDocument(ClassificationDocument document) {
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular(); List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
@ -52,14 +52,25 @@ public class RedactManagerClassificationService {
textBlock.setClassification(PageBlockType.OTHER); textBlock.setClassification(PageBlockType.OTHER);
return; return;
} }
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() if (page.getImages()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) { .stream()
.filter(image -> image.getImageType().equals(ImageType.GRAPHIC))
.anyMatch(graphic -> graphic.getPosition().intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()))) {
textBlock.setClassification(PageBlockType.PARAGRAPH);
return;
}
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || PositionUtils.isOverBodyTextFrame(bodyTextFrame,
textBlock,
page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
.getMostPopular())) {
textBlock.setClassification(PageBlockType.HEADER); textBlock.setClassification(PageBlockType.HEADER);
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) || PositionUtils.isUnderBodyTextFrame(bodyTextFrame,
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() textBlock,
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) { page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
.getMostPopular())) {
textBlock.setClassification(PageBlockType.FOOTER); textBlock.setClassification(PageBlockType.FOOTER);
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks() document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()

View File

@ -218,12 +218,12 @@ public class GraphicBBDetector extends PDFGraphicsStreamEngine {
} }
private class NullOp extends OperatorProcessor { private final class NullOp extends OperatorProcessor {
private final String name; private final String name;
public NullOp(String name, PDFStreamEngine context) { private NullOp(String name, PDFStreamEngine context) {
super(context); super(context);
this.name = name; this.name = name;

View File

@ -44,6 +44,7 @@ public class GraphicExtractorService {
var graphicBBoxes = graphicBBDetector.findGraphicBB(); var graphicBBoxes = graphicBBDetector.findGraphicBB();
if (graphicsRaster) { if (graphicsRaster) {
// This should only be used if ocr was performed, it is currently in an early stage and needs to be improved.
graphicBBoxes.addAll(findGraphicsRaster.findCCBoundingBoxes(pdDocument, graphicBBoxes.addAll(findGraphicsRaster.findCCBoundingBoxes(pdDocument,
characterBBoxes.stream().map(box -> new Rectangle2D.Double(box.x1 - 2, box.y1 - 2, box.width() + 4, box.height() + 4)).collect(Collectors.toList()), characterBBoxes.stream().map(box -> new Rectangle2D.Double(box.x1 - 2, box.y1 - 2, box.width() + 4, box.height() + 4)).collect(Collectors.toList()),
PageInformation.fromPDPage(pageNumber, pdPage))); PageInformation.fromPDPage(pageNumber, pdPage)));