RED-8826: Do not classify textblocks in graphics as headlines
This commit is contained in:
parent
b53930328a
commit
683f7f1fb8
@ -269,7 +269,7 @@ public class LayoutParsingPipeline {
|
|||||||
|
|
||||||
pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>())
|
pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>())
|
||||||
.addAll(graphics.stream()
|
.addAll(graphics.stream()
|
||||||
.map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), ImageType.GRAPHICS, false, stripper.getPageNumber()))
|
.map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), ImageType.GRAPHIC, false, stripper.getPageNumber()))
|
||||||
.toList());
|
.toList());
|
||||||
|
|
||||||
ClassificationPage classificationPage = switch (layoutParsingType) {
|
ClassificationPage classificationPage = switch (layoutParsingType) {
|
||||||
|
|||||||
@ -10,7 +10,7 @@ public enum ImageType {
|
|||||||
SIGNATURE_VISUAL,
|
SIGNATURE_VISUAL,
|
||||||
OTHER,
|
OTHER,
|
||||||
OCR,
|
OCR,
|
||||||
GRAPHICS;
|
GRAPHIC;
|
||||||
|
|
||||||
|
|
||||||
public static ImageType fromString(String imageType) {
|
public static ImageType fromString(String imageType) {
|
||||||
@ -20,6 +20,7 @@ public enum ImageType {
|
|||||||
case "formula" -> ImageType.FORMULA;
|
case "formula" -> ImageType.FORMULA;
|
||||||
case "signature" -> ImageType.SIGNATURE;
|
case "signature" -> ImageType.SIGNATURE;
|
||||||
case "ocr" -> ImageType.OCR;
|
case "ocr" -> ImageType.OCR;
|
||||||
|
case "graphic" -> ImageType.GRAPHIC;
|
||||||
default -> ImageType.OTHER;
|
default -> ImageType.OTHER;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@ -3,14 +3,15 @@ package com.knecon.fforesight.service.layoutparser.processor.services.classifica
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
@ -21,7 +22,6 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
public class RedactManagerClassificationService {
|
public class RedactManagerClassificationService {
|
||||||
|
|
||||||
|
|
||||||
public void classifyDocument(ClassificationDocument document) {
|
public void classifyDocument(ClassificationDocument document) {
|
||||||
|
|
||||||
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
||||||
@ -52,14 +52,25 @@ public class RedactManagerClassificationService {
|
|||||||
textBlock.setClassification(PageBlockType.OTHER);
|
textBlock.setClassification(PageBlockType.OTHER);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
|
||||||
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
if (page.getImages()
|
||||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
|
.stream()
|
||||||
|
.filter(image -> image.getImageType().equals(ImageType.GRAPHIC))
|
||||||
|
.anyMatch(graphic -> graphic.getPosition().intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()))) {
|
||||||
|
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || PositionUtils.isOverBodyTextFrame(bodyTextFrame,
|
||||||
|
textBlock,
|
||||||
|
page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||||
|
.getMostPopular())) {
|
||||||
textBlock.setClassification(PageBlockType.HEADER);
|
textBlock.setClassification(PageBlockType.HEADER);
|
||||||
|
|
||||||
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) || PositionUtils.isUnderBodyTextFrame(bodyTextFrame,
|
||||||
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
textBlock,
|
||||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
|
page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||||
|
.getMostPopular())) {
|
||||||
textBlock.setClassification(PageBlockType.FOOTER);
|
textBlock.setClassification(PageBlockType.FOOTER);
|
||||||
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
|
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
|
||||||
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
|
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
|
||||||
|
|||||||
@ -218,12 +218,12 @@ public class GraphicBBDetector extends PDFGraphicsStreamEngine {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private class NullOp extends OperatorProcessor {
|
private final class NullOp extends OperatorProcessor {
|
||||||
|
|
||||||
private final String name;
|
private final String name;
|
||||||
|
|
||||||
|
|
||||||
public NullOp(String name, PDFStreamEngine context) {
|
private NullOp(String name, PDFStreamEngine context) {
|
||||||
|
|
||||||
super(context);
|
super(context);
|
||||||
this.name = name;
|
this.name = name;
|
||||||
|
|||||||
@ -44,6 +44,7 @@ public class GraphicExtractorService {
|
|||||||
var graphicBBoxes = graphicBBDetector.findGraphicBB();
|
var graphicBBoxes = graphicBBDetector.findGraphicBB();
|
||||||
|
|
||||||
if (graphicsRaster) {
|
if (graphicsRaster) {
|
||||||
|
// This should only be used if ocr was performed, it is currently in an early stage and needs to be improved.
|
||||||
graphicBBoxes.addAll(findGraphicsRaster.findCCBoundingBoxes(pdDocument,
|
graphicBBoxes.addAll(findGraphicsRaster.findCCBoundingBoxes(pdDocument,
|
||||||
characterBBoxes.stream().map(box -> new Rectangle2D.Double(box.x1 - 2, box.y1 - 2, box.width() + 4, box.height() + 4)).collect(Collectors.toList()),
|
characterBBoxes.stream().map(box -> new Rectangle2D.Double(box.x1 - 2, box.y1 - 2, box.width() + 4, box.height() + 4)).collect(Collectors.toList()),
|
||||||
PageInformation.fromPDPage(pageNumber, pdPage)));
|
PageInformation.fromPDPage(pageNumber, pdPage)));
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user