RED-7461: Use marked content to classify headers and footers if available

This commit is contained in:
deiflaender 2023-08-21 15:27:03 +02:00
parent 60615ec5d8
commit b270b9c942
9 changed files with 102 additions and 147 deletions

View File

@ -2,15 +2,19 @@ package com.knecon.fforesight.service.layoutparser.processor;
import static java.lang.String.format;
import java.awt.geom.Rectangle2D;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentConverter;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.apache.pdfbox.text.PDFMarkedContentExtractor;
import org.springframework.stereotype.Service;
@ -173,7 +177,9 @@ public class LayoutParsingPipeline {
classificationPage.setPageNumber(pageNumber);
classificationPage.setPageWidth(cropbox.getWidth());
classificationPage.setPageHeight(cropbox.getHeight());
classificationPage.setMarkedContents(stripper.getMarkedContents());
// MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox.
classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents()));
// If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted.
if (pdfImages != null && pdfImages.containsKey(pageNumber)) {
@ -203,6 +209,13 @@ public class LayoutParsingPipeline {
}
private Map<String, Rectangle2D> convertMarkedContents(List<PDMarkedContent> pdMarkedContents){
Map<String, Rectangle2D> markedContentBboxes = new HashMap<>();
markedContentBboxes.put("Header", MarkedContentConverter.getMarkedContentBboxPerLine(pdMarkedContents, "Header"));
markedContentBboxes.put("Footer", MarkedContentConverter.getMarkedContentBboxPerLine(pdMarkedContents, "Footer"));
return markedContentBboxes;
}
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
if (!classificationPage.isLandscape()) {

View File

@ -1,7 +1,10 @@
package com.knecon.fforesight.service.layoutparser.processor.model;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
@ -39,6 +42,6 @@ public class ClassificationPage {
CleanRulings cleanRulings;
private List<PDMarkedContent> markedContents;
private Map<String, Rectangle2D> markedContentBboxPerType = new HashMap<>();
}

View File

@ -1,18 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.services;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.apache.pdfbox.text.TextPosition;
import org.springframework.stereotype.Service;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
@ -36,75 +25,11 @@ public class BodyTextFrameService {
Rectangle bodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), false, layoutParsingType);
Rectangle landscapeBodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), true, layoutParsingType);
for (ClassificationPage page : classificationDocument.getPages()) {
var markupHeaderBBoxes = getMarkupRectangleByTextPositions(page, "Header");
var markupFooterBBoxes = getMarkupRectangleByTextPositions(page, "Footer");
if(markupHeaderBBoxes.isEmpty() && markupFooterBBoxes.isEmpty()){
setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
} else {
double minHeader = page.getPageHeight();
if(!markupHeaderBBoxes.isEmpty()) {
minHeader = RectangleTransformations.rectangle2DBBox(markupHeaderBBoxes).getMinY();
}
double maxFooter = 0;
if(!markupFooterBBoxes.isEmpty()) {
maxFooter = RectangleTransformations.rectangle2DBBox(markupFooterBBoxes).getMaxY();
}
var btf = new Rectangle(new Point(0f, (float)maxFooter), page.getPageWidth(), ((float)minHeader) - ((float)maxFooter), page.getPageNumber());
page.setBodyTextFrame(btf);
}
setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
}
}
private List<Rectangle2D> getMarkupRectangle(ClassificationPage page, String subtype) {
if (page.getMarkedContents() == null) {
return new ArrayList<>();
}
return page.getMarkedContents().stream()
.filter(m -> m.getTag().equals("Artifact"))
.map(PDMarkedContent::getProperties)
.filter(p -> p.getItem("Subtype") != null)
.filter(p -> ((COSName) p.getItem("Subtype")).getName().equals(subtype))
.filter(c -> c.getItem("BBox") != null)
.map(c -> c.getItem("BBox"))
.map(c -> ((COSArray) c).toFloatArray())
.map(f -> new Rectangle2D.Float(f[0], f[1], f[2] - f[0], f[3] - f[1]))
.collect(Collectors.toList());
}
private List<Rectangle2D> getMarkupRectangleByTextPositions(ClassificationPage page, String subtype) {
if (page.getMarkedContents() == null) {
return new ArrayList<>();
}
var markedContent = page.getMarkedContents().stream()
.filter(m -> m.getTag().equals("Artifact"))
.filter(m -> m.getProperties().getItem("Subtype") != null)
.filter(m -> ((COSName) m.getProperties().getItem("Subtype")).getName().equals(subtype))
.map(m -> m.getContents()).flatMap(Collection::stream)
.map(t -> (TextPosition) t)
.collect(Collectors.groupingBy(t -> t.getY()));
if (markedContent.isEmpty()) {
return new ArrayList<>();
}
return markedContent.entrySet().stream()
.map(e -> new TextPositionSequence(e.getValue().stream()
.toList(), page.getPageNumber(), true)
.getRectangle())
.map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))).collect(Collectors.toList());
}
/*
private Rectangle calculateBodyTextFrameByRulings(List<ClassificationPage> pages) {
@ -231,6 +156,11 @@ public class BodyTextFrameService {
continue;
}
if (page.getMarkedContentBboxPerType().get("Header") != null && page.getMarkedContentBboxPerType().get("Header").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight())
|| page.getMarkedContentBboxPerType().get("Footer") != null && page.getMarkedContentBboxPerType().get("Footer").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight())) {
continue;
}
float approxLineCount = PositionUtils.getApproxLineCount(textBlock);
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE) && approxLineCount < approximateHeaderLineCount && textBlock.getMaxY() >= page.getPageHeight() - (page.getPageHeight() / 10) || !layoutParsingType.equals(
LayoutParsingType.DOCUMINE) && approxLineCount < approximateHeaderLineCount) {
@ -307,4 +237,4 @@ public class BodyTextFrameService {
}
}
}

View File

@ -62,10 +62,16 @@ public class DocuMineClassificationService {
textBlock.setClassification(PageBlockType.OTHER);
return;
}
if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) {
if (page.getMarkedContentBboxPerType().get("Header") != null && page.getMarkedContentBboxPerType().get("Header").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight())
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())
) {
textBlock.setClassification(PageBlockType.HEADER);
} else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) {
} else if (page.getMarkedContentBboxPerType().get("Footer") != null && page.getMarkedContentBboxPerType().get("Footer").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight())
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())
) {
textBlock.setClassification(PageBlockType.FOOTER);
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
@ -105,4 +111,4 @@ public class DocuMineClassificationService {
}
}
}
}

View File

@ -51,11 +51,13 @@ public class RedactManagerClassificationService {
textBlock.setClassification(PageBlockType.OTHER);
return;
}
if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
if (page.getMarkedContentBboxPerType().get("Header") != null && page.getMarkedContentBboxPerType().get("Header").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight())
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
textBlock.setClassification(PageBlockType.HEADER);
} else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
} else if (page.getMarkedContentBboxPerType().get("Footer") != null && page.getMarkedContentBboxPerType().get("Footer").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight())
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
textBlock.setClassification(PageBlockType.FOOTER);
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,

View File

@ -56,9 +56,11 @@ public class TaasClassificationService {
textBlock.setClassification(PageBlockType.OTHER);
return;
}
if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) {
if (page.getMarkedContentBboxPerType().get("Header") != null && page.getMarkedContentBboxPerType().get("Header").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight())
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) {
textBlock.setClassification(PageBlockType.HEADER);
} else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) {
} else if (page.getMarkedContentBboxPerType().get("Footer") != null && page.getMarkedContentBboxPerType().get("Footer").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight())
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) {
textBlock.setClassification(PageBlockType.FOOTER);
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()

View File

@ -0,0 +1,45 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.experimental.UtilityClass;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.apache.pdfbox.text.TextPosition;
import java.awt.geom.Rectangle2D;
import java.util.Collection;
import java.util.List;
import java.util.stream.Collectors;
@UtilityClass
public class MarkedContentConverter {
public Rectangle2D getMarkedContentBboxPerLine(List<PDMarkedContent> markedContents, String subtype) {
if (markedContents == null) {
return null;
}
var markedContentByYPosition = markedContents.stream()
.filter(m -> m.getTag().equals("Artifact"))
.filter(m -> m.getProperties() != null)
.filter(m -> m.getProperties().getItem("Subtype") != null)
.filter(m -> ((COSName) m.getProperties().getItem("Subtype")).getName().equals(subtype))
.map(PDMarkedContent::getContents).flatMap(Collection::stream)
.filter(t -> t instanceof TextPosition)
.map(t -> (TextPosition) t)
.filter(t -> !t.getUnicode().equals(" "))
.collect(Collectors.groupingBy(TextPosition::getY));
if (markedContentByYPosition.isEmpty()) {
return null;
}
return RectangleTransformations.rectangle2DBBox(markedContentByYPosition.values().stream()
.map(textPositions -> new TextPositionSequence(textPositions.stream()
.toList(), 0, true)
.getRectangle())
.map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))).collect(Collectors.toList()));
}
}

View File

@ -4,6 +4,7 @@ import java.io.FileOutputStream;
import java.nio.file.Path;
import org.apache.pdfbox.Loader;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.springframework.core.io.ClassPathResource;
@ -17,12 +18,13 @@ import lombok.SneakyThrows;
public class ViewerDocumentTest extends BuildDocumentGraphTest {
@Test
@Disabled
@SneakyThrows
public void testViewerDocument() {
LayoutGridService layoutGridService = new LayoutGridService();
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService);
String fileName = "files/BASF/2013-1110704.pdf";
String fileName = "files/new/VV-511309_OCR.pdf";
Document document = buildGraph(fileName, LayoutParsingType.DOCUMINE);
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getInputStream()); var out = new FileOutputStream(tmpFileName)) {

View File

@ -1,24 +1,5 @@
package com.knecon.fforesight.service.layoutparser.server.segmentation;
import static org.assertj.core.api.Assertions.assertThat;
import java.awt.geom.Rectangle2D;
import java.io.IOException;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.ClassPathResource;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
@ -34,8 +15,19 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
import lombok.SneakyThrows;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.ClassPathResource;
import java.awt.geom.Rectangle2D;
import java.io.IOException;
import java.util.*;
import java.util.stream.Collectors;
import static org.assertj.core.api.Assertions.assertThat;
public class PdfSegmentationServiceTest extends AbstractTest {
@ -492,46 +484,6 @@ public class PdfSegmentationServiceTest extends AbstractTest {
}
@Test
public void testDoc30Page5dfgfdg() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (4).pdf");
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getInputStream()));
// validateTableSize(document, 1);
//
// validateTable(document, 0, 3, 5, 0, 0);
var rects = document.getPages().stream().map(page -> Stream.concat(getMarkupRectangle(page, "Footer").stream(),
(getMarkupRectangle(page, "Header").stream())).toList()
).toList();
// var markupHeaderBBoxes = getMarkupRectangle(page, "Header");
// var markupFooterBBoxes = getMarkupRectangle(page, "Footer");
PdfDraw.drawRectanglesPerPage("files/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica (4).pdf", rects, "/tmp/test.pdf");
}
private List<Rectangle2D> getMarkupRectangle(ClassificationPage page, String subtype) {
if (page.getMarkedContents() == null) {
return new ArrayList<>();
}
return page.getMarkedContents().stream()
.filter(m -> m.getTag().equals("Artifact"))
.map(PDMarkedContent::getProperties)
.filter(p -> p.getItem("Subtype") != null && ((COSName) p.getItem("Subtype")).getName().equals(subtype))
.map(c -> c.getItem("BBox"))
.map(c -> ((COSArray) c).toFloatArray())
.map(f -> new Rectangle2D.Float(f[0], f[1], f[2] -f[0], f[3] - f[1]))
.collect(Collectors.toList());
}
private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) {
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex);