Merge branch 'RED-7461' into 'main'

RED-7461: Fixed wrong textblock classifation if footer is marked as header

See merge request fforesight/layout-parser!72
This commit is contained in:
Dominique Eifländer 2023-09-01 12:14:38 +02:00
commit 8dba392904
8 changed files with 206 additions and 10 deletions

View File

@ -238,9 +238,9 @@ public class LayoutParsingPipeline {
}
private Map<String, Rectangle2D> convertMarkedContents(List<PDMarkedContent> pdMarkedContents) {
private Map<String, List<Rectangle2D>> convertMarkedContents(List<PDMarkedContent> pdMarkedContents) {
Map<String, Rectangle2D> markedContentBboxes = new HashMap<>();
Map<String, List<Rectangle2D>> markedContentBboxes = new HashMap<>();
markedContentBboxes.put(MarkedContentUtils.HEADER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.HEADER));
markedContentBboxes.put(MarkedContentUtils.FOOTER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.FOOTER));
return markedContentBboxes;

View File

@ -42,6 +42,6 @@ public class ClassificationPage {
CleanRulings cleanRulings;
private Map<String, Rectangle2D> markedContentBboxPerType = new HashMap<>();
private Map<String, List<Rectangle2D>> markedContentBboxPerType = new HashMap<>();
}

View File

@ -19,7 +19,7 @@ public class MarkedContentUtils {
public static final String HEADER = "Header";
public static final String FOOTER = "Footer";
public Rectangle2D getMarkedContentBboxPerLine(List<PDMarkedContent> markedContents, String subtype) {
public List<Rectangle2D> getMarkedContentBboxPerLine(List<PDMarkedContent> markedContents, String subtype) {
if (markedContents == null) {
return null;
@ -40,16 +40,16 @@ public class MarkedContentUtils {
return null;
}
return RectangleTransformations.rectangle2DBBox(markedContentByYPosition.values().stream()
return markedContentByYPosition.values().stream()
.map(textPositions -> new TextPositionSequence(textPositions.stream()
.toList(), 0, true)
.getRectangle())
.map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))).collect(Collectors.toList()));
.map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))).collect(Collectors.toList());
}
public boolean intersects(TextPageBlock textBlock, Map<String, Rectangle2D> markedContentBboxPerType, String type) {
return markedContentBboxPerType.get(type) != null && markedContentBboxPerType.get(type).intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight());
public boolean intersects(TextPageBlock textBlock, Map<String, List<Rectangle2D>> markedContentBboxPerType, String type) {
return markedContentBboxPerType.get(type) != null && markedContentBboxPerType.get(type).stream().anyMatch(rectangle -> rectangle.intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()));
}
}

View File

@ -0,0 +1,196 @@
package com.knecon.fforesight.service.layoutparser.server.graph;
import java.io.File;
import java.io.IOException;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import lombok.SneakyThrows;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSNumber;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureElement;
import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureNode;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.apache.pdfbox.text.PDFMarkedContentExtractor;
import org.apache.pdfbox.text.TextPosition;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.springframework.core.io.ClassPathResource;
/**
* @author mkl
*/
@Disabled
public class ExtractMarkedContentTest {
final static File RESULT_FOLDER = new File("target/test-outputs", "extract");
@BeforeEach
public void setUpBeforeClass() throws Exception {
RESULT_FOLDER.mkdirs();
}
/**
* <a href="https://stackoverflow.com/questions/54956720/how-to-replace-a-space-with-a-word-while-extract-the-data-from-pdf-using-pdfbox">
* How to replace a space with a word while extract the data from PDF using PDFBox
* </a>
* <br/>
* <a href="https://drive.google.com/open?id=10ZkdPlGWzMJeahwnQPzE6V7s09d1nvwq">
* test.pdf
* </a> as "testWPhromma.pdf"
* <p>
* This test shows how to, in principle, extract tagged text.
* </p>
*/
@Test
@SneakyThrows
public void testExtractTestWPhromma() throws IOException {
System.out.printf("\n\n===\n%s\n===\n", "testWPhromma.pdf");
PDDocument document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile());
Map<PDPage, Map<Integer, PDMarkedContent>> markedContents = new HashMap<>();
for (PDPage page : document.getPages()) {
PDFMarkedContentExtractor extractor = new PDFMarkedContentExtractor();
extractor.processPage(page);
Map<Integer, PDMarkedContent> theseMarkedContents = new HashMap<>();
markedContents.put(page, theseMarkedContents);
for (PDMarkedContent markedContent : extractor.getMarkedContents()) {
theseMarkedContents.put(markedContent.getMCID(), markedContent);
}
}
PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot();
showStructure(root, markedContents);
}
/**
* <a href="https://stackoverflow.com/questions/59192443/get-tags-related-bboxs-even-though-there-is-no-attributes-a-in-document-cata">
* Get tag's related BBox's even though there is no attributes (/A in document catalog structure) related to Layout in PDFBox?
* </a>
* <br/>
* <a href="https://drive.google.com/file/d/1_-tuWuReaTvrDsqQwldTnPYrMHSpXIWp/view?usp=sharing">
* res_multipage.pdf
* </a>
* <p>
* This test shows how to, in principle, extract tagged text from this document.
* </p>
*/
@Test
public void testExtractResMultipage() throws IOException {
System.out.printf("\n\n===\n%s\n===\n", "res_multipage.pdf");
PDDocument document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile());
Map<PDPage, Map<Integer, PDMarkedContent>> markedContents = new HashMap<>();
for (PDPage page : document.getPages()) {
PDFMarkedContentExtractor extractor = new PDFMarkedContentExtractor();
extractor.processPage(page);
Map<Integer, PDMarkedContent> theseMarkedContents = new HashMap<>();
markedContents.put(page, theseMarkedContents);
for (PDMarkedContent markedContent : extractor.getMarkedContents()) {
theseMarkedContents.put(markedContent.getMCID(), markedContent);
}
}
PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot();
showStructure(root, markedContents);
}
/**
* <a href="https://issues.apache.org/jira/browse/PDFBOX-5613">
* PDFBOX-5613 - uncorrent paragraph split
* </a>
* <br/>
* <a href="https://issues.apache.org/jira/secure/attachment/13058595/Daily%20Report.pdf">
* Daily Report.pdf
* </a>
* <p>
* Making use of the marked content, the contents of the file can better be extracted.
* </p>
*/
@Test
public void testExtractDailyReport() throws IOException {
System.out.printf("\n\n===\n%s\n===\n", "Daily Report.pdf");
PDDocument document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile());
Map<PDPage, Map<Integer, PDMarkedContent>> markedContents = new HashMap<>();
for (PDPage page : document.getPages()) {
PDFMarkedContentExtractor extractor = new PDFMarkedContentExtractor();
extractor.processPage(page);
Map<Integer, PDMarkedContent> theseMarkedContents = new HashMap<>();
markedContents.put(page, theseMarkedContents);
for (PDMarkedContent markedContent : extractor.getMarkedContents()) {
theseMarkedContents.put(markedContent.getMCID(), markedContent);
}
}
PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot();
showStructure(root, markedContents);
}
/**
* @see #testExtractTestWPhromma()
*/
void showStructure(PDStructureNode node, Map<PDPage, Map<Integer, PDMarkedContent>> markedContents) {
String structType = null;
PDPage page = null;
if (node instanceof PDStructureElement) {
PDStructureElement element = (PDStructureElement) node;
structType = element.getStructureType();
page = element.getPage();
}
Map<Integer, PDMarkedContent> theseMarkedContents = markedContents.get(page);
System.out.printf("<%s>\n", structType);
for (Object object : node.getKids()) {
if (object instanceof COSArray) {
for (COSBase base : (COSArray) object) {
if (base instanceof COSDictionary) {
showStructure(PDStructureNode.create((COSDictionary) base), markedContents);
} else if (base instanceof COSNumber) {
showContent(((COSNumber)base).intValue(), theseMarkedContents);
} else {
System.out.printf("?%s\n", base);
}
}
} else if (object instanceof PDStructureNode) {
showStructure((PDStructureNode) object, markedContents);
} else if (object instanceof Integer) {
showContent((Integer)object, theseMarkedContents);
} else {
System.out.printf("?%s\n", object);
}
}
System.out.printf("</%s>\n", structType);
}
/**
* @see #showStructure(PDStructureNode, Map)
* @see #testExtractTestWPhromma()
*/
void showContent(int mcid, Map<Integer, PDMarkedContent> theseMarkedContents) {
PDMarkedContent markedContent = theseMarkedContents != null ? theseMarkedContents.get(mcid) : null;
List<Object> contents = markedContent != null ? markedContent.getContents() : Collections.emptyList();
StringBuilder textContent = new StringBuilder();
for (Object object : contents) {
if (object instanceof TextPosition) {
textContent.append(((TextPosition)object).getUnicode());
} else {
textContent.append("?" + object);
}
}
System.out.printf("%s\n", textContent);
}
}

View File

@ -25,8 +25,8 @@ public class ViewerDocumentTest extends BuildDocumentTest {
LayoutGridService layoutGridService = new LayoutGridService();
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService);
String fileName = "files/new/VV-511309_OCR.pdf";
Document document = buildGraph(fileName, LayoutParsingType.DOCUMINE);
String fileName = "files/marked_content/Header-Header.pdf";
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getFile()); var out = new FileOutputStream(tmpFileName)) {
viewerDocumentService.createViewerDocument(pdDocument, document, out, true);