Merge branch 'RED-7461' into 'main'

RED-7461: Fixed wrong textblock classifation if footer is marked as header See merge request fforesight/layout-parser!72
2023-09-01 12:14:38 +02:00 · 2023-09-01 12:14:38 +02:00 · 8dba392904
commit 8dba392904
parent 754fd8f933 306a53ea79
8 changed files with 206 additions and 10 deletions
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java
@ -238,9 +238,9 @@ public class LayoutParsingPipeline {
    }


-    private Map<String, Rectangle2D> convertMarkedContents(List<PDMarkedContent> pdMarkedContents) {
+    private Map<String, List<Rectangle2D>> convertMarkedContents(List<PDMarkedContent> pdMarkedContents) {

-        Map<String, Rectangle2D> markedContentBboxes = new HashMap<>();
+        Map<String, List<Rectangle2D>> markedContentBboxes = new HashMap<>();
        markedContentBboxes.put(MarkedContentUtils.HEADER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.HEADER));
        markedContentBboxes.put(MarkedContentUtils.FOOTER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.FOOTER));
        return markedContentBboxes;
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java
@ -42,6 +42,6 @@ public class ClassificationPage {

    CleanRulings cleanRulings;

-    private Map<String, Rectangle2D> markedContentBboxPerType = new HashMap<>();
+    private Map<String, List<Rectangle2D>> markedContentBboxPerType = new HashMap<>();

 }
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java
@ -19,7 +19,7 @@ public class MarkedContentUtils {
    public static final String HEADER = "Header";
    public static final String FOOTER = "Footer";

-    public Rectangle2D getMarkedContentBboxPerLine(List<PDMarkedContent> markedContents, String subtype) {
+    public List<Rectangle2D> getMarkedContentBboxPerLine(List<PDMarkedContent> markedContents, String subtype) {

        if (markedContents == null) {
            return null;
@ -40,16 +40,16 @@ public class MarkedContentUtils {
            return null;
        }

-        return RectangleTransformations.rectangle2DBBox(markedContentByYPosition.values().stream()
+        return markedContentByYPosition.values().stream()
                .map(textPositions -> new TextPositionSequence(textPositions.stream()
                        .toList(), 0, true)
                        .getRectangle())
-                .map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))).collect(Collectors.toList()));
+                .map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))).collect(Collectors.toList());
    }


-    public boolean intersects(TextPageBlock textBlock, Map<String, Rectangle2D> markedContentBboxPerType, String type) {
-        return markedContentBboxPerType.get(type) != null && markedContentBboxPerType.get(type).intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight());
+    public boolean intersects(TextPageBlock textBlock, Map<String, List<Rectangle2D>> markedContentBboxPerType, String type) {
+        return markedContentBboxPerType.get(type) != null && markedContentBboxPerType.get(type).stream().anyMatch(rectangle -> rectangle.intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()));
    }

 }
--- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ExtractMarkedContentTest.java
+++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ExtractMarkedContentTest.java
@ -0,0 +1,196 @@
+package com.knecon.fforesight.service.layoutparser.server.graph;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import lombok.SneakyThrows;
+import org.apache.pdfbox.Loader;
+import org.apache.pdfbox.cos.COSArray;
+import org.apache.pdfbox.cos.COSBase;
+import org.apache.pdfbox.cos.COSDictionary;
+import org.apache.pdfbox.cos.COSNumber;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureElement;
+import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureNode;
+import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
+import org.apache.pdfbox.text.PDFMarkedContentExtractor;
+import org.apache.pdfbox.text.TextPosition;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Test;
+import org.springframework.core.io.ClassPathResource;
+
+/**
+ * @author mkl
+ */
+@Disabled
+public class ExtractMarkedContentTest {
+    final static File RESULT_FOLDER = new File("target/test-outputs", "extract");
+
+    @BeforeEach
+    public void setUpBeforeClass() throws Exception {
+        RESULT_FOLDER.mkdirs();
+    }
+
+    /**
+     * <a href="https://stackoverflow.com/questions/54956720/how-to-replace-a-space-with-a-word-while-extract-the-data-from-pdf-using-pdfbox">
+     * How to replace a space with a word while extract the data from PDF using PDFBox
+     * </a>
+     * <br/>
+     * <a href="https://drive.google.com/open?id=10ZkdPlGWzMJeahwnQPzE6V7s09d1nvwq">
+     * test.pdf
+     * </a> as "testWPhromma.pdf"
+     * <p>
+     * This test shows how to, in principle, extract tagged text.
+     * </p>
+     */
+    @Test
+    @SneakyThrows
+    public void testExtractTestWPhromma() throws IOException {
+        System.out.printf("\n\n===\n%s\n===\n", "testWPhromma.pdf");
+            PDDocument document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile());
+
+            Map<PDPage, Map<Integer, PDMarkedContent>> markedContents = new HashMap<>();
+
+            for (PDPage page : document.getPages()) {
+                PDFMarkedContentExtractor extractor = new PDFMarkedContentExtractor();
+                extractor.processPage(page);
+
+                Map<Integer, PDMarkedContent> theseMarkedContents = new HashMap<>();
+                markedContents.put(page, theseMarkedContents);
+                for (PDMarkedContent markedContent : extractor.getMarkedContents()) {
+                    theseMarkedContents.put(markedContent.getMCID(), markedContent);
+                }
+            }
+
+            PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot();
+            showStructure(root, markedContents);
+    }
+
+    /**
+     * <a href="https://stackoverflow.com/questions/59192443/get-tags-related-bboxs-even-though-there-is-no-attributes-a-in-document-cata">
+     * Get tag's related BBox's even though there is no attributes (/A in document catalog structure) related to Layout in PDFBox?
+     * </a>
+     * <br/>
+     * <a href="https://drive.google.com/file/d/1_-tuWuReaTvrDsqQwldTnPYrMHSpXIWp/view?usp=sharing">
+     * res_multipage.pdf
+     * </a>
+     * <p>
+     * This test shows how to, in principle, extract tagged text from this document.
+     * </p>
+     */
+    @Test
+    public void testExtractResMultipage() throws IOException {
+        System.out.printf("\n\n===\n%s\n===\n", "res_multipage.pdf");
+            PDDocument document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile());
+
+            Map<PDPage, Map<Integer, PDMarkedContent>> markedContents = new HashMap<>();
+
+            for (PDPage page : document.getPages()) {
+                PDFMarkedContentExtractor extractor = new PDFMarkedContentExtractor();
+                extractor.processPage(page);
+
+                Map<Integer, PDMarkedContent> theseMarkedContents = new HashMap<>();
+                markedContents.put(page, theseMarkedContents);
+                for (PDMarkedContent markedContent : extractor.getMarkedContents()) {
+                    theseMarkedContents.put(markedContent.getMCID(), markedContent);
+                }
+            }
+
+            PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot();
+            showStructure(root, markedContents);
+    }
+
+    /**
+     * <a href="https://issues.apache.org/jira/browse/PDFBOX-5613">
+     * PDFBOX-5613 - uncorrent paragraph split
+     * </a>
+     * <br/>
+     * <a href="https://issues.apache.org/jira/secure/attachment/13058595/Daily%20Report.pdf">
+     * Daily Report.pdf
+     * </a>
+     * <p>
+     * Making use of the marked content, the contents of the file can better be extracted.
+     * </p>
+     */
+    @Test
+    public void testExtractDailyReport() throws IOException {
+        System.out.printf("\n\n===\n%s\n===\n", "Daily Report.pdf");
+            PDDocument document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile());
+
+            Map<PDPage, Map<Integer, PDMarkedContent>> markedContents = new HashMap<>();
+
+            for (PDPage page : document.getPages()) {
+                PDFMarkedContentExtractor extractor = new PDFMarkedContentExtractor();
+                extractor.processPage(page);
+
+                Map<Integer, PDMarkedContent> theseMarkedContents = new HashMap<>();
+                markedContents.put(page, theseMarkedContents);
+                for (PDMarkedContent markedContent : extractor.getMarkedContents()) {
+                    theseMarkedContents.put(markedContent.getMCID(), markedContent);
+                }
+            }
+
+            PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot();
+            showStructure(root, markedContents);
+        }
+
+    /**
+     * @see #testExtractTestWPhromma()
+     */
+    void showStructure(PDStructureNode node, Map<PDPage, Map<Integer, PDMarkedContent>> markedContents) {
+        String structType = null;
+        PDPage page = null;
+        if (node instanceof PDStructureElement) {
+            PDStructureElement element = (PDStructureElement) node;
+            structType = element.getStructureType();
+            page = element.getPage();
+        }
+        Map<Integer, PDMarkedContent> theseMarkedContents = markedContents.get(page);
+        System.out.printf("<%s>\n", structType);
+        for (Object object : node.getKids()) {
+            if (object instanceof COSArray) {
+                for (COSBase base : (COSArray) object) {
+                    if (base instanceof COSDictionary) {
+                        showStructure(PDStructureNode.create((COSDictionary) base), markedContents);
+                    } else if (base instanceof COSNumber) {
+                        showContent(((COSNumber)base).intValue(), theseMarkedContents);
+                    } else {
+                        System.out.printf("?%s\n", base);
+                    }
+                }
+            } else if (object instanceof PDStructureNode) {
+                showStructure((PDStructureNode) object, markedContents);
+            } else if (object instanceof Integer) {
+                showContent((Integer)object, theseMarkedContents);
+            } else {
+                System.out.printf("?%s\n", object);
+            }
+
+        }
+        System.out.printf("</%s>\n", structType);
+    }
+
+    /**
+     * @see #showStructure(PDStructureNode, Map)
+     * @see #testExtractTestWPhromma()
+     */
+    void showContent(int mcid, Map<Integer, PDMarkedContent> theseMarkedContents) {
+        PDMarkedContent markedContent = theseMarkedContents != null ? theseMarkedContents.get(mcid) : null;
+        List<Object> contents = markedContent != null ? markedContent.getContents() : Collections.emptyList();
+        StringBuilder textContent =  new StringBuilder();
+        for (Object object : contents) {
+            if (object instanceof TextPosition) {
+                textContent.append(((TextPosition)object).getUnicode());
+            } else {
+                textContent.append("?" + object);
+            }
+        }
+        System.out.printf("%s\n", textContent);
+    }
+}
--- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java
+++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java
@ -25,8 +25,8 @@ public class ViewerDocumentTest extends BuildDocumentTest {

        LayoutGridService layoutGridService = new LayoutGridService();
        ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService);
-        String fileName = "files/new/VV-511309_OCR.pdf";
-        Document document = buildGraph(fileName, LayoutParsingType.DOCUMINE);
+        String fileName = "files/marked_content/Header-Header.pdf";
+        Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
        String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
        try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getFile()); var out = new FileOutputStream(tmpFileName)) {
            viewerDocumentService.createViewerDocument(pdDocument, document, out, true);
--- a/layoutparser-service/layoutparser-service-server/src/test/resources/files/marked_content/Footer-Footer.pdf
+++ b/layoutparser-service/layoutparser-service-server/src/test/resources/files/marked_content/Footer-Footer.pdf
--- a/layoutparser-service/layoutparser-service-server/src/test/resources/files/marked_content/Header-Footer.pdf
+++ b/layoutparser-service/layoutparser-service-server/src/test/resources/files/marked_content/Header-Footer.pdf
--- a/layoutparser-service/layoutparser-service-server/src/test/resources/files/marked_content/Header-Header.pdf
+++ b/layoutparser-service/layoutparser-service-server/src/test/resources/files/marked_content/Header-Header.pdf