diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index e62c55a..acb23fc 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -238,9 +238,9 @@ public class LayoutParsingPipeline { } - private Map convertMarkedContents(List pdMarkedContents) { + private Map> convertMarkedContents(List pdMarkedContents) { - Map markedContentBboxes = new HashMap<>(); + Map> markedContentBboxes = new HashMap<>(); markedContentBboxes.put(MarkedContentUtils.HEADER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.HEADER)); markedContentBboxes.put(MarkedContentUtils.FOOTER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.FOOTER)); return markedContentBboxes; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java index 798612e..a654636 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/ClassificationPage.java @@ -42,6 +42,6 @@ public class ClassificationPage { CleanRulings cleanRulings; - private Map markedContentBboxPerType = new HashMap<>(); + private Map> markedContentBboxPerType = new HashMap<>(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java index 49d46f3..af4676f 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java @@ -19,7 +19,7 @@ public class MarkedContentUtils { public static final String HEADER = "Header"; public static final String FOOTER = "Footer"; - public Rectangle2D getMarkedContentBboxPerLine(List markedContents, String subtype) { + public List getMarkedContentBboxPerLine(List markedContents, String subtype) { if (markedContents == null) { return null; @@ -40,16 +40,16 @@ public class MarkedContentUtils { return null; } - return RectangleTransformations.rectangle2DBBox(markedContentByYPosition.values().stream() + return markedContentByYPosition.values().stream() .map(textPositions -> new TextPositionSequence(textPositions.stream() .toList(), 0, true) .getRectangle()) - .map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))).collect(Collectors.toList())); + .map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))).collect(Collectors.toList()); } - public boolean intersects(TextPageBlock textBlock, Map markedContentBboxPerType, String type) { - return markedContentBboxPerType.get(type) != null && markedContentBboxPerType.get(type).intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()); + public boolean intersects(TextPageBlock textBlock, Map> markedContentBboxPerType, String type) { + return markedContentBboxPerType.get(type) != null && markedContentBboxPerType.get(type).stream().anyMatch(rectangle -> rectangle.intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight())); } } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ExtractMarkedContentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ExtractMarkedContentTest.java new file mode 100644 index 0000000..a811fe7 --- /dev/null +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ExtractMarkedContentTest.java @@ -0,0 +1,196 @@ +package com.knecon.fforesight.service.layoutparser.server.graph; + +import java.io.File; +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import lombok.SneakyThrows; +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.cos.COSArray; +import org.apache.pdfbox.cos.COSBase; +import org.apache.pdfbox.cos.COSDictionary; +import org.apache.pdfbox.cos.COSNumber; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureElement; +import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureNode; +import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; +import org.apache.pdfbox.text.PDFMarkedContentExtractor; +import org.apache.pdfbox.text.TextPosition; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; +import org.springframework.core.io.ClassPathResource; + +/** + * @author mkl + */ +@Disabled +public class ExtractMarkedContentTest { + final static File RESULT_FOLDER = new File("target/test-outputs", "extract"); + + @BeforeEach + public void setUpBeforeClass() throws Exception { + RESULT_FOLDER.mkdirs(); + } + + /** + * + * How to replace a space with a word while extract the data from PDF using PDFBox + * + *
+ * + * test.pdf + * as "testWPhromma.pdf" + *

+ * This test shows how to, in principle, extract tagged text. + *

+ */ + @Test + @SneakyThrows + public void testExtractTestWPhromma() throws IOException { + System.out.printf("\n\n===\n%s\n===\n", "testWPhromma.pdf"); + PDDocument document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile()); + + Map> markedContents = new HashMap<>(); + + for (PDPage page : document.getPages()) { + PDFMarkedContentExtractor extractor = new PDFMarkedContentExtractor(); + extractor.processPage(page); + + Map theseMarkedContents = new HashMap<>(); + markedContents.put(page, theseMarkedContents); + for (PDMarkedContent markedContent : extractor.getMarkedContents()) { + theseMarkedContents.put(markedContent.getMCID(), markedContent); + } + } + + PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot(); + showStructure(root, markedContents); + } + + /** + * + * Get tag's related BBox's even though there is no attributes (/A in document catalog structure) related to Layout in PDFBox? + * + *
+ * + * res_multipage.pdf + * + *

+ * This test shows how to, in principle, extract tagged text from this document. + *

+ */ + @Test + public void testExtractResMultipage() throws IOException { + System.out.printf("\n\n===\n%s\n===\n", "res_multipage.pdf"); + PDDocument document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile()); + + Map> markedContents = new HashMap<>(); + + for (PDPage page : document.getPages()) { + PDFMarkedContentExtractor extractor = new PDFMarkedContentExtractor(); + extractor.processPage(page); + + Map theseMarkedContents = new HashMap<>(); + markedContents.put(page, theseMarkedContents); + for (PDMarkedContent markedContent : extractor.getMarkedContents()) { + theseMarkedContents.put(markedContent.getMCID(), markedContent); + } + } + + PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot(); + showStructure(root, markedContents); + } + + /** + * + * PDFBOX-5613 - uncorrent paragraph split + * + *
+ * + * Daily Report.pdf + * + *

+ * Making use of the marked content, the contents of the file can better be extracted. + *

+ */ + @Test + public void testExtractDailyReport() throws IOException { + System.out.printf("\n\n===\n%s\n===\n", "Daily Report.pdf"); + PDDocument document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile()); + + Map> markedContents = new HashMap<>(); + + for (PDPage page : document.getPages()) { + PDFMarkedContentExtractor extractor = new PDFMarkedContentExtractor(); + extractor.processPage(page); + + Map theseMarkedContents = new HashMap<>(); + markedContents.put(page, theseMarkedContents); + for (PDMarkedContent markedContent : extractor.getMarkedContents()) { + theseMarkedContents.put(markedContent.getMCID(), markedContent); + } + } + + PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot(); + showStructure(root, markedContents); + } + + /** + * @see #testExtractTestWPhromma() + */ + void showStructure(PDStructureNode node, Map> markedContents) { + String structType = null; + PDPage page = null; + if (node instanceof PDStructureElement) { + PDStructureElement element = (PDStructureElement) node; + structType = element.getStructureType(); + page = element.getPage(); + } + Map theseMarkedContents = markedContents.get(page); + System.out.printf("<%s>\n", structType); + for (Object object : node.getKids()) { + if (object instanceof COSArray) { + for (COSBase base : (COSArray) object) { + if (base instanceof COSDictionary) { + showStructure(PDStructureNode.create((COSDictionary) base), markedContents); + } else if (base instanceof COSNumber) { + showContent(((COSNumber)base).intValue(), theseMarkedContents); + } else { + System.out.printf("?%s\n", base); + } + } + } else if (object instanceof PDStructureNode) { + showStructure((PDStructureNode) object, markedContents); + } else if (object instanceof Integer) { + showContent((Integer)object, theseMarkedContents); + } else { + System.out.printf("?%s\n", object); + } + + } + System.out.printf("\n", structType); + } + + /** + * @see #showStructure(PDStructureNode, Map) + * @see #testExtractTestWPhromma() + */ + void showContent(int mcid, Map theseMarkedContents) { + PDMarkedContent markedContent = theseMarkedContents != null ? theseMarkedContents.get(mcid) : null; + List contents = markedContent != null ? markedContent.getContents() : Collections.emptyList(); + StringBuilder textContent = new StringBuilder(); + for (Object object : contents) { + if (object instanceof TextPosition) { + textContent.append(((TextPosition)object).getUnicode()); + } else { + textContent.append("?" + object); + } + } + System.out.printf("%s\n", textContent); + } +} \ No newline at end of file diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index 78a4785..9db7867 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -25,8 +25,8 @@ public class ViewerDocumentTest extends BuildDocumentTest { LayoutGridService layoutGridService = new LayoutGridService(); ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService); - String fileName = "files/new/VV-511309_OCR.pdf"; - Document document = buildGraph(fileName, LayoutParsingType.DOCUMINE); + String fileName = "files/marked_content/Header-Header.pdf"; + Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER); String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getFile()); var out = new FileOutputStream(tmpFileName)) { viewerDocumentService.createViewerDocument(pdDocument, document, out, true); diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/marked_content/Footer-Footer.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/marked_content/Footer-Footer.pdf new file mode 100644 index 0000000..b2dc11e Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/marked_content/Footer-Footer.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/marked_content/Header-Footer.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/marked_content/Header-Footer.pdf new file mode 100644 index 0000000..eba7e07 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/marked_content/Header-Footer.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/marked_content/Header-Header.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/marked_content/Header-Header.pdf new file mode 100644 index 0000000..b616ee5 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/marked_content/Header-Header.pdf differ