Merge branch 'RED-7461' into 'main'
RED-7461: Fixed wrong textblock classifation if footer is marked as header See merge request fforesight/layout-parser!72
This commit is contained in:
commit
8dba392904
@ -238,9 +238,9 @@ public class LayoutParsingPipeline {
|
||||
}
|
||||
|
||||
|
||||
private Map<String, Rectangle2D> convertMarkedContents(List<PDMarkedContent> pdMarkedContents) {
|
||||
private Map<String, List<Rectangle2D>> convertMarkedContents(List<PDMarkedContent> pdMarkedContents) {
|
||||
|
||||
Map<String, Rectangle2D> markedContentBboxes = new HashMap<>();
|
||||
Map<String, List<Rectangle2D>> markedContentBboxes = new HashMap<>();
|
||||
markedContentBboxes.put(MarkedContentUtils.HEADER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.HEADER));
|
||||
markedContentBboxes.put(MarkedContentUtils.FOOTER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.FOOTER));
|
||||
return markedContentBboxes;
|
||||
|
||||
@ -42,6 +42,6 @@ public class ClassificationPage {
|
||||
|
||||
CleanRulings cleanRulings;
|
||||
|
||||
private Map<String, Rectangle2D> markedContentBboxPerType = new HashMap<>();
|
||||
private Map<String, List<Rectangle2D>> markedContentBboxPerType = new HashMap<>();
|
||||
|
||||
}
|
||||
|
||||
@ -19,7 +19,7 @@ public class MarkedContentUtils {
|
||||
public static final String HEADER = "Header";
|
||||
public static final String FOOTER = "Footer";
|
||||
|
||||
public Rectangle2D getMarkedContentBboxPerLine(List<PDMarkedContent> markedContents, String subtype) {
|
||||
public List<Rectangle2D> getMarkedContentBboxPerLine(List<PDMarkedContent> markedContents, String subtype) {
|
||||
|
||||
if (markedContents == null) {
|
||||
return null;
|
||||
@ -40,16 +40,16 @@ public class MarkedContentUtils {
|
||||
return null;
|
||||
}
|
||||
|
||||
return RectangleTransformations.rectangle2DBBox(markedContentByYPosition.values().stream()
|
||||
return markedContentByYPosition.values().stream()
|
||||
.map(textPositions -> new TextPositionSequence(textPositions.stream()
|
||||
.toList(), 0, true)
|
||||
.getRectangle())
|
||||
.map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))).collect(Collectors.toList()));
|
||||
.map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
public boolean intersects(TextPageBlock textBlock, Map<String, Rectangle2D> markedContentBboxPerType, String type) {
|
||||
return markedContentBboxPerType.get(type) != null && markedContentBboxPerType.get(type).intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight());
|
||||
public boolean intersects(TextPageBlock textBlock, Map<String, List<Rectangle2D>> markedContentBboxPerType, String type) {
|
||||
return markedContentBboxPerType.get(type) != null && markedContentBboxPerType.get(type).stream().anyMatch(rectangle -> rectangle.intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,196 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.graph;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.cos.COSArray;
|
||||
import org.apache.pdfbox.cos.COSBase;
|
||||
import org.apache.pdfbox.cos.COSDictionary;
|
||||
import org.apache.pdfbox.cos.COSNumber;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureElement;
|
||||
import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureNode;
|
||||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
||||
import org.apache.pdfbox.text.PDFMarkedContentExtractor;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
/**
|
||||
* @author mkl
|
||||
*/
|
||||
@Disabled
|
||||
public class ExtractMarkedContentTest {
|
||||
final static File RESULT_FOLDER = new File("target/test-outputs", "extract");
|
||||
|
||||
@BeforeEach
|
||||
public void setUpBeforeClass() throws Exception {
|
||||
RESULT_FOLDER.mkdirs();
|
||||
}
|
||||
|
||||
/**
|
||||
* <a href="https://stackoverflow.com/questions/54956720/how-to-replace-a-space-with-a-word-while-extract-the-data-from-pdf-using-pdfbox">
|
||||
* How to replace a space with a word while extract the data from PDF using PDFBox
|
||||
* </a>
|
||||
* <br/>
|
||||
* <a href="https://drive.google.com/open?id=10ZkdPlGWzMJeahwnQPzE6V7s09d1nvwq">
|
||||
* test.pdf
|
||||
* </a> as "testWPhromma.pdf"
|
||||
* <p>
|
||||
* This test shows how to, in principle, extract tagged text.
|
||||
* </p>
|
||||
*/
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testExtractTestWPhromma() throws IOException {
|
||||
System.out.printf("\n\n===\n%s\n===\n", "testWPhromma.pdf");
|
||||
PDDocument document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile());
|
||||
|
||||
Map<PDPage, Map<Integer, PDMarkedContent>> markedContents = new HashMap<>();
|
||||
|
||||
for (PDPage page : document.getPages()) {
|
||||
PDFMarkedContentExtractor extractor = new PDFMarkedContentExtractor();
|
||||
extractor.processPage(page);
|
||||
|
||||
Map<Integer, PDMarkedContent> theseMarkedContents = new HashMap<>();
|
||||
markedContents.put(page, theseMarkedContents);
|
||||
for (PDMarkedContent markedContent : extractor.getMarkedContents()) {
|
||||
theseMarkedContents.put(markedContent.getMCID(), markedContent);
|
||||
}
|
||||
}
|
||||
|
||||
PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot();
|
||||
showStructure(root, markedContents);
|
||||
}
|
||||
|
||||
/**
|
||||
* <a href="https://stackoverflow.com/questions/59192443/get-tags-related-bboxs-even-though-there-is-no-attributes-a-in-document-cata">
|
||||
* Get tag's related BBox's even though there is no attributes (/A in document catalog structure) related to Layout in PDFBox?
|
||||
* </a>
|
||||
* <br/>
|
||||
* <a href="https://drive.google.com/file/d/1_-tuWuReaTvrDsqQwldTnPYrMHSpXIWp/view?usp=sharing">
|
||||
* res_multipage.pdf
|
||||
* </a>
|
||||
* <p>
|
||||
* This test shows how to, in principle, extract tagged text from this document.
|
||||
* </p>
|
||||
*/
|
||||
@Test
|
||||
public void testExtractResMultipage() throws IOException {
|
||||
System.out.printf("\n\n===\n%s\n===\n", "res_multipage.pdf");
|
||||
PDDocument document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile());
|
||||
|
||||
Map<PDPage, Map<Integer, PDMarkedContent>> markedContents = new HashMap<>();
|
||||
|
||||
for (PDPage page : document.getPages()) {
|
||||
PDFMarkedContentExtractor extractor = new PDFMarkedContentExtractor();
|
||||
extractor.processPage(page);
|
||||
|
||||
Map<Integer, PDMarkedContent> theseMarkedContents = new HashMap<>();
|
||||
markedContents.put(page, theseMarkedContents);
|
||||
for (PDMarkedContent markedContent : extractor.getMarkedContents()) {
|
||||
theseMarkedContents.put(markedContent.getMCID(), markedContent);
|
||||
}
|
||||
}
|
||||
|
||||
PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot();
|
||||
showStructure(root, markedContents);
|
||||
}
|
||||
|
||||
/**
|
||||
* <a href="https://issues.apache.org/jira/browse/PDFBOX-5613">
|
||||
* PDFBOX-5613 - uncorrent paragraph split
|
||||
* </a>
|
||||
* <br/>
|
||||
* <a href="https://issues.apache.org/jira/secure/attachment/13058595/Daily%20Report.pdf">
|
||||
* Daily Report.pdf
|
||||
* </a>
|
||||
* <p>
|
||||
* Making use of the marked content, the contents of the file can better be extracted.
|
||||
* </p>
|
||||
*/
|
||||
@Test
|
||||
public void testExtractDailyReport() throws IOException {
|
||||
System.out.printf("\n\n===\n%s\n===\n", "Daily Report.pdf");
|
||||
PDDocument document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile());
|
||||
|
||||
Map<PDPage, Map<Integer, PDMarkedContent>> markedContents = new HashMap<>();
|
||||
|
||||
for (PDPage page : document.getPages()) {
|
||||
PDFMarkedContentExtractor extractor = new PDFMarkedContentExtractor();
|
||||
extractor.processPage(page);
|
||||
|
||||
Map<Integer, PDMarkedContent> theseMarkedContents = new HashMap<>();
|
||||
markedContents.put(page, theseMarkedContents);
|
||||
for (PDMarkedContent markedContent : extractor.getMarkedContents()) {
|
||||
theseMarkedContents.put(markedContent.getMCID(), markedContent);
|
||||
}
|
||||
}
|
||||
|
||||
PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot();
|
||||
showStructure(root, markedContents);
|
||||
}
|
||||
|
||||
/**
|
||||
* @see #testExtractTestWPhromma()
|
||||
*/
|
||||
void showStructure(PDStructureNode node, Map<PDPage, Map<Integer, PDMarkedContent>> markedContents) {
|
||||
String structType = null;
|
||||
PDPage page = null;
|
||||
if (node instanceof PDStructureElement) {
|
||||
PDStructureElement element = (PDStructureElement) node;
|
||||
structType = element.getStructureType();
|
||||
page = element.getPage();
|
||||
}
|
||||
Map<Integer, PDMarkedContent> theseMarkedContents = markedContents.get(page);
|
||||
System.out.printf("<%s>\n", structType);
|
||||
for (Object object : node.getKids()) {
|
||||
if (object instanceof COSArray) {
|
||||
for (COSBase base : (COSArray) object) {
|
||||
if (base instanceof COSDictionary) {
|
||||
showStructure(PDStructureNode.create((COSDictionary) base), markedContents);
|
||||
} else if (base instanceof COSNumber) {
|
||||
showContent(((COSNumber)base).intValue(), theseMarkedContents);
|
||||
} else {
|
||||
System.out.printf("?%s\n", base);
|
||||
}
|
||||
}
|
||||
} else if (object instanceof PDStructureNode) {
|
||||
showStructure((PDStructureNode) object, markedContents);
|
||||
} else if (object instanceof Integer) {
|
||||
showContent((Integer)object, theseMarkedContents);
|
||||
} else {
|
||||
System.out.printf("?%s\n", object);
|
||||
}
|
||||
|
||||
}
|
||||
System.out.printf("</%s>\n", structType);
|
||||
}
|
||||
|
||||
/**
|
||||
* @see #showStructure(PDStructureNode, Map)
|
||||
* @see #testExtractTestWPhromma()
|
||||
*/
|
||||
void showContent(int mcid, Map<Integer, PDMarkedContent> theseMarkedContents) {
|
||||
PDMarkedContent markedContent = theseMarkedContents != null ? theseMarkedContents.get(mcid) : null;
|
||||
List<Object> contents = markedContent != null ? markedContent.getContents() : Collections.emptyList();
|
||||
StringBuilder textContent = new StringBuilder();
|
||||
for (Object object : contents) {
|
||||
if (object instanceof TextPosition) {
|
||||
textContent.append(((TextPosition)object).getUnicode());
|
||||
} else {
|
||||
textContent.append("?" + object);
|
||||
}
|
||||
}
|
||||
System.out.printf("%s\n", textContent);
|
||||
}
|
||||
}
|
||||
@ -25,8 +25,8 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
|
||||
LayoutGridService layoutGridService = new LayoutGridService();
|
||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService);
|
||||
String fileName = "files/new/VV-511309_OCR.pdf";
|
||||
Document document = buildGraph(fileName, LayoutParsingType.DOCUMINE);
|
||||
String fileName = "files/marked_content/Header-Header.pdf";
|
||||
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getFile()); var out = new FileOutputStream(tmpFileName)) {
|
||||
viewerDocumentService.createViewerDocument(pdDocument, document, out, true);
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user