Merge branch 'RED-7607-WIP' into 'main'

RED-7607 - Rotating pages leads to lost annotations (RM & DM)

See merge request fforesight/layout-parser!75
This commit is contained in:
Kilian Schüttler 2023-10-05 13:34:12 +02:00
commit 310c07b200
5 changed files with 20 additions and 11 deletions

View File

@ -249,9 +249,9 @@ public class LayoutParsingPipeline {
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) { private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
if (!classificationPage.isLandscape()) { // if (!classificationPage.isLandscape()) {
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue()); document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
} // }
document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue()); document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue());
document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue()); document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue());
document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue()); document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue());

View File

@ -1711,7 +1711,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
int numberOfStrings = line.size(); int numberOfStrings = line.size();
for (int i = 0; i < numberOfStrings; i++) { for (int i = 0; i < numberOfStrings; i++) {
WordWithTextPositions word = line.get(i); WordWithTextPositions word = line.get(i);
word.getTextPositions().sort(Comparator.comparing(TextPosition::getX)); word.getTextPositions().sort(Comparator.comparing(TextPosition::getXDirAdj));
writeString(word.getText(), word.getTextPositions(), isParagraphEnd && i == numberOfStrings - 1); writeString(word.getText(), word.getTextPositions(), isParagraphEnd && i == numberOfStrings - 1);
if (i < numberOfStrings - 1) { if (i < numberOfStrings - 1) {
writeWordSeparator(); writeWordSeparator();

View File

@ -9,6 +9,7 @@ import org.apache.pdfbox.text.TextPosition;
import java.awt.geom.Rectangle2D; import java.awt.geom.Rectangle2D;
import java.util.Collection; import java.util.Collection;
import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.stream.Collectors; import java.util.stream.Collectors;
@ -22,7 +23,7 @@ public class MarkedContentUtils {
public List<Rectangle2D> getMarkedContentBboxPerLine(List<PDMarkedContent> markedContents, String subtype) { public List<Rectangle2D> getMarkedContentBboxPerLine(List<PDMarkedContent> markedContents, String subtype) {
if (markedContents == null) { if (markedContents == null) {
return null; return Collections.emptyList();
} }
var markedContentByYPosition = markedContents.stream() var markedContentByYPosition = markedContents.stream()
@ -37,7 +38,7 @@ public class MarkedContentUtils {
.collect(Collectors.groupingBy(TextPosition::getY)); .collect(Collectors.groupingBy(TextPosition::getY));
if (markedContentByYPosition.isEmpty()) { if (markedContentByYPosition.isEmpty()) {
return null; return Collections.emptyList();
} }
return markedContentByYPosition.values().stream() return markedContentByYPosition.values().stream()

View File

@ -19,9 +19,10 @@ public final class PositionUtils {
double threshold = textBlock.getMostPopularWordHeight() * 3; double threshold = textBlock.getMostPopularWordHeight() * 3;
if (textBlock.getPdfMinX() + threshold > btf.getTopLeft().getX() && textBlock.getPdfMaxX() - threshold < btf.getTopLeft() if (textBlock.getPdfMinX() + threshold > btf.getTopLeft().getX()
.getX() + btf.getWidth() && textBlock.getPdfMinY() + threshold > btf.getTopLeft().getY() && textBlock.getPdfMaxY() - threshold < btf.getTopLeft() && textBlock.getPdfMaxX() - threshold < btf.getTopLeft().getX() + btf.getWidth()
.getY() + btf.getHeight()) { && textBlock.getPdfMinY() + threshold > btf.getTopLeft().getY()
&& textBlock.getPdfMaxY() - threshold < btf.getTopLeft().getY() + btf.getHeight()) {
return true; return true;
} else { } else {
return false; return false;

View File

@ -53,7 +53,7 @@ public class ExtractMarkedContentTest {
@SneakyThrows @SneakyThrows
public void testExtractTestWPhromma() throws IOException { public void testExtractTestWPhromma() throws IOException {
System.out.printf("\n\n===\n%s\n===\n", "testWPhromma.pdf"); System.out.printf("\n\n===\n%s\n===\n", "testWPhromma.pdf");
PDDocument document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile()); try (PDDocument document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile())) {
Map<PDPage, Map<Integer, PDMarkedContent>> markedContents = new HashMap<>(); Map<PDPage, Map<Integer, PDMarkedContent>> markedContents = new HashMap<>();
@ -70,6 +70,8 @@ public class ExtractMarkedContentTest {
PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot(); PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot();
showStructure(root, markedContents); showStructure(root, markedContents);
document.close();
}
} }
/** /**
@ -87,7 +89,8 @@ public class ExtractMarkedContentTest {
@Test @Test
public void testExtractResMultipage() throws IOException { public void testExtractResMultipage() throws IOException {
System.out.printf("\n\n===\n%s\n===\n", "res_multipage.pdf"); System.out.printf("\n\n===\n%s\n===\n", "res_multipage.pdf");
PDDocument document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile());
try(PDDocument document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile())) {
Map<PDPage, Map<Integer, PDMarkedContent>> markedContents = new HashMap<>(); Map<PDPage, Map<Integer, PDMarkedContent>> markedContents = new HashMap<>();
@ -104,6 +107,8 @@ public class ExtractMarkedContentTest {
PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot(); PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot();
showStructure(root, markedContents); showStructure(root, markedContents);
document.close();
}
} }
/** /**
@ -121,7 +126,7 @@ public class ExtractMarkedContentTest {
@Test @Test
public void testExtractDailyReport() throws IOException { public void testExtractDailyReport() throws IOException {
System.out.printf("\n\n===\n%s\n===\n", "Daily Report.pdf"); System.out.printf("\n\n===\n%s\n===\n", "Daily Report.pdf");
PDDocument document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile()); try (PDDocument document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile())) {
Map<PDPage, Map<Integer, PDMarkedContent>> markedContents = new HashMap<>(); Map<PDPage, Map<Integer, PDMarkedContent>> markedContents = new HashMap<>();
@ -138,7 +143,9 @@ public class ExtractMarkedContentTest {
PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot(); PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot();
showStructure(root, markedContents); showStructure(root, markedContents);
document.close();
} }
}
/** /**
* @see #testExtractTestWPhromma() * @see #testExtractTestWPhromma()