Merge branch 'RED-7607-WIP' into 'main'
RED-7607 - Rotating pages leads to lost annotations (RM & DM) See merge request fforesight/layout-parser!75
This commit is contained in:
commit
310c07b200
@ -249,9 +249,9 @@ public class LayoutParsingPipeline {
|
|||||||
|
|
||||||
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
|
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
|
||||||
|
|
||||||
if (!classificationPage.isLandscape()) {
|
// if (!classificationPage.isLandscape()) {
|
||||||
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
|
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
|
||||||
}
|
// }
|
||||||
document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue());
|
document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue());
|
||||||
document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue());
|
document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue());
|
||||||
document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue());
|
document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue());
|
||||||
|
|||||||
@ -1711,7 +1711,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
|||||||
int numberOfStrings = line.size();
|
int numberOfStrings = line.size();
|
||||||
for (int i = 0; i < numberOfStrings; i++) {
|
for (int i = 0; i < numberOfStrings; i++) {
|
||||||
WordWithTextPositions word = line.get(i);
|
WordWithTextPositions word = line.get(i);
|
||||||
word.getTextPositions().sort(Comparator.comparing(TextPosition::getX));
|
word.getTextPositions().sort(Comparator.comparing(TextPosition::getXDirAdj));
|
||||||
writeString(word.getText(), word.getTextPositions(), isParagraphEnd && i == numberOfStrings - 1);
|
writeString(word.getText(), word.getTextPositions(), isParagraphEnd && i == numberOfStrings - 1);
|
||||||
if (i < numberOfStrings - 1) {
|
if (i < numberOfStrings - 1) {
|
||||||
writeWordSeparator();
|
writeWordSeparator();
|
||||||
|
|||||||
@ -9,6 +9,7 @@ import org.apache.pdfbox.text.TextPosition;
|
|||||||
|
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
@ -22,7 +23,7 @@ public class MarkedContentUtils {
|
|||||||
public List<Rectangle2D> getMarkedContentBboxPerLine(List<PDMarkedContent> markedContents, String subtype) {
|
public List<Rectangle2D> getMarkedContentBboxPerLine(List<PDMarkedContent> markedContents, String subtype) {
|
||||||
|
|
||||||
if (markedContents == null) {
|
if (markedContents == null) {
|
||||||
return null;
|
return Collections.emptyList();
|
||||||
}
|
}
|
||||||
|
|
||||||
var markedContentByYPosition = markedContents.stream()
|
var markedContentByYPosition = markedContents.stream()
|
||||||
@ -37,7 +38,7 @@ public class MarkedContentUtils {
|
|||||||
.collect(Collectors.groupingBy(TextPosition::getY));
|
.collect(Collectors.groupingBy(TextPosition::getY));
|
||||||
|
|
||||||
if (markedContentByYPosition.isEmpty()) {
|
if (markedContentByYPosition.isEmpty()) {
|
||||||
return null;
|
return Collections.emptyList();
|
||||||
}
|
}
|
||||||
|
|
||||||
return markedContentByYPosition.values().stream()
|
return markedContentByYPosition.values().stream()
|
||||||
|
|||||||
@ -19,9 +19,10 @@ public final class PositionUtils {
|
|||||||
|
|
||||||
double threshold = textBlock.getMostPopularWordHeight() * 3;
|
double threshold = textBlock.getMostPopularWordHeight() * 3;
|
||||||
|
|
||||||
if (textBlock.getPdfMinX() + threshold > btf.getTopLeft().getX() && textBlock.getPdfMaxX() - threshold < btf.getTopLeft()
|
if (textBlock.getPdfMinX() + threshold > btf.getTopLeft().getX()
|
||||||
.getX() + btf.getWidth() && textBlock.getPdfMinY() + threshold > btf.getTopLeft().getY() && textBlock.getPdfMaxY() - threshold < btf.getTopLeft()
|
&& textBlock.getPdfMaxX() - threshold < btf.getTopLeft().getX() + btf.getWidth()
|
||||||
.getY() + btf.getHeight()) {
|
&& textBlock.getPdfMinY() + threshold > btf.getTopLeft().getY()
|
||||||
|
&& textBlock.getPdfMaxY() - threshold < btf.getTopLeft().getY() + btf.getHeight()) {
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
return false;
|
return false;
|
||||||
|
|||||||
@ -53,7 +53,7 @@ public class ExtractMarkedContentTest {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void testExtractTestWPhromma() throws IOException {
|
public void testExtractTestWPhromma() throws IOException {
|
||||||
System.out.printf("\n\n===\n%s\n===\n", "testWPhromma.pdf");
|
System.out.printf("\n\n===\n%s\n===\n", "testWPhromma.pdf");
|
||||||
PDDocument document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile());
|
try (PDDocument document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile())) {
|
||||||
|
|
||||||
Map<PDPage, Map<Integer, PDMarkedContent>> markedContents = new HashMap<>();
|
Map<PDPage, Map<Integer, PDMarkedContent>> markedContents = new HashMap<>();
|
||||||
|
|
||||||
@ -70,6 +70,8 @@ public class ExtractMarkedContentTest {
|
|||||||
|
|
||||||
PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot();
|
PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot();
|
||||||
showStructure(root, markedContents);
|
showStructure(root, markedContents);
|
||||||
|
document.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -87,7 +89,8 @@ public class ExtractMarkedContentTest {
|
|||||||
@Test
|
@Test
|
||||||
public void testExtractResMultipage() throws IOException {
|
public void testExtractResMultipage() throws IOException {
|
||||||
System.out.printf("\n\n===\n%s\n===\n", "res_multipage.pdf");
|
System.out.printf("\n\n===\n%s\n===\n", "res_multipage.pdf");
|
||||||
PDDocument document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile());
|
|
||||||
|
try(PDDocument document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile())) {
|
||||||
|
|
||||||
Map<PDPage, Map<Integer, PDMarkedContent>> markedContents = new HashMap<>();
|
Map<PDPage, Map<Integer, PDMarkedContent>> markedContents = new HashMap<>();
|
||||||
|
|
||||||
@ -104,6 +107,8 @@ public class ExtractMarkedContentTest {
|
|||||||
|
|
||||||
PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot();
|
PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot();
|
||||||
showStructure(root, markedContents);
|
showStructure(root, markedContents);
|
||||||
|
document.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -121,7 +126,7 @@ public class ExtractMarkedContentTest {
|
|||||||
@Test
|
@Test
|
||||||
public void testExtractDailyReport() throws IOException {
|
public void testExtractDailyReport() throws IOException {
|
||||||
System.out.printf("\n\n===\n%s\n===\n", "Daily Report.pdf");
|
System.out.printf("\n\n===\n%s\n===\n", "Daily Report.pdf");
|
||||||
PDDocument document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile());
|
try (PDDocument document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile())) {
|
||||||
|
|
||||||
Map<PDPage, Map<Integer, PDMarkedContent>> markedContents = new HashMap<>();
|
Map<PDPage, Map<Integer, PDMarkedContent>> markedContents = new HashMap<>();
|
||||||
|
|
||||||
@ -138,7 +143,9 @@ public class ExtractMarkedContentTest {
|
|||||||
|
|
||||||
PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot();
|
PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot();
|
||||||
showStructure(root, markedContents);
|
showStructure(root, markedContents);
|
||||||
|
document.close();
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @see #testExtractTestWPhromma()
|
* @see #testExtractTestWPhromma()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user