diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index acb23fc..b14fb8a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -249,9 +249,9 @@ public class LayoutParsingPipeline { private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) { - if (!classificationPage.isLandscape()) { +// if (!classificationPage.isLandscape()) { document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue()); - } +// } document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue()); document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue()); document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue()); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFTextStripper.java index 2f2d6ea..1ca5b43 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFTextStripper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFTextStripper.java @@ -1711,7 +1711,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { int numberOfStrings = line.size(); for (int i = 0; i < numberOfStrings; i++) { WordWithTextPositions word = line.get(i); - word.getTextPositions().sort(Comparator.comparing(TextPosition::getX)); + word.getTextPositions().sort(Comparator.comparing(TextPosition::getXDirAdj)); writeString(word.getText(), word.getTextPositions(), isParagraphEnd && i == numberOfStrings - 1); if (i < numberOfStrings - 1) { writeWordSeparator(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java index af4676f..799ac99 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java @@ -9,6 +9,7 @@ import org.apache.pdfbox.text.TextPosition; import java.awt.geom.Rectangle2D; import java.util.Collection; +import java.util.Collections; import java.util.List; import java.util.Map; import java.util.stream.Collectors; @@ -22,7 +23,7 @@ public class MarkedContentUtils { public List getMarkedContentBboxPerLine(List markedContents, String subtype) { if (markedContents == null) { - return null; + return Collections.emptyList(); } var markedContentByYPosition = markedContents.stream() @@ -37,7 +38,7 @@ public class MarkedContentUtils { .collect(Collectors.groupingBy(TextPosition::getY)); if (markedContentByYPosition.isEmpty()) { - return null; + return Collections.emptyList(); } return markedContentByYPosition.values().stream() diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PositionUtils.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PositionUtils.java index 3aecb92..48b720d 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PositionUtils.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PositionUtils.java @@ -19,9 +19,10 @@ public final class PositionUtils { double threshold = textBlock.getMostPopularWordHeight() * 3; - if (textBlock.getPdfMinX() + threshold > btf.getTopLeft().getX() && textBlock.getPdfMaxX() - threshold < btf.getTopLeft() - .getX() + btf.getWidth() && textBlock.getPdfMinY() + threshold > btf.getTopLeft().getY() && textBlock.getPdfMaxY() - threshold < btf.getTopLeft() - .getY() + btf.getHeight()) { + if (textBlock.getPdfMinX() + threshold > btf.getTopLeft().getX() + && textBlock.getPdfMaxX() - threshold < btf.getTopLeft().getX() + btf.getWidth() + && textBlock.getPdfMinY() + threshold > btf.getTopLeft().getY() + && textBlock.getPdfMaxY() - threshold < btf.getTopLeft().getY() + btf.getHeight()) { return true; } else { return false; diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ExtractMarkedContentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ExtractMarkedContentTest.java index a811fe7..e8ba602 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ExtractMarkedContentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ExtractMarkedContentTest.java @@ -53,7 +53,7 @@ public class ExtractMarkedContentTest { @SneakyThrows public void testExtractTestWPhromma() throws IOException { System.out.printf("\n\n===\n%s\n===\n", "testWPhromma.pdf"); - PDDocument document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile()); + try (PDDocument document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile())) { Map> markedContents = new HashMap<>(); @@ -70,6 +70,8 @@ public class ExtractMarkedContentTest { PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot(); showStructure(root, markedContents); + document.close(); + } } /** @@ -87,7 +89,8 @@ public class ExtractMarkedContentTest { @Test public void testExtractResMultipage() throws IOException { System.out.printf("\n\n===\n%s\n===\n", "res_multipage.pdf"); - PDDocument document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile()); + + try(PDDocument document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile())) { Map> markedContents = new HashMap<>(); @@ -104,6 +107,8 @@ public class ExtractMarkedContentTest { PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot(); showStructure(root, markedContents); + document.close(); + } } /** @@ -121,7 +126,7 @@ public class ExtractMarkedContentTest { @Test public void testExtractDailyReport() throws IOException { System.out.printf("\n\n===\n%s\n===\n", "Daily Report.pdf"); - PDDocument document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile()); + try (PDDocument document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile())) { Map> markedContents = new HashMap<>(); @@ -138,7 +143,9 @@ public class ExtractMarkedContentTest { PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot(); showStructure(root, markedContents); + document.close(); } + } /** * @see #testExtractTestWPhromma()