From f2c0991987e48bb773594a894d5bbf755e78a839 Mon Sep 17 00:00:00 2001 From: Corina Olariu Date: Wed, 4 Oct 2023 14:09:46 +0300 Subject: [PATCH 1/5] RED-7607 - Rotating pages leads to lost annotations (RM & DM) - fix PMD findings --- .../processor/utils/MarkedContentUtils.java | 5 ++-- .../graph/ExtractMarkedContentTest.java | 25 ++++++++++++++++--- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java index af4676f..799ac99 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/MarkedContentUtils.java @@ -9,6 +9,7 @@ import org.apache.pdfbox.text.TextPosition; import java.awt.geom.Rectangle2D; import java.util.Collection; +import java.util.Collections; import java.util.List; import java.util.Map; import java.util.stream.Collectors; @@ -22,7 +23,7 @@ public class MarkedContentUtils { public List getMarkedContentBboxPerLine(List markedContents, String subtype) { if (markedContents == null) { - return null; + return Collections.emptyList(); } var markedContentByYPosition = markedContents.stream() @@ -37,7 +38,7 @@ public class MarkedContentUtils { .collect(Collectors.groupingBy(TextPosition::getY)); if (markedContentByYPosition.isEmpty()) { - return null; + return Collections.emptyList(); } return markedContentByYPosition.values().stream() diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ExtractMarkedContentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ExtractMarkedContentTest.java index a811fe7..176ed49 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ExtractMarkedContentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ExtractMarkedContentTest.java @@ -53,7 +53,9 @@ public class ExtractMarkedContentTest { @SneakyThrows public void testExtractTestWPhromma() throws IOException { System.out.printf("\n\n===\n%s\n===\n", "testWPhromma.pdf"); - PDDocument document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile()); + PDDocument document = null; + try { + document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile()); Map> markedContents = new HashMap<>(); @@ -70,6 +72,10 @@ public class ExtractMarkedContentTest { PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot(); showStructure(root, markedContents); + } finally { + if (null != document ) + document.close(); + } } /** @@ -87,7 +93,10 @@ public class ExtractMarkedContentTest { @Test public void testExtractResMultipage() throws IOException { System.out.printf("\n\n===\n%s\n===\n", "res_multipage.pdf"); - PDDocument document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile()); + + PDDocument document = null; + try { + document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile()); Map> markedContents = new HashMap<>(); @@ -104,6 +113,10 @@ public class ExtractMarkedContentTest { PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot(); showStructure(root, markedContents); + } finally { + if (document != null) + document.close(); + } } /** @@ -121,7 +134,9 @@ public class ExtractMarkedContentTest { @Test public void testExtractDailyReport() throws IOException { System.out.printf("\n\n===\n%s\n===\n", "Daily Report.pdf"); - PDDocument document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile()); + PDDocument document = null; + try { + document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile()); Map> markedContents = new HashMap<>(); @@ -138,7 +153,11 @@ public class ExtractMarkedContentTest { PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot(); showStructure(root, markedContents); + } finally { + if (null != document) + document.close(); } + } /** * @see #testExtractTestWPhromma() From 99ed331a1e1ef220d6eefafe8f375140eff08544 Mon Sep 17 00:00:00 2001 From: Corina Olariu Date: Wed, 4 Oct 2023 14:13:38 +0300 Subject: [PATCH 2/5] RED-7607 - Rotating pages leads to lost annotations (RM & DM) - use getXDirAdj instead of getX - add fontSizeCounter for landscape pages also --- .../layoutparser/processor/LayoutParsingPipeline.java | 4 ++-- .../processor/services/parsing/PDFTextStripper.java | 2 +- .../layoutparser/processor/utils/PositionUtils.java | 7 ++++--- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index acb23fc..b14fb8a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -249,9 +249,9 @@ public class LayoutParsingPipeline { private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) { - if (!classificationPage.isLandscape()) { +// if (!classificationPage.isLandscape()) { document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue()); - } +// } document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue()); document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue()); document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue()); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFTextStripper.java index 2f2d6ea..1ca5b43 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFTextStripper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFTextStripper.java @@ -1711,7 +1711,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine { int numberOfStrings = line.size(); for (int i = 0; i < numberOfStrings; i++) { WordWithTextPositions word = line.get(i); - word.getTextPositions().sort(Comparator.comparing(TextPosition::getX)); + word.getTextPositions().sort(Comparator.comparing(TextPosition::getXDirAdj)); writeString(word.getText(), word.getTextPositions(), isParagraphEnd && i == numberOfStrings - 1); if (i < numberOfStrings - 1) { writeWordSeparator(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PositionUtils.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PositionUtils.java index 3aecb92..48b720d 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PositionUtils.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PositionUtils.java @@ -19,9 +19,10 @@ public final class PositionUtils { double threshold = textBlock.getMostPopularWordHeight() * 3; - if (textBlock.getPdfMinX() + threshold > btf.getTopLeft().getX() && textBlock.getPdfMaxX() - threshold < btf.getTopLeft() - .getX() + btf.getWidth() && textBlock.getPdfMinY() + threshold > btf.getTopLeft().getY() && textBlock.getPdfMaxY() - threshold < btf.getTopLeft() - .getY() + btf.getHeight()) { + if (textBlock.getPdfMinX() + threshold > btf.getTopLeft().getX() + && textBlock.getPdfMaxX() - threshold < btf.getTopLeft().getX() + btf.getWidth() + && textBlock.getPdfMinY() + threshold > btf.getTopLeft().getY() + && textBlock.getPdfMaxY() - threshold < btf.getTopLeft().getY() + btf.getHeight()) { return true; } else { return false; From b4d68594f1ff57c2d047e57dac28a7c452dc07b8 Mon Sep 17 00:00:00 2001 From: Corina Olariu Date: Wed, 4 Oct 2023 14:22:15 +0300 Subject: [PATCH 3/5] RED-7607 - Rotating pages leads to lost annotations (RM & DM) - use rotation instead of getDir().getDegrees() --- .../processor/model/text/TextPageBlock.java | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java index 0442af6..6f0ccb6 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java @@ -147,12 +147,12 @@ public class TextPageBlock extends AbstractPageBlock { @JsonIgnore public float getPdfMinX() { - if (getDir().getDegrees() == 90) { + if (rotation == 90) { return minY; - } else if (getDir().getDegrees() == 180) { + } else if (rotation == 180) { return getPageWidth() - maxX; - } else if (getDir().getDegrees() == 270) { + } else if (rotation == 270) { return getPageWidth() - maxY; } else { @@ -174,11 +174,11 @@ public class TextPageBlock extends AbstractPageBlock { @JsonIgnore public float getPdfMaxX() { - if (getDir().getDegrees() == 90) { + if (rotation == 90) { return maxY; - } else if (getDir().getDegrees() == 180) { + } else if (rotation == 180) { return getPageWidth() - minX; - } else if (getDir().getDegrees() == 270) { + } else if (rotation == 270) { return getPageWidth() - minY; } else { @@ -200,12 +200,12 @@ public class TextPageBlock extends AbstractPageBlock { @JsonIgnore public float getPdfMinY() { - if (getDir().getDegrees() == 90) { + if (rotation == 90) { return minX; - } else if (getDir().getDegrees() == 180) { + } else if (rotation == 180) { return maxY; - } else if (getDir().getDegrees() == 270) { + } else if (rotation == 270) { return getPageHeight() - maxX; } else { @@ -227,12 +227,12 @@ public class TextPageBlock extends AbstractPageBlock { @JsonIgnore public float getPdfMaxY() { - if (getDir().getDegrees() == 90) { + if (rotation == 90) { return maxX; - } else if (getDir().getDegrees() == 180) { + } else if (rotation == 180) { return minY; - } else if (getDir().getDegrees() == 270) { + } else if (rotation == 270) { return getPageHeight() - minX; } else { return getPageHeight() - minY; From 3839de215c9a0c75cf56eb93b34be580a1557787 Mon Sep 17 00:00:00 2001 From: Corina Olariu Date: Wed, 4 Oct 2023 15:27:13 +0300 Subject: [PATCH 4/5] RED-7607 - Rotating pages leads to lost annotations (RM & DM) - rollback to getDir().getDegrees() --- .../processor/model/text/TextPageBlock.java | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java index 6f0ccb6..0442af6 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java @@ -147,12 +147,12 @@ public class TextPageBlock extends AbstractPageBlock { @JsonIgnore public float getPdfMinX() { - if (rotation == 90) { + if (getDir().getDegrees() == 90) { return minY; - } else if (rotation == 180) { + } else if (getDir().getDegrees() == 180) { return getPageWidth() - maxX; - } else if (rotation == 270) { + } else if (getDir().getDegrees() == 270) { return getPageWidth() - maxY; } else { @@ -174,11 +174,11 @@ public class TextPageBlock extends AbstractPageBlock { @JsonIgnore public float getPdfMaxX() { - if (rotation == 90) { + if (getDir().getDegrees() == 90) { return maxY; - } else if (rotation == 180) { + } else if (getDir().getDegrees() == 180) { return getPageWidth() - minX; - } else if (rotation == 270) { + } else if (getDir().getDegrees() == 270) { return getPageWidth() - minY; } else { @@ -200,12 +200,12 @@ public class TextPageBlock extends AbstractPageBlock { @JsonIgnore public float getPdfMinY() { - if (rotation == 90) { + if (getDir().getDegrees() == 90) { return minX; - } else if (rotation == 180) { + } else if (getDir().getDegrees() == 180) { return maxY; - } else if (rotation == 270) { + } else if (getDir().getDegrees() == 270) { return getPageHeight() - maxX; } else { @@ -227,12 +227,12 @@ public class TextPageBlock extends AbstractPageBlock { @JsonIgnore public float getPdfMaxY() { - if (rotation == 90) { + if (getDir().getDegrees() == 90) { return maxX; - } else if (rotation == 180) { + } else if (getDir().getDegrees() == 180) { return minY; - } else if (rotation == 270) { + } else if (getDir().getDegrees() == 270) { return getPageHeight() - minX; } else { return getPageHeight() - minY; From daba0bf8a6ba1caf4f641de7b3cb0fda7ebaf857 Mon Sep 17 00:00:00 2001 From: Corina Olariu Date: Wed, 4 Oct 2023 17:46:46 +0300 Subject: [PATCH 5/5] RED-7607 - Rotating pages leads to lost annotations (RM & DM) - remove finally clause --- .../graph/ExtractMarkedContentTest.java | 24 +++++-------------- 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ExtractMarkedContentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ExtractMarkedContentTest.java index 176ed49..e8ba602 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ExtractMarkedContentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ExtractMarkedContentTest.java @@ -53,9 +53,7 @@ public class ExtractMarkedContentTest { @SneakyThrows public void testExtractTestWPhromma() throws IOException { System.out.printf("\n\n===\n%s\n===\n", "testWPhromma.pdf"); - PDDocument document = null; - try { - document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile()); + try (PDDocument document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile())) { Map> markedContents = new HashMap<>(); @@ -72,9 +70,7 @@ public class ExtractMarkedContentTest { PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot(); showStructure(root, markedContents); - } finally { - if (null != document ) - document.close(); + document.close(); } } @@ -94,9 +90,7 @@ public class ExtractMarkedContentTest { public void testExtractResMultipage() throws IOException { System.out.printf("\n\n===\n%s\n===\n", "res_multipage.pdf"); - PDDocument document = null; - try { - document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile()); + try(PDDocument document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile())) { Map> markedContents = new HashMap<>(); @@ -113,9 +107,7 @@ public class ExtractMarkedContentTest { PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot(); showStructure(root, markedContents); - } finally { - if (document != null) - document.close(); + document.close(); } } @@ -134,9 +126,7 @@ public class ExtractMarkedContentTest { @Test public void testExtractDailyReport() throws IOException { System.out.printf("\n\n===\n%s\n===\n", "Daily Report.pdf"); - PDDocument document = null; - try { - document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile()); + try (PDDocument document = Loader.loadPDF(new ClassPathResource("files/bdr/Drucksache_19_9865.pdf").getFile())) { Map> markedContents = new HashMap<>(); @@ -153,9 +143,7 @@ public class ExtractMarkedContentTest { PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot(); showStructure(root, markedContents); - } finally { - if (null != document) - document.close(); + document.close(); } }