RED-9746: improve invisible element removal

2024-08-19 14:47:48 +02:00 · 2024-08-19 14:47:48 +02:00 · 2cadfdb8bf
commit 2cadfdb8bf
parent b019ca8e2d
6 changed files with 12 additions and 10 deletions
--- a/README.md
+++ b/README.md
@ -70,7 +70,7 @@ int concurrency = 8;
 int batchSize = 128;
 boolean debug; // writes the ocr layer visibly to the viewer doc pdf
 boolean idpEnabled; // Enables table detection, paragraph classification, section detection, key-value detection.
-boolean tableDetection; // writes the tables to the PDF as invisible lines.
+boolean drawTablesAsLines; // writes the tables to the PDF as invisible lines.
 boolean processAllPages; // if this parameter is set, ocr will be performed on any page, regardless if it has images or not
 boolean fontStyleDetection; // Enables bold detection using ghostscript and leptonica
 String contentFormat; // Either markdown or text. But, for whatever reason, with markdown enabled, key-values are not written by azure....
--- a/azure-ocr-service/azure-ocr-service-processor/build.gradle.kts
+++ b/azure-ocr-service/azure-ocr-service-processor/build.gradle.kts
@ -21,7 +21,7 @@ dependencies {
    api("org.apache.commons:commons-math3:3.6.1")
    api("com.amazonaws:aws-java-sdk-kms:1.12.440")
    api("com.google.guava:guava:31.1-jre")
-    api("com.iqser.red.commons:pdftron-logic-commons:2.27.0")
+    api("com.iqser.red.commons:pdftron-logic-commons:2.30.0")
    api("com.knecon.fforesight:viewer-doc-processor:0.148.0")
    api("com.azure:azure-ai-documentintelligence:1.0.0-beta.3")
    testImplementation("org.junit.jupiter:junit-jupiter:5.8.1")
--- a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/OcrServiceSettings.java
+++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/OcrServiceSettings.java
@ -18,7 +18,7 @@ public class OcrServiceSettings {

    boolean debug; // writes the ocr layer visibly to the viewer doc pdf
    boolean idpEnabled; // Enables table detection, paragraph classification, section detection, key-value detection.
-    boolean tableDetection; // writes the tables to the PDF as invisible lines.
+    boolean drawTablesAsLines; // writes the tables to the PDF as invisible lines.
    boolean processAllPages; // if this parameter is set, ocr will be performed on any page, regardless if it has images or not
    boolean fontStyleDetection; // Enables bold detection using ghostscript and leptonica
    String contentFormat; // Either markdown or text. But, for whatever reason, with markdown enabled, key-values are not written by azure....
--- a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/PageInformation.java
+++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/PageInformation.java
@ -20,10 +20,12 @@ public record PageInformation(Rectangle2D mediabox, int number, int rotationDegr

        ConcurrentHashMap<Integer, PageInformation> pageInformationMap = new ConcurrentHashMap<>();
        int pageNumber = 1;
-        for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); pageNumber++) {
-
-            Page page = iterator.next();
-            pageInformationMap.put(pageNumber, PageInformation.fromPage(pageNumber, page));
+        try (PageIterator iterator = pdfDoc.getPageIterator()) {
+            while (iterator.hasNext()) {
+                Page page = iterator.next();
+                pageInformationMap.put(pageNumber, PageInformation.fromPage(pageNumber, page));
+                pageNumber++;
+            }
        }
        return pageInformationMap;
    }
--- a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/imageprocessing/ImageProcessingSupervisor.java
+++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/imageprocessing/ImageProcessingSupervisor.java
@ -53,7 +53,7 @@ public class ImageProcessingSupervisor {

    public ImageFile awaitProcessedPage(Integer pageNumber) throws InterruptedException {

-        if (hasErros()) {
+        if (hasErrors()) {
            return null;
        }
        getPageLatch(pageNumber).await();
@ -61,7 +61,7 @@ public class ImageProcessingSupervisor {
    }


-    private boolean hasErros() {
+    private boolean hasErrors() {

        return errors.isEmpty();
    }
--- a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/visualizations/WritableOcrResultFactory.java
+++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/visualizations/WritableOcrResultFactory.java
@ -88,7 +88,7 @@ public class WritableOcrResultFactory {

            var builder = WritableOcrResult.builder().pageNumber(pageInformation.number()).textPositionInImage(words);

-            if (settings.isTableDetection()) {
+            if (settings.isDrawTablesAsLines()) {
                builder.tableLines(getTableLines(analyzeResult, pageInformation, pageCtm));
            }