From 2cadfdb8bf7a89586bc13df57a77bb3043b0f14d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kilian=20Sch=C3=BCttler?= Date: Mon, 19 Aug 2024 14:47:48 +0200 Subject: [PATCH] RED-9746: improve invisible element removal --- README.md | 2 +- .../azure-ocr-service-processor/build.gradle.kts | 2 +- .../service/ocr/processor/OcrServiceSettings.java | 2 +- .../service/ocr/processor/model/PageInformation.java | 10 ++++++---- .../imageprocessing/ImageProcessingSupervisor.java | 4 ++-- .../visualizations/WritableOcrResultFactory.java | 2 +- 6 files changed, 12 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 423ebd6..c310b2c 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,7 @@ int concurrency = 8; int batchSize = 128; boolean debug; // writes the ocr layer visibly to the viewer doc pdf boolean idpEnabled; // Enables table detection, paragraph classification, section detection, key-value detection. -boolean tableDetection; // writes the tables to the PDF as invisible lines. +boolean drawTablesAsLines; // writes the tables to the PDF as invisible lines. boolean processAllPages; // if this parameter is set, ocr will be performed on any page, regardless if it has images or not boolean fontStyleDetection; // Enables bold detection using ghostscript and leptonica String contentFormat; // Either markdown or text. But, for whatever reason, with markdown enabled, key-values are not written by azure.... diff --git a/azure-ocr-service/azure-ocr-service-processor/build.gradle.kts b/azure-ocr-service/azure-ocr-service-processor/build.gradle.kts index 45ca213..4e52d83 100644 --- a/azure-ocr-service/azure-ocr-service-processor/build.gradle.kts +++ b/azure-ocr-service/azure-ocr-service-processor/build.gradle.kts @@ -21,7 +21,7 @@ dependencies { api("org.apache.commons:commons-math3:3.6.1") api("com.amazonaws:aws-java-sdk-kms:1.12.440") api("com.google.guava:guava:31.1-jre") - api("com.iqser.red.commons:pdftron-logic-commons:2.27.0") + api("com.iqser.red.commons:pdftron-logic-commons:2.30.0") api("com.knecon.fforesight:viewer-doc-processor:0.148.0") api("com.azure:azure-ai-documentintelligence:1.0.0-beta.3") testImplementation("org.junit.jupiter:junit-jupiter:5.8.1") diff --git a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/OcrServiceSettings.java b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/OcrServiceSettings.java index d1a6afd..af11d10 100644 --- a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/OcrServiceSettings.java +++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/OcrServiceSettings.java @@ -18,7 +18,7 @@ public class OcrServiceSettings { boolean debug; // writes the ocr layer visibly to the viewer doc pdf boolean idpEnabled; // Enables table detection, paragraph classification, section detection, key-value detection. - boolean tableDetection; // writes the tables to the PDF as invisible lines. + boolean drawTablesAsLines; // writes the tables to the PDF as invisible lines. boolean processAllPages; // if this parameter is set, ocr will be performed on any page, regardless if it has images or not boolean fontStyleDetection; // Enables bold detection using ghostscript and leptonica String contentFormat; // Either markdown or text. But, for whatever reason, with markdown enabled, key-values are not written by azure.... diff --git a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/PageInformation.java b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/PageInformation.java index e4532eb..5f15461 100644 --- a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/PageInformation.java +++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/PageInformation.java @@ -20,10 +20,12 @@ public record PageInformation(Rectangle2D mediabox, int number, int rotationDegr ConcurrentHashMap pageInformationMap = new ConcurrentHashMap<>(); int pageNumber = 1; - for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); pageNumber++) { - - Page page = iterator.next(); - pageInformationMap.put(pageNumber, PageInformation.fromPage(pageNumber, page)); + try (PageIterator iterator = pdfDoc.getPageIterator()) { + while (iterator.hasNext()) { + Page page = iterator.next(); + pageInformationMap.put(pageNumber, PageInformation.fromPage(pageNumber, page)); + pageNumber++; + } } return pageInformationMap; } diff --git a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/imageprocessing/ImageProcessingSupervisor.java b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/imageprocessing/ImageProcessingSupervisor.java index 1e8f4cc..7d6487f 100644 --- a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/imageprocessing/ImageProcessingSupervisor.java +++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/imageprocessing/ImageProcessingSupervisor.java @@ -53,7 +53,7 @@ public class ImageProcessingSupervisor { public ImageFile awaitProcessedPage(Integer pageNumber) throws InterruptedException { - if (hasErros()) { + if (hasErrors()) { return null; } getPageLatch(pageNumber).await(); @@ -61,7 +61,7 @@ public class ImageProcessingSupervisor { } - private boolean hasErros() { + private boolean hasErrors() { return errors.isEmpty(); } diff --git a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/visualizations/WritableOcrResultFactory.java b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/visualizations/WritableOcrResultFactory.java index 6666caa..aa30f24 100644 --- a/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/visualizations/WritableOcrResultFactory.java +++ b/azure-ocr-service/azure-ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/visualizations/WritableOcrResultFactory.java @@ -88,7 +88,7 @@ public class WritableOcrResultFactory { var builder = WritableOcrResult.builder().pageNumber(pageInformation.number()).textPositionInImage(words); - if (settings.isTableDetection()) { + if (settings.isDrawTablesAsLines()) { builder.tableLines(getTableLines(analyzeResult, pageInformation, pageCtm)); }