From fd9241989540e9056d4882376e1b583f5649cd23 Mon Sep 17 00:00:00 2001 From: Thomas Beyer Date: Mon, 20 Mar 2023 10:01:33 +0100 Subject: [PATCH] RED-4875 - set version of common pdftron logics to newest and move PdfTextExtraction to this new repo --- ocr-service-v1/ocr-service-server-v1/pom.xml | 2 +- .../v1/server/OcrServiceIntegrationTest.java | 2 +- .../InvisibleElementRemovalServiceTest.java | 3 +- .../v1/server/utils/PdfTextExtraction.java | 35 ------------------- 4 files changed, 4 insertions(+), 38 deletions(-) delete mode 100644 ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/utils/PdfTextExtraction.java diff --git a/ocr-service-v1/ocr-service-server-v1/pom.xml b/ocr-service-v1/ocr-service-server-v1/pom.xml index 1854007..44641c5 100644 --- a/ocr-service-v1/ocr-service-server-v1/pom.xml +++ b/ocr-service-v1/ocr-service-server-v1/pom.xml @@ -26,7 +26,7 @@ com.iqser.red.commons pdftron-logic-commons - dev_red4875_a3a2a + dev_red4875_e8b89 diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java index e719877..6d0f5f8 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java +++ b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java @@ -1,7 +1,7 @@ package com.iqser.red.service.ocr.v1.server; +import static com.iqser.red.pdftronlogic.commons.PdfTextExtraction.extractAllTextFromDocument; import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory; -import static com.iqser.red.service.ocr.v1.server.utils.PdfTextExtraction.extractAllTextFromDocument; import static org.assertj.core.api.Assertions.assertThat; import java.io.FileInputStream; diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalServiceTest.java b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalServiceTest.java index 39d979e..32d8875 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalServiceTest.java +++ b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalServiceTest.java @@ -1,7 +1,7 @@ package com.iqser.red.service.ocr.v1.server.service; +import static com.iqser.red.pdftronlogic.commons.PdfTextExtraction.extractAllTextFromDocument; import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory; -import static com.iqser.red.service.ocr.v1.server.utils.PdfTextExtraction.extractAllTextFromDocument; import static org.assertj.core.api.AssertionsForClassTypes.assertThat; import java.io.FileInputStream; @@ -46,5 +46,6 @@ public class InvisibleElementRemovalServiceTest extends AbstractTest { String[] text = extractAllTextFromDocument(fileStream).split("\n"); assertThat(text).contains("APPENDIX 16 Pathology Report", "Amendment 1", "Page 255 of 260"); } + } } \ No newline at end of file diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/utils/PdfTextExtraction.java b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/utils/PdfTextExtraction.java deleted file mode 100644 index c3f195d..0000000 --- a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/utils/PdfTextExtraction.java +++ /dev/null @@ -1,35 +0,0 @@ -package com.iqser.red.service.ocr.v1.server.utils; - -import java.io.IOException; -import java.io.InputStream; -import java.util.ArrayList; -import java.util.List; - -import com.pdftron.common.PDFNetException; -import com.pdftron.pdf.PDFDoc; -import com.pdftron.pdf.Page; -import com.pdftron.pdf.PageIterator; -import com.pdftron.pdf.TextExtractor; - - -public class PdfTextExtraction { - - public static String extractAllTextFromDocument(InputStream fileStream) throws IOException, PDFNetException { - - PDFDoc pdfDoc = new PDFDoc(fileStream); - TextExtractor extractor = new TextExtractor(); - List texts = new ArrayList<>(); - - PageIterator iterator = pdfDoc.getPageIterator(); - while (iterator.hasNext()) { - Page page = iterator.next(); - extractor.begin(page); - texts.add(extractor.getAsText()); - } - - extractor.destroy(); - pdfDoc.close(); - return String.join("\n", texts); - } - -}