From 8a35a1bd901c6dfe7f747d0cffcb8060a2d02e1a Mon Sep 17 00:00:00 2001 From: Thomas Beyer Date: Mon, 20 Mar 2023 08:21:22 +0100 Subject: [PATCH] RED-4875 - move PdfTextExtraction.java from ocr-service to here --- .../commons/PdfTextExtraction.java | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 src/main/java/com/iqser/red/pdftronlogic/commons/PdfTextExtraction.java diff --git a/src/main/java/com/iqser/red/pdftronlogic/commons/PdfTextExtraction.java b/src/main/java/com/iqser/red/pdftronlogic/commons/PdfTextExtraction.java new file mode 100644 index 0000000..d489edf --- /dev/null +++ b/src/main/java/com/iqser/red/pdftronlogic/commons/PdfTextExtraction.java @@ -0,0 +1,35 @@ +package com.iqser.red.pdftronlogic.commons; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; + +import com.pdftron.common.PDFNetException; +import com.pdftron.pdf.PDFDoc; +import com.pdftron.pdf.Page; +import com.pdftron.pdf.PageIterator; +import com.pdftron.pdf.TextExtractor; + + +public class PdfTextExtraction { + + public static String extractAllTextFromDocument(InputStream fileStream) throws IOException, PDFNetException { + + PDFDoc pdfDoc = new PDFDoc(fileStream); + TextExtractor extractor = new TextExtractor(); + List texts = new ArrayList<>(); + + PageIterator iterator = pdfDoc.getPageIterator(); + while (iterator.hasNext()) { + Page page = iterator.next(); + extractor.begin(page); + texts.add(extractor.getAsText()); + } + + extractor.destroy(); + pdfDoc.close(); + return String.join("\n", texts); + } + +}