RED-4875 - move PdfTextExtraction.java from ocr-service to here

This commit is contained in:
Thomas Beyer 2023-03-20 08:21:22 +01:00
parent a3a2a9ac03
commit 8a35a1bd90

View File

@ -0,0 +1,35 @@
package com.iqser.red.pdftronlogic.commons;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
import com.pdftron.pdf.TextExtractor;
public class PdfTextExtraction {
public static String extractAllTextFromDocument(InputStream fileStream) throws IOException, PDFNetException {
PDFDoc pdfDoc = new PDFDoc(fileStream);
TextExtractor extractor = new TextExtractor();
List<String> texts = new ArrayList<>();
PageIterator iterator = pdfDoc.getPageIterator();
while (iterator.hasNext()) {
Page page = iterator.next();
extractor.begin(page);
texts.add(extractor.getAsText());
}
extractor.destroy();
pdfDoc.close();
return String.join("\n", texts);
}
}