RED-4875 - move PdfTextExtraction.java from ocr-service to here
This commit is contained in:
parent
a3a2a9ac03
commit
8a35a1bd90
@ -0,0 +1,35 @@
|
|||||||
|
package com.iqser.red.pdftronlogic.commons;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import com.pdftron.common.PDFNetException;
|
||||||
|
import com.pdftron.pdf.PDFDoc;
|
||||||
|
import com.pdftron.pdf.Page;
|
||||||
|
import com.pdftron.pdf.PageIterator;
|
||||||
|
import com.pdftron.pdf.TextExtractor;
|
||||||
|
|
||||||
|
|
||||||
|
public class PdfTextExtraction {
|
||||||
|
|
||||||
|
public static String extractAllTextFromDocument(InputStream fileStream) throws IOException, PDFNetException {
|
||||||
|
|
||||||
|
PDFDoc pdfDoc = new PDFDoc(fileStream);
|
||||||
|
TextExtractor extractor = new TextExtractor();
|
||||||
|
List<String> texts = new ArrayList<>();
|
||||||
|
|
||||||
|
PageIterator iterator = pdfDoc.getPageIterator();
|
||||||
|
while (iterator.hasNext()) {
|
||||||
|
Page page = iterator.next();
|
||||||
|
extractor.begin(page);
|
||||||
|
texts.add(extractor.getAsText());
|
||||||
|
}
|
||||||
|
|
||||||
|
extractor.destroy();
|
||||||
|
pdfDoc.close();
|
||||||
|
return String.join("\n", texts);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user