RED-4875 - set version of common pdftron logics to newest and move PdfTextExtraction to this new repo

This commit is contained in:
Thomas Beyer 2023-03-20 10:01:33 +01:00
parent 142e8cf957
commit fd92419895
4 changed files with 4 additions and 38 deletions

View File

@ -26,7 +26,7 @@
<dependency>
<groupId>com.iqser.red.commons</groupId>
<artifactId>pdftron-logic-commons</artifactId>
<version>dev_red4875_a3a2a</version>
<version>dev_red4875_e8b89</version>
</dependency>
<dependency>

View File

@ -1,7 +1,7 @@
package com.iqser.red.service.ocr.v1.server;
import static com.iqser.red.pdftronlogic.commons.PdfTextExtraction.extractAllTextFromDocument;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
import static com.iqser.red.service.ocr.v1.server.utils.PdfTextExtraction.extractAllTextFromDocument;
import static org.assertj.core.api.Assertions.assertThat;
import java.io.FileInputStream;

View File

@ -1,7 +1,7 @@
package com.iqser.red.service.ocr.v1.server.service;
import static com.iqser.red.pdftronlogic.commons.PdfTextExtraction.extractAllTextFromDocument;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
import static com.iqser.red.service.ocr.v1.server.utils.PdfTextExtraction.extractAllTextFromDocument;
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
import java.io.FileInputStream;
@ -46,5 +46,6 @@ public class InvisibleElementRemovalServiceTest extends AbstractTest {
String[] text = extractAllTextFromDocument(fileStream).split("\n");
assertThat(text).contains("APPENDIX 16 Pathology Report", "Amendment 1", "Page 255 of 260");
}
}
}

View File

@ -1,35 +0,0 @@
package com.iqser.red.service.ocr.v1.server.utils;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
import com.pdftron.pdf.TextExtractor;
public class PdfTextExtraction {
public static String extractAllTextFromDocument(InputStream fileStream) throws IOException, PDFNetException {
PDFDoc pdfDoc = new PDFDoc(fileStream);
TextExtractor extractor = new TextExtractor();
List<String> texts = new ArrayList<>();
PageIterator iterator = pdfDoc.getPageIterator();
while (iterator.hasNext()) {
Page page = iterator.next();
extractor.begin(page);
texts.add(extractor.getAsText());
}
extractor.destroy();
pdfDoc.close();
return String.join("\n", texts);
}
}