Compare commits

...

1 Commits

Author SHA1 Message Date
Kilian Schuettler
90b4869761 RED-6126: In the OCRService, OCR Text is not applied to Document
*removed init/terminate calls again
*added destroy()/close() at every opportunity
2023-02-24 13:07:42 +01:00
12 changed files with 254 additions and 213 deletions

View File

@ -6,6 +6,8 @@ import lombok.SneakyThrows;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import javax.annotation.PostConstruct;
@Component
@RequiredArgsConstructor
public class PDFNetInitializer {
@ -18,14 +20,12 @@ public class PDFNetInitializer {
@SneakyThrows
@PostConstruct
// Do not change back to application runner, if it is application runner it takes messages from the queue before PDFNet is initialized, that leads to UnsatisfiedLinkError.
public void init() {
PDFNet.initialize(pdftronLicense);
PDFNet.setTempPath("/tmp/pdftron");
PDFNet.addResourceSearchPath(ocrModulePath);
PDFNet.initialize(pdftronLicense);
}
}

View File

@ -0,0 +1,2 @@
package com.iqser.red.service.ocr.v1.server.service;public class DataExtractionService {
}

View File

@ -47,16 +47,17 @@ public class ImagePositionRetrievalService {
ElementReader reader = new ElementReader();
for (int pageId = 1; pageId <= pdfDoc.getPageCount(); ++pageId) {
RectCollection imagePositions = new RectCollection();
reader.begin(pdfDoc.getPage(pageId));
reader.begin(pdfDoc.getPage(pageId));
findImagePositionsOnPage(reader, imagePositions, pdfDoc.getPage(pageId), mirrorY);
imagePositions = mergeOverlappingRects(imagePositions);
reader.end();
if (imagePositions.getNumRects() > 0) {
pageIdToImagePositions.put(pageId, imagePositions);
}
}
reader.destroy();
return pageIdToImagePositions;
}

View File

@ -87,13 +87,27 @@ public class InvisibleElementRemovalService {
.visitedXObjIds(visitedXObjIds)
.build();
removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context);
context.visitedXObjIds().clear();
removeOverlappedElements(page, writer, context);
reader.end();
writer.end();
}
pdfDoc.save(out, SDFDoc.SaveMode.REMOVE_UNUSED, null);
try {
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
} catch (Exception e) {
log.error("File could not be saved after invisible element removal");
throw new RuntimeException(e);
}
writer.destroy();
reader.destroy();
pdfDoc.close();
}
@ -216,7 +230,7 @@ public class InvisibleElementRemovalService {
formWriter.setDefaultGState(context.reader());
processElements(formWriter, context);
formWriter.end();
formWriter.destroy();
context.reader().end();
}
}
@ -352,7 +366,7 @@ public class InvisibleElementRemovalService {
formWriter.setDefaultGState(context.reader());
processOverlappedElements(formWriter, context);
formWriter.end();
formWriter.destroy();
context.reader().end();
}
}
@ -376,7 +390,8 @@ public class InvisibleElementRemovalService {
switch (operator) {
case PathData.e_moveto -> linePath.moveTo(points.next(), points.next());
case PathData.e_lineto -> linePath.lineTo(points.next(), points.next());
case PathData.e_cubicto -> linePath.curveTo(points.next(), points.next(), points.next(), points.next(), points.next(), points.next());
case PathData.e_cubicto ->
linePath.curveTo(points.next(), points.next(), points.next(), points.next(), points.next(), points.next());
case PathData.e_closepath -> linePath.closePath();
case PathData.e_rect -> {
double x = points.next();
@ -427,6 +442,9 @@ public class InvisibleElementRemovalService {
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
rect.getGState().setStrokeColor(colorPt);
writer.writePlacedElement(rect);
colorPt.destroy();
eb.destroy();
}

View File

@ -98,6 +98,7 @@ public class OCRService {
getAllOcrTextZonesAsString(pageIdToRectCollection, pageId));
replaceOriginalPageWithOcrPage(pdfDoc, pageId, singlePagePdfDoc);
singlePagePdfDoc.close();
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,

View File

@ -1,26 +1,23 @@
package com.iqser.red.service.ocr.v1.server.service;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import org.springframework.amqp.AmqpRejectAndDontRequeueException;
import org.springframework.amqp.rabbit.annotation.RabbitHandler;
import org.springframework.amqp.rabbit.annotation.RabbitListener;
import org.springframework.http.HttpStatus;
import org.springframework.stereotype.Service;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.ocr.v1.api.model.DocumentRequest;
import com.iqser.red.service.ocr.v1.server.client.FileStatusProcessingUpdateClient;
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
import com.iqser.red.service.ocr.v1.server.initializer.PDFNetInitializer;
import com.pdftron.pdf.PDFNet;
import feign.FeignException;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.amqp.AmqpRejectAndDontRequeueException;
import org.springframework.amqp.rabbit.annotation.RabbitHandler;
import org.springframework.amqp.rabbit.annotation.RabbitListener;
import org.springframework.http.HttpStatus;
import org.springframework.stereotype.Service;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
@Slf4j
@Service
@ -30,7 +27,6 @@ public class OcrMessageReceiver {
private final ObjectMapper objectMapper;
private final FileStorageService fileStorageService;
private final FileStatusProcessingUpdateClient fileStatusProcessingUpdateClient;
private final PDFNetInitializer pdfNetInitializer;
private final OCRService ocrService;
@ -39,7 +35,6 @@ public class OcrMessageReceiver {
@RabbitListener(queues = MessagingConfiguration.OCR_QUEUE, concurrency = "1")
public void receiveOcr(String in) throws JsonProcessingException {
pdfNetInitializer.init();
DocumentRequest ocrRequestMessage = objectMapper.readValue(in, DocumentRequest.class);
log.info("Start ocr for file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
@ -62,8 +57,6 @@ public class OcrMessageReceiver {
}
fileStatusProcessingUpdateClient.ocrSuccessful(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
PDFNet.terminate();
}

View File

@ -1,24 +1,28 @@
package com.iqser.red.service.ocr.v1.server;
import com.iqser.red.service.ocr.v1.server.initializer.PDFNetInitializer;
import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService;
import com.iqser.red.storage.commons.StorageAutoConfiguration;
import com.iqser.red.storage.commons.service.StorageService;
import com.pdftron.pdf.PDFNet;
import lombok.SneakyThrows;
import org.junit.jupiter.api.*;
import static org.assertj.core.api.Assertions.assertThat;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.extension.ExtendWith;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.context.annotation.Import;
import org.springframework.context.annotation.Primary;
import org.springframework.test.context.junit.jupiter.SpringExtension;
import static org.assertj.core.api.Assertions.assertThat;
import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService;
import com.iqser.red.storage.commons.StorageAutoConfiguration;
import com.iqser.red.storage.commons.service.StorageService;
import com.pdftron.pdf.PDFNet;
import lombok.SneakyThrows;
@ExtendWith(SpringExtension.class)
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
@ -28,19 +32,14 @@ public class AbstractTest {
@Autowired
protected StorageService storageService;
@Autowired
private PDFNetInitializer pdfNetInitializer;
@MockBean
protected RabbitTemplate rabbitTemplate;
@BeforeEach
@SneakyThrows
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
public void initPDFNet() {
pdfNetInitializer.init();
}
@AfterAll
public static void terminatePDFNet() {
PDFNet.terminate();
System.out.println("PDFNet Terminated");
}

View File

@ -1,43 +1,36 @@
package com.iqser.red.service.ocr.v1.server;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
import static com.iqser.red.service.ocr.v1.server.utils.PdfUtils.extractAllTextFromDocument;
import static org.assertj.core.api.Assertions.assertThat;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.util.concurrent.TimeUnit;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.core.io.ClassPathResource;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.ocr.v1.server.service.FileStorageService;
import com.iqser.red.service.ocr.v1.server.service.OCRService;
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.*;
import com.pdftron.pdf.OCRModule;
import io.micrometer.prometheus.PrometheusMeterRegistry;
import io.micrometer.prometheus.PrometheusTimer;
import lombok.SneakyThrows;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.core.io.ClassPathResource;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
import static org.assertj.core.api.Assertions.assertThat;
@SpringBootTest(properties = {"pdftron.ocrmodule.path=/YourOCRModulePath"})
@SpringBootTest(properties = {"pdftron.ocrmodule.path=/home/kschuettler/iqser/PDFTron/ocr/Lib/"})
public class OcrServiceIntegrationTest extends AbstractTest {
@Autowired
protected ObjectMapper objectMapper;
@MockBean
protected RabbitTemplate rabbitTemplate;
@Autowired
private OCRService ocrService;
@ -72,6 +65,29 @@ public class OcrServiceIntegrationTest extends AbstractTest {
public void testOcr() {
// check visually for most of the images containing text, the resulting text is kind of nonsense, just ensure it is there
String text = testOCR("StitchedImagesMultiPage");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
}
@ -139,25 +155,5 @@ public class OcrServiceIntegrationTest extends AbstractTest {
try (var fileStream = new FileInputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
return extractAllTextFromDocument(fileStream);
}
}
private static String extractAllTextFromDocument(InputStream fileStream) throws IOException, PDFNetException {
PDFDoc pdfDoc = new PDFDoc(fileStream);
TextExtractor extractor = new TextExtractor();
List<String> texts = new ArrayList<>();
PageIterator iterator = pdfDoc.getPageIterator();
while (iterator.hasNext()) {
Page page = iterator.next();
extractor.begin(page);
texts.add(extractor.getAsText());
}
return String.join("\n", texts);
}
}

View File

@ -0,0 +1,14 @@
package com.iqser.red.service.ocr.v1.server.service;
import static org.junit.jupiter.api.Assertions.*;
import org.junit.jupiter.api.Test;
class DataExtractionServiceTest {
@Test
void extractData() {
}
}

View File

@ -1,15 +1,9 @@
package com.iqser.red.service.ocr.v1.server.service;
import com.iqser.red.service.ocr.v1.server.AbstractTest;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.*;
import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows;
import org.junit.jupiter.api.Test;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.core.io.ClassPathResource;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
import static com.iqser.red.service.ocr.v1.server.utils.PdfUtils.drawGrid;
import static com.iqser.red.service.ocr.v1.server.utils.PdfUtils.drawRectCollection;
import static org.assertj.core.api.Assertions.assertThat;
import java.io.FileInputStream;
import java.io.FileOutputStream;
@ -20,8 +14,18 @@ import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
import static org.assertj.core.api.Assertions.assertThat;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.ClassPathResource;
import com.iqser.red.service.ocr.v1.server.AbstractTest;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.RectCollection;
import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows;
class ImagePositionRetrievalServiceTest extends AbstractTest {
@ -29,10 +33,6 @@ class ImagePositionRetrievalServiceTest extends AbstractTest {
@Autowired
private ImagePositionRetrievalService imagePositionRetrievalService;
@MockBean
protected RabbitTemplate rabbitTemplate;
@Test
@SneakyThrows
public void testImagePositionRetrievalForRotateTestFileWithImages() {
@ -116,37 +116,38 @@ class ImagePositionRetrievalServiceTest extends AbstractTest {
private List<int[]> testImagePositionDetection(String fileName) throws IOException, PDFNetException {
InputStream fileStream = new FileInputStream(new ClassPathResource("files/" + fileName + ".pdf").getFile().getAbsolutePath());
PDFDoc pdfDoc = new PDFDoc(fileStream);
try (InputStream fileStream = new FileInputStream(new ClassPathResource("files/" + fileName + ".pdf").getFile().getAbsolutePath())) {
PDFDoc pdfDoc = new PDFDoc(fileStream);
Map<Integer, RectCollection> pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, false);
Map<Integer, RectCollection> pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, false);
ElementWriter writer = new ElementWriter();
pageIdToRectCollection.forEach((pageId, rectCollection) -> {
try {
writer.begin(pdfDoc.getPage(pageId));
drawRectCollection(writer, rectCollection, pdfDoc.getPage(pageId));
drawGrid(writer, pdfDoc.getPage(pageId));
writer.end();
StringBuilder zonesString = new StringBuilder();
for (int j = 0; j < rectCollection.getNumRects(); ++j) {
var r = rectCollection.getRectAt(j);
zonesString.append(String.format("%d:[lower left (%.1f|%.1f) upper right (%.1f|%.1f)]; ", j, r.getX1(), r.getY1(), r.getX2(), r.getY2()));
ElementWriter writer = new ElementWriter();
pageIdToRectCollection.forEach((pageId, rectCollection) -> {
try {
writer.begin(pdfDoc.getPage(pageId));
drawRectCollection(writer, rectCollection);
drawGrid(writer, pdfDoc.getPage(pageId));
writer.end();
StringBuilder zonesString = new StringBuilder();
for (int j = 0; j < rectCollection.getNumRects(); ++j) {
var r = rectCollection.getRectAt(j);
zonesString.append(String.format("%d:[lower left (%.1f|%.1f) upper right (%.1f|%.1f)]; ", j, r.getX1(), r.getY1(), r.getX2(), r.getY2()));
}
System.out.printf("Page %d: rotation: %d OCR regions %s \n", pageId, pdfDoc.getPage(pageId).getRotation(), zonesString);
} catch (PDFNetException e) {
throw new RuntimeException(e);
}
System.out.printf("Page %d: rotation: %d OCR regions %s \n", pageId, pdfDoc.getPage(pageId).getRotation(), zonesString);
} catch (PDFNetException e) {
throw new RuntimeException(e);
}
});
});
// Check visually for red Rectangles to match images in the saved pdf file
try (var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + "_IMAGE_BBOX.pdf")) {
out.write(pdfDoc.save(SDFDoc.SaveMode.LINEARIZED, null));
// Check visually for red Rectangles to match images in the saved pdf file
try (var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + "_IMAGE_BBOX.pdf")) {
out.write(pdfDoc.save(SDFDoc.SaveMode.LINEARIZED, null));
}
pdfDoc.close();
System.out.println("result file: " + getTemporaryDirectory() + "/" + fileName + "_IMAGE_BBOX.pdf");
// round all coords to nearest int to account for inconsistencies with the calculation of the bounding box
return pageIdToRectCollection.values().stream().map(this::toRoundedCoordinateArrayList).flatMap(List::stream).collect(Collectors.toList());
}
System.out.println("result file: " + getTemporaryDirectory() + "/" + fileName + "_IMAGE_BBOX.pdf");
fileStream.close();
// round all coords to nearest int to account for inconsistencies with the calculation of the bounding box
return pageIdToRectCollection.values().stream().map(this::toRoundedCoordinateArrayList).flatMap(List::stream).collect(Collectors.toList());
}
@ -161,59 +162,4 @@ class ImagePositionRetrievalServiceTest extends AbstractTest {
return coords;
}
@SneakyThrows
private void drawGrid(ElementWriter writer, Page page) {
ElementBuilder eb = new ElementBuilder();
double dX = 15;
double dY = 15;
int nRows = (int) (page.getPageHeight() / dY) + 1;
int nCols = (int) (page.getPageWidth() / dX) + 1;
for (int row = 0; row < nRows; ++row) {
for (int col = 0; col < nCols; ++col) {
Element cell = eb.createRect(col * dX, row * dY, dX, dY);
cell.setPathStroke(true);
cell.getGState().setLineWidth(1);
cell.getGState().setStrokeOpacity(0.1);
cell.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
if (row == 0 && col == 0) {
cell.getGState().setStrokeColor(new ColorPt(0, 0, 1));
cell.setPathFill(true);
cell.getGState().setFillOpacity(0.8);
cell.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
cell.getGState().setFillColor(new ColorPt(0, 0, 1));
} else {
cell.setPathFill(false);
cell.getGState().setStrokeColor(new ColorPt(0.1, 0.1, 0.1));
}
writer.writePlacedElement(cell);
}
}
}
@SneakyThrows
public static void drawRectCollection(ElementWriter writer, RectCollection rectCollection, Page page) {
ColorPt colorPt = new ColorPt(1, 0, 0);
ElementBuilder eb = new ElementBuilder();
for (int i = 0; i < rectCollection.getNumRects(); ++i) {
Rect r = rectCollection.getRectAt(i);
Element rect = eb.createRect(r.getX1(), r.getY1(), r.getWidth(), r.getHeight());
rect.setPathStroke(true);
rect.getGState().setLineWidth(5);
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
rect.getGState().setStrokeColor(colorPt);
rect.setPathFill(true);
rect.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
rect.getGState().setFillColor(colorPt);
rect.getGState().setFillOpacity(0.5);
writer.writePlacedElement(rect);
}
}
}

View File

@ -1,31 +1,25 @@
package com.iqser.red.service.ocr.v1.server.service;
import com.iqser.red.service.ocr.v1.server.AbstractTest;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
import com.pdftron.pdf.TextExtractor;
import lombok.SneakyThrows;
import org.junit.jupiter.api.Test;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.core.io.ClassPathResource;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
import static com.iqser.red.service.ocr.v1.server.utils.PdfUtils.extractAllTextFromDocument;
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.ClassPathResource;
import com.iqser.red.service.ocr.v1.server.AbstractTest;
import lombok.SneakyThrows;
public class InvisibleElementRemovalServiceTest extends AbstractTest {
@Autowired
private InvisibleElementRemovalService invisibleElementRemovalService;
@MockBean
protected RabbitTemplate rabbitTemplate;
@Test
@SneakyThrows
@ -36,27 +30,18 @@ public class InvisibleElementRemovalServiceTest extends AbstractTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf");
try (var initialFileStream = pdfFileResource.getInputStream(); var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
invisibleElementRemovalService.removeInvisibleElements(initialFileStream, out,false);
invisibleElementRemovalService.removeInvisibleElements(initialFileStream, out, false);
}
try (var initialFileStream = pdfFileResource.getInputStream(); var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + "_delta.pdf")) {
invisibleElementRemovalService.removeInvisibleElements(initialFileStream, out,true);
invisibleElementRemovalService.removeInvisibleElements(initialFileStream, out, true);
}
System.out.println("Output File without invisible elements: files/" + fileName + ".pdf");
System.out.println("Output Delta File without invisible elements: files/" + fileName + "_delta.pdf");
TextExtractor extractor = new TextExtractor();
PDFDoc pdfDoc;
try(var fileStream = new FileInputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
pdfDoc = new PDFDoc(fileStream);
}
PageIterator iterator = pdfDoc.getPageIterator();
while (iterator.hasNext()) {
Page page = iterator.next();
extractor.begin(page);
String[] text = extractor.getAsText().split("\n");
try (var fileStream = new FileInputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
String[] text = extractAllTextFromDocument(fileStream).split("\n");
assertThat(text).contains("APPENDIX 16 Pathology Report", "Amendment 1", "Page 255 of 260");
}
}

View File

@ -0,0 +1,86 @@
package com.iqser.red.service.ocr.v1.server.utils;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.*;
import lombok.SneakyThrows;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
public class PdfUtils {
public static String extractAllTextFromDocument(InputStream fileStream) throws IOException, PDFNetException {
PDFDoc pdfDoc = new PDFDoc(fileStream);
TextExtractor extractor = new TextExtractor();
List<String> texts = new ArrayList<>();
PageIterator iterator = pdfDoc.getPageIterator();
while (iterator.hasNext()) {
Page page = iterator.next();
extractor.begin(page);
texts.add(extractor.getAsText());
}
extractor.destroy();
pdfDoc.close();
return String.join("\n", texts);
}
@SneakyThrows
public static void drawGrid(ElementWriter writer, Page page) {
ElementBuilder eb = new ElementBuilder();
double dX = 15;
double dY = 15;
int nRows = (int) (page.getPageHeight() / dY) + 1;
int nCols = (int) (page.getPageWidth() / dX) + 1;
for (int row = 0; row < nRows; ++row) {
for (int col = 0; col < nCols; ++col) {
Element cell = eb.createRect(col * dX, row * dY, dX, dY);
cell.setPathStroke(true);
cell.getGState().setLineWidth(1);
cell.getGState().setStrokeOpacity(0.1);
cell.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
if (row == 0 && col == 0) {
cell.getGState().setStrokeColor(new ColorPt(0, 0, 1));
cell.setPathFill(true);
cell.getGState().setFillOpacity(0.8);
cell.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
cell.getGState().setFillColor(new ColorPt(0, 0, 1));
} else {
cell.setPathFill(false);
cell.getGState().setStrokeColor(new ColorPt(0.1, 0.1, 0.1));
}
writer.writePlacedElement(cell);
}
}
eb.destroy();
}
@SneakyThrows
public static void drawRectCollection(ElementWriter writer, RectCollection rectCollection) {
ColorPt colorPt = new ColorPt(1, 0, 0);
ElementBuilder eb = new ElementBuilder();
for (int i = 0; i < rectCollection.getNumRects(); ++i) {
Rect r = rectCollection.getRectAt(i);
Element rect = eb.createRect(r.getX1(), r.getY1(), r.getWidth(), r.getHeight());
rect.setPathStroke(true);
rect.getGState().setLineWidth(5);
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
rect.getGState().setStrokeColor(colorPt);
rect.setPathFill(true);
rect.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
rect.getGState().setFillColor(colorPt);
rect.getGState().setFillOpacity(0.5);
writer.writePlacedElement(rect);
}
colorPt.destroy();
eb.destroy();
}
}