Compare commits

...

1 Commits

Author SHA1 Message Date
Kilian Schuettler
90b4869761 RED-6126: In the OCRService, OCR Text is not applied to Document
*removed init/terminate calls again
*added destroy()/close() at every opportunity
2023-02-24 13:07:42 +01:00
12 changed files with 254 additions and 213 deletions

View File

@ -6,6 +6,8 @@ import lombok.SneakyThrows;
import org.springframework.beans.factory.annotation.Value; import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
import javax.annotation.PostConstruct;
@Component @Component
@RequiredArgsConstructor @RequiredArgsConstructor
public class PDFNetInitializer { public class PDFNetInitializer {
@ -18,14 +20,12 @@ public class PDFNetInitializer {
@SneakyThrows @SneakyThrows
@PostConstruct
// Do not change back to application runner, if it is application runner it takes messages from the queue before PDFNet is initialized, that leads to UnsatisfiedLinkError. // Do not change back to application runner, if it is application runner it takes messages from the queue before PDFNet is initialized, that leads to UnsatisfiedLinkError.
public void init() { public void init() {
PDFNet.initialize(pdftronLicense);
PDFNet.setTempPath("/tmp/pdftron"); PDFNet.setTempPath("/tmp/pdftron");
PDFNet.addResourceSearchPath(ocrModulePath); PDFNet.addResourceSearchPath(ocrModulePath);
PDFNet.initialize(pdftronLicense);
} }
} }

View File

@ -0,0 +1,2 @@
package com.iqser.red.service.ocr.v1.server.service;public class DataExtractionService {
}

View File

@ -47,16 +47,17 @@ public class ImagePositionRetrievalService {
ElementReader reader = new ElementReader(); ElementReader reader = new ElementReader();
for (int pageId = 1; pageId <= pdfDoc.getPageCount(); ++pageId) { for (int pageId = 1; pageId <= pdfDoc.getPageCount(); ++pageId) {
RectCollection imagePositions = new RectCollection(); RectCollection imagePositions = new RectCollection();
reader.begin(pdfDoc.getPage(pageId));
reader.begin(pdfDoc.getPage(pageId));
findImagePositionsOnPage(reader, imagePositions, pdfDoc.getPage(pageId), mirrorY); findImagePositionsOnPage(reader, imagePositions, pdfDoc.getPage(pageId), mirrorY);
imagePositions = mergeOverlappingRects(imagePositions); imagePositions = mergeOverlappingRects(imagePositions);
reader.end(); reader.end();
if (imagePositions.getNumRects() > 0) { if (imagePositions.getNumRects() > 0) {
pageIdToImagePositions.put(pageId, imagePositions); pageIdToImagePositions.put(pageId, imagePositions);
} }
} }
reader.destroy();
return pageIdToImagePositions; return pageIdToImagePositions;
} }

View File

@ -87,13 +87,27 @@ public class InvisibleElementRemovalService {
.visitedXObjIds(visitedXObjIds) .visitedXObjIds(visitedXObjIds)
.build(); .build();
removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context); removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context);
context.visitedXObjIds().clear(); context.visitedXObjIds().clear();
removeOverlappedElements(page, writer, context); removeOverlappedElements(page, writer, context);
reader.end();
writer.end();
} }
pdfDoc.save(out, SDFDoc.SaveMode.REMOVE_UNUSED, null);
try {
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
} catch (Exception e) {
log.error("File could not be saved after invisible element removal");
throw new RuntimeException(e);
}
writer.destroy();
reader.destroy();
pdfDoc.close();
} }
@ -216,7 +230,7 @@ public class InvisibleElementRemovalService {
formWriter.setDefaultGState(context.reader()); formWriter.setDefaultGState(context.reader());
processElements(formWriter, context); processElements(formWriter, context);
formWriter.end(); formWriter.destroy();
context.reader().end(); context.reader().end();
} }
} }
@ -352,7 +366,7 @@ public class InvisibleElementRemovalService {
formWriter.setDefaultGState(context.reader()); formWriter.setDefaultGState(context.reader());
processOverlappedElements(formWriter, context); processOverlappedElements(formWriter, context);
formWriter.end(); formWriter.destroy();
context.reader().end(); context.reader().end();
} }
} }
@ -376,7 +390,8 @@ public class InvisibleElementRemovalService {
switch (operator) { switch (operator) {
case PathData.e_moveto -> linePath.moveTo(points.next(), points.next()); case PathData.e_moveto -> linePath.moveTo(points.next(), points.next());
case PathData.e_lineto -> linePath.lineTo(points.next(), points.next()); case PathData.e_lineto -> linePath.lineTo(points.next(), points.next());
case PathData.e_cubicto -> linePath.curveTo(points.next(), points.next(), points.next(), points.next(), points.next(), points.next()); case PathData.e_cubicto ->
linePath.curveTo(points.next(), points.next(), points.next(), points.next(), points.next(), points.next());
case PathData.e_closepath -> linePath.closePath(); case PathData.e_closepath -> linePath.closePath();
case PathData.e_rect -> { case PathData.e_rect -> {
double x = points.next(); double x = points.next();
@ -427,6 +442,9 @@ public class InvisibleElementRemovalService {
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB()); rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
rect.getGState().setStrokeColor(colorPt); rect.getGState().setStrokeColor(colorPt);
writer.writePlacedElement(rect); writer.writePlacedElement(rect);
colorPt.destroy();
eb.destroy();
} }

View File

@ -98,6 +98,7 @@ public class OCRService {
getAllOcrTextZonesAsString(pageIdToRectCollection, pageId)); getAllOcrTextZonesAsString(pageIdToRectCollection, pageId));
replaceOriginalPageWithOcrPage(pdfDoc, pageId, singlePagePdfDoc); replaceOriginalPageWithOcrPage(pdfDoc, pageId, singlePagePdfDoc);
singlePagePdfDoc.close(); singlePagePdfDoc.close();
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE, rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,

View File

@ -1,26 +1,23 @@
package com.iqser.red.service.ocr.v1.server.service; package com.iqser.red.service.ocr.v1.server.service;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import org.springframework.amqp.AmqpRejectAndDontRequeueException;
import org.springframework.amqp.rabbit.annotation.RabbitHandler;
import org.springframework.amqp.rabbit.annotation.RabbitListener;
import org.springframework.http.HttpStatus;
import org.springframework.stereotype.Service;
import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.ocr.v1.api.model.DocumentRequest; import com.iqser.red.service.ocr.v1.api.model.DocumentRequest;
import com.iqser.red.service.ocr.v1.server.client.FileStatusProcessingUpdateClient; import com.iqser.red.service.ocr.v1.server.client.FileStatusProcessingUpdateClient;
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration; import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
import com.iqser.red.service.ocr.v1.server.initializer.PDFNetInitializer; import com.iqser.red.service.ocr.v1.server.initializer.PDFNetInitializer;
import com.pdftron.pdf.PDFNet;
import feign.FeignException; import feign.FeignException;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.springframework.amqp.AmqpRejectAndDontRequeueException;
import org.springframework.amqp.rabbit.annotation.RabbitHandler;
import org.springframework.amqp.rabbit.annotation.RabbitListener;
import org.springframework.http.HttpStatus;
import org.springframework.stereotype.Service;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
@Slf4j @Slf4j
@Service @Service
@ -30,7 +27,6 @@ public class OcrMessageReceiver {
private final ObjectMapper objectMapper; private final ObjectMapper objectMapper;
private final FileStorageService fileStorageService; private final FileStorageService fileStorageService;
private final FileStatusProcessingUpdateClient fileStatusProcessingUpdateClient; private final FileStatusProcessingUpdateClient fileStatusProcessingUpdateClient;
private final PDFNetInitializer pdfNetInitializer;
private final OCRService ocrService; private final OCRService ocrService;
@ -39,7 +35,6 @@ public class OcrMessageReceiver {
@RabbitListener(queues = MessagingConfiguration.OCR_QUEUE, concurrency = "1") @RabbitListener(queues = MessagingConfiguration.OCR_QUEUE, concurrency = "1")
public void receiveOcr(String in) throws JsonProcessingException { public void receiveOcr(String in) throws JsonProcessingException {
pdfNetInitializer.init();
DocumentRequest ocrRequestMessage = objectMapper.readValue(in, DocumentRequest.class); DocumentRequest ocrRequestMessage = objectMapper.readValue(in, DocumentRequest.class);
log.info("Start ocr for file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId()); log.info("Start ocr for file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
@ -62,8 +57,6 @@ public class OcrMessageReceiver {
} }
fileStatusProcessingUpdateClient.ocrSuccessful(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId()); fileStatusProcessingUpdateClient.ocrSuccessful(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
PDFNet.terminate();
} }

View File

@ -1,24 +1,28 @@
package com.iqser.red.service.ocr.v1.server; package com.iqser.red.service.ocr.v1.server;
import com.iqser.red.service.ocr.v1.server.initializer.PDFNetInitializer; import static org.assertj.core.api.Assertions.assertThat;
import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService;
import com.iqser.red.storage.commons.StorageAutoConfiguration; import org.junit.jupiter.api.AfterAll;
import com.iqser.red.storage.commons.service.StorageService; import org.junit.jupiter.api.AfterEach;
import com.pdftron.pdf.PDFNet;
import lombok.SneakyThrows;
import org.junit.jupiter.api.*;
import org.junit.jupiter.api.extension.ExtendWith; import org.junit.jupiter.api.extension.ExtendWith;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.autoconfigure.EnableAutoConfiguration; import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration; import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration;
import org.springframework.boot.test.context.SpringBootTest; import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration; import org.springframework.context.annotation.Configuration;
import org.springframework.context.annotation.Import; import org.springframework.context.annotation.Import;
import org.springframework.context.annotation.Primary; import org.springframework.context.annotation.Primary;
import org.springframework.test.context.junit.jupiter.SpringExtension; import org.springframework.test.context.junit.jupiter.SpringExtension;
import static org.assertj.core.api.Assertions.assertThat; import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService;
import com.iqser.red.storage.commons.StorageAutoConfiguration;
import com.iqser.red.storage.commons.service.StorageService;
import com.pdftron.pdf.PDFNet;
import lombok.SneakyThrows;
@ExtendWith(SpringExtension.class) @ExtendWith(SpringExtension.class)
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT) @SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
@ -28,19 +32,14 @@ public class AbstractTest {
@Autowired @Autowired
protected StorageService storageService; protected StorageService storageService;
@Autowired @MockBean
private PDFNetInitializer pdfNetInitializer; protected RabbitTemplate rabbitTemplate;
@BeforeEach
@SneakyThrows
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
public void initPDFNet() {
pdfNetInitializer.init();
}
@AfterAll @AfterAll
public static void terminatePDFNet() { public static void terminatePDFNet() {
PDFNet.terminate(); PDFNet.terminate();
System.out.println("PDFNet Terminated");
} }

View File

@ -1,43 +1,36 @@
package com.iqser.red.service.ocr.v1.server; package com.iqser.red.service.ocr.v1.server;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
import static com.iqser.red.service.ocr.v1.server.utils.PdfUtils.extractAllTextFromDocument;
import static org.assertj.core.api.Assertions.assertThat;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.util.concurrent.TimeUnit;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.core.io.ClassPathResource;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.ocr.v1.server.service.FileStorageService; import com.iqser.red.service.ocr.v1.server.service.FileStorageService;
import com.iqser.red.service.ocr.v1.server.service.OCRService; import com.iqser.red.service.ocr.v1.server.service.OCRService;
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType; import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType;
import com.pdftron.common.PDFNetException; import com.pdftron.pdf.OCRModule;
import com.pdftron.pdf.*;
import io.micrometer.prometheus.PrometheusMeterRegistry; import io.micrometer.prometheus.PrometheusMeterRegistry;
import io.micrometer.prometheus.PrometheusTimer; import io.micrometer.prometheus.PrometheusTimer;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.core.io.ClassPathResource;
import java.io.FileInputStream; @SpringBootTest(properties = {"pdftron.ocrmodule.path=/home/kschuettler/iqser/PDFTron/ocr/Lib/"})
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
import static org.assertj.core.api.Assertions.assertThat;
@SpringBootTest(properties = {"pdftron.ocrmodule.path=/YourOCRModulePath"})
public class OcrServiceIntegrationTest extends AbstractTest { public class OcrServiceIntegrationTest extends AbstractTest {
@Autowired @Autowired
protected ObjectMapper objectMapper; protected ObjectMapper objectMapper;
@MockBean
protected RabbitTemplate rabbitTemplate;
@Autowired @Autowired
private OCRService ocrService; private OCRService ocrService;
@ -72,6 +65,29 @@ public class OcrServiceIntegrationTest extends AbstractTest {
public void testOcr() { public void testOcr() {
// check visually for most of the images containing text, the resulting text is kind of nonsense, just ensure it is there // check visually for most of the images containing text, the resulting text is kind of nonsense, just ensure it is there
String text = testOCR("StitchedImagesMultiPage"); String text = testOCR("StitchedImagesMultiPage");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
testOCR("131 IDD0000261725");
} }
@ -139,25 +155,5 @@ public class OcrServiceIntegrationTest extends AbstractTest {
try (var fileStream = new FileInputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) { try (var fileStream = new FileInputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
return extractAllTextFromDocument(fileStream); return extractAllTextFromDocument(fileStream);
} }
} }
private static String extractAllTextFromDocument(InputStream fileStream) throws IOException, PDFNetException {
PDFDoc pdfDoc = new PDFDoc(fileStream);
TextExtractor extractor = new TextExtractor();
List<String> texts = new ArrayList<>();
PageIterator iterator = pdfDoc.getPageIterator();
while (iterator.hasNext()) {
Page page = iterator.next();
extractor.begin(page);
texts.add(extractor.getAsText());
}
return String.join("\n", texts);
}
} }

View File

@ -0,0 +1,14 @@
package com.iqser.red.service.ocr.v1.server.service;
import static org.junit.jupiter.api.Assertions.*;
import org.junit.jupiter.api.Test;
class DataExtractionServiceTest {
@Test
void extractData() {
}
}

View File

@ -1,15 +1,9 @@
package com.iqser.red.service.ocr.v1.server.service; package com.iqser.red.service.ocr.v1.server.service;
import com.iqser.red.service.ocr.v1.server.AbstractTest; import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
import com.pdftron.common.PDFNetException; import static com.iqser.red.service.ocr.v1.server.utils.PdfUtils.drawGrid;
import com.pdftron.pdf.*; import static com.iqser.red.service.ocr.v1.server.utils.PdfUtils.drawRectCollection;
import com.pdftron.sdf.SDFDoc; import static org.assertj.core.api.Assertions.assertThat;
import lombok.SneakyThrows;
import org.junit.jupiter.api.Test;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.core.io.ClassPathResource;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.FileOutputStream; import java.io.FileOutputStream;
@ -20,8 +14,18 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory; import org.junit.jupiter.api.Test;
import static org.assertj.core.api.Assertions.assertThat; import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.ClassPathResource;
import com.iqser.red.service.ocr.v1.server.AbstractTest;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.RectCollection;
import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows;
class ImagePositionRetrievalServiceTest extends AbstractTest { class ImagePositionRetrievalServiceTest extends AbstractTest {
@ -29,10 +33,6 @@ class ImagePositionRetrievalServiceTest extends AbstractTest {
@Autowired @Autowired
private ImagePositionRetrievalService imagePositionRetrievalService; private ImagePositionRetrievalService imagePositionRetrievalService;
@MockBean
protected RabbitTemplate rabbitTemplate;
@Test @Test
@SneakyThrows @SneakyThrows
public void testImagePositionRetrievalForRotateTestFileWithImages() { public void testImagePositionRetrievalForRotateTestFileWithImages() {
@ -116,37 +116,38 @@ class ImagePositionRetrievalServiceTest extends AbstractTest {
private List<int[]> testImagePositionDetection(String fileName) throws IOException, PDFNetException { private List<int[]> testImagePositionDetection(String fileName) throws IOException, PDFNetException {
InputStream fileStream = new FileInputStream(new ClassPathResource("files/" + fileName + ".pdf").getFile().getAbsolutePath()); try (InputStream fileStream = new FileInputStream(new ClassPathResource("files/" + fileName + ".pdf").getFile().getAbsolutePath())) {
PDFDoc pdfDoc = new PDFDoc(fileStream); PDFDoc pdfDoc = new PDFDoc(fileStream);
Map<Integer, RectCollection> pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, false); Map<Integer, RectCollection> pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, false);
ElementWriter writer = new ElementWriter(); ElementWriter writer = new ElementWriter();
pageIdToRectCollection.forEach((pageId, rectCollection) -> { pageIdToRectCollection.forEach((pageId, rectCollection) -> {
try { try {
writer.begin(pdfDoc.getPage(pageId)); writer.begin(pdfDoc.getPage(pageId));
drawRectCollection(writer, rectCollection, pdfDoc.getPage(pageId)); drawRectCollection(writer, rectCollection);
drawGrid(writer, pdfDoc.getPage(pageId)); drawGrid(writer, pdfDoc.getPage(pageId));
writer.end(); writer.end();
StringBuilder zonesString = new StringBuilder(); StringBuilder zonesString = new StringBuilder();
for (int j = 0; j < rectCollection.getNumRects(); ++j) { for (int j = 0; j < rectCollection.getNumRects(); ++j) {
var r = rectCollection.getRectAt(j); var r = rectCollection.getRectAt(j);
zonesString.append(String.format("%d:[lower left (%.1f|%.1f) upper right (%.1f|%.1f)]; ", j, r.getX1(), r.getY1(), r.getX2(), r.getY2())); zonesString.append(String.format("%d:[lower left (%.1f|%.1f) upper right (%.1f|%.1f)]; ", j, r.getX1(), r.getY1(), r.getX2(), r.getY2()));
}
System.out.printf("Page %d: rotation: %d OCR regions %s \n", pageId, pdfDoc.getPage(pageId).getRotation(), zonesString);
} catch (PDFNetException e) {
throw new RuntimeException(e);
} }
System.out.printf("Page %d: rotation: %d OCR regions %s \n", pageId, pdfDoc.getPage(pageId).getRotation(), zonesString); });
} catch (PDFNetException e) {
throw new RuntimeException(e);
}
});
// Check visually for red Rectangles to match images in the saved pdf file // Check visually for red Rectangles to match images in the saved pdf file
try (var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + "_IMAGE_BBOX.pdf")) { try (var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + "_IMAGE_BBOX.pdf")) {
out.write(pdfDoc.save(SDFDoc.SaveMode.LINEARIZED, null)); out.write(pdfDoc.save(SDFDoc.SaveMode.LINEARIZED, null));
}
pdfDoc.close();
System.out.println("result file: " + getTemporaryDirectory() + "/" + fileName + "_IMAGE_BBOX.pdf");
// round all coords to nearest int to account for inconsistencies with the calculation of the bounding box
return pageIdToRectCollection.values().stream().map(this::toRoundedCoordinateArrayList).flatMap(List::stream).collect(Collectors.toList());
} }
System.out.println("result file: " + getTemporaryDirectory() + "/" + fileName + "_IMAGE_BBOX.pdf");
fileStream.close();
// round all coords to nearest int to account for inconsistencies with the calculation of the bounding box
return pageIdToRectCollection.values().stream().map(this::toRoundedCoordinateArrayList).flatMap(List::stream).collect(Collectors.toList());
} }
@ -161,59 +162,4 @@ class ImagePositionRetrievalServiceTest extends AbstractTest {
return coords; return coords;
} }
@SneakyThrows
private void drawGrid(ElementWriter writer, Page page) {
ElementBuilder eb = new ElementBuilder();
double dX = 15;
double dY = 15;
int nRows = (int) (page.getPageHeight() / dY) + 1;
int nCols = (int) (page.getPageWidth() / dX) + 1;
for (int row = 0; row < nRows; ++row) {
for (int col = 0; col < nCols; ++col) {
Element cell = eb.createRect(col * dX, row * dY, dX, dY);
cell.setPathStroke(true);
cell.getGState().setLineWidth(1);
cell.getGState().setStrokeOpacity(0.1);
cell.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
if (row == 0 && col == 0) {
cell.getGState().setStrokeColor(new ColorPt(0, 0, 1));
cell.setPathFill(true);
cell.getGState().setFillOpacity(0.8);
cell.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
cell.getGState().setFillColor(new ColorPt(0, 0, 1));
} else {
cell.setPathFill(false);
cell.getGState().setStrokeColor(new ColorPt(0.1, 0.1, 0.1));
}
writer.writePlacedElement(cell);
}
}
}
@SneakyThrows
public static void drawRectCollection(ElementWriter writer, RectCollection rectCollection, Page page) {
ColorPt colorPt = new ColorPt(1, 0, 0);
ElementBuilder eb = new ElementBuilder();
for (int i = 0; i < rectCollection.getNumRects(); ++i) {
Rect r = rectCollection.getRectAt(i);
Element rect = eb.createRect(r.getX1(), r.getY1(), r.getWidth(), r.getHeight());
rect.setPathStroke(true);
rect.getGState().setLineWidth(5);
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
rect.getGState().setStrokeColor(colorPt);
rect.setPathFill(true);
rect.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
rect.getGState().setFillColor(colorPt);
rect.getGState().setFillOpacity(0.5);
writer.writePlacedElement(rect);
}
}
} }

View File

@ -1,31 +1,25 @@
package com.iqser.red.service.ocr.v1.server.service; package com.iqser.red.service.ocr.v1.server.service;
import com.iqser.red.service.ocr.v1.server.AbstractTest; import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
import com.pdftron.pdf.PDFDoc; import static com.iqser.red.service.ocr.v1.server.utils.PdfUtils.extractAllTextFromDocument;
import com.pdftron.pdf.Page; import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
import com.pdftron.pdf.PageIterator;
import com.pdftron.pdf.TextExtractor;
import lombok.SneakyThrows;
import org.junit.jupiter.api.Test;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.core.io.ClassPathResource;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.FileOutputStream; import java.io.FileOutputStream;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory; import org.junit.jupiter.api.Test;
import static org.assertj.core.api.AssertionsForClassTypes.assertThat; import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.ClassPathResource;
import com.iqser.red.service.ocr.v1.server.AbstractTest;
import lombok.SneakyThrows;
public class InvisibleElementRemovalServiceTest extends AbstractTest { public class InvisibleElementRemovalServiceTest extends AbstractTest {
@Autowired @Autowired
private InvisibleElementRemovalService invisibleElementRemovalService; private InvisibleElementRemovalService invisibleElementRemovalService;
@MockBean
protected RabbitTemplate rabbitTemplate;
@Test @Test
@SneakyThrows @SneakyThrows
@ -36,27 +30,18 @@ public class InvisibleElementRemovalServiceTest extends AbstractTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf"); ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf");
try (var initialFileStream = pdfFileResource.getInputStream(); var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) { try (var initialFileStream = pdfFileResource.getInputStream(); var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
invisibleElementRemovalService.removeInvisibleElements(initialFileStream, out,false); invisibleElementRemovalService.removeInvisibleElements(initialFileStream, out, false);
} }
try (var initialFileStream = pdfFileResource.getInputStream(); var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + "_delta.pdf")) { try (var initialFileStream = pdfFileResource.getInputStream(); var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + "_delta.pdf")) {
invisibleElementRemovalService.removeInvisibleElements(initialFileStream, out,true); invisibleElementRemovalService.removeInvisibleElements(initialFileStream, out, true);
} }
System.out.println("Output File without invisible elements: files/" + fileName + ".pdf"); System.out.println("Output File without invisible elements: files/" + fileName + ".pdf");
System.out.println("Output Delta File without invisible elements: files/" + fileName + "_delta.pdf"); System.out.println("Output Delta File without invisible elements: files/" + fileName + "_delta.pdf");
TextExtractor extractor = new TextExtractor();
PDFDoc pdfDoc; try (var fileStream = new FileInputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
try(var fileStream = new FileInputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) { String[] text = extractAllTextFromDocument(fileStream).split("\n");
pdfDoc = new PDFDoc(fileStream);
}
PageIterator iterator = pdfDoc.getPageIterator();
while (iterator.hasNext()) {
Page page = iterator.next();
extractor.begin(page);
String[] text = extractor.getAsText().split("\n");
assertThat(text).contains("APPENDIX 16 Pathology Report", "Amendment 1", "Page 255 of 260"); assertThat(text).contains("APPENDIX 16 Pathology Report", "Amendment 1", "Page 255 of 260");
} }
} }

View File

@ -0,0 +1,86 @@
package com.iqser.red.service.ocr.v1.server.utils;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.*;
import lombok.SneakyThrows;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
public class PdfUtils {
public static String extractAllTextFromDocument(InputStream fileStream) throws IOException, PDFNetException {
PDFDoc pdfDoc = new PDFDoc(fileStream);
TextExtractor extractor = new TextExtractor();
List<String> texts = new ArrayList<>();
PageIterator iterator = pdfDoc.getPageIterator();
while (iterator.hasNext()) {
Page page = iterator.next();
extractor.begin(page);
texts.add(extractor.getAsText());
}
extractor.destroy();
pdfDoc.close();
return String.join("\n", texts);
}
@SneakyThrows
public static void drawGrid(ElementWriter writer, Page page) {
ElementBuilder eb = new ElementBuilder();
double dX = 15;
double dY = 15;
int nRows = (int) (page.getPageHeight() / dY) + 1;
int nCols = (int) (page.getPageWidth() / dX) + 1;
for (int row = 0; row < nRows; ++row) {
for (int col = 0; col < nCols; ++col) {
Element cell = eb.createRect(col * dX, row * dY, dX, dY);
cell.setPathStroke(true);
cell.getGState().setLineWidth(1);
cell.getGState().setStrokeOpacity(0.1);
cell.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
if (row == 0 && col == 0) {
cell.getGState().setStrokeColor(new ColorPt(0, 0, 1));
cell.setPathFill(true);
cell.getGState().setFillOpacity(0.8);
cell.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
cell.getGState().setFillColor(new ColorPt(0, 0, 1));
} else {
cell.setPathFill(false);
cell.getGState().setStrokeColor(new ColorPt(0.1, 0.1, 0.1));
}
writer.writePlacedElement(cell);
}
}
eb.destroy();
}
@SneakyThrows
public static void drawRectCollection(ElementWriter writer, RectCollection rectCollection) {
ColorPt colorPt = new ColorPt(1, 0, 0);
ElementBuilder eb = new ElementBuilder();
for (int i = 0; i < rectCollection.getNumRects(); ++i) {
Rect r = rectCollection.getRectAt(i);
Element rect = eb.createRect(r.getX1(), r.getY1(), r.getWidth(), r.getHeight());
rect.setPathStroke(true);
rect.getGState().setLineWidth(5);
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
rect.getGState().setStrokeColor(colorPt);
rect.setPathFill(true);
rect.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
rect.getGState().setFillColor(colorPt);
rect.getGState().setFillOpacity(0.5);
writer.writePlacedElement(rect);
}
colorPt.destroy();
eb.destroy();
}
}