From 742725834933ad74ad582366b2b62015524bedb3 Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Wed, 22 Feb 2023 14:51:40 +0100 Subject: [PATCH] RED-6280: Performance Test Issue with OCR-Service *removed init/terminate calls again *manual memory cleanup at every opportunity --- .../server/initializer/PDFNetInitializer.java | 8 +- .../ImagePositionRetrievalService.java | 5 +- .../InvisibleElementRemovalService.java | 23 ++- .../ocr/v1/server/service/OCRService.java | 1 + .../v1/server/service/OcrMessageReceiver.java | 25 ++-- .../service/ocr/v1/server/AbstractTest.java | 31 ++-- .../v1/server/OcrServiceIntegrationTest.java | 61 +++----- .../ImagePositionRetrievalServiceTest.java | 140 ++++++------------ .../InvisibleElementRemovalServiceTest.java | 43 ++---- .../service/ocr/v1/server/utils/PdfDraw.java | 74 +++++++++ .../v1/server/utils/PdfTextExtraction.java | 35 +++++ 11 files changed, 234 insertions(+), 212 deletions(-) create mode 100644 ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/utils/PdfDraw.java create mode 100644 ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/utils/PdfTextExtraction.java diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/initializer/PDFNetInitializer.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/initializer/PDFNetInitializer.java index a432800..9fa6654 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/initializer/PDFNetInitializer.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/initializer/PDFNetInitializer.java @@ -6,6 +6,8 @@ import lombok.SneakyThrows; import org.springframework.beans.factory.annotation.Value; import org.springframework.stereotype.Component; +import javax.annotation.PostConstruct; + @Component @RequiredArgsConstructor public class PDFNetInitializer { @@ -18,14 +20,12 @@ public class PDFNetInitializer { @SneakyThrows + @PostConstruct // Do not change back to application runner, if it is application runner it takes messages from the queue before PDFNet is initialized, that leads to UnsatisfiedLinkError. public void init() { - PDFNet.initialize(pdftronLicense); PDFNet.setTempPath("/tmp/pdftron"); PDFNet.addResourceSearchPath(ocrModulePath); - + PDFNet.initialize(pdftronLicense); } - - } diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/ImagePositionRetrievalService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/ImagePositionRetrievalService.java index 3dbbb2b..fe88142 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/ImagePositionRetrievalService.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/ImagePositionRetrievalService.java @@ -47,16 +47,17 @@ public class ImagePositionRetrievalService { ElementReader reader = new ElementReader(); for (int pageId = 1; pageId <= pdfDoc.getPageCount(); ++pageId) { RectCollection imagePositions = new RectCollection(); - reader.begin(pdfDoc.getPage(pageId)); + reader.begin(pdfDoc.getPage(pageId)); findImagePositionsOnPage(reader, imagePositions, pdfDoc.getPage(pageId), mirrorY); imagePositions = mergeOverlappingRects(imagePositions); - reader.end(); + if (imagePositions.getNumRects() > 0) { pageIdToImagePositions.put(pageId, imagePositions); } } + reader.destroy(); return pageIdToImagePositions; } diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java index c747a9d..a9da1a1 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java @@ -77,7 +77,6 @@ public class InvisibleElementRemovalService { visitedXObjIds.add(page.getSDFObj().getObjNum()); - InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder() .reader(reader) .clippingPathStack(new ClippingPathStack(page.getMediaBox())) @@ -92,8 +91,20 @@ public class InvisibleElementRemovalService { context.visitedXObjIds().clear(); removeOverlappedElements(page, writer, context); + reader.end(); + writer.end(); } - pdfDoc.save(out, SDFDoc.SaveMode.REMOVE_UNUSED, null); + + try { + pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null); + } catch (Exception e) { + log.error("File could not be saved after invisible element removal"); + throw new RuntimeException(e); + } + + writer.destroy(); + reader.destroy(); + pdfDoc.close(); } @@ -216,13 +227,14 @@ public class InvisibleElementRemovalService { formWriter.setDefaultGState(context.reader()); processElements(formWriter, context); - formWriter.end(); + formWriter.destroy(); context.reader().end(); } } private void processPath(Element pathElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { + PathData pathData = pathElement.getPathData(); if (pathData.getOperators().length == 0 && pathData.getPoints().length == 0) { @@ -352,7 +364,7 @@ public class InvisibleElementRemovalService { formWriter.setDefaultGState(context.reader()); processOverlappedElements(formWriter, context); - formWriter.end(); + formWriter.destroy(); context.reader().end(); } } @@ -427,6 +439,9 @@ public class InvisibleElementRemovalService { rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB()); rect.getGState().setStrokeColor(colorPt); writer.writePlacedElement(rect); + + colorPt.destroy(); + eb.destroy(); } diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java index a6d7d8e..5fd671d 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java @@ -98,6 +98,7 @@ public class OCRService { getAllOcrTextZonesAsString(pageIdToRectCollection, pageId)); replaceOriginalPageWithOcrPage(pdfDoc, pageId, singlePagePdfDoc); + singlePagePdfDoc.close(); rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE, diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OcrMessageReceiver.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OcrMessageReceiver.java index 6452d50..3500929 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OcrMessageReceiver.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OcrMessageReceiver.java @@ -1,26 +1,23 @@ package com.iqser.red.service.ocr.v1.server.service; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.IOException; - -import org.springframework.amqp.AmqpRejectAndDontRequeueException; -import org.springframework.amqp.rabbit.annotation.RabbitHandler; -import org.springframework.amqp.rabbit.annotation.RabbitListener; -import org.springframework.http.HttpStatus; -import org.springframework.stereotype.Service; - import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.iqser.red.service.ocr.v1.api.model.DocumentRequest; import com.iqser.red.service.ocr.v1.server.client.FileStatusProcessingUpdateClient; import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration; import com.iqser.red.service.ocr.v1.server.initializer.PDFNetInitializer; -import com.pdftron.pdf.PDFNet; - import feign.FeignException; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; +import org.springframework.amqp.AmqpRejectAndDontRequeueException; +import org.springframework.amqp.rabbit.annotation.RabbitHandler; +import org.springframework.amqp.rabbit.annotation.RabbitListener; +import org.springframework.http.HttpStatus; +import org.springframework.stereotype.Service; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; @Slf4j @Service @@ -30,7 +27,6 @@ public class OcrMessageReceiver { private final ObjectMapper objectMapper; private final FileStorageService fileStorageService; private final FileStatusProcessingUpdateClient fileStatusProcessingUpdateClient; - private final PDFNetInitializer pdfNetInitializer; private final OCRService ocrService; @@ -39,7 +35,6 @@ public class OcrMessageReceiver { @RabbitListener(queues = MessagingConfiguration.OCR_QUEUE, concurrency = "1") public void receiveOcr(String in) throws JsonProcessingException { - pdfNetInitializer.init(); DocumentRequest ocrRequestMessage = objectMapper.readValue(in, DocumentRequest.class); log.info("Start ocr for file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId()); @@ -62,8 +57,6 @@ public class OcrMessageReceiver { } fileStatusProcessingUpdateClient.ocrSuccessful(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId()); - PDFNet.terminate(); - } diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/AbstractTest.java b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/AbstractTest.java index 46e684c..61a23a2 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/AbstractTest.java +++ b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/AbstractTest.java @@ -1,24 +1,28 @@ package com.iqser.red.service.ocr.v1.server; -import com.iqser.red.service.ocr.v1.server.initializer.PDFNetInitializer; -import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService; -import com.iqser.red.storage.commons.StorageAutoConfiguration; -import com.iqser.red.storage.commons.service.StorageService; -import com.pdftron.pdf.PDFNet; -import lombok.SneakyThrows; -import org.junit.jupiter.api.*; +import static org.assertj.core.api.Assertions.assertThat; + +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.extension.ExtendWith; +import org.springframework.amqp.rabbit.core.RabbitTemplate; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.autoconfigure.EnableAutoConfiguration; import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration; import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.boot.test.mock.mockito.MockBean; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Configuration; import org.springframework.context.annotation.Import; import org.springframework.context.annotation.Primary; import org.springframework.test.context.junit.jupiter.SpringExtension; -import static org.assertj.core.api.Assertions.assertThat; +import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService; +import com.iqser.red.storage.commons.StorageAutoConfiguration; +import com.iqser.red.storage.commons.service.StorageService; +import com.pdftron.pdf.PDFNet; + +import lombok.SneakyThrows; @ExtendWith(SpringExtension.class) @SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT) @@ -28,19 +32,14 @@ public class AbstractTest { @Autowired protected StorageService storageService; - @Autowired - private PDFNetInitializer pdfNetInitializer; + @MockBean + protected RabbitTemplate rabbitTemplate; - @BeforeEach - @SneakyThrows - @Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top. - public void initPDFNet() { - pdfNetInitializer.init(); - } @AfterAll public static void terminatePDFNet() { PDFNet.terminate(); + System.out.println("PDFNet Terminated"); } diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java index afef495..2c5b1de 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java +++ b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java @@ -1,33 +1,29 @@ package com.iqser.red.service.ocr.v1.server; +import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory; +import static com.iqser.red.service.ocr.v1.server.utils.PdfTextExtraction.extractAllTextFromDocument; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.util.concurrent.TimeUnit; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.core.io.ClassPathResource; + import com.fasterxml.jackson.databind.ObjectMapper; import com.iqser.red.service.ocr.v1.server.service.FileStorageService; import com.iqser.red.service.ocr.v1.server.service.OCRService; import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType; -import com.pdftron.common.PDFNetException; -import com.pdftron.pdf.*; +import com.pdftron.pdf.OCRModule; + import io.micrometer.prometheus.PrometheusMeterRegistry; import io.micrometer.prometheus.PrometheusTimer; import lombok.SneakyThrows; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; -import org.springframework.amqp.rabbit.core.RabbitTemplate; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.boot.test.context.SpringBootTest; -import org.springframework.boot.test.mock.mockito.MockBean; -import org.springframework.core.io.ClassPathResource; - -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.util.ArrayList; -import java.util.List; -import java.util.concurrent.TimeUnit; - -import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory; -import static org.assertj.core.api.Assertions.assertThat; @SpringBootTest(properties = {"pdftron.ocrmodule.path=/YourOCRModulePath"}) public class OcrServiceIntegrationTest extends AbstractTest { @@ -35,9 +31,6 @@ public class OcrServiceIntegrationTest extends AbstractTest { @Autowired protected ObjectMapper objectMapper; - @MockBean - protected RabbitTemplate rabbitTemplate; - @Autowired private OCRService ocrService; @@ -139,25 +132,5 @@ public class OcrServiceIntegrationTest extends AbstractTest { try (var fileStream = new FileInputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) { return extractAllTextFromDocument(fileStream); } - - } - - private static String extractAllTextFromDocument(InputStream fileStream) throws IOException, PDFNetException { - - PDFDoc pdfDoc = new PDFDoc(fileStream); - TextExtractor extractor = new TextExtractor(); - List texts = new ArrayList<>(); - - PageIterator iterator = pdfDoc.getPageIterator(); - while (iterator.hasNext()) { - Page page = iterator.next(); - extractor.begin(page); - texts.add(extractor.getAsText()); - } - - return String.join("\n", texts); - } - - } diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/ImagePositionRetrievalServiceTest.java b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/ImagePositionRetrievalServiceTest.java index 698e5de..41555e7 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/ImagePositionRetrievalServiceTest.java +++ b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/ImagePositionRetrievalServiceTest.java @@ -1,15 +1,9 @@ package com.iqser.red.service.ocr.v1.server.service; -import com.iqser.red.service.ocr.v1.server.AbstractTest; -import com.pdftron.common.PDFNetException; -import com.pdftron.pdf.*; -import com.pdftron.sdf.SDFDoc; -import lombok.SneakyThrows; -import org.junit.jupiter.api.Test; -import org.springframework.amqp.rabbit.core.RabbitTemplate; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.boot.test.mock.mockito.MockBean; -import org.springframework.core.io.ClassPathResource; +import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory; +import static com.iqser.red.service.ocr.v1.server.utils.PdfDraw.drawGrid; +import static com.iqser.red.service.ocr.v1.server.utils.PdfDraw.drawRectCollection; +import static org.assertj.core.api.Assertions.assertThat; import java.io.FileInputStream; import java.io.FileOutputStream; @@ -20,8 +14,18 @@ import java.util.List; import java.util.Map; import java.util.stream.Collectors; -import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory; -import static org.assertj.core.api.Assertions.assertThat; +import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.core.io.ClassPathResource; + +import com.iqser.red.service.ocr.v1.server.AbstractTest; +import com.pdftron.common.PDFNetException; +import com.pdftron.pdf.ElementWriter; +import com.pdftron.pdf.PDFDoc; +import com.pdftron.pdf.RectCollection; +import com.pdftron.sdf.SDFDoc; + +import lombok.SneakyThrows; class ImagePositionRetrievalServiceTest extends AbstractTest { @@ -29,10 +33,6 @@ class ImagePositionRetrievalServiceTest extends AbstractTest { @Autowired private ImagePositionRetrievalService imagePositionRetrievalService; - @MockBean - protected RabbitTemplate rabbitTemplate; - - @Test @SneakyThrows public void testImagePositionRetrievalForRotateTestFileWithImages() { @@ -116,37 +116,38 @@ class ImagePositionRetrievalServiceTest extends AbstractTest { private List testImagePositionDetection(String fileName) throws IOException, PDFNetException { - InputStream fileStream = new FileInputStream(new ClassPathResource("files/" + fileName + ".pdf").getFile().getAbsolutePath()); - PDFDoc pdfDoc = new PDFDoc(fileStream); + try (InputStream fileStream = new FileInputStream(new ClassPathResource("files/" + fileName + ".pdf").getFile().getAbsolutePath())) { + PDFDoc pdfDoc = new PDFDoc(fileStream); - Map pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, false); + Map pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, false); - ElementWriter writer = new ElementWriter(); - pageIdToRectCollection.forEach((pageId, rectCollection) -> { - try { - writer.begin(pdfDoc.getPage(pageId)); - drawRectCollection(writer, rectCollection, pdfDoc.getPage(pageId)); - drawGrid(writer, pdfDoc.getPage(pageId)); - writer.end(); - StringBuilder zonesString = new StringBuilder(); - for (int j = 0; j < rectCollection.getNumRects(); ++j) { - var r = rectCollection.getRectAt(j); - zonesString.append(String.format("%d:[lower left (%.1f|%.1f) upper right (%.1f|%.1f)]; ", j, r.getX1(), r.getY1(), r.getX2(), r.getY2())); + ElementWriter writer = new ElementWriter(); + pageIdToRectCollection.forEach((pageId, rectCollection) -> { + try { + writer.begin(pdfDoc.getPage(pageId)); + drawRectCollection(writer, rectCollection); + drawGrid(writer, pdfDoc.getPage(pageId)); + writer.end(); + StringBuilder zonesString = new StringBuilder(); + for (int j = 0; j < rectCollection.getNumRects(); ++j) { + var r = rectCollection.getRectAt(j); + zonesString.append(String.format("%d:[lower left (%.1f|%.1f) upper right (%.1f|%.1f)]; ", j, r.getX1(), r.getY1(), r.getX2(), r.getY2())); + } + System.out.printf("Page %d: rotation: %d OCR regions %s \n", pageId, pdfDoc.getPage(pageId).getRotation(), zonesString); + } catch (PDFNetException e) { + throw new RuntimeException(e); } - System.out.printf("Page %d: rotation: %d OCR regions %s \n", pageId, pdfDoc.getPage(pageId).getRotation(), zonesString); - } catch (PDFNetException e) { - throw new RuntimeException(e); - } - }); + }); - // Check visually for red Rectangles to match images in the saved pdf file - try (var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + "_IMAGE_BBOX.pdf")) { - out.write(pdfDoc.save(SDFDoc.SaveMode.LINEARIZED, null)); + // Check visually for red Rectangles to match images in the saved pdf file + try (var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + "_IMAGE_BBOX.pdf")) { + out.write(pdfDoc.save(SDFDoc.SaveMode.LINEARIZED, null)); + } + pdfDoc.close(); + System.out.println("result file: " + getTemporaryDirectory() + "/" + fileName + "_IMAGE_BBOX.pdf"); + // round all coords to nearest int to account for inconsistencies with the calculation of the bounding box + return pageIdToRectCollection.values().stream().map(this::toRoundedCoordinateArrayList).flatMap(List::stream).collect(Collectors.toList()); } - System.out.println("result file: " + getTemporaryDirectory() + "/" + fileName + "_IMAGE_BBOX.pdf"); - fileStream.close(); - // round all coords to nearest int to account for inconsistencies with the calculation of the bounding box - return pageIdToRectCollection.values().stream().map(this::toRoundedCoordinateArrayList).flatMap(List::stream).collect(Collectors.toList()); } @@ -161,59 +162,4 @@ class ImagePositionRetrievalServiceTest extends AbstractTest { return coords; } - - @SneakyThrows - private void drawGrid(ElementWriter writer, Page page) { - - ElementBuilder eb = new ElementBuilder(); - double dX = 15; - double dY = 15; - int nRows = (int) (page.getPageHeight() / dY) + 1; - int nCols = (int) (page.getPageWidth() / dX) + 1; - for (int row = 0; row < nRows; ++row) { - for (int col = 0; col < nCols; ++col) { - Element cell = eb.createRect(col * dX, row * dY, dX, dY); - cell.setPathStroke(true); - cell.getGState().setLineWidth(1); - cell.getGState().setStrokeOpacity(0.1); - cell.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB()); - if (row == 0 && col == 0) { - cell.getGState().setStrokeColor(new ColorPt(0, 0, 1)); - cell.setPathFill(true); - cell.getGState().setFillOpacity(0.8); - cell.getGState().setFillColorSpace(ColorSpace.createDeviceRGB()); - cell.getGState().setFillColor(new ColorPt(0, 0, 1)); - } else { - cell.setPathFill(false); - cell.getGState().setStrokeColor(new ColorPt(0.1, 0.1, 0.1)); - } - writer.writePlacedElement(cell); - } - } - } - - - @SneakyThrows - public static void drawRectCollection(ElementWriter writer, RectCollection rectCollection, Page page) { - - ColorPt colorPt = new ColorPt(1, 0, 0); - ElementBuilder eb = new ElementBuilder(); - for (int i = 0; i < rectCollection.getNumRects(); ++i) { - Rect r = rectCollection.getRectAt(i); - Element rect = eb.createRect(r.getX1(), r.getY1(), r.getWidth(), r.getHeight()); - - rect.setPathStroke(true); - rect.getGState().setLineWidth(5); - rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB()); - rect.getGState().setStrokeColor(colorPt); - - rect.setPathFill(true); - rect.getGState().setFillColorSpace(ColorSpace.createDeviceRGB()); - rect.getGState().setFillColor(colorPt); - rect.getGState().setFillOpacity(0.5); - - writer.writePlacedElement(rect); - } - } - } \ No newline at end of file diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalServiceTest.java b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalServiceTest.java index af2ecb2..ce6e3a1 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalServiceTest.java +++ b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalServiceTest.java @@ -1,31 +1,25 @@ package com.iqser.red.service.ocr.v1.server.service; -import com.iqser.red.service.ocr.v1.server.AbstractTest; -import com.pdftron.pdf.PDFDoc; -import com.pdftron.pdf.Page; -import com.pdftron.pdf.PageIterator; -import com.pdftron.pdf.TextExtractor; -import lombok.SneakyThrows; -import org.junit.jupiter.api.Test; -import org.springframework.amqp.rabbit.core.RabbitTemplate; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.boot.test.mock.mockito.MockBean; -import org.springframework.core.io.ClassPathResource; +import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory; +import static com.iqser.red.service.ocr.v1.server.utils.PdfTextExtraction.extractAllTextFromDocument; +import static org.assertj.core.api.AssertionsForClassTypes.assertThat; import java.io.FileInputStream; import java.io.FileOutputStream; -import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory; -import static org.assertj.core.api.AssertionsForClassTypes.assertThat; +import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.core.io.ClassPathResource; + +import com.iqser.red.service.ocr.v1.server.AbstractTest; + +import lombok.SneakyThrows; public class InvisibleElementRemovalServiceTest extends AbstractTest { @Autowired private InvisibleElementRemovalService invisibleElementRemovalService; - @MockBean - protected RabbitTemplate rabbitTemplate; - @Test @SneakyThrows @@ -36,27 +30,18 @@ public class InvisibleElementRemovalServiceTest extends AbstractTest { ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf"); try (var initialFileStream = pdfFileResource.getInputStream(); var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) { - invisibleElementRemovalService.removeInvisibleElements(initialFileStream, out,false); + invisibleElementRemovalService.removeInvisibleElements(initialFileStream, out, false); } try (var initialFileStream = pdfFileResource.getInputStream(); var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + "_delta.pdf")) { - invisibleElementRemovalService.removeInvisibleElements(initialFileStream, out,true); + invisibleElementRemovalService.removeInvisibleElements(initialFileStream, out, true); } System.out.println("Output File without invisible elements: files/" + fileName + ".pdf"); System.out.println("Output Delta File without invisible elements: files/" + fileName + "_delta.pdf"); - TextExtractor extractor = new TextExtractor(); - PDFDoc pdfDoc; - try(var fileStream = new FileInputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) { - pdfDoc = new PDFDoc(fileStream); - } - - PageIterator iterator = pdfDoc.getPageIterator(); - while (iterator.hasNext()) { - Page page = iterator.next(); - extractor.begin(page); - String[] text = extractor.getAsText().split("\n"); + try (var fileStream = new FileInputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) { + String[] text = extractAllTextFromDocument(fileStream).split("\n"); assertThat(text).contains("APPENDIX 16 Pathology Report", "Amendment 1", "Page 255 of 260"); } } diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/utils/PdfDraw.java b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/utils/PdfDraw.java new file mode 100644 index 0000000..bea3663 --- /dev/null +++ b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/utils/PdfDraw.java @@ -0,0 +1,74 @@ +package com.iqser.red.service.ocr.v1.server.utils; + +import com.pdftron.pdf.ColorPt; +import com.pdftron.pdf.ColorSpace; +import com.pdftron.pdf.Element; +import com.pdftron.pdf.ElementBuilder; +import com.pdftron.pdf.ElementWriter; +import com.pdftron.pdf.Page; +import com.pdftron.pdf.Rect; +import com.pdftron.pdf.RectCollection; + +import lombok.SneakyThrows; + + +public class PdfDraw { + + @SneakyThrows + public static void drawGrid(ElementWriter writer, Page page) { + + ElementBuilder eb = new ElementBuilder(); + double dX = 15; + double dY = 15; + int nRows = (int) (page.getPageHeight() / dY) + 1; + int nCols = (int) (page.getPageWidth() / dX) + 1; + for (int row = 0; row < nRows; ++row) { + for (int col = 0; col < nCols; ++col) { + Element cell = eb.createRect(col * dX, row * dY, dX, dY); + cell.setPathStroke(true); + cell.getGState().setLineWidth(1); + cell.getGState().setStrokeOpacity(0.1); + cell.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB()); + if (row == 0 && col == 0) { + cell.getGState().setStrokeColor(new ColorPt(0, 0, 1)); + cell.setPathFill(true); + cell.getGState().setFillOpacity(0.8); + cell.getGState().setFillColorSpace(ColorSpace.createDeviceRGB()); + cell.getGState().setFillColor(new ColorPt(0, 0, 1)); + } else { + cell.setPathFill(false); + cell.getGState().setStrokeColor(new ColorPt(0.1, 0.1, 0.1)); + } + writer.writePlacedElement(cell); + } + } + eb.destroy(); + } + + + @SneakyThrows + public static void drawRectCollection(ElementWriter writer, RectCollection rectCollection) { + + ColorPt colorPt = new ColorPt(1, 0, 0); + ElementBuilder eb = new ElementBuilder(); + for (int i = 0; i < rectCollection.getNumRects(); ++i) { + Rect r = rectCollection.getRectAt(i); + Element rect = eb.createRect(r.getX1(), r.getY1(), r.getWidth(), r.getHeight()); + + rect.setPathStroke(true); + rect.getGState().setLineWidth(5); + rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB()); + rect.getGState().setStrokeColor(colorPt); + + rect.setPathFill(true); + rect.getGState().setFillColorSpace(ColorSpace.createDeviceRGB()); + rect.getGState().setFillColor(colorPt); + rect.getGState().setFillOpacity(0.5); + + writer.writePlacedElement(rect); + } + colorPt.destroy(); + eb.destroy(); + } + +} diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/utils/PdfTextExtraction.java b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/utils/PdfTextExtraction.java new file mode 100644 index 0000000..c3f195d --- /dev/null +++ b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/utils/PdfTextExtraction.java @@ -0,0 +1,35 @@ +package com.iqser.red.service.ocr.v1.server.utils; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; + +import com.pdftron.common.PDFNetException; +import com.pdftron.pdf.PDFDoc; +import com.pdftron.pdf.Page; +import com.pdftron.pdf.PageIterator; +import com.pdftron.pdf.TextExtractor; + + +public class PdfTextExtraction { + + public static String extractAllTextFromDocument(InputStream fileStream) throws IOException, PDFNetException { + + PDFDoc pdfDoc = new PDFDoc(fileStream); + TextExtractor extractor = new TextExtractor(); + List texts = new ArrayList<>(); + + PageIterator iterator = pdfDoc.getPageIterator(); + while (iterator.hasNext()) { + Page page = iterator.next(); + extractor.begin(page); + texts.add(extractor.getAsText()); + } + + extractor.destroy(); + pdfDoc.close(); + return String.join("\n", texts); + } + +}