diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/initializer/PDFNetInitializer.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/initializer/PDFNetInitializer.java index babf7b3..2ad2450 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/initializer/PDFNetInitializer.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/initializer/PDFNetInitializer.java @@ -1,14 +1,12 @@ package com.iqser.red.service.ocr.v1.server.initializer; -import javax.annotation.PostConstruct; - +import com.pdftron.pdf.PDFNet; +import lombok.RequiredArgsConstructor; +import lombok.SneakyThrows; import org.springframework.beans.factory.annotation.Value; import org.springframework.stereotype.Component; -import com.pdftron.pdf.PDFNet; - -import lombok.RequiredArgsConstructor; -import lombok.SneakyThrows; +import javax.annotation.PostConstruct; @Component @RequiredArgsConstructor @@ -22,7 +20,6 @@ public class PDFNetInitializer { @SneakyThrows - @PostConstruct // Do not change back to application runner, if it is application runner it takes messages from the queue before PDFNet is initialized, that leads to UnsatisfiedLinkError. public void init() { @@ -32,4 +29,5 @@ public class PDFNetInitializer { } + } diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java index c1cb635..a6d7d8e 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java @@ -1,34 +1,23 @@ package com.iqser.red.service.ocr.v1.server.service; -import static java.lang.String.format; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.util.Map; - -import org.springframework.amqp.rabbit.core.RabbitTemplate; -import org.springframework.stereotype.Service; - import com.fasterxml.jackson.databind.ObjectMapper; import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse; import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration; import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings; import com.pdftron.common.PDFNetException; -import com.pdftron.pdf.OCRModule; -import com.pdftron.pdf.OCROptions; -import com.pdftron.pdf.Optimizer; -import com.pdftron.pdf.PDFDoc; -import com.pdftron.pdf.Page; -import com.pdftron.pdf.RectCollection; +import com.pdftron.pdf.*; import com.pdftron.sdf.SDFDoc; - import io.micrometer.core.annotation.Timed; import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; +import org.springframework.amqp.rabbit.core.RabbitTemplate; +import org.springframework.stereotype.Service; + +import java.io.*; +import java.util.Map; + +import static java.lang.String.format; @Slf4j @Service @@ -67,17 +56,17 @@ public class OCRService { try (ByteArrayOutputStream transferOutputStream = new ByteArrayOutputStream()) { try (InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId)) { - long start = System.currentTimeMillis(); + long removalStart = System.currentTimeMillis(); log.debug("Start invisible element removal for file with dossierId {} and fileId {}", dossierId, fileId); invisibleElementRemovalService.removeInvisibleElements(fileStream, transferOutputStream, false); - long end = System.currentTimeMillis(); - log.debug("Invisible element removal successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, format("%.1f", (end - start) / 1000.0)); + long removalEnd = System.currentTimeMillis(); + log.debug("Invisible element removal successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, format("%.1f", (removalEnd - removalStart) / 1000.0)); } try (InputStream transferInputStream = new ByteArrayInputStream(transferOutputStream.toByteArray())) { - long start = System.currentTimeMillis(); + long ocrStart = System.currentTimeMillis(); runOcr(transferInputStream, out, fileId); - long end = System.currentTimeMillis(); - log.info("ocr successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, format("%.1f", (end - start) / 1000.0)); + long ocrEnd = System.currentTimeMillis(); + log.info("ocr successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, format("%.1f", (ocrEnd - ocrStart) / 1000.0)); } } } @@ -94,7 +83,7 @@ public class OCRService { // When a page does not have a TextZone, PDFTron whites out the page. But, PDFTron scans it anyway, resulting in a longer runtime. // So, we need to remove pages without images. // Furthermore, creating a new document is *much* faster than reusing the same document and adding/removing pages one by one. - // Therefore, we create a new Document with a single page for every page that contains text. + // Therefore, we create a new Document with a single page for every page that contains an image. int numProcessedPages = 0; for (Integer pageId : pageIdToRectCollection.keySet()) { try { @@ -139,6 +128,7 @@ public class OCRService { log.error("Processed File with fileId {} could not be saved", fileId); throw new RuntimeException(e); } + pdfDoc.close(); } diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OcrMessageReceiver.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OcrMessageReceiver.java index 1683ae1..6452d50 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OcrMessageReceiver.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OcrMessageReceiver.java @@ -15,6 +15,8 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.iqser.red.service.ocr.v1.api.model.DocumentRequest; import com.iqser.red.service.ocr.v1.server.client.FileStatusProcessingUpdateClient; import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration; +import com.iqser.red.service.ocr.v1.server.initializer.PDFNetInitializer; +import com.pdftron.pdf.PDFNet; import feign.FeignException; import lombok.RequiredArgsConstructor; @@ -28,6 +30,7 @@ public class OcrMessageReceiver { private final ObjectMapper objectMapper; private final FileStorageService fileStorageService; private final FileStatusProcessingUpdateClient fileStatusProcessingUpdateClient; + private final PDFNetInitializer pdfNetInitializer; private final OCRService ocrService; @@ -36,6 +39,7 @@ public class OcrMessageReceiver { @RabbitListener(queues = MessagingConfiguration.OCR_QUEUE, concurrency = "1") public void receiveOcr(String in) throws JsonProcessingException { + pdfNetInitializer.init(); DocumentRequest ocrRequestMessage = objectMapper.readValue(in, DocumentRequest.class); log.info("Start ocr for file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId()); @@ -58,6 +62,7 @@ public class OcrMessageReceiver { } fileStatusProcessingUpdateClient.ocrSuccessful(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId()); + PDFNet.terminate(); }