diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java index a9da1a1..e64c1fd 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java @@ -77,6 +77,7 @@ public class InvisibleElementRemovalService { visitedXObjIds.add(page.getSDFObj().getObjNum()); + InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder() .reader(reader) .clippingPathStack(new ClippingPathStack(page.getMediaBox())) @@ -91,8 +92,6 @@ public class InvisibleElementRemovalService { context.visitedXObjIds().clear(); removeOverlappedElements(page, writer, context); - reader.end(); - writer.end(); } try { @@ -227,6 +226,7 @@ public class InvisibleElementRemovalService { formWriter.setDefaultGState(context.reader()); processElements(formWriter, context); + formWriter.end(); formWriter.destroy(); context.reader().end(); } @@ -364,6 +364,7 @@ public class InvisibleElementRemovalService { formWriter.setDefaultGState(context.reader()); processOverlappedElements(formWriter, context); + formWriter.end(); formWriter.destroy(); context.reader().end(); } diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java index 5fd671d..2c5fe31 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java @@ -1,23 +1,35 @@ package com.iqser.red.service.ocr.v1.server.service; +import static java.lang.String.format; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.HashMap; +import java.util.Map; + +import org.springframework.amqp.rabbit.core.RabbitTemplate; +import org.springframework.stereotype.Service; + import com.fasterxml.jackson.databind.ObjectMapper; import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse; import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration; import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings; import com.pdftron.common.PDFNetException; -import com.pdftron.pdf.*; +import com.pdftron.pdf.OCRModule; +import com.pdftron.pdf.OCROptions; +import com.pdftron.pdf.Optimizer; +import com.pdftron.pdf.PDFDoc; +import com.pdftron.pdf.Page; +import com.pdftron.pdf.RectCollection; import com.pdftron.sdf.SDFDoc; + import io.micrometer.core.annotation.Timed; import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; -import org.springframework.amqp.rabbit.core.RabbitTemplate; -import org.springframework.stereotype.Service; - -import java.io.*; -import java.util.Map; - -import static java.lang.String.format; @Slf4j @Service @@ -60,7 +72,10 @@ public class OCRService { log.debug("Start invisible element removal for file with dossierId {} and fileId {}", dossierId, fileId); invisibleElementRemovalService.removeInvisibleElements(fileStream, transferOutputStream, false); long removalEnd = System.currentTimeMillis(); - log.debug("Invisible element removal successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, format("%.1f", (removalEnd - removalStart) / 1000.0)); + log.debug("Invisible element removal successful for file with dossierId {} and fileId {}, took {}s", + dossierId, + fileId, + format("%.1f", (removalEnd - removalStart) / 1000.0)); } try (InputStream transferInputStream = new ByteArrayInputStream(transferOutputStream.toByteArray())) { long ocrStart = System.currentTimeMillis(); @@ -78,18 +93,22 @@ public class OCRService { PDFDoc pdfDoc = new PDFDoc(fileStream); Map pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, true); + rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE, + objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder().fileId(fileId).numberOfPagesToOCR(pageIdToRectCollection.size()).build())); // Optimization: // When a page does not have a TextZone, PDFTron whites out the page. But, PDFTron scans it anyway, resulting in a longer runtime. // So, we need to remove pages without images. // Furthermore, creating a new document is *much* faster than reusing the same document and adding/removing pages one by one. // Therefore, we create a new Document with a single page for every page that contains an image. + // For some reason, if we insert the OCR singlePageDoc into the original PDFDoc inside the loop, for some documents, the RAM Usage increases exponentially with every page. + // This is why, we replace the OCRed Pages outside the main loop. int numProcessedPages = 0; + Map pageIdToSinglePagePdfDoc = new HashMap<>(); for (Integer pageId : pageIdToRectCollection.keySet()) { try { PDFDoc singlePagePdfDoc = extractSinglePagePdfDoc(pdfDoc, pageId); processOcr(pageIdToRectCollection, pageId, singlePagePdfDoc); - ++numProcessedPages; log.info("{}/{} Page {} done, OCR regions {}", numProcessedPages, @@ -97,9 +116,8 @@ public class OCRService { pageId, getAllOcrTextZonesAsString(pageIdToRectCollection, pageId)); - replaceOriginalPageWithOcrPage(pdfDoc, pageId, singlePagePdfDoc); - - singlePagePdfDoc.close(); + pageIdToSinglePagePdfDoc.put(pageId, singlePagePdfDoc); + ++numProcessedPages; rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE, objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder() @@ -113,6 +131,9 @@ public class OCRService { throw new RuntimeException(e); } } + log.info("Copying {} OCRed Pages into original Document", pageIdToSinglePagePdfDoc.size()); + pageIdToSinglePagePdfDoc.forEach((pageId, singlePagePdfDoc) -> replaceOriginalPageWithOcrPage(pdfDoc, pageId, singlePagePdfDoc)); + Optimizer.optimize(pdfDoc); rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE, objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder() @@ -122,7 +143,6 @@ public class OCRService { .ocrFinished(true) .build())); - Optimizer.optimize(pdfDoc); try { pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null); } catch (Exception e) { @@ -154,11 +174,13 @@ public class OCRService { } - private static void replaceOriginalPageWithOcrPage(PDFDoc pdfDoc, Integer pageId, PDFDoc ocrPageDoc) throws PDFNetException { + @SneakyThrows + private static void replaceOriginalPageWithOcrPage(PDFDoc pdfDoc, Integer pageId, PDFDoc singlePagePdfDoc) { - Page ocrPage = ocrPageDoc.getPage(1); + Page ocrPage = singlePagePdfDoc.getPage(1); pdfDoc.pageInsert(pdfDoc.getPageIterator(pageId), ocrPage); pdfDoc.pageRemove(pdfDoc.getPageIterator(pageId + 1)); + singlePagePdfDoc.close(); }