diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java index a9da1a1..e64c1fd 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java @@ -77,6 +77,7 @@ public class InvisibleElementRemovalService { visitedXObjIds.add(page.getSDFObj().getObjNum()); + InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder() .reader(reader) .clippingPathStack(new ClippingPathStack(page.getMediaBox())) @@ -91,8 +92,6 @@ public class InvisibleElementRemovalService { context.visitedXObjIds().clear(); removeOverlappedElements(page, writer, context); - reader.end(); - writer.end(); } try { @@ -227,6 +226,7 @@ public class InvisibleElementRemovalService { formWriter.setDefaultGState(context.reader()); processElements(formWriter, context); + formWriter.end(); formWriter.destroy(); context.reader().end(); } @@ -364,6 +364,7 @@ public class InvisibleElementRemovalService { formWriter.setDefaultGState(context.reader()); processOverlappedElements(formWriter, context); + formWriter.end(); formWriter.destroy(); context.reader().end(); } diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java index 5fd671d..01c396e 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java @@ -1,23 +1,35 @@ package com.iqser.red.service.ocr.v1.server.service; +import static java.lang.String.format; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.HashMap; +import java.util.Map; + +import org.springframework.amqp.rabbit.core.RabbitTemplate; +import org.springframework.stereotype.Service; + import com.fasterxml.jackson.databind.ObjectMapper; import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse; import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration; import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings; import com.pdftron.common.PDFNetException; -import com.pdftron.pdf.*; +import com.pdftron.pdf.OCRModule; +import com.pdftron.pdf.OCROptions; +import com.pdftron.pdf.Optimizer; +import com.pdftron.pdf.PDFDoc; +import com.pdftron.pdf.Page; +import com.pdftron.pdf.RectCollection; import com.pdftron.sdf.SDFDoc; + import io.micrometer.core.annotation.Timed; import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; -import org.springframework.amqp.rabbit.core.RabbitTemplate; -import org.springframework.stereotype.Service; - -import java.io.*; -import java.util.Map; - -import static java.lang.String.format; @Slf4j @Service @@ -84,11 +96,15 @@ public class OCRService { // So, we need to remove pages without images. // Furthermore, creating a new document is *much* faster than reusing the same document and adding/removing pages one by one. // Therefore, we create a new Document with a single page for every page that contains an image. + // For some reason, if we insert the OCR singlePageDoc into the original PDFDoc inside the loop, for some documents, the RAM Usage increases exponentially with every page. + // This is why, we replace the OCRed Pages outside the main loop. int numProcessedPages = 0; + Map pageIdToSinglePagePdfDoc = new HashMap<>(); for (Integer pageId : pageIdToRectCollection.keySet()) { try { PDFDoc singlePagePdfDoc = extractSinglePagePdfDoc(pdfDoc, pageId); processOcr(pageIdToRectCollection, pageId, singlePagePdfDoc); + pageIdToSinglePagePdfDoc.put(pageId, singlePagePdfDoc); ++numProcessedPages; log.info("{}/{} Page {} done, OCR regions {}", @@ -97,9 +113,6 @@ public class OCRService { pageId, getAllOcrTextZonesAsString(pageIdToRectCollection, pageId)); - replaceOriginalPageWithOcrPage(pdfDoc, pageId, singlePagePdfDoc); - - singlePagePdfDoc.close(); rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE, objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder() @@ -113,6 +126,9 @@ public class OCRService { throw new RuntimeException(e); } } + log.info("Copying {} OCRed Pages into original Document", pageIdToSinglePagePdfDoc.size()); + pageIdToSinglePagePdfDoc.forEach((pageId, singlePagePdfDoc) -> replaceOriginalPageWithOcrPage(pdfDoc, pageId, singlePagePdfDoc)); + Optimizer.optimize(pdfDoc); rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE, objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder() @@ -122,7 +138,6 @@ public class OCRService { .ocrFinished(true) .build())); - Optimizer.optimize(pdfDoc); try { pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null); } catch (Exception e) { @@ -154,11 +169,13 @@ public class OCRService { } - private static void replaceOriginalPageWithOcrPage(PDFDoc pdfDoc, Integer pageId, PDFDoc ocrPageDoc) throws PDFNetException { + @SneakyThrows + private static void replaceOriginalPageWithOcrPage(PDFDoc pdfDoc, Integer pageIndex, PDFDoc singlePagePdfDoc) { - Page ocrPage = ocrPageDoc.getPage(1); - pdfDoc.pageInsert(pdfDoc.getPageIterator(pageId), ocrPage); - pdfDoc.pageRemove(pdfDoc.getPageIterator(pageId + 1)); + Page ocrPage = singlePagePdfDoc.getPage(1); + pdfDoc.pageInsert(pdfDoc.getPageIterator(pageIndex), ocrPage); + pdfDoc.pageRemove(pdfDoc.getPageIterator(pageIndex + 1)); + singlePagePdfDoc.close(); }