diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java index a9da1a1..e64c1fd 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java @@ -77,6 +77,7 @@ public class InvisibleElementRemovalService { visitedXObjIds.add(page.getSDFObj().getObjNum()); + InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder() .reader(reader) .clippingPathStack(new ClippingPathStack(page.getMediaBox())) @@ -91,8 +92,6 @@ public class InvisibleElementRemovalService { context.visitedXObjIds().clear(); removeOverlappedElements(page, writer, context); - reader.end(); - writer.end(); } try { @@ -227,6 +226,7 @@ public class InvisibleElementRemovalService { formWriter.setDefaultGState(context.reader()); processElements(formWriter, context); + formWriter.end(); formWriter.destroy(); context.reader().end(); } @@ -364,6 +364,7 @@ public class InvisibleElementRemovalService { formWriter.setDefaultGState(context.reader()); processOverlappedElements(formWriter, context); + formWriter.end(); formWriter.destroy(); context.reader().end(); } diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java index 5fd671d..8fb9f2d 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java @@ -1,23 +1,35 @@ package com.iqser.red.service.ocr.v1.server.service; +import static java.lang.String.format; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.HashMap; +import java.util.Map; + +import org.springframework.amqp.rabbit.core.RabbitTemplate; +import org.springframework.stereotype.Service; + import com.fasterxml.jackson.databind.ObjectMapper; import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse; import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration; import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings; import com.pdftron.common.PDFNetException; -import com.pdftron.pdf.*; +import com.pdftron.pdf.OCRModule; +import com.pdftron.pdf.OCROptions; +import com.pdftron.pdf.Optimizer; +import com.pdftron.pdf.PDFDoc; +import com.pdftron.pdf.Page; +import com.pdftron.pdf.RectCollection; import com.pdftron.sdf.SDFDoc; + import io.micrometer.core.annotation.Timed; import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; -import org.springframework.amqp.rabbit.core.RabbitTemplate; -import org.springframework.stereotype.Service; - -import java.io.*; -import java.util.Map; - -import static java.lang.String.format; @Slf4j @Service @@ -84,11 +96,14 @@ public class OCRService { // So, we need to remove pages without images. // Furthermore, creating a new document is *much* faster than reusing the same document and adding/removing pages one by one. // Therefore, we create a new Document with a single page for every page that contains an image. + // Moreover, if we insert the OCR Page into the PDFDoc inside the loop, for some documents, the RAM Usage increases exponentially with every page, this does not make any sense to me... int numProcessedPages = 0; + Map pageIdToSinglePagePdfDoc = new HashMap<>(); for (Integer pageId : pageIdToRectCollection.keySet()) { try { PDFDoc singlePagePdfDoc = extractSinglePagePdfDoc(pdfDoc, pageId); processOcr(pageIdToRectCollection, pageId, singlePagePdfDoc); + pageIdToSinglePagePdfDoc.put(pageId, singlePagePdfDoc); ++numProcessedPages; log.info("{}/{} Page {} done, OCR regions {}", @@ -97,9 +112,6 @@ public class OCRService { pageId, getAllOcrTextZonesAsString(pageIdToRectCollection, pageId)); - replaceOriginalPageWithOcrPage(pdfDoc, pageId, singlePagePdfDoc); - - singlePagePdfDoc.close(); rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE, objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder() @@ -113,6 +125,9 @@ public class OCRService { throw new RuntimeException(e); } } + log.info("Copying {} OCRed Pages into original Document", pageIdToSinglePagePdfDoc.size()); + pageIdToSinglePagePdfDoc.forEach((pageId, singlePagePdfDoc) -> replaceOriginalPageWithOcrPage(pdfDoc, pageId, singlePagePdfDoc)); + Optimizer.optimize(pdfDoc); rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE, objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder() @@ -122,7 +137,6 @@ public class OCRService { .ocrFinished(true) .build())); - Optimizer.optimize(pdfDoc); try { pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null); } catch (Exception e) { @@ -154,11 +168,13 @@ public class OCRService { } - private static void replaceOriginalPageWithOcrPage(PDFDoc pdfDoc, Integer pageId, PDFDoc ocrPageDoc) throws PDFNetException { + @SneakyThrows + private static void replaceOriginalPageWithOcrPage(PDFDoc pdfDoc, Integer pageIndex, PDFDoc singlePagePdfDoc) { - Page ocrPage = ocrPageDoc.getPage(1); - pdfDoc.pageInsert(pdfDoc.getPageIterator(pageId), ocrPage); - pdfDoc.pageRemove(pdfDoc.getPageIterator(pageId + 1)); + Page ocrPage = singlePagePdfDoc.getPage(1); + pdfDoc.pageInsert(pdfDoc.getPageIterator(pageIndex), ocrPage); + pdfDoc.pageRemove(pdfDoc.getPageIterator(pageIndex + 1)); + singlePagePdfDoc.close(); }