diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java index 2e7bda6..a0005ae 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java @@ -89,7 +89,7 @@ public class InvisibleElementRemovalService { context.visitedXObjIds().clear(); - // removeOverlappedElements(page, writer, context); + removeOverlappedElements(page, writer, context); } pdfDoc.save(out, SDFDoc.SaveMode.REMOVE_UNUSED, null); } diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java index 8f8626e..afa3b93 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java @@ -6,7 +6,6 @@ import java.io.InputStream; import java.io.OutputStream; import java.util.Map; -import io.micrometer.core.annotation.Timed; import org.springframework.amqp.rabbit.core.RabbitTemplate; import org.springframework.stereotype.Service; @@ -23,6 +22,7 @@ import com.pdftron.pdf.Page; import com.pdftron.pdf.RectCollection; import com.pdftron.sdf.SDFDoc; +import io.micrometer.core.annotation.Timed; import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; @@ -81,15 +81,16 @@ public class OCRService { Map pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, true); - OCROptions options = new OCROptions(); - PDFDoc ocrPageDoc = new PDFDoc(); - int numProcessedPages = 0; for (Integer pageId : pageIdToRectCollection.keySet()) { try { - // optimization by only scanning pages that contain images + // optimization: creating a new document is faster than reusing the same and adding/removing pages one by one + OCROptions options = new OCROptions(); + PDFDoc ocrPageDoc = new PDFDoc(); + // optimization: only scanning pages that contain images Page pdfPage = pdfDoc.getPage(pageId); - pdfPage.setMediaBox(pdfPage.getCropBox()); // this line ensures the ocr text is placed correctly by PDFTron + // optimization: this line ensures the ocr text is placed correctly by PDFTron + pdfPage.setMediaBox(pdfPage.getCropBox()); ocrPageDoc.pagePushBack(pdfPage); options.addTextZonesForPage(pageIdToRectCollection.get(pageId), 1); options.addLang(ENGLISH); @@ -109,7 +110,7 @@ public class OCRService { Page ocrPage = ocrPageDoc.getPage(1); pdfDoc.pageInsert(pdfDoc.getPageIterator(pageId), ocrPage); pdfDoc.pageRemove(pdfDoc.getPageIterator(pageId + 1)); - ocrPageDoc.pageRemove(ocrPageDoc.getPageIterator(1)); + ocrPageDoc.close(); rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE, objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder() @@ -124,8 +125,6 @@ public class OCRService { } } - ocrPageDoc.close(); - rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE, objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder() .fileId(fileId) @@ -137,4 +136,5 @@ public class OCRService { Optimizer.optimize(pdfDoc); pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null); } + } diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/resources/log4j2-test.xml b/ocr-service-v1/ocr-service-server-v1/src/test/resources/log4j2-test.xml new file mode 100644 index 0000000..b4895cf --- /dev/null +++ b/ocr-service-v1/ocr-service-server-v1/src/test/resources/log4j2-test.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + +