From 37f1e03ebcd5356e0f0b403a5c0cdd20fc133997 Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Fri, 10 Feb 2023 14:49:10 +0100 Subject: [PATCH] RED-6126: performance-test *refactor to improve cleanness *closed inputStream --- .../ImagePositionRetrievalService.java | 24 ++++-- .../ocr/v1/server/service/OCRService.java | 78 +++++++++++++------ .../v1/server/service/OcrMessageReceiver.java | 12 +-- .../v1/server/OcrServiceIntegrationTest.java | 36 +++++---- 4 files changed, 99 insertions(+), 51 deletions(-) diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/ImagePositionRetrievalService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/ImagePositionRetrievalService.java index a6c8a0e..ae251a1 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/ImagePositionRetrievalService.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/ImagePositionRetrievalService.java @@ -101,18 +101,13 @@ public class ImagePositionRetrievalService { // Sometimes images are split up into stripes, here we merge the positions of aligned and intersecting rectangles into one larger rectangle private void mergeRectangleList(List rectangleList) { - int idx = 0; - while (rectangleList.size() >= idx + 2) { + for (int idx = 0; rectangleList.size() >= idx + 2; ) { var rect1 = rectangleList.get(idx); var rect2 = rectangleList.get(idx + 1); - boolean isAlignedX = Math.abs(rect1.getMinX() - rect2.getMinX()) < TOLERANCE && Math.abs(rect1.getMaxX() - rect2.getMaxX()) < TOLERANCE; - boolean isAlignedY = Math.abs(rect1.getMinY() - rect2.getMinY()) < TOLERANCE && Math.abs(rect1.getMaxY() - rect2.getMaxY()) < TOLERANCE; - boolean intersects = rect1.intersects(rect2.getMinX() - TOLERANCE, rect2.getMinY() - TOLERANCE, rect2.getWidth() + (2 * TOLERANCE), rect2.getHeight() + (2 * TOLERANCE)); - - if (intersects && (isAlignedX || isAlignedY)) { + if (intersects(rect1, rect2) && isAlignedXOrY(rect1, rect2)) { rectangleList.remove(idx + 1); rectangleList.remove(idx); rectangleList.add(idx, rect1.createUnion(rect2)); @@ -123,6 +118,21 @@ public class ImagePositionRetrievalService { } + private boolean intersects(Rectangle2D rect1, Rectangle2D rect2) { + + return rect1.intersects(rect2.getMinX() - TOLERANCE, rect2.getMinY() - TOLERANCE, rect2.getWidth() + (2 * TOLERANCE), rect2.getHeight() + (2 * TOLERANCE)); + } + + + private boolean isAlignedXOrY(Rectangle2D rect1, Rectangle2D rect2) { + + boolean isAlignedX = Math.abs(rect1.getMinX() - rect2.getMinX()) < TOLERANCE && Math.abs(rect1.getMaxX() - rect2.getMaxX()) < TOLERANCE; + boolean isAlignedY = Math.abs(rect1.getMinY() - rect2.getMinY()) < TOLERANCE && Math.abs(rect1.getMaxY() - rect2.getMaxY()) < TOLERANCE; + + return isAlignedX || isAlignedY; + } + + private Rect toRotationAdjustedRect(Rect bbox, Page page, boolean mirrorY) throws PDFNetException { int rotation = page.getRotation(); diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java index 7901ca3..c1cb635 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java @@ -90,36 +90,26 @@ public class OCRService { Map pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, true); + // Optimization: + // When a page does not have a TextZone, PDFTron whites out the page. But, PDFTron scans it anyway, resulting in a longer runtime. + // So, we need to remove pages without images. + // Furthermore, creating a new document is *much* faster than reusing the same document and adding/removing pages one by one. + // Therefore, we create a new Document with a single page for every page that contains text. int numProcessedPages = 0; - // optimization: only scanning pages that contain images for (Integer pageId : pageIdToRectCollection.keySet()) { try { - // optimization: creating a new document is faster than reusing the same and adding/removing pages one by one - OCROptions options = new OCROptions(); - PDFDoc ocrPageDoc = new PDFDoc(); - Page pdfPage = pdfDoc.getPage(pageId); - // optimization: this line ensures the ocr text is placed correctly by PDFTron - pdfPage.setMediaBox(pdfPage.getCropBox()); - ocrPageDoc.pagePushBack(pdfPage); - options.addTextZonesForPage(pageIdToRectCollection.get(pageId), 1); - options.addLang(ENGLISH); - options.addDPI(settings.getOcrDPI()); - - OCRModule.processPDF(ocrPageDoc, options); + PDFDoc singlePagePdfDoc = extractSinglePagePdfDoc(pdfDoc, pageId); + processOcr(pageIdToRectCollection, pageId, singlePagePdfDoc); ++numProcessedPages; - StringBuilder zonesString = new StringBuilder(); - for (int j = 0; j < pageIdToRectCollection.get(pageId).getNumRects(); ++j) { - var r = pageIdToRectCollection.get(pageId).getRectAt(j); - zonesString.append(format("[lower left (%.1f|%.1f) upper right (%.1f|%.1f)]", r.getX1(), r.getY1(), r.getX2(), r.getY2())); - } - log.info("{}/{} Page {} done, OCR regions {}", numProcessedPages, pageIdToRectCollection.size(), pageId, zonesString); + log.info("{}/{} Page {} done, OCR regions {}", + numProcessedPages, + pageIdToRectCollection.size(), + pageId, + getAllOcrTextZonesAsString(pageIdToRectCollection, pageId)); - // re-adding OCR pages - Page ocrPage = ocrPageDoc.getPage(1); - pdfDoc.pageInsert(pdfDoc.getPageIterator(pageId), ocrPage); - pdfDoc.pageRemove(pdfDoc.getPageIterator(pageId + 1)); - ocrPageDoc.close(); + replaceOriginalPageWithOcrPage(pdfDoc, pageId, singlePagePdfDoc); + singlePagePdfDoc.close(); rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE, objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder() @@ -151,4 +141,44 @@ public class OCRService { } } + + private void processOcr(Map pageIdToRectCollection, Integer pageId, PDFDoc singlePagePdfDoc) throws PDFNetException { + + OCROptions options = new OCROptions(); + options.addTextZonesForPage(pageIdToRectCollection.get(pageId), 1); + options.addLang(ENGLISH); + options.addDPI(settings.getOcrDPI()); + + OCRModule.processPDF(singlePagePdfDoc, options); + } + + + private static PDFDoc extractSinglePagePdfDoc(PDFDoc pdfDoc, Integer pageId) throws PDFNetException { + + PDFDoc singlePagePdfDoc = new PDFDoc(); + Page page = pdfDoc.getPage(pageId); + page.setMediaBox(page.getCropBox()); // this line ensures the ocr text is placed correctly by PDFTron, see TestFile MediaBoxBiggerThanCropBox.pdf + singlePagePdfDoc.pagePushBack(page); + return singlePagePdfDoc; + } + + + private static void replaceOriginalPageWithOcrPage(PDFDoc pdfDoc, Integer pageId, PDFDoc ocrPageDoc) throws PDFNetException { + + Page ocrPage = ocrPageDoc.getPage(1); + pdfDoc.pageInsert(pdfDoc.getPageIterator(pageId), ocrPage); + pdfDoc.pageRemove(pdfDoc.getPageIterator(pageId + 1)); + } + + + private static StringBuilder getAllOcrTextZonesAsString(Map pageIdToRectCollection, Integer pageId) throws PDFNetException { + + StringBuilder zonesString = new StringBuilder(); + for (int j = 0; j < pageIdToRectCollection.get(pageId).getNumRects(); ++j) { + var r = pageIdToRectCollection.get(pageId).getRectAt(j); + zonesString.append(format("[lower left (%.1f|%.1f) upper right (%.1f|%.1f)]", r.getX1(), r.getY1(), r.getX2(), r.getY2())); + } + return zonesString; + } + } diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OcrMessageReceiver.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OcrMessageReceiver.java index 7ddbdf4..1683ae1 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OcrMessageReceiver.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OcrMessageReceiver.java @@ -31,13 +31,13 @@ public class OcrMessageReceiver { private final OCRService ocrService; + @RabbitHandler @RabbitListener(queues = MessagingConfiguration.OCR_QUEUE, concurrency = "1") public void receiveOcr(String in) throws JsonProcessingException { DocumentRequest ocrRequestMessage = objectMapper.readValue(in, DocumentRequest.class); - log.info("Start ocr for file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId()); setStatusOcrProcessing(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId()); @@ -47,16 +47,16 @@ public class OcrMessageReceiver { fileStorageService.storeUntouchedFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), originalFile); } - try (var out = new ByteArrayOutputStream()) { - ocrService.runOcrOnDocument(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), out); - - fileStorageService.storeOriginalFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), new ByteArrayInputStream(out.toByteArray())); + try (var transferStream = new ByteArrayOutputStream()) { + ocrService.runOcrOnDocument(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), transferStream); + try (var inputStream = new ByteArrayInputStream(transferStream.toByteArray())) { + fileStorageService.storeOriginalFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), inputStream); + } } catch (IOException e) { log.error("Failed to store file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId()); throw new RuntimeException(e); } - fileStatusProcessingUpdateClient.ocrSuccessful(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId()); } diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java index b21a331..cbd57ff 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java +++ b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java @@ -5,12 +5,12 @@ import static org.assertj.core.api.Assertions.assertThat; import java.io.FileInputStream; import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; import java.util.ArrayList; import java.util.List; import java.util.concurrent.TimeUnit; -import io.micrometer.prometheus.PrometheusMeterRegistry; -import io.micrometer.prometheus.PrometheusTimer; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Disabled; @@ -36,12 +36,15 @@ import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService; import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType; import com.iqser.red.storage.commons.StorageAutoConfiguration; import com.iqser.red.storage.commons.service.StorageService; +import com.pdftron.common.PDFNetException; import com.pdftron.pdf.OCRModule; import com.pdftron.pdf.PDFDoc; import com.pdftron.pdf.Page; import com.pdftron.pdf.PageIterator; import com.pdftron.pdf.TextExtractor; +import io.micrometer.prometheus.PrometheusMeterRegistry; +import io.micrometer.prometheus.PrometheusTimer; import lombok.SneakyThrows; @ExtendWith(SpringExtension.class) @@ -80,19 +83,20 @@ public class OcrServiceIntegrationTest { @Test @Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top. - public void testOCRMetrics(){ + public void testOCRMetrics() { + testOCR("Watermark"); testOCR("Watermark"); testOCR("Watermark"); - var ocrOnDocumentMeter = registry.getMeters().stream() - .filter(m -> m.getId().getName().equalsIgnoreCase("redactmanager_runOcrOnDocument")).findAny(); + var ocrOnDocumentMeter = registry.getMeters().stream().filter(m -> m.getId().getName().equalsIgnoreCase("redactmanager_runOcrOnDocument")).findAny(); assertThat(ocrOnDocumentMeter.isPresent()).isTrue(); PrometheusTimer timer = (PrometheusTimer) ocrOnDocumentMeter.get(); assertThat(timer.count()).isEqualTo(3); assertThat(timer.mean(TimeUnit.SECONDS)).isGreaterThan(0.1); } + @Test @Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top. public void testOcr() { @@ -153,30 +157,34 @@ public class OcrServiceIntegrationTest { private String testOCR(String fileName) { ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf"); - var originId = FileStorageService.getStorageId("dossier", "file", FileType.ORIGIN); try (var fileStream = pdfFileResource.getInputStream()) { storageService.storeObject(originId, fileStream); } - try (var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) { ocrService.runOcrOnDocument("dossier", "file", out); } + System.out.println("File:" + getTemporaryDirectory() + "/" + fileName + ".pdf"); + try (var fileStream = new FileInputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) { + return extractAllTextFromDocument(fileStream); + } + } + + + private static String extractAllTextFromDocument(InputStream fileStream) throws IOException, PDFNetException { + + PDFDoc pdfDoc = new PDFDoc(fileStream); TextExtractor extractor = new TextExtractor(); List texts = new ArrayList<>(); - PDFDoc pdfDoc; - try (var fileStream = new FileInputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) { - pdfDoc = new PDFDoc(fileStream); - } PageIterator iterator = pdfDoc.getPageIterator(); while (iterator.hasNext()) { Page page = iterator.next(); extractor.begin(page); texts.add(extractor.getAsText()); } - System.out.println("File:" + getTemporaryDirectory() + "/" + fileName + ".pdf"); + return String.join("\n", texts); } @@ -184,7 +192,7 @@ public class OcrServiceIntegrationTest { @SneakyThrows public void dummyTest() { - // Build needs one text to not fail. + // Build needs one test to not fail. assertThat(1).isEqualTo(1); } @@ -204,7 +212,7 @@ public class OcrServiceIntegrationTest { @Bean @Primary - public StorageService inmemoryStorage() { + public StorageService inMemoryStorage() { return new FileSystemBackedStorageService(); }