From e705f869fdab123719c9adeb691c6f4cb7849a17 Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Wed, 8 Feb 2023 17:05:00 +0100 Subject: [PATCH] RED-6126: Performance Tests *moved to streams for pdf file transfer *disabled overlap detection --- .../InvisibleElementRemovalService.java | 30 +++++++------ .../ocr/v1/server/service/OCRService.java | 29 +++++++------ .../v1/server/service/OcrMessageReceiver.java | 15 +++++-- .../v1/server/OcrServiceIntegrationTest.java | 42 ++++++++++--------- .../InvisibleElementRemovalServiceTest.java | 42 +++++++------------ 5 files changed, 84 insertions(+), 74 deletions(-) diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java index 0ca60ee..2e7bda6 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java @@ -5,6 +5,7 @@ import java.awt.geom.AffineTransform; import java.awt.geom.GeneralPath; import java.awt.geom.Rectangle2D; import java.io.InputStream; +import java.io.OutputStream; import java.util.ArrayList; import java.util.Iterator; import java.util.List; @@ -59,10 +60,10 @@ public class InvisibleElementRemovalService { * @param pdfFile The PDF file to process * @param delta If this flag is set only the removed Elements will be written to the output file. * The Elements are red if they are removed by clipping path, blue for transparency, and a green bounding box for overlap. - * @return The resulting PDF File as bytes. + * @param out OutputStream to write the resulting file to **/ @SneakyThrows - public byte[] removeInvisibleElements(InputStream pdfFile, boolean delta) { + public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta) { PDFDoc pdfDoc = new PDFDoc(pdfFile); @@ -88,9 +89,9 @@ public class InvisibleElementRemovalService { context.visitedXObjIds().clear(); - removeOverlappedElements(page, writer, context); + // removeOverlappedElements(page, writer, context); } - return pdfDoc.save(SDFDoc.SaveMode.REMOVE_UNUSED, null); + pdfDoc.save(out, SDFDoc.SaveMode.REMOVE_UNUSED, null); } @@ -225,7 +226,7 @@ public class InvisibleElementRemovalService { //transform path to initial user space var ctm = pathElement.getCTM(); - var affineTransform = getAffineTransform(ctm); + var affineTransform = toAffineTransform(ctm); linePath.transform(affineTransform); var rect = linePath.getBounds2D(); @@ -244,8 +245,13 @@ public class InvisibleElementRemovalService { writer.writeElement(pathElement); } else { + if (pathElement.isWindingFill()) { + linePath.setWindingRule(GeneralPath.WIND_NON_ZERO); + } else { + linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD); + } + if (inClippingPath) { - // TODO: WINDING RULE if (isFilledAndNonTransparent(pathElement)) { List currentOverlappedElements = context.visibleElements() .stream() @@ -270,12 +276,6 @@ public class InvisibleElementRemovalService { } - private static AffineTransform getAffineTransform(Matrix2D ctm) throws PDFNetException { - - return new AffineTransform(ctm.getA(), ctm.getB(), ctm.getC(), ctm.getD(), ctm.getH(), ctm.getV()); - } - - private void removeOverlappedElements(Page page, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { context.reader().begin(page); @@ -422,6 +422,12 @@ public class InvisibleElementRemovalService { } + private static AffineTransform toAffineTransform(Matrix2D ctm) throws PDFNetException { + + return new AffineTransform(ctm.getA(), ctm.getB(), ctm.getC(), ctm.getD(), ctm.getH(), ctm.getV()); + } + + @Builder private record InvisibleElementRemovalContext( boolean delta, diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java index 63b1925..8f8626e 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java @@ -1,7 +1,9 @@ package com.iqser.red.service.ocr.v1.server.service; import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; import java.io.InputStream; +import java.io.OutputStream; import java.util.Map; import io.micrometer.core.annotation.Timed; @@ -55,32 +57,33 @@ public class OCRService { * * @param dossierId The dossier id * @param fileId The file id - * @return the resulting PDF file as an InputStream + * @param out OutputStream to write the file to */ @SneakyThrows @Timed("redactmanager_runOcrOnDocument") - public InputStream runOcrOnDocument(String dossierId, String fileId) { - - InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId); - - byte[] fileWithoutInvisibleTextBytes = invisibleElementRemovalService.removeInvisibleElements(fileStream, false); - - byte[] ocrBytes = runOcr(fileWithoutInvisibleTextBytes, fileId); - - return new ByteArrayInputStream(ocrBytes); + public void runOcrOnDocument(String dossierId, String fileId, OutputStream out) { + try (ByteArrayOutputStream transferOutputStream = new ByteArrayOutputStream()) { + try (InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId)) { + invisibleElementRemovalService.removeInvisibleElements(fileStream, transferOutputStream, false); + } + try (InputStream transferInputStream = new ByteArrayInputStream(transferOutputStream.toByteArray())) { + runOcr(transferInputStream, out, fileId); + } + } } @SneakyThrows - private byte[] runOcr(byte[] file, String fileId) { + private void runOcr(InputStream fileStream, OutputStream out, String fileId) { - PDFDoc pdfDoc = new PDFDoc(file); + PDFDoc pdfDoc = new PDFDoc(fileStream); Map pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, true); OCROptions options = new OCROptions(); PDFDoc ocrPageDoc = new PDFDoc(); + int numProcessedPages = 0; for (Integer pageId : pageIdToRectCollection.keySet()) { try { @@ -132,6 +135,6 @@ public class OCRService { .build())); Optimizer.optimize(pdfDoc); - return pdfDoc.save(SDFDoc.SaveMode.LINEARIZED, null); + pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null); } } diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OcrMessageReceiver.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OcrMessageReceiver.java index 8c28d3a..72af005 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OcrMessageReceiver.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OcrMessageReceiver.java @@ -1,5 +1,9 @@ package com.iqser.red.service.ocr.v1.server.service; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; + import org.springframework.amqp.AmqpRejectAndDontRequeueException; import org.springframework.amqp.rabbit.annotation.RabbitHandler; import org.springframework.amqp.rabbit.annotation.RabbitListener; @@ -8,9 +12,9 @@ import org.springframework.stereotype.Service; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; +import com.iqser.red.service.ocr.v1.api.model.DocumentRequest; import com.iqser.red.service.ocr.v1.server.client.FileStatusProcessingUpdateClient; import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration; -import com.iqser.red.service.ocr.v1.api.model.DocumentRequest; import feign.FeignException; import lombok.RequiredArgsConstructor; @@ -44,9 +48,14 @@ public class OcrMessageReceiver { fileStorageService.storeUntouchedFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), originalFile); } - var ocrResult = ocrService.runOcrOnDocument(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId()); + try (var out = new ByteArrayOutputStream()) { + ocrService.runOcrOnDocument(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), out); + + fileStorageService.storeOriginalFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), new ByteArrayInputStream(out.toByteArray())); + } catch (IOException e) { + throw new RuntimeException(e); + } - fileStorageService.storeOriginalFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), ocrResult); long end = System.currentTimeMillis(); log.info("Successfully processed ocr for file with dossierId {} and fileId {}, took {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), end - start); diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java index 0007973..b21a331 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java +++ b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java @@ -3,8 +3,8 @@ package com.iqser.red.service.ocr.v1.server; import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory; import static org.assertj.core.api.Assertions.assertThat; +import java.io.FileInputStream; import java.io.FileOutputStream; -import java.io.InputStream; import java.util.ArrayList; import java.util.List; import java.util.concurrent.TimeUnit; @@ -155,25 +155,29 @@ public class OcrServiceIntegrationTest { ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf"); var originId = FileStorageService.getStorageId("dossier", "file", FileType.ORIGIN); - storageService.storeObject(originId, pdfFileResource.getInputStream()); - - try (InputStream ocrDocument = ocrService.runOcrOnDocument("dossier", "file")) { - byte[] ocrDocumentBytes = ocrDocument.readAllBytes(); - try (var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) { - out.write(ocrDocumentBytes); - } - TextExtractor extractor = new TextExtractor(); - List texts = new ArrayList<>(); - PDFDoc pdfDoc = new PDFDoc(ocrDocumentBytes); - PageIterator iterator = pdfDoc.getPageIterator(); - while (iterator.hasNext()) { - Page page = iterator.next(); - extractor.begin(page); - texts.add(extractor.getAsText()); - } - System.out.println("File:" + getTemporaryDirectory() + "/" + fileName + ".pdf"); - return String.join("\n", texts); + try (var fileStream = pdfFileResource.getInputStream()) { + storageService.storeObject(originId, fileStream); } + + try (var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) { + ocrService.runOcrOnDocument("dossier", "file", out); + } + + TextExtractor extractor = new TextExtractor(); + List texts = new ArrayList<>(); + PDFDoc pdfDoc; + + try (var fileStream = new FileInputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) { + pdfDoc = new PDFDoc(fileStream); + } + PageIterator iterator = pdfDoc.getPageIterator(); + while (iterator.hasNext()) { + Page page = iterator.next(); + extractor.begin(page); + texts.add(extractor.getAsText()); + } + System.out.println("File:" + getTemporaryDirectory() + "/" + fileName + ".pdf"); + return String.join("\n", texts); } diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalServiceTest.java b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalServiceTest.java index fbe81a3..9aa1f26 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalServiceTest.java +++ b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalServiceTest.java @@ -3,9 +3,8 @@ package com.iqser.red.service.ocr.v1.server.service; import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory; import static org.assertj.core.api.AssertionsForClassTypes.assertThat; +import java.io.FileInputStream; import java.io.FileOutputStream; -import java.io.IOException; -import java.nio.file.Files; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; @@ -46,22 +45,23 @@ public class InvisibleElementRemovalServiceTest { ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf"); - var initialFileStream = Files.newInputStream(pdfFileResource.getFile().toPath()); - var fileWithoutInvisibleElements = invisibleElementRemovalService.removeInvisibleElements(initialFileStream, false); + try (var initialFileStream = pdfFileResource.getInputStream(); var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) { + invisibleElementRemovalService.removeInvisibleElements(initialFileStream, out,false); + } - initialFileStream = Files.newInputStream(pdfFileResource.getFile().toPath()); - var deltaFile = invisibleElementRemovalService.removeInvisibleElements(initialFileStream, true); + try (var initialFileStream = pdfFileResource.getInputStream(); var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + "_delta.pdf")) { + invisibleElementRemovalService.removeInvisibleElements(initialFileStream, out,true); + } - String fileWithoutInvisibleTextLocation = getTemporaryDirectory() + "/" + fileName + ".pdf"; - String deltaFileLocation = getTemporaryDirectory() + "/" + fileName + "_delta.pdf"; - - saveToFile(fileWithoutInvisibleTextLocation, fileWithoutInvisibleElements); - saveToFile(deltaFileLocation, deltaFile); - - System.out.println("Output File without invisible elements: " + fileWithoutInvisibleTextLocation); - System.out.println("Output Delta File: " + deltaFileLocation); + System.out.println("Output File without invisible elements: files/" + fileName + ".pdf"); + System.out.println("Output Delta File without invisible elements: files/" + fileName + "_delta.pdf"); TextExtractor extractor = new TextExtractor(); - PDFDoc pdfDoc = new PDFDoc(fileWithoutInvisibleElements); + + PDFDoc pdfDoc; + try(var fileStream = new FileInputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) { + pdfDoc = new PDFDoc(fileStream); + } + PageIterator iterator = pdfDoc.getPageIterator(); while (iterator.hasNext()) { Page page = iterator.next(); @@ -70,16 +70,4 @@ public class InvisibleElementRemovalServiceTest { assertThat(text).contains("APPENDIX 16 Pathology Report", "Amendment 1", "Page 255 of 260"); } } - - - private void saveToFile(String location, byte[] fileBytes) { - - try (var f_out = new FileOutputStream(location)) { - f_out.write(fileBytes); - } catch (IOException e) { - throw new RuntimeException("File location: " + location + "could not be openend, no file will be saved"); - } - - } - } \ No newline at end of file