diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/ImagePositionRetrievalService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/ImagePositionRetrievalService.java index 9f32d3c..ae251a1 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/ImagePositionRetrievalService.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/ImagePositionRetrievalService.java @@ -15,6 +15,7 @@ import com.pdftron.common.Matrix2D; import com.pdftron.common.PDFNetException; import com.pdftron.pdf.Element; import com.pdftron.pdf.ElementReader; +import com.pdftron.pdf.Image; import com.pdftron.pdf.PDFDoc; import com.pdftron.pdf.Page; import com.pdftron.pdf.Rect; @@ -27,9 +28,12 @@ public class ImagePositionRetrievalService { private static final double TOLERANCE = 1e-1; + // any image with smaller height and width than this gets thrown out, see everyPointInDashedLineIsImage.pdf + private static final int PIXEL_THRESHOLD = 10; + /** - * Iterates over all elements in a PDF Document and retrieves the bounding box for each image, + * Iterates over all elements in a PDF Document and retrieves the bounding box for each image, that is larger than the pixel threshold of 10 in either dimension. * Then it adjusts the bounding boxes for the page rotation. * If the mirrorY flag is set, the Y Coordinates are mirrored and moved up by the page height. This is required for PDFTrons OCRModule. * @@ -63,7 +67,13 @@ public class ImagePositionRetrievalService { Element element; while ((element = reader.next()) != null) { switch (element.getType()) { - case Element.e_image, Element.e_inline_image -> imagePositions.addRect(toRotationAdjustedRect(element.getBBox(), currentPage, mirrorY)); + case Element.e_image, Element.e_inline_image -> { + Image image = new Image(element.getXObject()); + // see everyPointInDashedLineIsImage.pdf TestFile + if (image.getImageHeight() > PIXEL_THRESHOLD || image.getImageWidth() > PIXEL_THRESHOLD) { + imagePositions.addRect(toRotationAdjustedRect(element.getBBox(), currentPage, mirrorY)); + } + } case Element.e_form -> { reader.formBegin(); findImagePositionsOnPage(reader, imagePositions, currentPage, mirrorY); @@ -77,39 +87,49 @@ public class ImagePositionRetrievalService { @SneakyThrows public RectCollection mergeOverlappingRects(RectCollection imagePositions) { - if (imagePositions.getNumRects() == 1) { + if (imagePositions.getNumRects() < 2) { return imagePositions; } List rectangleList = toSortedRectangleList(imagePositions); - rectangleList = mergeRectangleListRecursive(rectangleList, 0); + mergeRectangleList(rectangleList); + return toRectCollection(rectangleList); } // Sometimes images are split up into stripes, here we merge the positions of aligned and intersecting rectangles into one larger rectangle - private List mergeRectangleListRecursive(List rectangleList, int currentIdx) { + private void mergeRectangleList(List rectangleList) { - if (rectangleList.size() < currentIdx + 2) { - return rectangleList; + for (int idx = 0; rectangleList.size() >= idx + 2; ) { + + var rect1 = rectangleList.get(idx); + var rect2 = rectangleList.get(idx + 1); + + if (intersects(rect1, rect2) && isAlignedXOrY(rect1, rect2)) { + rectangleList.remove(idx + 1); + rectangleList.remove(idx); + rectangleList.add(idx, rect1.createUnion(rect2)); + } else { + ++idx; + } } + } - var rect1 = rectangleList.get(currentIdx); - var rect2 = rectangleList.get(currentIdx + 1); + + private boolean intersects(Rectangle2D rect1, Rectangle2D rect2) { + + return rect1.intersects(rect2.getMinX() - TOLERANCE, rect2.getMinY() - TOLERANCE, rect2.getWidth() + (2 * TOLERANCE), rect2.getHeight() + (2 * TOLERANCE)); + } + + + private boolean isAlignedXOrY(Rectangle2D rect1, Rectangle2D rect2) { boolean isAlignedX = Math.abs(rect1.getMinX() - rect2.getMinX()) < TOLERANCE && Math.abs(rect1.getMaxX() - rect2.getMaxX()) < TOLERANCE; boolean isAlignedY = Math.abs(rect1.getMinY() - rect2.getMinY()) < TOLERANCE && Math.abs(rect1.getMaxY() - rect2.getMaxY()) < TOLERANCE; - boolean intersects = rect1.intersects(rect2.getMinX() - TOLERANCE, rect2.getMinY() - TOLERANCE, rect2.getWidth() + (2 * TOLERANCE), rect2.getHeight() + (2 * TOLERANCE)); - if (intersects && (isAlignedX || isAlignedY)) { - rectangleList.remove(currentIdx + 1); - rectangleList.remove(currentIdx); - rectangleList.add(currentIdx, rect1.createUnion(rect2)); - return mergeRectangleListRecursive(rectangleList, currentIdx); - } else { - return mergeRectangleListRecursive(rectangleList, currentIdx + 1); - } + return isAlignedX || isAlignedY; } diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java index 0ca60ee..c747a9d 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java @@ -5,6 +5,7 @@ import java.awt.geom.AffineTransform; import java.awt.geom.GeneralPath; import java.awt.geom.Rectangle2D; import java.io.InputStream; +import java.io.OutputStream; import java.util.ArrayList; import java.util.Iterator; import java.util.List; @@ -59,10 +60,10 @@ public class InvisibleElementRemovalService { * @param pdfFile The PDF file to process * @param delta If this flag is set only the removed Elements will be written to the output file. * The Elements are red if they are removed by clipping path, blue for transparency, and a green bounding box for overlap. - * @return The resulting PDF File as bytes. + * @param out OutputStream to write the resulting file to **/ @SneakyThrows - public byte[] removeInvisibleElements(InputStream pdfFile, boolean delta) { + public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta) { PDFDoc pdfDoc = new PDFDoc(pdfFile); @@ -75,6 +76,8 @@ public class InvisibleElementRemovalService { Page page = iterator.next(); visitedXObjIds.add(page.getSDFObj().getObjNum()); + + InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder() .reader(reader) .clippingPathStack(new ClippingPathStack(page.getMediaBox())) @@ -90,7 +93,7 @@ public class InvisibleElementRemovalService { removeOverlappedElements(page, writer, context); } - return pdfDoc.save(SDFDoc.SaveMode.REMOVE_UNUSED, null); + pdfDoc.save(out, SDFDoc.SaveMode.REMOVE_UNUSED, null); } @@ -220,12 +223,18 @@ public class InvisibleElementRemovalService { private void processPath(Element pathElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { + PathData pathData = pathElement.getPathData(); - GeneralPath linePath = convertToGeneralPath(pathElement.getPathData()); + if (pathData.getOperators().length == 0 && pathData.getPoints().length == 0) { + writer.writeGStateChanges(pathElement); + return; + } + + GeneralPath linePath = convertToGeneralPath(pathData); //transform path to initial user space var ctm = pathElement.getCTM(); - var affineTransform = getAffineTransform(ctm); + var affineTransform = toAffineTransform(ctm); linePath.transform(affineTransform); var rect = linePath.getBounds2D(); @@ -244,8 +253,13 @@ public class InvisibleElementRemovalService { writer.writeElement(pathElement); } else { + if (pathElement.isWindingFill()) { + linePath.setWindingRule(GeneralPath.WIND_NON_ZERO); + } else { + linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD); + } + if (inClippingPath) { - // TODO: WINDING RULE if (isFilledAndNonTransparent(pathElement)) { List currentOverlappedElements = context.visibleElements() .stream() @@ -270,12 +284,6 @@ public class InvisibleElementRemovalService { } - private static AffineTransform getAffineTransform(Matrix2D ctm) throws PDFNetException { - - return new AffineTransform(ctm.getA(), ctm.getB(), ctm.getC(), ctm.getD(), ctm.getH(), ctm.getV()); - } - - private void removeOverlappedElements(Page page, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { context.reader().begin(page); @@ -422,6 +430,12 @@ public class InvisibleElementRemovalService { } + private static AffineTransform toAffineTransform(Matrix2D ctm) throws PDFNetException { + + return new AffineTransform(ctm.getA(), ctm.getB(), ctm.getC(), ctm.getD(), ctm.getH(), ctm.getV()); + } + + @Builder private record InvisibleElementRemovalContext( boolean delta, diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java index 63b1925..c1cb635 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java @@ -1,10 +1,14 @@ package com.iqser.red.service.ocr.v1.server.service; +import static java.lang.String.format; + import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; import java.io.InputStream; +import java.io.OutputStream; import java.util.Map; -import io.micrometer.core.annotation.Timed; import org.springframework.amqp.rabbit.core.RabbitTemplate; import org.springframework.stereotype.Service; @@ -21,6 +25,7 @@ import com.pdftron.pdf.Page; import com.pdftron.pdf.RectCollection; import com.pdftron.sdf.SDFDoc; +import io.micrometer.core.annotation.Timed; import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; @@ -55,58 +60,56 @@ public class OCRService { * * @param dossierId The dossier id * @param fileId The file id - * @return the resulting PDF file as an InputStream + * @param out OutputStream to write the file to */ - @SneakyThrows @Timed("redactmanager_runOcrOnDocument") - public InputStream runOcrOnDocument(String dossierId, String fileId) { - - InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId); - - byte[] fileWithoutInvisibleTextBytes = invisibleElementRemovalService.removeInvisibleElements(fileStream, false); - - byte[] ocrBytes = runOcr(fileWithoutInvisibleTextBytes, fileId); - - return new ByteArrayInputStream(ocrBytes); + public void runOcrOnDocument(String dossierId, String fileId, OutputStream out) throws IOException { + try (ByteArrayOutputStream transferOutputStream = new ByteArrayOutputStream()) { + try (InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId)) { + long start = System.currentTimeMillis(); + log.debug("Start invisible element removal for file with dossierId {} and fileId {}", dossierId, fileId); + invisibleElementRemovalService.removeInvisibleElements(fileStream, transferOutputStream, false); + long end = System.currentTimeMillis(); + log.debug("Invisible element removal successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, format("%.1f", (end - start) / 1000.0)); + } + try (InputStream transferInputStream = new ByteArrayInputStream(transferOutputStream.toByteArray())) { + long start = System.currentTimeMillis(); + runOcr(transferInputStream, out, fileId); + long end = System.currentTimeMillis(); + log.info("ocr successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, format("%.1f", (end - start) / 1000.0)); + } + } } @SneakyThrows - private byte[] runOcr(byte[] file, String fileId) { + private void runOcr(InputStream fileStream, OutputStream out, String fileId) { - PDFDoc pdfDoc = new PDFDoc(file); + PDFDoc pdfDoc = new PDFDoc(fileStream); Map pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, true); - OCROptions options = new OCROptions(); - PDFDoc ocrPageDoc = new PDFDoc(); + // Optimization: + // When a page does not have a TextZone, PDFTron whites out the page. But, PDFTron scans it anyway, resulting in a longer runtime. + // So, we need to remove pages without images. + // Furthermore, creating a new document is *much* faster than reusing the same document and adding/removing pages one by one. + // Therefore, we create a new Document with a single page for every page that contains text. int numProcessedPages = 0; for (Integer pageId : pageIdToRectCollection.keySet()) { try { - // optimization by only scanning pages that contain images - Page pdfPage = pdfDoc.getPage(pageId); - pdfPage.setMediaBox(pdfPage.getCropBox()); // this line ensures the ocr text is placed correctly by PDFTron - ocrPageDoc.pagePushBack(pdfPage); - options.addTextZonesForPage(pageIdToRectCollection.get(pageId), 1); - options.addLang(ENGLISH); - options.addDPI(settings.getOcrDPI()); - - OCRModule.processPDF(ocrPageDoc, options); + PDFDoc singlePagePdfDoc = extractSinglePagePdfDoc(pdfDoc, pageId); + processOcr(pageIdToRectCollection, pageId, singlePagePdfDoc); ++numProcessedPages; - StringBuilder zonesString = new StringBuilder(); - for (int j = 0; j < pageIdToRectCollection.get(pageId).getNumRects(); ++j) { - var r = pageIdToRectCollection.get(pageId).getRectAt(j); - zonesString.append(String.format("[lower left (%.1f|%.1f) upper right (%.1f|%.1f)]", r.getX1(), r.getY1(), r.getX2(), r.getY2())); - } - log.info("{}/{} Page {} done, OCR regions {}", numProcessedPages, pageIdToRectCollection.size(), pageId, zonesString); + log.info("{}/{} Page {} done, OCR regions {}", + numProcessedPages, + pageIdToRectCollection.size(), + pageId, + getAllOcrTextZonesAsString(pageIdToRectCollection, pageId)); - // re-adding OCR pages - Page ocrPage = ocrPageDoc.getPage(1); - pdfDoc.pageInsert(pdfDoc.getPageIterator(pageId), ocrPage); - pdfDoc.pageRemove(pdfDoc.getPageIterator(pageId + 1)); - ocrPageDoc.pageRemove(ocrPageDoc.getPageIterator(1)); + replaceOriginalPageWithOcrPage(pdfDoc, pageId, singlePagePdfDoc); + singlePagePdfDoc.close(); rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE, objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder() @@ -116,13 +119,11 @@ public class OCRService { .build())); } catch (PDFNetException e) { - log.error("failed to process page {}", pageId); + log.error("Failed to process page {}", pageId); throw new RuntimeException(e); } } - ocrPageDoc.close(); - rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE, objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder() .fileId(fileId) @@ -132,6 +133,52 @@ public class OCRService { .build())); Optimizer.optimize(pdfDoc); - return pdfDoc.save(SDFDoc.SaveMode.LINEARIZED, null); + try { + pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null); + } catch (Exception e) { + log.error("Processed File with fileId {} could not be saved", fileId); + throw new RuntimeException(e); + } } + + + private void processOcr(Map pageIdToRectCollection, Integer pageId, PDFDoc singlePagePdfDoc) throws PDFNetException { + + OCROptions options = new OCROptions(); + options.addTextZonesForPage(pageIdToRectCollection.get(pageId), 1); + options.addLang(ENGLISH); + options.addDPI(settings.getOcrDPI()); + + OCRModule.processPDF(singlePagePdfDoc, options); + } + + + private static PDFDoc extractSinglePagePdfDoc(PDFDoc pdfDoc, Integer pageId) throws PDFNetException { + + PDFDoc singlePagePdfDoc = new PDFDoc(); + Page page = pdfDoc.getPage(pageId); + page.setMediaBox(page.getCropBox()); // this line ensures the ocr text is placed correctly by PDFTron, see TestFile MediaBoxBiggerThanCropBox.pdf + singlePagePdfDoc.pagePushBack(page); + return singlePagePdfDoc; + } + + + private static void replaceOriginalPageWithOcrPage(PDFDoc pdfDoc, Integer pageId, PDFDoc ocrPageDoc) throws PDFNetException { + + Page ocrPage = ocrPageDoc.getPage(1); + pdfDoc.pageInsert(pdfDoc.getPageIterator(pageId), ocrPage); + pdfDoc.pageRemove(pdfDoc.getPageIterator(pageId + 1)); + } + + + private static StringBuilder getAllOcrTextZonesAsString(Map pageIdToRectCollection, Integer pageId) throws PDFNetException { + + StringBuilder zonesString = new StringBuilder(); + for (int j = 0; j < pageIdToRectCollection.get(pageId).getNumRects(); ++j) { + var r = pageIdToRectCollection.get(pageId).getRectAt(j); + zonesString.append(format("[lower left (%.1f|%.1f) upper right (%.1f|%.1f)]", r.getX1(), r.getY1(), r.getX2(), r.getY2())); + } + return zonesString; + } + } diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OcrMessageReceiver.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OcrMessageReceiver.java index 8c28d3a..1683ae1 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OcrMessageReceiver.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OcrMessageReceiver.java @@ -1,5 +1,9 @@ package com.iqser.red.service.ocr.v1.server.service; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; + import org.springframework.amqp.AmqpRejectAndDontRequeueException; import org.springframework.amqp.rabbit.annotation.RabbitHandler; import org.springframework.amqp.rabbit.annotation.RabbitListener; @@ -8,9 +12,9 @@ import org.springframework.stereotype.Service; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; +import com.iqser.red.service.ocr.v1.api.model.DocumentRequest; import com.iqser.red.service.ocr.v1.server.client.FileStatusProcessingUpdateClient; import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration; -import com.iqser.red.service.ocr.v1.api.model.DocumentRequest; import feign.FeignException; import lombok.RequiredArgsConstructor; @@ -34,7 +38,6 @@ public class OcrMessageReceiver { DocumentRequest ocrRequestMessage = objectMapper.readValue(in, DocumentRequest.class); - long start = System.currentTimeMillis(); log.info("Start ocr for file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId()); setStatusOcrProcessing(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId()); @@ -44,14 +47,18 @@ public class OcrMessageReceiver { fileStorageService.storeUntouchedFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), originalFile); } - var ocrResult = ocrService.runOcrOnDocument(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId()); - - fileStorageService.storeOriginalFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), ocrResult); - - long end = System.currentTimeMillis(); - log.info("Successfully processed ocr for file with dossierId {} and fileId {}, took {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), end - start); + try (var transferStream = new ByteArrayOutputStream()) { + ocrService.runOcrOnDocument(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), transferStream); + try (var inputStream = new ByteArrayInputStream(transferStream.toByteArray())) { + fileStorageService.storeOriginalFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), inputStream); + } + } catch (IOException e) { + log.error("Failed to store file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId()); + throw new RuntimeException(e); + } fileStatusProcessingUpdateClient.ocrSuccessful(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId()); + } diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java index 0007973..cbd57ff 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java +++ b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java @@ -3,14 +3,14 @@ package com.iqser.red.service.ocr.v1.server; import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory; import static org.assertj.core.api.Assertions.assertThat; +import java.io.FileInputStream; import java.io.FileOutputStream; +import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.List; import java.util.concurrent.TimeUnit; -import io.micrometer.prometheus.PrometheusMeterRegistry; -import io.micrometer.prometheus.PrometheusTimer; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Disabled; @@ -36,12 +36,15 @@ import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService; import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType; import com.iqser.red.storage.commons.StorageAutoConfiguration; import com.iqser.red.storage.commons.service.StorageService; +import com.pdftron.common.PDFNetException; import com.pdftron.pdf.OCRModule; import com.pdftron.pdf.PDFDoc; import com.pdftron.pdf.Page; import com.pdftron.pdf.PageIterator; import com.pdftron.pdf.TextExtractor; +import io.micrometer.prometheus.PrometheusMeterRegistry; +import io.micrometer.prometheus.PrometheusTimer; import lombok.SneakyThrows; @ExtendWith(SpringExtension.class) @@ -80,19 +83,20 @@ public class OcrServiceIntegrationTest { @Test @Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top. - public void testOCRMetrics(){ + public void testOCRMetrics() { + testOCR("Watermark"); testOCR("Watermark"); testOCR("Watermark"); - var ocrOnDocumentMeter = registry.getMeters().stream() - .filter(m -> m.getId().getName().equalsIgnoreCase("redactmanager_runOcrOnDocument")).findAny(); + var ocrOnDocumentMeter = registry.getMeters().stream().filter(m -> m.getId().getName().equalsIgnoreCase("redactmanager_runOcrOnDocument")).findAny(); assertThat(ocrOnDocumentMeter.isPresent()).isTrue(); PrometheusTimer timer = (PrometheusTimer) ocrOnDocumentMeter.get(); assertThat(timer.count()).isEqualTo(3); assertThat(timer.mean(TimeUnit.SECONDS)).isGreaterThan(0.1); } + @Test @Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top. public void testOcr() { @@ -153,34 +157,42 @@ public class OcrServiceIntegrationTest { private String testOCR(String fileName) { ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf"); - var originId = FileStorageService.getStorageId("dossier", "file", FileType.ORIGIN); - storageService.storeObject(originId, pdfFileResource.getInputStream()); - - try (InputStream ocrDocument = ocrService.runOcrOnDocument("dossier", "file")) { - byte[] ocrDocumentBytes = ocrDocument.readAllBytes(); - try (var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) { - out.write(ocrDocumentBytes); - } - TextExtractor extractor = new TextExtractor(); - List texts = new ArrayList<>(); - PDFDoc pdfDoc = new PDFDoc(ocrDocumentBytes); - PageIterator iterator = pdfDoc.getPageIterator(); - while (iterator.hasNext()) { - Page page = iterator.next(); - extractor.begin(page); - texts.add(extractor.getAsText()); - } - System.out.println("File:" + getTemporaryDirectory() + "/" + fileName + ".pdf"); - return String.join("\n", texts); + try (var fileStream = pdfFileResource.getInputStream()) { + storageService.storeObject(originId, fileStream); } + try (var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) { + ocrService.runOcrOnDocument("dossier", "file", out); + } + + System.out.println("File:" + getTemporaryDirectory() + "/" + fileName + ".pdf"); + try (var fileStream = new FileInputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) { + return extractAllTextFromDocument(fileStream); + } + } + + + private static String extractAllTextFromDocument(InputStream fileStream) throws IOException, PDFNetException { + + PDFDoc pdfDoc = new PDFDoc(fileStream); + TextExtractor extractor = new TextExtractor(); + List texts = new ArrayList<>(); + + PageIterator iterator = pdfDoc.getPageIterator(); + while (iterator.hasNext()) { + Page page = iterator.next(); + extractor.begin(page); + texts.add(extractor.getAsText()); + } + + return String.join("\n", texts); } @SneakyThrows public void dummyTest() { - // Build needs one text to not fail. + // Build needs one test to not fail. assertThat(1).isEqualTo(1); } @@ -200,7 +212,7 @@ public class OcrServiceIntegrationTest { @Bean @Primary - public StorageService inmemoryStorage() { + public StorageService inMemoryStorage() { return new FileSystemBackedStorageService(); } diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/ImagePositionRetrievalServiceTest.java b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/ImagePositionRetrievalServiceTest.java index 9783979..d4eb08e 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/ImagePositionRetrievalServiceTest.java +++ b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/ImagePositionRetrievalServiceTest.java @@ -122,6 +122,14 @@ class ImagePositionRetrievalServiceTest { assertThat(allRectCoords.size()).isEqualTo(48); } + @Test + @SneakyThrows + public void testEveryPointInDashedLineIsImage() { + String fileName = "everyPointInDashedLineIsImage"; + List allRectCoords = testImagePositionDetection(fileName); + assertThat(allRectCoords.size()).isEqualTo(0); + } + private List testImagePositionDetection(String fileName) throws IOException, PDFNetException { diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalServiceTest.java b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalServiceTest.java index fbe81a3..9aa1f26 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalServiceTest.java +++ b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalServiceTest.java @@ -3,9 +3,8 @@ package com.iqser.red.service.ocr.v1.server.service; import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory; import static org.assertj.core.api.AssertionsForClassTypes.assertThat; +import java.io.FileInputStream; import java.io.FileOutputStream; -import java.io.IOException; -import java.nio.file.Files; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; @@ -46,22 +45,23 @@ public class InvisibleElementRemovalServiceTest { ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf"); - var initialFileStream = Files.newInputStream(pdfFileResource.getFile().toPath()); - var fileWithoutInvisibleElements = invisibleElementRemovalService.removeInvisibleElements(initialFileStream, false); + try (var initialFileStream = pdfFileResource.getInputStream(); var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) { + invisibleElementRemovalService.removeInvisibleElements(initialFileStream, out,false); + } - initialFileStream = Files.newInputStream(pdfFileResource.getFile().toPath()); - var deltaFile = invisibleElementRemovalService.removeInvisibleElements(initialFileStream, true); + try (var initialFileStream = pdfFileResource.getInputStream(); var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + "_delta.pdf")) { + invisibleElementRemovalService.removeInvisibleElements(initialFileStream, out,true); + } - String fileWithoutInvisibleTextLocation = getTemporaryDirectory() + "/" + fileName + ".pdf"; - String deltaFileLocation = getTemporaryDirectory() + "/" + fileName + "_delta.pdf"; - - saveToFile(fileWithoutInvisibleTextLocation, fileWithoutInvisibleElements); - saveToFile(deltaFileLocation, deltaFile); - - System.out.println("Output File without invisible elements: " + fileWithoutInvisibleTextLocation); - System.out.println("Output Delta File: " + deltaFileLocation); + System.out.println("Output File without invisible elements: files/" + fileName + ".pdf"); + System.out.println("Output Delta File without invisible elements: files/" + fileName + "_delta.pdf"); TextExtractor extractor = new TextExtractor(); - PDFDoc pdfDoc = new PDFDoc(fileWithoutInvisibleElements); + + PDFDoc pdfDoc; + try(var fileStream = new FileInputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) { + pdfDoc = new PDFDoc(fileStream); + } + PageIterator iterator = pdfDoc.getPageIterator(); while (iterator.hasNext()) { Page page = iterator.next(); @@ -70,16 +70,4 @@ public class InvisibleElementRemovalServiceTest { assertThat(text).contains("APPENDIX 16 Pathology Report", "Amendment 1", "Page 255 of 260"); } } - - - private void saveToFile(String location, byte[] fileBytes) { - - try (var f_out = new FileOutputStream(location)) { - f_out.write(fileBytes); - } catch (IOException e) { - throw new RuntimeException("File location: " + location + "could not be openend, no file will be saved"); - } - - } - } \ No newline at end of file diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/everyPointInDashedLineIsImage.pdf b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/everyPointInDashedLineIsImage.pdf new file mode 100644 index 0000000..8ee74c9 Binary files /dev/null and b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/everyPointInDashedLineIsImage.pdf differ diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/resources/log4j2-test.xml b/ocr-service-v1/ocr-service-server-v1/src/test/resources/log4j2-test.xml new file mode 100644 index 0000000..b4895cf --- /dev/null +++ b/ocr-service-v1/ocr-service-server-v1/src/test/resources/log4j2-test.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + +