diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/ImagePositionRetrievalService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/ImagePositionRetrievalService.java new file mode 100644 index 0000000..9f32d3c --- /dev/null +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/ImagePositionRetrievalService.java @@ -0,0 +1,177 @@ +package com.iqser.red.service.ocr.v1.server.service; + +import java.awt.geom.Point2D; +import java.awt.geom.Rectangle2D; +import java.awt.geom.RectangularShape; +import java.util.Comparator; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; + +import org.springframework.stereotype.Service; + +import com.pdftron.common.Matrix2D; +import com.pdftron.common.PDFNetException; +import com.pdftron.pdf.Element; +import com.pdftron.pdf.ElementReader; +import com.pdftron.pdf.PDFDoc; +import com.pdftron.pdf.Page; +import com.pdftron.pdf.Rect; +import com.pdftron.pdf.RectCollection; + +import lombok.SneakyThrows; + +@Service +public class ImagePositionRetrievalService { + + private static final double TOLERANCE = 1e-1; + + + /** + * Iterates over all elements in a PDF Document and retrieves the bounding box for each image, + * Then it adjusts the bounding boxes for the page rotation. + * If the mirrorY flag is set, the Y Coordinates are mirrored and moved up by the page height. This is required for PDFTrons OCRModule. + * + * @param pdfDoc a PDF File as PDFTron PDFDoc class + * @param mirrorY if this flag is set, all coordinates are calculated with upper left corner as (0,0), else initial user space + * @return a map with the page indices as keys and the image bounding boxes on that page as a RectCollection + */ + @SneakyThrows + public Map getImagePositionPerPage(PDFDoc pdfDoc, boolean mirrorY) { + + Map pageIdToImagePositions = new HashMap<>(); + ElementReader reader = new ElementReader(); + for (int pageId = 1; pageId <= pdfDoc.getPageCount(); ++pageId) { + RectCollection imagePositions = new RectCollection(); + reader.begin(pdfDoc.getPage(pageId)); + + findImagePositionsOnPage(reader, imagePositions, pdfDoc.getPage(pageId), mirrorY); + imagePositions = mergeOverlappingRects(imagePositions); + + reader.end(); + if (imagePositions.getNumRects() > 0) { + pageIdToImagePositions.put(pageId, imagePositions); + } + } + return pageIdToImagePositions; + } + + + private void findImagePositionsOnPage(ElementReader reader, RectCollection imagePositions, Page currentPage, boolean mirrorY) throws PDFNetException { + + Element element; + while ((element = reader.next()) != null) { + switch (element.getType()) { + case Element.e_image, Element.e_inline_image -> imagePositions.addRect(toRotationAdjustedRect(element.getBBox(), currentPage, mirrorY)); + case Element.e_form -> { + reader.formBegin(); + findImagePositionsOnPage(reader, imagePositions, currentPage, mirrorY); + reader.end(); + } + } + } + } + + + @SneakyThrows + public RectCollection mergeOverlappingRects(RectCollection imagePositions) { + + if (imagePositions.getNumRects() == 1) { + return imagePositions; + } + + List rectangleList = toSortedRectangleList(imagePositions); + + rectangleList = mergeRectangleListRecursive(rectangleList, 0); + return toRectCollection(rectangleList); + } + + + // Sometimes images are split up into stripes, here we merge the positions of aligned and intersecting rectangles into one larger rectangle + private List mergeRectangleListRecursive(List rectangleList, int currentIdx) { + + if (rectangleList.size() < currentIdx + 2) { + return rectangleList; + } + + var rect1 = rectangleList.get(currentIdx); + var rect2 = rectangleList.get(currentIdx + 1); + + boolean isAlignedX = Math.abs(rect1.getMinX() - rect2.getMinX()) < TOLERANCE && Math.abs(rect1.getMaxX() - rect2.getMaxX()) < TOLERANCE; + boolean isAlignedY = Math.abs(rect1.getMinY() - rect2.getMinY()) < TOLERANCE && Math.abs(rect1.getMaxY() - rect2.getMaxY()) < TOLERANCE; + boolean intersects = rect1.intersects(rect2.getMinX() - TOLERANCE, rect2.getMinY() - TOLERANCE, rect2.getWidth() + (2 * TOLERANCE), rect2.getHeight() + (2 * TOLERANCE)); + + if (intersects && (isAlignedX || isAlignedY)) { + rectangleList.remove(currentIdx + 1); + rectangleList.remove(currentIdx); + rectangleList.add(currentIdx, rect1.createUnion(rect2)); + return mergeRectangleListRecursive(rectangleList, currentIdx); + } else { + return mergeRectangleListRecursive(rectangleList, currentIdx + 1); + } + } + + + private Rect toRotationAdjustedRect(Rect bbox, Page page, boolean mirrorY) throws PDFNetException { + + int rotation = page.getRotation(); + double height = page.getPageHeight(); + double width = page.getPageWidth(); + + // Even though PDFTron almost always has the origin in the lower left corner, for some reason, the OCRModule's addTextZonesForPage() uses the upper left corner as origin... + Matrix2D mirrorMatrix; + if (mirrorY) { + mirrorMatrix = new Matrix2D(1, 0, 0, -1, 0, height); + } else { + mirrorMatrix = new Matrix2D(); + } + + // We need to rotate the rects to fit to the page rotation + Matrix2D rotationMatrix = switch (rotation) { + case 1 -> new Matrix2D(0, -1, 1, 0, 0, height); + case 2 -> new Matrix2D(-1, 0, 0, -1, width, height); + case 3 -> new Matrix2D(0, 1, -1, 0, width, 0); + default -> new Matrix2D(); + }; + + Matrix2D finalMatrix = mirrorMatrix.multiply(rotationMatrix); + + Point2D.Double p1 = finalMatrix.multPoint(bbox.getX1(), bbox.getY1()); + Point2D.Double p2 = finalMatrix.multPoint(bbox.getX2(), bbox.getY2()); + + // PDFTron Rect *needs* lower left and upper right coordinates to calculate width and height correctly, even though the documentation states otherwise + Point2D.Double lowerLeft = new Point2D.Double(Math.min(p1.x, p2.x), Math.min(p1.y, p2.y)); + Point2D.Double upperRight = new Point2D.Double(Math.max(p1.x, p2.x), Math.max(p1.y, p2.y)); + + return new Rect(lowerLeft.x, lowerLeft.y, upperRight.x, upperRight.y); + } + + + private RectCollection toRectCollection(List rectangleList) { + + RectCollection rectCollection = new RectCollection(); + rectangleList.forEach(r -> { + try { + rectCollection.addRect(new Rect(r.getMinX(), r.getMinY(), r.getMaxX(), r.getMaxY())); + } catch (PDFNetException e) { + throw new RuntimeException(e); + } + }); + return rectCollection; + } + + + @SneakyThrows + private List toSortedRectangleList(RectCollection rectCollection) { + + List list = new LinkedList<>(); + for (int i = 0; i < rectCollection.getNumRects(); ++i) { + Rect r = rectCollection.getRectAt(i); + list.add(new Rectangle2D.Double(r.getX1(), r.getY1(), r.getWidth(), r.getHeight())); + } + list.sort(Comparator.comparingDouble(RectangularShape::getMinY).thenComparing(RectangularShape::getMinX)); + return list; + } + +} diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java index a473a10..0ca60ee 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java @@ -423,12 +423,13 @@ public class InvisibleElementRemovalService { @Builder - private record InvisibleElementRemovalContext(boolean delta, // - ElementReader reader, // - ClippingPathStack clippingPathStack, // - List overlappedElements, // - List visibleElements, // - Set visitedXObjIds) { + private record InvisibleElementRemovalContext( + boolean delta, + ElementReader reader, + ClippingPathStack clippingPathStack, + List overlappedElements, + List visibleElements, + Set visitedXObjIds) { } diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java index 964a507..49e7f33 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java @@ -1,12 +1,7 @@ package com.iqser.red.service.ocr.v1.server.service; import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; import java.io.InputStream; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; import java.util.Map; import org.springframework.amqp.rabbit.core.RabbitTemplate; @@ -15,11 +10,8 @@ import org.springframework.stereotype.Service; import com.fasterxml.jackson.databind.ObjectMapper; import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse; import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration; -import com.iqser.red.service.ocr.v1.server.model.ImagePosition; -import com.iqser.red.service.ocr.v1.server.model.image.ImageServiceResponse; import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings; -import com.iqser.red.service.redaction.v1.model.Point; -import com.iqser.red.service.redaction.v1.model.Rectangle; +import com.pdftron.common.PDFNetException; import com.pdftron.pdf.OCRModule; import com.pdftron.pdf.OCROptions; import com.pdftron.pdf.Optimizer; @@ -48,118 +40,96 @@ public class OCRService { private final InvisibleElementRemovalService invisibleElementRemovalService; + private final ImagePositionRetrievalService imagePositionRetrievalService; + + /** + * First loads the PDF Document from storage. + * Then removes all invisible Elements from the PDF, check InvisibleElementRemovalService for details. + * Then gets Image Position Information, check ImagePositionRetrievalService for details. + * Then runs OCR page by page, exclusively on pages which have images on them. It does so, by creating a new PDFDoc and inserting a single page at a time. + * This is because PDFTron OCROptions overlays all regions where OCR should not be run with white images. It does not check for empty pages. + * For Documents with many pages but few Images this results in major performance improvements. + * It then re-adds the OCRed Pages to the original document and saves it. + * + * @param dossierId The dossier id + * @param fileId The file id + * @return the resulting PDF file as an InputStream + */ @SneakyThrows - public InputStream ocrDocument(String dossierId, String fileId) { + public InputStream runOcrOnDocument(String dossierId, String fileId) { InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId); - ImageServiceResponse imageServiceResponse = fileStorageService.getImageServiceResponse(dossierId, fileId); - byte[] fileWithoutInvisibleTextStream = invisibleElementRemovalService.removeInvisibleElements(fileStream, false); + byte[] fileWithoutInvisibleTextBytes = invisibleElementRemovalService.removeInvisibleElements(fileStream, false); - byte[] ocrBytes = ocr(fileWithoutInvisibleTextStream, fileId, imageServiceResponse); + byte[] ocrBytes = runOcr(fileWithoutInvisibleTextBytes, fileId); return new ByteArrayInputStream(ocrBytes); } - private byte[] ocr(byte[] file, String fileId, ImageServiceResponse imageServiceResponse) { - PDFDoc pdfDoc = null; - try (ByteArrayOutputStream out = new ByteArrayOutputStream()) { - pdfDoc = new PDFDoc(file); - - Map> pages = new HashMap<>(); - - imageServiceResponse.getData() - .forEach(imageMetadata -> pages.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>()) - .add(new ImagePosition(new Rectangle(new Point(imageMetadata.getPosition().getX1(), imageMetadata.getPosition().getY1()), - imageMetadata.getGeometry().getWidth(), - imageMetadata.getGeometry().getHeight(), - imageMetadata.getPosition().getPageNumber()), imageMetadata.isAlpha()))); - - Map pdfDocMap = Collections.synchronizedMap(new HashMap<>()); - - rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE, - objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder().fileId(fileId).numberOfPagesToOCR(pages.keySet().size()).build())); - - ocrPages(pdfDoc, fileId, pages, pdfDocMap); - - Optimizer.optimize(pdfDoc); - pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null); - pdfDoc.close(); - - rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE, - objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder() - .fileId(fileId) - .numberOfPagesToOCR(pages.keySet().size()) - .numberOfOCRedPages(pages.keySet().size()) - .ocrFinished(true) - .build())); - - return out.toByteArray(); - } catch (Exception e) { - throw new RuntimeException(e); - } finally { - if (pdfDoc != null) { - try { - pdfDoc.close(); - } catch (Exception e) { - log.debug("Failed to close document", e); - } - } - } - } - - @SneakyThrows - private void ocrPages(PDFDoc pdfDoc, String fileId, Map> pages, Map pdfDocMap) { + private byte[] runOcr(byte[] file, String fileId) { - int numberOfOCRedPages = 0; - for (var pageEntry : pages.entrySet()) { + PDFDoc pdfDoc = new PDFDoc(file); + Map pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, true); + + OCROptions options = new OCROptions(); + PDFDoc ocrPageDoc = new PDFDoc(); + int numProcessedPages = 0; + for (Integer pageId : pageIdToRectCollection.keySet()) { try { - RectCollection rectCollection = new RectCollection(); - - var page = pageEntry.getKey(); - - Page pdfPage = pdfDoc.getPageIterator(page).next(); - - pdfPage.setMediaBox(pdfPage.getCropBox()); - - for (ImagePosition imagePosition : pageEntry.getValue()) { - Rectangle rectangle = imagePosition.getRectangle(); - - // Warning coordinate system is different in this call macOs/Linux - double y = -rectangle.getTopLeft().getY() + pdfPage.getCropBox().getY2() - rectangle.getHeight(); - rectCollection.addRect(rectangle.getTopLeft().getX(), y, rectangle.getTopLeft().getX() + rectangle.getWidth(), y + rectangle.getHeight()); - } - - PDFDoc ocrDoc = new PDFDoc(); - ocrDoc.pagePushBack(pdfPage); - pdfDocMap.put(pageEntry.getKey(), ocrDoc); - - OCROptions options = new OCROptions(); - options.addTextZonesForPage(rectCollection, 1); + // optimization by only scanning pages that contain images + Page pdfPage = pdfDoc.getPage(pageId); + pdfPage.setMediaBox(pdfPage.getCropBox()); // this line ensures the ocr text is placed correctly by PDFTron + ocrPageDoc.pagePushBack(pdfPage); + options.addTextZonesForPage(pageIdToRectCollection.get(pageId), 1); options.addLang(ENGLISH); options.addDPI(settings.getOcrDPI()); - OCRModule.processPDF(ocrDoc, options); - rectCollection.clear(); + OCRModule.processPDF(ocrPageDoc, options); + ++numProcessedPages; - } catch (Exception e) { - log.warn("Failed to process PDF page {}", pageEntry.getKey()); + StringBuilder zonesString = new StringBuilder(); + for (int j = 0; j < pageIdToRectCollection.get(pageId).getNumRects(); ++j) { + var r = pageIdToRectCollection.get(pageId).getRectAt(j); + zonesString.append(String.format("[lower left (%.1f|%.1f) upper right (%.1f|%.1f)]", r.getX1(), r.getY1(), r.getX2(), r.getY2())); + } + log.info("{}/{} Page {} done, OCR regions {}", numProcessedPages, pageIdToRectCollection.size(), pageId, zonesString); + + // re-adding OCR pages + Page ocrPage = ocrPageDoc.getPage(1); + pdfDoc.pageInsert(pdfDoc.getPageIterator(pageId), ocrPage); + pdfDoc.pageRemove(pdfDoc.getPageIterator(pageId + 1)); + ocrPageDoc.pageRemove(ocrPageDoc.getPageIterator(1)); + + rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE, + objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder() + .fileId(fileId) + .numberOfPagesToOCR(pageIdToRectCollection.size()) + .numberOfOCRedPages(numProcessedPages) + .build())); + + } catch (PDFNetException e) { + log.error("failed to process page {}", pageId); + throw new RuntimeException(e); } - - rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE, - objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder() - .fileId(fileId) - .numberOfPagesToOCR(pages.keySet().size()) - .numberOfOCRedPages(++numberOfOCRedPages) - .build())); - - log.warn("Done page {}", pageEntry); - } + + ocrPageDoc.close(); + + rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE, + objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder() + .fileId(fileId) + .numberOfPagesToOCR(pageIdToRectCollection.size()) + .numberOfOCRedPages(numProcessedPages) + .ocrFinished(true) + .build())); + + Optimizer.optimize(pdfDoc); + return pdfDoc.save(SDFDoc.SaveMode.LINEARIZED, null); } } diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OcrMessageReceiver.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OcrMessageReceiver.java index 772ffb8..8c28d3a 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OcrMessageReceiver.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OcrMessageReceiver.java @@ -44,7 +44,7 @@ public class OcrMessageReceiver { fileStorageService.storeUntouchedFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), originalFile); } - var ocrResult = ocrService.ocrDocument(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId()); + var ocrResult = ocrService.runOcrOnDocument(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId()); fileStorageService.storeOriginalFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), ocrResult); diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java index 6293a62..f93651c 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java +++ b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java @@ -3,11 +3,13 @@ package com.iqser.red.service.ocr.v1.server; import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory; import static org.assertj.core.api.Assertions.assertThat; -import java.io.File; +import java.io.FileOutputStream; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; -import org.apache.commons.io.FileUtils; -import org.apache.commons.io.IOUtils; import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; @@ -25,12 +27,17 @@ import org.springframework.core.io.ClassPathResource; import org.springframework.test.context.junit.jupiter.SpringExtension; import com.fasterxml.jackson.databind.ObjectMapper; -import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService; -import com.iqser.red.service.ocr.v1.server.service.OCRService; import com.iqser.red.service.ocr.v1.server.service.FileStorageService; +import com.iqser.red.service.ocr.v1.server.service.OCRService; +import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService; import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType; import com.iqser.red.storage.commons.StorageAutoConfiguration; import com.iqser.red.storage.commons.service.StorageService; +import com.pdftron.pdf.OCRModule; +import com.pdftron.pdf.PDFDoc; +import com.pdftron.pdf.Page; +import com.pdftron.pdf.PageIterator; +import com.pdftron.pdf.TextExtractor; import lombok.SneakyThrows; @@ -56,28 +63,96 @@ public class OcrServiceIntegrationTest { private OCRService ocrService; + @BeforeEach + @SneakyThrows + @Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top. + public void assertOCRModuleIsLoaded() { + + assert OCRModule.isModuleAvailable(); + } + + @Test @Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top. + public void testOcr() { + // check visually for most of the images containing text, the resulting text is kind of nonsense, just ensure it is there + String text = testOCR("StitchedImagesMultiPage"); + } + + + @Test + @Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top. + public void testManyRotatedImages() { + // check visually for most of the images containing text, the resulting text is kind of nonsense, just ensure it is there + String text = testOCR("manyRotatedImages"); + assertThat(text).contains("Michela", "Gregori", "DVM", "PhD", "Pathologist"); + } + + + @Test + @Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top. + public void testMergeImages() { + // check visually for most of the images containing text, the resulting text is kind of nonsense, just ensure it is there + String text = testOCR("merge_images"); + assertThat(text).contains("Bodyweight change of dams with live young - group mean values", + "Control", + "mg/g day", + "10 mg/kg/day", + "20 mg/kg/", + "Days", + "50", + "-200", + "—250", + "150", + "200", + "250", + "—150"); + } + + + @Test + @Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top. + public void testOCRWatermark() { + + assertThat(testOCR("Watermark")).contains("syngenta"); + } + + + @Test + @Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top. + public void testOCRInvisibleText() { + + String text = testOCR("InvisibleText"); + assertThat(text).contains("Michela", "Gregori", "DVM", "PhD", "Pathologist", "SIGNATURE PAGE"); + assertThat(text).doesNotContain("COMPLETION DATE:", "LABORATORY PROJECT ID:", "AUTHOR(S):", "Substance"); + } + + @SneakyThrows - public void testOCR() { + private String testOCR(String fileName) { - String fileName = "Watermark"; - - ClassPathResource imageInfoResource = new ClassPathResource("files/" + fileName + ".IMAGE_INFO.json"); ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf"); var originId = FileStorageService.getStorageId("dossier", "file", FileType.ORIGIN); storageService.storeObject(originId, pdfFileResource.getInputStream()); - var imageId = FileStorageService.getStorageId("dossier", "file", FileType.IMAGE_INFO); - storageService.storeObject(imageId, imageInfoResource.getInputStream()); - - var response = ocrService.ocrDocument("dossier", "file"); - - var out = FileUtils.openOutputStream(new File(getTemporaryDirectory() + "/" + fileName + ".pdf")); - IOUtils.copy(response, out); - - System.out.println("File:" + getTemporaryDirectory() + "/" + fileName + ".pdf"); + try (InputStream ocrDocument = ocrService.runOcrOnDocument("dossier", "file")) { + byte[] ocrDocumentBytes = ocrDocument.readAllBytes(); + try (var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) { + out.write(ocrDocumentBytes); + } + TextExtractor extractor = new TextExtractor(); + List texts = new ArrayList<>(); + PDFDoc pdfDoc = new PDFDoc(ocrDocumentBytes); + PageIterator iterator = pdfDoc.getPageIterator(); + while (iterator.hasNext()) { + Page page = iterator.next(); + extractor.begin(page); + texts.add(extractor.getAsText()); + } + System.out.println("File:" + getTemporaryDirectory() + "/" + fileName + ".pdf"); + return String.join("\n", texts); + } } diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/ImagePositionRetrievalServiceTest.java b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/ImagePositionRetrievalServiceTest.java new file mode 100644 index 0000000..9783979 --- /dev/null +++ b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/ImagePositionRetrievalServiceTest.java @@ -0,0 +1,228 @@ +package com.iqser.red.service.ocr.v1.server.service; + +import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.springframework.amqp.rabbit.core.RabbitTemplate; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.boot.test.mock.mockito.MockBean; +import org.springframework.context.annotation.Import; +import org.springframework.core.io.ClassPathResource; +import org.springframework.test.context.junit.jupiter.SpringExtension; + +import com.iqser.red.service.ocr.v1.server.Application; +import com.iqser.red.service.ocr.v1.server.OcrServiceIntegrationTest; +import com.pdftron.common.PDFNetException; +import com.pdftron.pdf.ColorPt; +import com.pdftron.pdf.ColorSpace; +import com.pdftron.pdf.Element; +import com.pdftron.pdf.ElementBuilder; +import com.pdftron.pdf.ElementWriter; +import com.pdftron.pdf.PDFDoc; +import com.pdftron.pdf.Page; +import com.pdftron.pdf.Rect; +import com.pdftron.pdf.RectCollection; +import com.pdftron.sdf.SDFDoc; + +import lombok.SneakyThrows; + +@ExtendWith(SpringExtension.class) +@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT) +@Import(OcrServiceIntegrationTest.TestConfiguration.class) +class ImagePositionRetrievalServiceTest { + + @Autowired + private ImagePositionRetrievalService imagePositionRetrievalService; + + @MockBean + protected RabbitTemplate rabbitTemplate; + + + @Test + @SneakyThrows + public void testImagePositionRetrievalForRotateTestFileWithImages() { + + String fileName = "RotateTestFileWithImages"; + + List allRectCoords = testImagePositionDetection(fileName); + + assertThat(allRectCoords).contains(new int[]{48, 572, 295, 721}, + new int[]{54, 279, 301, 428}, + new int[]{360, 173, 509, 419}, + new int[]{362, 522, 511, 768}, + new int[]{459, 354, 608, 600}, + new int[]{145, 404, 392, 553}, + new int[]{151, 111, 398, 260}, + new int[]{457, 5, 606, 251}, + new int[]{395, 480, 545, 726}, + new int[]{393, 130, 542, 377}, + new int[]{88, 236, 334, 386}, + new int[]{82, 530, 328, 679}, + new int[]{465, 11, 614, 257}, + new int[]{159, 117, 406, 266}, + new int[]{467, 360, 617, 607}, + new int[]{153, 410, 400, 559}); + } + + + @Test + @SneakyThrows + public void testImagePositionRetrievalForRotateTestFileWithImagesExtremeCropbox() { + + String fileName = "RotateTestFileWithImagesExtremeCropbox"; + + List allRectCoords = testImagePositionDetection(fileName); + assertThat(allRectCoords).contains(new int[]{48, 572, 295, 721}, + new int[]{362, 522, 511, 768}, + new int[]{360, 173, 509, 419}, + new int[]{54, 279, 301, 428}, + new int[]{145, 192, 392, 341}, + new int[]{459, 142, 608, 388}, + new int[]{457, -207, 606, 39}, + new int[]{151, -101, 398, 48}, + new int[]{-30, 238, 216, 387}, + new int[]{283, 188, 433, 434}, + new int[]{281, -162, 430, 85}, + new int[]{-24, -56, 222, 94}, + new int[]{-39, 410, 208, 559}, + new int[]{275, 360, 425, 607}, + new int[]{273, 11, 422, 257}, + new int[]{-33, 117, 214, 266}); + } + + + @Test + @SneakyThrows + public void testMergeImages() { + + String fileName = "merge_images"; + List allRectCoords = testImagePositionDetection(fileName); + assertThat(allRectCoords).contains(new int[]{90, 284, 398, 770}); + } + + + @Test + @SneakyThrows + public void testStitchedImagesMultiPage() { + + String fileName = "StitchedImagesMultiPage"; + List allRectCoords = testImagePositionDetection(fileName); + assertThat(allRectCoords.size()).isEqualTo(48); + } + + + private List testImagePositionDetection(String fileName) throws IOException, PDFNetException { + + InputStream fileStream = new FileInputStream(new ClassPathResource("files/" + fileName + ".pdf").getFile().getAbsolutePath()); + PDFDoc pdfDoc = new PDFDoc(fileStream); + + Map pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, false); + + ElementWriter writer = new ElementWriter(); + pageIdToRectCollection.forEach((pageId, rectCollection) -> { + try { + writer.begin(pdfDoc.getPage(pageId)); + drawRectCollection(writer, rectCollection, pdfDoc.getPage(pageId)); + drawGrid(writer, pdfDoc.getPage(pageId)); + writer.end(); + StringBuilder zonesString = new StringBuilder(); + for (int j = 0; j < rectCollection.getNumRects(); ++j) { + var r = rectCollection.getRectAt(j); + zonesString.append(String.format("%d:[lower left (%.1f|%.1f) upper right (%.1f|%.1f)]; ", j, r.getX1(), r.getY1(), r.getX2(), r.getY2())); + } + System.out.printf("Page %d: rotation: %d OCR regions %s \n", pageId, pdfDoc.getPage(pageId).getRotation(), zonesString); + } catch (PDFNetException e) { + throw new RuntimeException(e); + } + }); + + // Check visually for red Rectangles to match images in the saved pdf file + try (var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + "_IMAGE_BBOX.pdf")) { + out.write(pdfDoc.save(SDFDoc.SaveMode.LINEARIZED, null)); + } + System.out.println("result file: " + getTemporaryDirectory() + "/" + fileName + "_IMAGE_BBOX.pdf"); + fileStream.close(); + // round all coords to nearest int to account for inconsistencies with the calculation of the bounding box + return pageIdToRectCollection.values().stream().map(this::toRoundedCoordinateArrayList).flatMap(List::stream).collect(Collectors.toList()); + } + + + @SneakyThrows + private List toRoundedCoordinateArrayList(RectCollection rectCollection) { + + List coords = new ArrayList<>(rectCollection.getNumRects()); + for (int i = 0; i < rectCollection.getNumRects(); ++i) { + var r = rectCollection.getRectAt(i); + coords.add(new int[]{(int) Math.round(r.getX1()), (int) Math.round(r.getY1()), (int) Math.round(r.getX2()), (int) Math.round(r.getY2())}); + } + return coords; + } + + + @SneakyThrows + private void drawGrid(ElementWriter writer, Page page) { + + ElementBuilder eb = new ElementBuilder(); + double dX = 15; + double dY = 15; + int nRows = (int) (page.getPageHeight() / dY) + 1; + int nCols = (int) (page.getPageWidth() / dX) + 1; + for (int row = 0; row < nRows; ++row) { + for (int col = 0; col < nCols; ++col) { + Element cell = eb.createRect(col * dX, row * dY, dX, dY); + cell.setPathStroke(true); + cell.getGState().setLineWidth(1); + cell.getGState().setStrokeOpacity(0.1); + cell.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB()); + if (row == 0 && col == 0) { + cell.getGState().setStrokeColor(new ColorPt(0, 0, 1)); + cell.setPathFill(true); + cell.getGState().setFillOpacity(0.8); + cell.getGState().setFillColorSpace(ColorSpace.createDeviceRGB()); + cell.getGState().setFillColor(new ColorPt(0, 0, 1)); + } else { + cell.setPathFill(false); + cell.getGState().setStrokeColor(new ColorPt(0.1, 0.1, 0.1)); + } + writer.writePlacedElement(cell); + } + } + } + + + @SneakyThrows + public static void drawRectCollection(ElementWriter writer, RectCollection rectCollection, Page page) { + + ColorPt colorPt = new ColorPt(1, 0, 0); + ElementBuilder eb = new ElementBuilder(); + for (int i = 0; i < rectCollection.getNumRects(); ++i) { + Rect r = rectCollection.getRectAt(i); + Element rect = eb.createRect(r.getX1(), r.getY1(), r.getWidth(), r.getHeight()); + + rect.setPathStroke(true); + rect.getGState().setLineWidth(5); + rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB()); + rect.getGState().setStrokeColor(colorPt); + + rect.setPathFill(true); + rect.getGState().setFillColorSpace(ColorSpace.createDeviceRGB()); + rect.getGState().setFillColor(colorPt); + rect.getGState().setFillOpacity(0.5); + + writer.writePlacedElement(rect); + } + } + +} \ No newline at end of file diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/InvisibleElementRemovalServiceTest.java b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalServiceTest.java similarity index 93% rename from ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/InvisibleElementRemovalServiceTest.java rename to ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalServiceTest.java index 8517775..fbe81a3 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/InvisibleElementRemovalServiceTest.java +++ b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalServiceTest.java @@ -1,4 +1,4 @@ -package com.iqser.red.service.ocr.v1.server; +package com.iqser.red.service.ocr.v1.server.service; import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory; import static org.assertj.core.api.AssertionsForClassTypes.assertThat; @@ -17,7 +17,8 @@ import org.springframework.context.annotation.Import; import org.springframework.core.io.ClassPathResource; import org.springframework.test.context.junit.jupiter.SpringExtension; -import com.iqser.red.service.ocr.v1.server.service.InvisibleElementRemovalService; +import com.iqser.red.service.ocr.v1.server.Application; +import com.iqser.red.service.ocr.v1.server.OcrServiceIntegrationTest; import com.pdftron.pdf.PDFDoc; import com.pdftron.pdf.Page; import com.pdftron.pdf.PageIterator; @@ -26,8 +27,7 @@ import com.pdftron.pdf.TextExtractor; import lombok.SneakyThrows; @ExtendWith(SpringExtension.class) -@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT // - , properties = {"pdftron.ocrmodule.path=/YourOCRModulePath"}) +@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT) @Import(OcrServiceIntegrationTest.TestConfiguration.class) public class InvisibleElementRemovalServiceTest { diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/ComplexLayeredElements.IMAGE_INFO.json b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/ComplexLayeredElements.IMAGE_INFO.json deleted file mode 100644 index 940d444..0000000 --- a/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/ComplexLayeredElements.IMAGE_INFO.json +++ /dev/null @@ -1 +0,0 @@ -{"dossierId": "c8553cbd-409f-4e1a-baf4-34b11d49deac", "fileId": "bd6f93ed896dd0e2f641b0568f13ddf1", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": [{"classification": {"label": "other", "probabilities": {"other": 0.9999, "logo": 0.0001, "formula": 0.0, "signature": 0.0}}, "representation": "FFFFFEFBF7EFCFFFFFFFFFFFF", "position": {"x1": -3, "x2": 795, "y1": 0, "y2": 612, "pageNumber": 1}, "geometry": {"width": 798, "height": 612}, "alpha": false, "filters": {"geometry": {"imageSize": {"quotient": 1.0038, "tooLarge": true, "tooSmall": false}, "imageFormat": {"quotient": 1.3039, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "logo", "probabilities": {"logo": 0.9966, "other": 0.0025, "signature": 0.0005, "formula": 0.0003}}, "representation": "FFC33D3C323CCF3390C1F8C72", "position": {"x1": 120, "x2": 131, "y1": 264, "y2": 380, "pageNumber": 1}, "geometry": {"width": 11, "height": 116}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.0513, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.0948, "tooTall": true, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "other", "probabilities": {"other": 1.0, "formula": 0.0, "logo": 0.0, "signature": 0.0}}, "representation": "EF8FF6381060800318F0E187", "position": {"x1": 152, "x2": 205, "y1": 115, "y2": 533, "pageNumber": 1}, "geometry": {"width": 53, "height": 418}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.2138, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.1268, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": true}}, {"classification": {"label": "other", "probabilities": {"other": 0.5967, "logo": 0.1756, "signature": 0.1218, "formula": 0.106}}, "representation": "EC9377C9170E1070C3070C30F", "position": {"x1": 196, "x2": 221, "y1": 121, "y2": 245, "pageNumber": 1}, "geometry": {"width": 25, "height": 124}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.08, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.2016, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": true}}, {"classification": {"label": "logo", "probabilities": {"logo": 1.0, "formula": 0.0, "other": 0.0, "signature": 0.0}}, "representation": "CF1F1F70F140036860F1441B5", "position": {"x1": 707, "x2": 718, "y1": 481, "y2": 531, "pageNumber": 1}, "geometry": {"width": 11, "height": 50}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.0337, "tooLarge": false, "tooSmall": true}, "imageFormat": {"quotient": 0.22, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "logo", "probabilities": {"logo": 0.9704, "other": 0.0223, "formula": 0.0044, "signature": 0.0029}}, "representation": "CF0F1C70F1C7090081F7CF073", "position": {"x1": 732, "x2": 744, "y1": 118, "y2": 241, "pageNumber": 1}, "geometry": {"width": 12, "height": 123}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.0552, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.0976, "tooTall": true, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}], "dataCV": []} \ No newline at end of file diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/CropboxNotEqualToMediaBox.IMAGE_INFO.json b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/CropboxNotEqualToMediaBox.IMAGE_INFO.json deleted file mode 100644 index 7c13b44..0000000 --- a/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/CropboxNotEqualToMediaBox.IMAGE_INFO.json +++ /dev/null @@ -1 +0,0 @@ -{"dossierId": "55547c91-6b0e-4aa6-9009-2e7c4cd90f13", "fileId": "917b9d9c9f548f85fef3679db45ff46c", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": [{"classification": {"label": "signature", "probabilities": {"signature": 0.9927, "logo": 0.0038, "other": 0.0034, "formula": 0.0}}, "representation": "FFF2CF0F7C74FFC1070830FFF", "position": {"x1": -7, "x2": 603, "y1": 0, "y2": 852, "pageNumber": 1}, "geometry": {"width": 610, "height": 852}, "alpha": false, "filters": {"geometry": {"imageSize": {"quotient": 1.0096, "tooLarge": true, "tooSmall": false}, "imageFormat": {"quotient": 0.716, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}], "dataCV": []} \ No newline at end of file diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/InvisibleText.IMAGE_INFO.json b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/InvisibleText.IMAGE_INFO.json deleted file mode 100644 index 940d444..0000000 --- a/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/InvisibleText.IMAGE_INFO.json +++ /dev/null @@ -1 +0,0 @@ -{"dossierId": "c8553cbd-409f-4e1a-baf4-34b11d49deac", "fileId": "bd6f93ed896dd0e2f641b0568f13ddf1", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": [{"classification": {"label": "other", "probabilities": {"other": 0.9999, "logo": 0.0001, "formula": 0.0, "signature": 0.0}}, "representation": "FFFFFEFBF7EFCFFFFFFFFFFFF", "position": {"x1": -3, "x2": 795, "y1": 0, "y2": 612, "pageNumber": 1}, "geometry": {"width": 798, "height": 612}, "alpha": false, "filters": {"geometry": {"imageSize": {"quotient": 1.0038, "tooLarge": true, "tooSmall": false}, "imageFormat": {"quotient": 1.3039, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "logo", "probabilities": {"logo": 0.9966, "other": 0.0025, "signature": 0.0005, "formula": 0.0003}}, "representation": "FFC33D3C323CCF3390C1F8C72", "position": {"x1": 120, "x2": 131, "y1": 264, "y2": 380, "pageNumber": 1}, "geometry": {"width": 11, "height": 116}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.0513, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.0948, "tooTall": true, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "other", "probabilities": {"other": 1.0, "formula": 0.0, "logo": 0.0, "signature": 0.0}}, "representation": "EF8FF6381060800318F0E187", "position": {"x1": 152, "x2": 205, "y1": 115, "y2": 533, "pageNumber": 1}, "geometry": {"width": 53, "height": 418}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.2138, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.1268, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": true}}, {"classification": {"label": "other", "probabilities": {"other": 0.5967, "logo": 0.1756, "signature": 0.1218, "formula": 0.106}}, "representation": "EC9377C9170E1070C3070C30F", "position": {"x1": 196, "x2": 221, "y1": 121, "y2": 245, "pageNumber": 1}, "geometry": {"width": 25, "height": 124}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.08, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.2016, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": true}}, {"classification": {"label": "logo", "probabilities": {"logo": 1.0, "formula": 0.0, "other": 0.0, "signature": 0.0}}, "representation": "CF1F1F70F140036860F1441B5", "position": {"x1": 707, "x2": 718, "y1": 481, "y2": 531, "pageNumber": 1}, "geometry": {"width": 11, "height": 50}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.0337, "tooLarge": false, "tooSmall": true}, "imageFormat": {"quotient": 0.22, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "logo", "probabilities": {"logo": 0.9704, "other": 0.0223, "formula": 0.0044, "signature": 0.0029}}, "representation": "CF0F1C70F1C7090081F7CF073", "position": {"x1": 732, "x2": 744, "y1": 118, "y2": 241, "pageNumber": 1}, "geometry": {"width": 12, "height": 123}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.0552, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.0976, "tooTall": true, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}], "dataCV": []} \ No newline at end of file diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/MediaBoxBiggerThanCropBox.IMAGE_INFO.json b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/MediaBoxBiggerThanCropBox.IMAGE_INFO.json deleted file mode 100644 index ac6c30f..0000000 --- a/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/MediaBoxBiggerThanCropBox.IMAGE_INFO.json +++ /dev/null @@ -1 +0,0 @@ -{"dossierId": "55547c91-6b0e-4aa6-9009-2e7c4cd90f13", "fileId": "d7f1e0e37cba4e28ebdf894a79d3bd67", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": [{"classification": {"label": "signature", "probabilities": {"signature": 0.9872, "logo": 0.0064, "other": 0.0063, "formula": 0.0001}}, "representation": "FFFCF10608F6F89747BFFC301", "position": {"x1": -9, "x2": 584, "y1": 9, "y2": 849, "pageNumber": 1}, "geometry": {"width": 593, "height": 840}, "alpha": false, "filters": {"geometry": {"imageSize": {"quotient": 0.9992, "tooLarge": true, "tooSmall": false}, "imageFormat": {"quotient": 0.706, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}], "dataCV": []} \ No newline at end of file diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/RotateTestFileWithImages.pdf b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/RotateTestFileWithImages.pdf new file mode 100644 index 0000000..2b009d1 Binary files /dev/null and b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/RotateTestFileWithImages.pdf differ diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/RotateTestFileWithImagesExtremeCropbox.pdf b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/RotateTestFileWithImagesExtremeCropbox.pdf new file mode 100644 index 0000000..ced5d01 Binary files /dev/null and b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/RotateTestFileWithImagesExtremeCropbox.pdf differ diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/StitchedImagesMultiPage.pdf b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/StitchedImagesMultiPage.pdf new file mode 100644 index 0000000..7432246 Binary files /dev/null and b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/StitchedImagesMultiPage.pdf differ diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/Watermark.IMAGE_INFO.json b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/Watermark.IMAGE_INFO.json deleted file mode 100644 index a4eb3b4..0000000 --- a/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/Watermark.IMAGE_INFO.json +++ /dev/null @@ -1 +0,0 @@ -{"dossierId": "55547c91-6b0e-4aa6-9009-2e7c4cd90f13", "fileId": "32b19ec38896f5105c09041def470c90", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": [{"classification": {"label": "logo", "probabilities": {"logo": 0.9999, "signature": 0.0001, "formula": 0.0, "other": 0.0}}, "representation": "307EF8F6E9833CE9D7AF9EFFF", "position": {"x1": 26, "x2": 586, "y1": -2, "y2": 794, "pageNumber": 1}, "geometry": {"width": 560, "height": 796}, "alpha": false, "filters": {"geometry": {"imageSize": {"quotient": 0.959, "tooLarge": true, "tooSmall": false}, "imageFormat": {"quotient": 0.7035, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "logo", "probabilities": {"logo": 1.0, "formula": 0.0, "other": 0.0, "signature": 0.0}}, "representation": "FFF7FFD2000000018F3FFEFFF", "position": {"x1": 90, "x2": 210, "y1": 676, "y2": 720, "pageNumber": 1}, "geometry": {"width": 120, "height": 44}, "alpha": false, "filters": {"geometry": {"imageSize": {"quotient": 0.1044, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 2.7273, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": true}}], "dataCV": []} \ No newline at end of file diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/manyRotatedImages.pdf b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/manyRotatedImages.pdf new file mode 100644 index 0000000..c91f1d3 Binary files /dev/null and b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/manyRotatedImages.pdf differ