diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/ImagePositionRetrievalService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/ImagePositionRetrievalService.java index 9f32d3c..a6c8a0e 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/ImagePositionRetrievalService.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/ImagePositionRetrievalService.java @@ -15,6 +15,7 @@ import com.pdftron.common.Matrix2D; import com.pdftron.common.PDFNetException; import com.pdftron.pdf.Element; import com.pdftron.pdf.ElementReader; +import com.pdftron.pdf.Image; import com.pdftron.pdf.PDFDoc; import com.pdftron.pdf.Page; import com.pdftron.pdf.Rect; @@ -27,9 +28,12 @@ public class ImagePositionRetrievalService { private static final double TOLERANCE = 1e-1; + // any image with smaller height and width than this gets thrown out, see everyPointInDashedLineIsImage.pdf + private static final int PIXEL_THRESHOLD = 10; + /** - * Iterates over all elements in a PDF Document and retrieves the bounding box for each image, + * Iterates over all elements in a PDF Document and retrieves the bounding box for each image, that is larger than the pixel threshold of 10 in either dimension. * Then it adjusts the bounding boxes for the page rotation. * If the mirrorY flag is set, the Y Coordinates are mirrored and moved up by the page height. This is required for PDFTrons OCRModule. * @@ -63,7 +67,13 @@ public class ImagePositionRetrievalService { Element element; while ((element = reader.next()) != null) { switch (element.getType()) { - case Element.e_image, Element.e_inline_image -> imagePositions.addRect(toRotationAdjustedRect(element.getBBox(), currentPage, mirrorY)); + case Element.e_image, Element.e_inline_image -> { + Image image = new Image(element.getXObject()); + // see everyPointInDashedLineIsImage.pdf TestFile + if (image.getImageHeight() > PIXEL_THRESHOLD || image.getImageWidth() > PIXEL_THRESHOLD) { + imagePositions.addRect(toRotationAdjustedRect(element.getBBox(), currentPage, mirrorY)); + } + } case Element.e_form -> { reader.formBegin(); findImagePositionsOnPage(reader, imagePositions, currentPage, mirrorY); @@ -77,38 +87,38 @@ public class ImagePositionRetrievalService { @SneakyThrows public RectCollection mergeOverlappingRects(RectCollection imagePositions) { - if (imagePositions.getNumRects() == 1) { + if (imagePositions.getNumRects() < 2) { return imagePositions; } List rectangleList = toSortedRectangleList(imagePositions); - rectangleList = mergeRectangleListRecursive(rectangleList, 0); + mergeRectangleList(rectangleList); + return toRectCollection(rectangleList); } // Sometimes images are split up into stripes, here we merge the positions of aligned and intersecting rectangles into one larger rectangle - private List mergeRectangleListRecursive(List rectangleList, int currentIdx) { + private void mergeRectangleList(List rectangleList) { + int idx = 0; - if (rectangleList.size() < currentIdx + 2) { - return rectangleList; - } + while (rectangleList.size() >= idx + 2) { - var rect1 = rectangleList.get(currentIdx); - var rect2 = rectangleList.get(currentIdx + 1); + var rect1 = rectangleList.get(idx); + var rect2 = rectangleList.get(idx + 1); - boolean isAlignedX = Math.abs(rect1.getMinX() - rect2.getMinX()) < TOLERANCE && Math.abs(rect1.getMaxX() - rect2.getMaxX()) < TOLERANCE; - boolean isAlignedY = Math.abs(rect1.getMinY() - rect2.getMinY()) < TOLERANCE && Math.abs(rect1.getMaxY() - rect2.getMaxY()) < TOLERANCE; - boolean intersects = rect1.intersects(rect2.getMinX() - TOLERANCE, rect2.getMinY() - TOLERANCE, rect2.getWidth() + (2 * TOLERANCE), rect2.getHeight() + (2 * TOLERANCE)); + boolean isAlignedX = Math.abs(rect1.getMinX() - rect2.getMinX()) < TOLERANCE && Math.abs(rect1.getMaxX() - rect2.getMaxX()) < TOLERANCE; + boolean isAlignedY = Math.abs(rect1.getMinY() - rect2.getMinY()) < TOLERANCE && Math.abs(rect1.getMaxY() - rect2.getMaxY()) < TOLERANCE; + boolean intersects = rect1.intersects(rect2.getMinX() - TOLERANCE, rect2.getMinY() - TOLERANCE, rect2.getWidth() + (2 * TOLERANCE), rect2.getHeight() + (2 * TOLERANCE)); - if (intersects && (isAlignedX || isAlignedY)) { - rectangleList.remove(currentIdx + 1); - rectangleList.remove(currentIdx); - rectangleList.add(currentIdx, rect1.createUnion(rect2)); - return mergeRectangleListRecursive(rectangleList, currentIdx); - } else { - return mergeRectangleListRecursive(rectangleList, currentIdx + 1); + if (intersects && (isAlignedX || isAlignedY)) { + rectangleList.remove(idx + 1); + rectangleList.remove(idx); + rectangleList.add(idx, rect1.createUnion(rect2)); + } else { + ++idx; + } } } diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java index a0005ae..c747a9d 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java @@ -76,6 +76,8 @@ public class InvisibleElementRemovalService { Page page = iterator.next(); visitedXObjIds.add(page.getSDFObj().getObjNum()); + + InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder() .reader(reader) .clippingPathStack(new ClippingPathStack(page.getMediaBox())) @@ -221,8 +223,14 @@ public class InvisibleElementRemovalService { private void processPath(Element pathElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { + PathData pathData = pathElement.getPathData(); - GeneralPath linePath = convertToGeneralPath(pathElement.getPathData()); + if (pathData.getOperators().length == 0 && pathData.getPoints().length == 0) { + writer.writeGStateChanges(pathElement); + return; + } + + GeneralPath linePath = convertToGeneralPath(pathData); //transform path to initial user space var ctm = pathElement.getCTM(); diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java index d7bc889..7901ca3 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java @@ -12,7 +12,6 @@ import java.util.Map; import org.springframework.amqp.rabbit.core.RabbitTemplate; import org.springframework.stereotype.Service; -import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse; import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration; @@ -28,6 +27,7 @@ import com.pdftron.sdf.SDFDoc; import io.micrometer.core.annotation.Timed; import lombok.RequiredArgsConstructor; +import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; @Slf4j @@ -68,10 +68,10 @@ public class OCRService { try (ByteArrayOutputStream transferOutputStream = new ByteArrayOutputStream()) { try (InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId)) { long start = System.currentTimeMillis(); - log.info("Start invisible element removal for file with dossierId {} and fileId {}", dossierId, fileId); + log.debug("Start invisible element removal for file with dossierId {} and fileId {}", dossierId, fileId); invisibleElementRemovalService.removeInvisibleElements(fileStream, transferOutputStream, false); long end = System.currentTimeMillis(); - log.info("Invisible element removal successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, format("%.1f", (end - start) / 1000.0)); + log.debug("Invisible element removal successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, format("%.1f", (end - start) / 1000.0)); } try (InputStream transferInputStream = new ByteArrayInputStream(transferOutputStream.toByteArray())) { long start = System.currentTimeMillis(); @@ -83,15 +83,10 @@ public class OCRService { } + @SneakyThrows private void runOcr(InputStream fileStream, OutputStream out, String fileId) { - PDFDoc pdfDoc; - try { - pdfDoc = new PDFDoc(fileStream); - } catch (Exception e) { - log.error("Couldn't parse file with fileId {} from InputStream ", fileId); - throw new RuntimeException(e); - } + PDFDoc pdfDoc = new PDFDoc(fileStream); Map pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, true); @@ -136,24 +131,16 @@ public class OCRService { } catch (PDFNetException e) { log.error("Failed to process page {}", pageId); throw new RuntimeException(e); - } catch (JsonProcessingException e) { - log.error("Failed to send \"processed\" message to rabbitMQ for file with fileID {} on OCR page {}/{}", fileId, numProcessedPages, pageIdToRectCollection.size()); - throw new RuntimeException(e); } } - try { - rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE, - objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder() - .fileId(fileId) - .numberOfPagesToOCR(pageIdToRectCollection.size()) - .numberOfOCRedPages(numProcessedPages) - .ocrFinished(true) - .build())); - } catch (JsonProcessingException e) { - log.error("Failed to send message to rabbitMQ for file with fileID {} on OCR page {}/{}", fileId, numProcessedPages, pageIdToRectCollection.size()); - throw new RuntimeException(e); - } + rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE, + objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder() + .fileId(fileId) + .numberOfPagesToOCR(pageIdToRectCollection.size()) + .numberOfOCRedPages(numProcessedPages) + .ocrFinished(true) + .build())); Optimizer.optimize(pdfDoc); try { diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/ImagePositionRetrievalServiceTest.java b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/ImagePositionRetrievalServiceTest.java index 9783979..d4eb08e 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/ImagePositionRetrievalServiceTest.java +++ b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/ImagePositionRetrievalServiceTest.java @@ -122,6 +122,14 @@ class ImagePositionRetrievalServiceTest { assertThat(allRectCoords.size()).isEqualTo(48); } + @Test + @SneakyThrows + public void testEveryPointInDashedLineIsImage() { + String fileName = "everyPointInDashedLineIsImage"; + List allRectCoords = testImagePositionDetection(fileName); + assertThat(allRectCoords.size()).isEqualTo(0); + } + private List testImagePositionDetection(String fileName) throws IOException, PDFNetException { diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/everyPointInDashedLineIsImage.pdf b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/everyPointInDashedLineIsImage.pdf new file mode 100644 index 0000000..8ee74c9 Binary files /dev/null and b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/everyPointInDashedLineIsImage.pdf differ