diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java index 964a507..3a007ff 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java @@ -20,12 +20,14 @@ import com.iqser.red.service.ocr.v1.server.model.image.ImageServiceResponse; import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings; import com.iqser.red.service.redaction.v1.model.Point; import com.iqser.red.service.redaction.v1.model.Rectangle; +import com.pdftron.common.PDFNetException; import com.pdftron.pdf.OCRModule; import com.pdftron.pdf.OCROptions; import com.pdftron.pdf.Optimizer; import com.pdftron.pdf.PDFDoc; import com.pdftron.pdf.Page; import com.pdftron.pdf.RectCollection; +import com.pdftron.pdf.TextExtractor; import com.pdftron.sdf.SDFDoc; import lombok.RequiredArgsConstructor; @@ -55,35 +57,37 @@ public class OCRService { InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId); ImageServiceResponse imageServiceResponse = fileStorageService.getImageServiceResponse(dossierId, fileId); - byte[] fileWithoutInvisibleTextStream = invisibleElementRemovalService.removeInvisibleElements(fileStream, false); + byte[] fileWithoutInvisibleTextBytes = invisibleElementRemovalService.removeInvisibleElements(fileStream, false); - byte[] ocrBytes = ocr(fileWithoutInvisibleTextStream, fileId, imageServiceResponse); + byte[] ocrBytes = runOcrOnImages(fileWithoutInvisibleTextBytes, fileId, imageServiceResponse); return new ByteArrayInputStream(ocrBytes); } - private byte[] ocr(byte[] file, String fileId, ImageServiceResponse imageServiceResponse) { + private byte[] runOcrOnImages(byte[] file, String fileId, ImageServiceResponse imageServiceResponse) { + PDFDoc pdfDoc = null; try (ByteArrayOutputStream out = new ByteArrayOutputStream()) { pdfDoc = new PDFDoc(file); - Map> pages = new HashMap<>(); + Map> pageIdToImgPos = new HashMap<>(); imageServiceResponse.getData() - .forEach(imageMetadata -> pages.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>()) + .forEach(imageMetadata -> pageIdToImgPos.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>()) .add(new ImagePosition(new Rectangle(new Point(imageMetadata.getPosition().getX1(), imageMetadata.getPosition().getY1()), imageMetadata.getGeometry().getWidth(), imageMetadata.getGeometry().getHeight(), imageMetadata.getPosition().getPageNumber()), imageMetadata.isAlpha()))); - Map pdfDocMap = Collections.synchronizedMap(new HashMap<>()); - rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE, - objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder().fileId(fileId).numberOfPagesToOCR(pages.keySet().size()).build())); + objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder().fileId(fileId).numberOfPagesToOCR(pageIdToImgPos.size()).build())); - ocrPages(pdfDoc, fileId, pages, pdfDocMap); + // the PDFDoc is a helper document, which contains exactly one page + Map pageIdToOcrPageMap = runOcrPerPage(pdfDoc, fileId, pageIdToImgPos); + + addOCRPagesToDocIfAdditionalWordsFound(pdfDoc, pageIdToOcrPageMap); Optimizer.optimize(pdfDoc); pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null); @@ -92,8 +96,8 @@ public class OCRService { rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE, objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder() .fileId(fileId) - .numberOfPagesToOCR(pages.keySet().size()) - .numberOfOCRedPages(pages.keySet().size()) + .numberOfPagesToOCR(pageIdToImgPos.size()) + .numberOfOCRedPages(pageIdToOcrPageMap.size()) .ocrFinished(true) .build())); @@ -113,21 +117,23 @@ public class OCRService { @SneakyThrows - private void ocrPages(PDFDoc pdfDoc, String fileId, Map> pages, Map pdfDocMap) { + private Map runOcrPerPage(PDFDoc pdfDoc, String fileId, Map> pageIdToImgPosMap) { - int numberOfOCRedPages = 0; - for (var pageEntry : pages.entrySet()) { + Map pageIdToOcrPageMap = Collections.synchronizedMap(new HashMap<>()); + + int numberOfRunPages = 0; + for (var pageIdToImgPos : pageIdToImgPosMap.entrySet()) { try { - RectCollection rectCollection = new RectCollection(); - var page = pageEntry.getKey(); + Integer pageIndex = pageIdToImgPos.getKey(); - Page pdfPage = pdfDoc.getPageIterator(page).next(); + Page pdfPage = pdfDoc.getPageIterator(pageIndex).next(); pdfPage.setMediaBox(pdfPage.getCropBox()); - for (ImagePosition imagePosition : pageEntry.getValue()) { + RectCollection rectCollection = new RectCollection(); + for (ImagePosition imagePosition : pageIdToImgPos.getValue()) { Rectangle rectangle = imagePosition.getRectangle(); // Warning coordinate system is different in this call macOs/Linux @@ -135,31 +141,58 @@ public class OCRService { rectCollection.addRect(rectangle.getTopLeft().getX(), y, rectangle.getTopLeft().getX() + rectangle.getWidth(), y + rectangle.getHeight()); } - PDFDoc ocrDoc = new PDFDoc(); - ocrDoc.pagePushBack(pdfPage); - pdfDocMap.put(pageEntry.getKey(), ocrDoc); + // technically a document, but it always contains exactly one page + PDFDoc ocrPage = new PDFDoc(); + ocrPage.pagePushBack(pdfPage); + pageIdToOcrPageMap.put(pageIndex, ocrPage); OCROptions options = new OCROptions(); options.addTextZonesForPage(rectCollection, 1); options.addLang(ENGLISH); options.addDPI(settings.getOcrDPI()); - OCRModule.processPDF(ocrDoc, options); + OCRModule.processPDF(ocrPage, options); rectCollection.clear(); } catch (Exception e) { - log.warn("Failed to process PDF page {}", pageEntry.getKey()); + log.warn("Failed to process PDF page {}", pageIdToImgPos.getKey()); } rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE, objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder() .fileId(fileId) - .numberOfPagesToOCR(pages.keySet().size()) - .numberOfOCRedPages(++numberOfOCRedPages) + .numberOfPagesToOCR(pageIdToImgPosMap.size()) + .numberOfOCRedPages(++numberOfRunPages) .build())); - log.warn("Done page {}", pageEntry); + log.warn("Done page {}", pageIdToImgPos); } + return pageIdToOcrPageMap; } + + + private void addOCRPagesToDocIfAdditionalWordsFound(PDFDoc pdfDoc, Map ocrDocPagesMap) throws PDFNetException { + + for (var ocrDocPagesEntry : ocrDocPagesMap.entrySet()) { + int pageIndex = ocrDocPagesEntry.getKey(); + + Page ocrPage = ocrDocPagesEntry.getValue().getPage(1); + Page page = pdfDoc.getPage(pageIndex); + + if (getWordCount(ocrPage) >= getWordCount(page)) { + pdfDoc.pageInsert(pdfDoc.getPageIterator(pageIndex), ocrPage); + pdfDoc.pageRemove(pdfDoc.getPageIterator(pageIndex + 1)); + } + } + } + + + private static int getWordCount(Page pdfPage) { + + TextExtractor txt = new TextExtractor(); + txt.begin(pdfPage); + return txt.getWordCount(); + } + } diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java index 6293a62..46a42ab 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java +++ b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java @@ -3,10 +3,11 @@ package com.iqser.red.service.ocr.v1.server; import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory; import static org.assertj.core.api.Assertions.assertThat; -import java.io.File; +import java.io.FileOutputStream; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; -import org.apache.commons.io.FileUtils; -import org.apache.commons.io.IOUtils; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; @@ -25,18 +26,22 @@ import org.springframework.core.io.ClassPathResource; import org.springframework.test.context.junit.jupiter.SpringExtension; import com.fasterxml.jackson.databind.ObjectMapper; -import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService; -import com.iqser.red.service.ocr.v1.server.service.OCRService; import com.iqser.red.service.ocr.v1.server.service.FileStorageService; +import com.iqser.red.service.ocr.v1.server.service.OCRService; +import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService; import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType; import com.iqser.red.storage.commons.StorageAutoConfiguration; import com.iqser.red.storage.commons.service.StorageService; +import com.pdftron.pdf.PDFDoc; +import com.pdftron.pdf.Page; +import com.pdftron.pdf.PageIterator; +import com.pdftron.pdf.TextExtractor; import lombok.SneakyThrows; @ExtendWith(SpringExtension.class) @SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT // - , properties = {"pdftron.ocrmodule.path=/YourOCRModulePath"}) + , properties = {"pdftron.ocrmodule.path=/home/kschuettler/iqser/PDFTron/ocr/Lib/"}) @Import(OcrServiceIntegrationTest.TestConfiguration.class) public class OcrServiceIntegrationTest { @@ -58,10 +63,23 @@ public class OcrServiceIntegrationTest { @Test @Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top. - @SneakyThrows - public void testOCR() { + public void testOCRWatermark() { - String fileName = "Watermark"; + assertThat(testOCR("Watermark")).contains("syngenta"); + } + + + @Test + @Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top. + public void testOCRInvisibleText() { + String text = testOCR("InvisibleText"); + assertThat(text).contains("Michela", "Gregori", "DVM", "PhD", "Pathologist", "SIGNATURE PAGE"); + assertThat(text).doesNotContain("COMPLETION DATE:", "LABORATORY PROJECT ID:", "AUTHOR(S):", "Substance"); + } + + + @SneakyThrows + private String testOCR(String fileName) { ClassPathResource imageInfoResource = new ClassPathResource("files/" + fileName + ".IMAGE_INFO.json"); ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf"); @@ -72,12 +90,24 @@ public class OcrServiceIntegrationTest { var imageId = FileStorageService.getStorageId("dossier", "file", FileType.IMAGE_INFO); storageService.storeObject(imageId, imageInfoResource.getInputStream()); - var response = ocrService.ocrDocument("dossier", "file"); - - var out = FileUtils.openOutputStream(new File(getTemporaryDirectory() + "/" + fileName + ".pdf")); - IOUtils.copy(response, out); - - System.out.println("File:" + getTemporaryDirectory() + "/" + fileName + ".pdf"); + try (InputStream ocrDocument = ocrService.ocrDocument("dossier", "file")) { + byte[] ocrDocumentBytes = ocrDocument.readAllBytes(); + try (var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) { + out.write(ocrDocumentBytes); + } + PDFDoc ocrDoc = new PDFDoc(ocrDocumentBytes); + TextExtractor extractor = new TextExtractor(); + List texts = new ArrayList<>(); + PDFDoc pdfDoc = new PDFDoc(ocrDocumentBytes); + PageIterator iterator = pdfDoc.getPageIterator(); + while (iterator.hasNext()) { + Page page = iterator.next(); + extractor.begin(page); + texts.add(extractor.getAsText()); + } + System.out.println("File:" + getTemporaryDirectory() + "/" + fileName + ".pdf"); + return String.join("\n", texts); + } }