diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java index dfd45f9..2a25bfa 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java @@ -6,11 +6,8 @@ import java.io.InputStream; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; -import java.util.HashSet; import java.util.List; import java.util.Map; -import java.util.Set; -import java.util.TreeSet; import org.apache.commons.io.IOUtils; import org.springframework.amqp.rabbit.core.RabbitTemplate; @@ -26,21 +23,17 @@ import com.iqser.red.service.persistence.service.v1.api.utils.SuppressFBWarnings import com.iqser.red.service.redaction.v1.model.Point; import com.iqser.red.service.redaction.v1.model.Rectangle; import com.pdftron.common.PDFNetException; -import com.pdftron.pdf.Element; -import com.pdftron.pdf.ElementReader; -import com.pdftron.pdf.ElementWriter; +import com.pdftron.pdf.ContentReplacer; import com.pdftron.pdf.OCRModule; import com.pdftron.pdf.OCROptions; import com.pdftron.pdf.Optimizer; import com.pdftron.pdf.PDFDoc; import com.pdftron.pdf.Page; -import com.pdftron.pdf.PageIterator; import com.pdftron.pdf.Rect; import com.pdftron.pdf.RectCollection; -import com.pdftron.sdf.Obj; +import com.pdftron.pdf.TextExtractor; import com.pdftron.sdf.SDFDoc; -import io.micrometer.core.annotation.Timed; import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; @@ -51,6 +44,7 @@ import lombok.extern.slf4j.Slf4j; public class OCRService { public static final String ENGLISH = "eng"; + public static final String REPLACEMENT_TEXT = ""; private final FileStorageService fileStorageService; private final OcrServiceSettings settings; @@ -60,12 +54,10 @@ public class OCRService { private final ObjectMapper objectMapper; - @Timed("redactmanager_PDFTron-ocrDocument") @SneakyThrows public InputStream ocrDocument(String dossierId, String fileId) { var fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId); - var imageServiceResponse = fileStorageService.getImageServiceResponse(dossierId, fileId); var fileBytes = IOUtils.toByteArray(fileStream); @@ -83,12 +75,9 @@ public class OCRService { PDFDoc pdfDoc = null; try (ByteArrayOutputStream out = new ByteArrayOutputStream()) { pdfDoc = new PDFDoc(file); - removeInvisibleText(pdfDoc); Map> pages = new HashMap<>(); - // TODO take logic to ignore small and combine images from image-service. - // TODO Then replace logic so ocr-service is independent from image-service. imageServiceResponse.getData() .forEach(imageMetadata -> pages.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>()) .add(new ImagePosition(new Rectangle(new Point(imageMetadata.getPosition().getX1(), imageMetadata.getPosition().getY1()), @@ -97,11 +86,12 @@ public class OCRService { imageMetadata.getPosition().getPageNumber()), imageMetadata.isAlpha()))); Map pdfDocMap = Collections.synchronizedMap(new HashMap<>()); + Map wordCountPerPage = Collections.synchronizedMap(new HashMap<>()); rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE, objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder().fileId(fileId).numberOfPagesToOCR(pages.keySet().size()).build())); - ocrPages(pdfDoc, fileId, pages, pdfDocMap); + ocrPages(pdfDoc, fileId, pages, pdfDocMap, wordCountPerPage); for (var entry : pdfDocMap.entrySet()) { @@ -109,8 +99,14 @@ public class OCRService { var page = entry.getKey(); Page ocrPage = ocrDoc.getPageIterator(1).next(); - pdfDoc.pageInsert(pdfDoc.getPageIterator(page), ocrPage); - pdfDoc.pageRemove(pdfDoc.getPageIterator(page + 1)); + + TextExtractor txt = new TextExtractor(); + txt.begin(ocrPage); + int wordCount = txt.getWordCount(); + if (wordCount >= wordCountPerPage.get(page)) { + pdfDoc.pageInsert(pdfDoc.getPageIterator(page), ocrPage); + pdfDoc.pageRemove(pdfDoc.getPageIterator(page + 1)); + } ocrDoc.close(); } @@ -143,7 +139,7 @@ public class OCRService { @SneakyThrows - private void ocrPages(PDFDoc pdfDoc, String fileId, Map> pages, Map pdfDocMap) { + private void ocrPages(PDFDoc pdfDoc, String fileId, Map> pages, Map pdfDocMap, Map wordCountPerPage) { int numberOfOCRedPages = 0; for (var pageEntry : pages.entrySet()) { @@ -153,20 +149,35 @@ public class OCRService { var page = pageEntry.getKey(); + var areasToRemoveInOcrDoc = new ArrayList(); + Page pdfPage = pdfDoc.getPageIterator(page).next(); pdfPage.setMediaBox(pdfPage.getCropBox()); + TextExtractor txt = new TextExtractor(); + txt.begin(pdfPage); + int wordCount = txt.getWordCount(); + wordCountPerPage.put(page, wordCount); + for (ImagePosition imagePosition : pageEntry.getValue()) { Rectangle rectangle = imagePosition.getRectangle(); + Rect rect = convert(rectangle, pdfPage.getCropBox(), pdfPage.getMediaBox()); // Warning coordinate system is different in this call macOs/Linux double y = -rectangle.getTopLeft().getY() + pdfPage.getCropBox().getY2() - rectangle.getHeight(); rectCollection.addRect(rectangle.getTopLeft().getX(), y, rectangle.getTopLeft().getX() + rectangle.getWidth(), y + rectangle.getHeight()); + + if (!imagePosition.isHasTransparency()) { + areasToRemoveInOcrDoc.add(rect); + } } + rectCollection.clear(); + PDFDoc ocrDoc = new PDFDoc(); ocrDoc.pagePushBack(pdfPage); + removeTextFromOCRPage(areasToRemoveInOcrDoc, ocrDoc); pdfDocMap.put(pageEntry.getKey(), ocrDoc); OCROptions options = new OCROptions(); @@ -175,10 +186,8 @@ public class OCRService { options.addDPI(settings.getOcrDPI()); OCRModule.processPDF(ocrDoc, options); - rectCollection.clear(); - } catch (Exception e) { - log.warn("Failed to process PDF page {} - {}", pageEntry.getKey(), e); + log.warn("Failed to process PDF page {}", pageEntry.getKey()); } rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE, @@ -194,131 +203,36 @@ public class OCRService { } - /** - * There are 2 possibilities to have invisible Text in pdfs. - * 1. gState is set to invisible, this is ocr text. - * 2. Filled Path elements in front of the text. - */ @SneakyThrows - private void removeInvisibleText(PDFDoc pdfDoc) { + private void removeTextFromOCRPage(List areasToRemoveInOcrDoc, PDFDoc ocrDoc) { - ElementWriter writer = new ElementWriter(); - ElementReader reader = new ElementReader(); - Set visited = new TreeSet<>(); - - for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) { - Page page = iterator.next(); - removeOverlapText(page, reader, writer, visited); - } - } - - - @SneakyThrows - private void removeOverlapText(Page page, ElementReader reader, ElementWriter writer, Set visited) { - - visited.add((int) page.getSDFObj().getObjNum()); - reader.begin(page); - writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict()); - processElements(reader, writer, visited, false); - writer.end(); - reader.end(); - } - - - @SneakyThrows - private void processElements(ElementReader reader, ElementWriter writer, Set visited, boolean isInForm) { - - Set filledRectangles = new HashSet<>(); - for (Element element = reader.next(); element != null; element = reader.next()) - - switch (element.getType()) { - case Element.e_image: - case Element.e_inline_image: - processImage(element, writer, isInForm); - break; - - case Element.e_text: - processText(element, writer, filledRectangles); - break; - - case Element.e_path: - processPath(element, writer, filledRectangles); - break; - - case Element.e_form: - processForm(reader, writer, element, visited); - break; - - default: - writer.writeElement(element); - } - } - - - @SneakyThrows - private void processImage(Element element, ElementWriter writer, boolean isInForm) { - - if (!isInForm || !settings.isRemoveWatermark()) { - writer.writeElement(element); - } - } - - - @SneakyThrows - private void processText(Element element, ElementWriter writer, Set filledRectangles) { - - if (element.getBBox() == null) { - writer.writeElement(element); - return; - } - - double x = element.getBBox().getX1(); - double y = element.getBBox().getY1(); - boolean filledRectangleIntersection = filledRectangles.stream().anyMatch(r -> { + Page ocrPage = ocrDoc.getPage(1); + for (var rect : areasToRemoveInOcrDoc) { try { - return r.contains(x, y); - } catch (PDFNetException e) { - throw new RuntimeException("Internal pdftron error during removal of overlap text", e); + ContentReplacer replacer = new ContentReplacer(); // Reinitialize is needed in loop. + replacer.addText(rect, REPLACEMENT_TEXT); + replacer.process(ocrPage); + } catch (Exception e) { + log.warn("Skipping removing text behind image because of: {}", e.getMessage()); + break; } - }); - - var gState = element.getGState(); - - //See PDF Reference 5.3 Text rendering modes, 3 = Invisible, however this ocr does not use it. - if (!filledRectangleIntersection && gState.getTextRenderMode() != 3 || filledRectangleIntersection && gState.getTextRenderMode() == 0) { - writer.writeElement(element); } } - @SneakyThrows - private void processPath(Element element, ElementWriter writer, Set filledRectangles) { + public Rect convert(Rectangle rectangle, Rect cropBox, Rect mediaBox) { - writer.writeElement(element); - if (element.getPathData() != null && element.getPathData().getPoints().length > 4) { - filledRectangles.add(element.getBBox()); - } - } + try { + var offset = 0.01; + var x1 = rectangle.getTopLeft().getX() + cropBox.getX1() - mediaBox.getX1() + (cropBox.equals(mediaBox) ? cropBox.getX1() : 0f) - offset; + var y1 = rectangle.getTopLeft().getY() + rectangle.getHeight() - (mediaBox.getY1() - cropBox.getY1()) + (cropBox.equals(mediaBox) ? cropBox.getY1() : 0f) + offset; + var x2 = rectangle.getTopLeft().getX() + rectangle.getWidth() + cropBox.getX1() - mediaBox.getX1() + (cropBox.equals(mediaBox) ? cropBox.getX1() : 0f) + offset; + var y2 = rectangle.getTopLeft().getY() - (mediaBox.getY1() - cropBox.getY1()) + (cropBox.equals(mediaBox) ? cropBox.getY1() : 0f) - offset; - - @SneakyThrows - private void processForm(ElementReader reader, ElementWriter writer, Element element, Set visited) { - - writer.writeElement(element); - Obj formObj = element.getXObject(); - - if (!visited.contains((int) formObj.getObjNum())) { - visited.add((int) formObj.getObjNum()); - ElementWriter new_writer = new ElementWriter(); - reader.formBegin(); - new_writer.begin(formObj); - - reader.clearChangeList(); - new_writer.setDefaultGState(reader); - - processElements(reader, new_writer, visited, true); - new_writer.end(); - reader.end(); + // Rect is specified by lower-left and upperright corner. + return new Rect(x1, y1, x2, y2); + } catch (PDFNetException e) { + throw new RuntimeException(e); } } diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/InvisibleText.IMAGE_INFO.json b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/InvisibleText.IMAGE_INFO.json new file mode 100644 index 0000000..940d444 --- /dev/null +++ b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/InvisibleText.IMAGE_INFO.json @@ -0,0 +1 @@ +{"dossierId": "c8553cbd-409f-4e1a-baf4-34b11d49deac", "fileId": "bd6f93ed896dd0e2f641b0568f13ddf1", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": [{"classification": {"label": "other", "probabilities": {"other": 0.9999, "logo": 0.0001, "formula": 0.0, "signature": 0.0}}, "representation": "FFFFFEFBF7EFCFFFFFFFFFFFF", "position": {"x1": -3, "x2": 795, "y1": 0, "y2": 612, "pageNumber": 1}, "geometry": {"width": 798, "height": 612}, "alpha": false, "filters": {"geometry": {"imageSize": {"quotient": 1.0038, "tooLarge": true, "tooSmall": false}, "imageFormat": {"quotient": 1.3039, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "logo", "probabilities": {"logo": 0.9966, "other": 0.0025, "signature": 0.0005, "formula": 0.0003}}, "representation": "FFC33D3C323CCF3390C1F8C72", "position": {"x1": 120, "x2": 131, "y1": 264, "y2": 380, "pageNumber": 1}, "geometry": {"width": 11, "height": 116}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.0513, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.0948, "tooTall": true, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "other", "probabilities": {"other": 1.0, "formula": 0.0, "logo": 0.0, "signature": 0.0}}, "representation": "EF8FF6381060800318F0E187", "position": {"x1": 152, "x2": 205, "y1": 115, "y2": 533, "pageNumber": 1}, "geometry": {"width": 53, "height": 418}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.2138, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.1268, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": true}}, {"classification": {"label": "other", "probabilities": {"other": 0.5967, "logo": 0.1756, "signature": 0.1218, "formula": 0.106}}, "representation": "EC9377C9170E1070C3070C30F", "position": {"x1": 196, "x2": 221, "y1": 121, "y2": 245, "pageNumber": 1}, "geometry": {"width": 25, "height": 124}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.08, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.2016, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": true}}, {"classification": {"label": "logo", "probabilities": {"logo": 1.0, "formula": 0.0, "other": 0.0, "signature": 0.0}}, "representation": "CF1F1F70F140036860F1441B5", "position": {"x1": 707, "x2": 718, "y1": 481, "y2": 531, "pageNumber": 1}, "geometry": {"width": 11, "height": 50}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.0337, "tooLarge": false, "tooSmall": true}, "imageFormat": {"quotient": 0.22, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "logo", "probabilities": {"logo": 0.9704, "other": 0.0223, "formula": 0.0044, "signature": 0.0029}}, "representation": "CF0F1C70F1C7090081F7CF073", "position": {"x1": 732, "x2": 744, "y1": 118, "y2": 241, "pageNumber": 1}, "geometry": {"width": 12, "height": 123}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.0552, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.0976, "tooTall": true, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}], "dataCV": []} \ No newline at end of file diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/InvisibleText.pdf b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/InvisibleText.pdf new file mode 100644 index 0000000..0eb7128 Binary files /dev/null and b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/InvisibleText.pdf differ