From 4df80612abad47ebeae98828caa4c92cb8274bd4 Mon Sep 17 00:00:00 2001 From: deiflaender Date: Mon, 3 Jul 2023 12:37:42 +0200 Subject: [PATCH] DM-307: Added none production ready code remove watermarks from SCM Flora prototype files --- .../ocr/v1/server/service/OCRService.java | 22 ++- .../service/WatermarkRemovalService.java | 125 ++++++++++++++++++ 2 files changed, 143 insertions(+), 4 deletions(-) create mode 100644 ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/WatermarkRemovalService.java diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java index f96e405..7f01e69 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java @@ -44,7 +44,7 @@ public class OCRService { private final RabbitTemplate rabbitTemplate; - private final ObjectMapper objectMapper; + private final WatermarkRemovalService watermarkRemovalService; private final InvisibleElementRemovalService invisibleElementRemovalService; @@ -67,16 +67,30 @@ public class OCRService { @Timed("redactmanager_runOcrOnDocument") public void runOcrOnDocument(String dossierId, String fileId, OutputStream out) throws IOException { - try (ByteArrayOutputStream transferOutputStream = new ByteArrayOutputStream()) { - try (InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId)) { - invisibleElementRemovalService.removeInvisibleElements(fileStream, transferOutputStream, false); + ByteArrayOutputStream transferOutputStream = new ByteArrayOutputStream(); + InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId); + + try { + if (settings.isRemoveWatermark()) { + watermarkRemovalService.removeWatermarks(fileStream, transferOutputStream); + fileStream.close(); + fileStream = new ByteArrayInputStream(transferOutputStream.toByteArray()); + transferOutputStream.close(); + transferOutputStream = new ByteArrayOutputStream(); } + + invisibleElementRemovalService.removeInvisibleElements(fileStream, transferOutputStream, false); + try (InputStream transferInputStream = new ByteArrayInputStream(transferOutputStream.toByteArray())) { long ocrStart = System.currentTimeMillis(); runOcr(transferInputStream, out, fileId); long ocrEnd = System.currentTimeMillis(); log.info("ocr successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, format("%.1f", (ocrEnd - ocrStart) / 1000.0)); } + + } finally { + fileStream.close(); + transferOutputStream.close(); } } diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/WatermarkRemovalService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/WatermarkRemovalService.java new file mode 100644 index 0000000..2558b8b --- /dev/null +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/WatermarkRemovalService.java @@ -0,0 +1,125 @@ +package com.iqser.red.service.ocr.v1.server.service; + +import java.io.InputStream; +import java.io.OutputStream; +import java.util.Set; +import java.util.TreeSet; + +import org.springframework.stereotype.Service; + +import com.pdftron.pdf.Element; +import com.pdftron.pdf.ElementReader; +import com.pdftron.pdf.ElementWriter; +import com.pdftron.pdf.PDFDoc; +import com.pdftron.pdf.Page; +import com.pdftron.pdf.PageIterator; +import com.pdftron.sdf.Obj; +import com.pdftron.sdf.SDFDoc; + +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@Service +public class WatermarkRemovalService { + + /** + * !!!Warning!! This logic is definitive wrong and should NEVER run in production, + * however it was used in second DocuMine (SCM) prototype and we currently need it to compare the results. + * + * @param pdfFile the file as Inputstream. + * @param transferOutputStream the resulting file as Outputstream. + */ + @SneakyThrows + public void removeWatermarks(InputStream pdfFile, OutputStream transferOutputStream) { + + PDFDoc pdfDoc = new PDFDoc(pdfFile); + this.execute(pdfDoc); + + try { + pdfDoc.save(transferOutputStream, SDFDoc.SaveMode.LINEARIZED, null); + } catch (Exception var10) { + log.error("File could not be saved after watermark removal"); + throw new RuntimeException(var10); + } finally { + pdfDoc.close(); + } + } + + + @SneakyThrows + private void execute(PDFDoc pdfDoc) { + + ElementWriter writer = new ElementWriter(); + ElementReader reader = new ElementReader(); + Set visited = new TreeSet<>(); + + for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) { + Page page = iterator.next(); + removeOverlapText(page, reader, writer, visited); + } + + reader.destroy(); + writer.destroy(); + } + + + @SneakyThrows + private void removeOverlapText(Page page, ElementReader reader, ElementWriter writer, Set visited) { + + visited.add((int) page.getSDFObj().getObjNum()); + reader.begin(page); + writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict()); + processElements(reader, writer, visited, false); + writer.end(); + reader.end(); + } + + + @SneakyThrows + private void processElements(ElementReader reader, ElementWriter writer, Set visited, boolean isInForm) { + + for (Element element = reader.next(); element != null; element = reader.next()) + switch (element.getType()) { + case Element.e_image, Element.e_inline_image -> processImage(element, writer, isInForm); + case Element.e_form -> processForm(reader, writer, element, visited); + default -> writer.writeElement(element); + } + } + + + @SneakyThrows + private void processForm(ElementReader reader, ElementWriter writer, Element element, Set visited) { + + writer.writeElement(element); + Obj formObj = element.getXObject(); + + if (!visited.contains((int) formObj.getObjNum())) { + visited.add((int) formObj.getObjNum()); + ElementWriter formWriter = new ElementWriter(); + reader.formBegin(); + formWriter.begin(formObj); + + reader.clearChangeList(); + formWriter.setDefaultGState(reader); + + processElements(reader, formWriter, visited, true); + formWriter.end(); + formWriter.destroy(); + reader.end(); + } + } + + + @SneakyThrows + private void processImage(Element element, ElementWriter writer, boolean isInForm) { + + // !!! Warning, this will also remove none watermark images form files. + // Idea: Remove watermarks by comparing (hash values) images. Watermarks to remove should be uploaded in dossier/dossierTemplate. + // Removing watermarks should be done in preprocessing, not at ocr. + if (!isInForm) { + writer.writeElement(element); + } + } + +}