Merge branch 'DM-307' into 'master'

DM-307: Added none production ready code remove watermarks from SCM Flora prototype files

Closes DM-307

See merge request redactmanager/ocr-service!7
This commit is contained in:
Dominique Eifländer 2023-07-03 12:44:20 +02:00
commit a4f6b2c0d2
2 changed files with 143 additions and 4 deletions

View File

@ -44,7 +44,7 @@ public class OCRService {
private final RabbitTemplate rabbitTemplate;
private final ObjectMapper objectMapper;
private final WatermarkRemovalService watermarkRemovalService;
private final InvisibleElementRemovalService invisibleElementRemovalService;
@ -67,16 +67,30 @@ public class OCRService {
@Timed("redactmanager_runOcrOnDocument")
public void runOcrOnDocument(String dossierId, String fileId, OutputStream out) throws IOException {
try (ByteArrayOutputStream transferOutputStream = new ByteArrayOutputStream()) {
try (InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId)) {
invisibleElementRemovalService.removeInvisibleElements(fileStream, transferOutputStream, false);
ByteArrayOutputStream transferOutputStream = new ByteArrayOutputStream();
InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId);
try {
if (settings.isRemoveWatermark()) {
watermarkRemovalService.removeWatermarks(fileStream, transferOutputStream);
fileStream.close();
fileStream = new ByteArrayInputStream(transferOutputStream.toByteArray());
transferOutputStream.close();
transferOutputStream = new ByteArrayOutputStream();
}
invisibleElementRemovalService.removeInvisibleElements(fileStream, transferOutputStream, false);
try (InputStream transferInputStream = new ByteArrayInputStream(transferOutputStream.toByteArray())) {
long ocrStart = System.currentTimeMillis();
runOcr(transferInputStream, out, fileId);
long ocrEnd = System.currentTimeMillis();
log.info("ocr successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, format("%.1f", (ocrEnd - ocrStart) / 1000.0));
}
} finally {
fileStream.close();
transferOutputStream.close();
}
}

View File

@ -0,0 +1,125 @@
package com.iqser.red.service.ocr.v1.server.service;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Set;
import java.util.TreeSet;
import org.springframework.stereotype.Service;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementReader;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
import com.pdftron.sdf.Obj;
import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
public class WatermarkRemovalService {
/**
* !!!Warning!! This logic is definitive wrong and should NEVER run in production,
* however it was used in second DocuMine (SCM) prototype and we currently need it to compare the results.
*
* @param pdfFile the file as Inputstream.
* @param transferOutputStream the resulting file as Outputstream.
*/
@SneakyThrows
public void removeWatermarks(InputStream pdfFile, OutputStream transferOutputStream) {
PDFDoc pdfDoc = new PDFDoc(pdfFile);
this.execute(pdfDoc);
try {
pdfDoc.save(transferOutputStream, SDFDoc.SaveMode.LINEARIZED, null);
} catch (Exception var10) {
log.error("File could not be saved after watermark removal");
throw new RuntimeException(var10);
} finally {
pdfDoc.close();
}
}
@SneakyThrows
private void execute(PDFDoc pdfDoc) {
ElementWriter writer = new ElementWriter();
ElementReader reader = new ElementReader();
Set<Integer> visited = new TreeSet<>();
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
Page page = iterator.next();
removeOverlapText(page, reader, writer, visited);
}
reader.destroy();
writer.destroy();
}
@SneakyThrows
private void removeOverlapText(Page page, ElementReader reader, ElementWriter writer, Set<Integer> visited) {
visited.add((int) page.getSDFObj().getObjNum());
reader.begin(page);
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
processElements(reader, writer, visited, false);
writer.end();
reader.end();
}
@SneakyThrows
private void processElements(ElementReader reader, ElementWriter writer, Set<Integer> visited, boolean isInForm) {
for (Element element = reader.next(); element != null; element = reader.next())
switch (element.getType()) {
case Element.e_image, Element.e_inline_image -> processImage(element, writer, isInForm);
case Element.e_form -> processForm(reader, writer, element, visited);
default -> writer.writeElement(element);
}
}
@SneakyThrows
private void processForm(ElementReader reader, ElementWriter writer, Element element, Set<Integer> visited) {
writer.writeElement(element);
Obj formObj = element.getXObject();
if (!visited.contains((int) formObj.getObjNum())) {
visited.add((int) formObj.getObjNum());
ElementWriter formWriter = new ElementWriter();
reader.formBegin();
formWriter.begin(formObj);
reader.clearChangeList();
formWriter.setDefaultGState(reader);
processElements(reader, formWriter, visited, true);
formWriter.end();
formWriter.destroy();
reader.end();
}
}
@SneakyThrows
private void processImage(Element element, ElementWriter writer, boolean isInForm) {
// !!! Warning, this will also remove none watermark images form files.
// Idea: Remove watermarks by comparing (hash values) images. Watermarks to remove should be uploaded in dossier/dossierTemplate.
// Removing watermarks should be done in preprocessing, not at ocr.
if (!isInForm) {
writer.writeElement(element);
}
}
}