Merge branch 'DM-307' into 'master'
DM-307: Added none production ready code remove watermarks from SCM Flora prototype files Closes DM-307 See merge request redactmanager/ocr-service!7
This commit is contained in:
commit
a4f6b2c0d2
@ -44,7 +44,7 @@ public class OCRService {
|
||||
|
||||
private final RabbitTemplate rabbitTemplate;
|
||||
|
||||
private final ObjectMapper objectMapper;
|
||||
private final WatermarkRemovalService watermarkRemovalService;
|
||||
|
||||
private final InvisibleElementRemovalService invisibleElementRemovalService;
|
||||
|
||||
@ -67,16 +67,30 @@ public class OCRService {
|
||||
@Timed("redactmanager_runOcrOnDocument")
|
||||
public void runOcrOnDocument(String dossierId, String fileId, OutputStream out) throws IOException {
|
||||
|
||||
try (ByteArrayOutputStream transferOutputStream = new ByteArrayOutputStream()) {
|
||||
try (InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId)) {
|
||||
invisibleElementRemovalService.removeInvisibleElements(fileStream, transferOutputStream, false);
|
||||
ByteArrayOutputStream transferOutputStream = new ByteArrayOutputStream();
|
||||
InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId);
|
||||
|
||||
try {
|
||||
if (settings.isRemoveWatermark()) {
|
||||
watermarkRemovalService.removeWatermarks(fileStream, transferOutputStream);
|
||||
fileStream.close();
|
||||
fileStream = new ByteArrayInputStream(transferOutputStream.toByteArray());
|
||||
transferOutputStream.close();
|
||||
transferOutputStream = new ByteArrayOutputStream();
|
||||
}
|
||||
|
||||
invisibleElementRemovalService.removeInvisibleElements(fileStream, transferOutputStream, false);
|
||||
|
||||
try (InputStream transferInputStream = new ByteArrayInputStream(transferOutputStream.toByteArray())) {
|
||||
long ocrStart = System.currentTimeMillis();
|
||||
runOcr(transferInputStream, out, fileId);
|
||||
long ocrEnd = System.currentTimeMillis();
|
||||
log.info("ocr successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, format("%.1f", (ocrEnd - ocrStart) / 1000.0));
|
||||
}
|
||||
|
||||
} finally {
|
||||
fileStream.close();
|
||||
transferOutputStream.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -0,0 +1,125 @@
|
||||
package com.iqser.red.service.ocr.v1.server.service;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.pdftron.pdf.Element;
|
||||
import com.pdftron.pdf.ElementReader;
|
||||
import com.pdftron.pdf.ElementWriter;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.pdf.PageIterator;
|
||||
import com.pdftron.sdf.Obj;
|
||||
import com.pdftron.sdf.SDFDoc;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
public class WatermarkRemovalService {
|
||||
|
||||
/**
|
||||
* !!!Warning!! This logic is definitive wrong and should NEVER run in production,
|
||||
* however it was used in second DocuMine (SCM) prototype and we currently need it to compare the results.
|
||||
*
|
||||
* @param pdfFile the file as Inputstream.
|
||||
* @param transferOutputStream the resulting file as Outputstream.
|
||||
*/
|
||||
@SneakyThrows
|
||||
public void removeWatermarks(InputStream pdfFile, OutputStream transferOutputStream) {
|
||||
|
||||
PDFDoc pdfDoc = new PDFDoc(pdfFile);
|
||||
this.execute(pdfDoc);
|
||||
|
||||
try {
|
||||
pdfDoc.save(transferOutputStream, SDFDoc.SaveMode.LINEARIZED, null);
|
||||
} catch (Exception var10) {
|
||||
log.error("File could not be saved after watermark removal");
|
||||
throw new RuntimeException(var10);
|
||||
} finally {
|
||||
pdfDoc.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void execute(PDFDoc pdfDoc) {
|
||||
|
||||
ElementWriter writer = new ElementWriter();
|
||||
ElementReader reader = new ElementReader();
|
||||
Set<Integer> visited = new TreeSet<>();
|
||||
|
||||
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
|
||||
Page page = iterator.next();
|
||||
removeOverlapText(page, reader, writer, visited);
|
||||
}
|
||||
|
||||
reader.destroy();
|
||||
writer.destroy();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void removeOverlapText(Page page, ElementReader reader, ElementWriter writer, Set<Integer> visited) {
|
||||
|
||||
visited.add((int) page.getSDFObj().getObjNum());
|
||||
reader.begin(page);
|
||||
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
|
||||
processElements(reader, writer, visited, false);
|
||||
writer.end();
|
||||
reader.end();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void processElements(ElementReader reader, ElementWriter writer, Set<Integer> visited, boolean isInForm) {
|
||||
|
||||
for (Element element = reader.next(); element != null; element = reader.next())
|
||||
switch (element.getType()) {
|
||||
case Element.e_image, Element.e_inline_image -> processImage(element, writer, isInForm);
|
||||
case Element.e_form -> processForm(reader, writer, element, visited);
|
||||
default -> writer.writeElement(element);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void processForm(ElementReader reader, ElementWriter writer, Element element, Set<Integer> visited) {
|
||||
|
||||
writer.writeElement(element);
|
||||
Obj formObj = element.getXObject();
|
||||
|
||||
if (!visited.contains((int) formObj.getObjNum())) {
|
||||
visited.add((int) formObj.getObjNum());
|
||||
ElementWriter formWriter = new ElementWriter();
|
||||
reader.formBegin();
|
||||
formWriter.begin(formObj);
|
||||
|
||||
reader.clearChangeList();
|
||||
formWriter.setDefaultGState(reader);
|
||||
|
||||
processElements(reader, formWriter, visited, true);
|
||||
formWriter.end();
|
||||
formWriter.destroy();
|
||||
reader.end();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void processImage(Element element, ElementWriter writer, boolean isInForm) {
|
||||
|
||||
// !!! Warning, this will also remove none watermark images form files.
|
||||
// Idea: Remove watermarks by comparing (hash values) images. Watermarks to remove should be uploaded in dossier/dossierTemplate.
|
||||
// Removing watermarks should be done in preprocessing, not at ocr.
|
||||
if (!isInForm) {
|
||||
writer.writeElement(element);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user