Merge branch 'RED-7075' into 'master'

RED-7075: WIP

Closes RED-7075

See merge request redactmanager/ocr-service!15
This commit is contained in:
Raphael Arnold 2023-08-07 13:33:10 +02:00
commit 04a0925a6c
5 changed files with 12 additions and 132 deletions

View File

@ -15,7 +15,7 @@
<properties>
<tennat-commons.version>0.10.0</tennat-commons.version>
<persistence-service.version>2.118.0</persistence-service.version>
<pdftron-logic-commons.version>2.10.0</pdftron-logic-commons.version>
<pdftron-logic-commons.version>2.14.0</pdftron-logic-commons.version>
</properties>
<dependencies>

View File

@ -12,6 +12,7 @@ import org.springframework.context.annotation.Import;
import org.springframework.scheduling.annotation.EnableAsync;
import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService;
import com.iqser.red.pdftronlogic.commons.WatermarkRemovalService;
import com.iqser.red.service.ocr.v1.server.client.FileStatusProcessingUpdateClient;
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings;
@ -25,7 +26,7 @@ import io.micrometer.core.instrument.MeterRegistry;
@ImportAutoConfiguration({MultiTenancyAutoConfiguration.class})
@EnableConfigurationProperties(OcrServiceSettings.class)
@SpringBootApplication(exclude = {SecurityAutoConfiguration.class, ManagementWebSecurityAutoConfiguration.class})
@Import({ MessagingConfiguration.class, StorageAutoConfiguration.class})
@Import({MessagingConfiguration.class, StorageAutoConfiguration.class})
@EnableFeignClients(basePackageClasses = FileStatusProcessingUpdateClient.class)
public class Application {
@ -53,4 +54,11 @@ public class Application {
return new InvisibleElementRemovalService();
}
@Bean
public WatermarkRemovalService watermarkRemovalService() {
return new WatermarkRemovalService();
}
}

View File

@ -14,9 +14,8 @@ import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.stereotype.Service;
import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService;
import com.iqser.red.pdftronlogic.commons.WatermarkRemovalService;
import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse;
import com.iqser.red.service.ocr.v1.server.client.DossierClient;
import com.iqser.red.service.ocr.v1.server.client.DossierTemplateClient;
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings;
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.DossierTemplate;

View File

@ -1,125 +0,0 @@
package com.iqser.red.service.ocr.v1.server.service;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Set;
import java.util.TreeSet;
import org.springframework.stereotype.Service;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementReader;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
import com.pdftron.sdf.Obj;
import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
public class WatermarkRemovalService {
/**
* !!!Warning!! This logic is definitive wrong and should NEVER run in production,
* however it was used in second DocuMine (SCM) prototype and we currently need it to compare the results.
*
* @param pdfFile the file as Inputstream.
* @param transferOutputStream the resulting file as Outputstream.
*/
@SneakyThrows
public void removeWatermarks(InputStream pdfFile, OutputStream transferOutputStream) {
PDFDoc pdfDoc = new PDFDoc(pdfFile);
this.execute(pdfDoc);
try {
pdfDoc.save(transferOutputStream, SDFDoc.SaveMode.LINEARIZED, null);
} catch (Exception var10) {
log.error("File could not be saved after watermark removal");
throw new RuntimeException(var10);
} finally {
pdfDoc.close();
}
}
@SneakyThrows
private void execute(PDFDoc pdfDoc) {
ElementWriter writer = new ElementWriter();
ElementReader reader = new ElementReader();
Set<Integer> visited = new TreeSet<>();
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
Page page = iterator.next();
removeOverlapText(page, reader, writer, visited);
}
reader.destroy();
writer.destroy();
}
@SneakyThrows
private void removeOverlapText(Page page, ElementReader reader, ElementWriter writer, Set<Integer> visited) {
visited.add((int) page.getSDFObj().getObjNum());
reader.begin(page);
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
processElements(reader, writer, visited, false);
writer.end();
reader.end();
}
@SneakyThrows
private void processElements(ElementReader reader, ElementWriter writer, Set<Integer> visited, boolean isInForm) {
for (Element element = reader.next(); element != null; element = reader.next())
switch (element.getType()) {
case Element.e_image, Element.e_inline_image -> processImage(element, writer, isInForm);
case Element.e_form -> processForm(reader, writer, element, visited);
default -> writer.writeElement(element);
}
}
@SneakyThrows
private void processForm(ElementReader reader, ElementWriter writer, Element element, Set<Integer> visited) {
writer.writeElement(element);
Obj formObj = element.getXObject();
if (!visited.contains((int) formObj.getObjNum())) {
visited.add((int) formObj.getObjNum());
ElementWriter formWriter = new ElementWriter();
reader.formBegin();
formWriter.begin(formObj);
reader.clearChangeList();
formWriter.setDefaultGState(reader);
processElements(reader, formWriter, visited, true);
formWriter.end();
formWriter.destroy();
reader.end();
}
}
@SneakyThrows
private void processImage(Element element, ElementWriter writer, boolean isInForm) {
// !!! Warning, this will also remove none watermark images form files.
// Idea: Remove watermarks by comparing (hash values) images. Watermarks to remove should be uploaded in dossier/dossierTemplate.
// Removing watermarks should be done in preprocessing, not at ocr.
if (!isInForm) {
writer.writeElement(element);
}
}
}

View File

@ -1,13 +1,12 @@
package com.iqser.red.service.ocr.v1.server.service;
import static org.junit.jupiter.api.Assertions.*;
import java.io.FileOutputStream;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.ClassPathResource;
import com.iqser.red.pdftronlogic.commons.WatermarkRemovalService;
import com.iqser.red.service.ocr.v1.server.AbstractTest;
import com.iqser.red.service.ocr.v1.server.utils.OsUtils;
@ -29,5 +28,4 @@ class WatermarkRemovalServiceTest extends AbstractTest {
}
}
}