Merge branch 'RED-7075' into 'master'
RED-7075: WIP Closes RED-7075 See merge request redactmanager/ocr-service!15
This commit is contained in:
commit
04a0925a6c
@ -15,7 +15,7 @@
|
||||
<properties>
|
||||
<tennat-commons.version>0.10.0</tennat-commons.version>
|
||||
<persistence-service.version>2.118.0</persistence-service.version>
|
||||
<pdftron-logic-commons.version>2.10.0</pdftron-logic-commons.version>
|
||||
<pdftron-logic-commons.version>2.14.0</pdftron-logic-commons.version>
|
||||
</properties>
|
||||
|
||||
<dependencies>
|
||||
|
||||
@ -12,6 +12,7 @@ import org.springframework.context.annotation.Import;
|
||||
import org.springframework.scheduling.annotation.EnableAsync;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService;
|
||||
import com.iqser.red.pdftronlogic.commons.WatermarkRemovalService;
|
||||
import com.iqser.red.service.ocr.v1.server.client.FileStatusProcessingUpdateClient;
|
||||
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
|
||||
import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings;
|
||||
@ -25,7 +26,7 @@ import io.micrometer.core.instrument.MeterRegistry;
|
||||
@ImportAutoConfiguration({MultiTenancyAutoConfiguration.class})
|
||||
@EnableConfigurationProperties(OcrServiceSettings.class)
|
||||
@SpringBootApplication(exclude = {SecurityAutoConfiguration.class, ManagementWebSecurityAutoConfiguration.class})
|
||||
@Import({ MessagingConfiguration.class, StorageAutoConfiguration.class})
|
||||
@Import({MessagingConfiguration.class, StorageAutoConfiguration.class})
|
||||
@EnableFeignClients(basePackageClasses = FileStatusProcessingUpdateClient.class)
|
||||
public class Application {
|
||||
|
||||
@ -53,4 +54,11 @@ public class Application {
|
||||
return new InvisibleElementRemovalService();
|
||||
}
|
||||
|
||||
|
||||
@Bean
|
||||
public WatermarkRemovalService watermarkRemovalService() {
|
||||
|
||||
return new WatermarkRemovalService();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -14,9 +14,8 @@ import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService;
|
||||
import com.iqser.red.pdftronlogic.commons.WatermarkRemovalService;
|
||||
import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse;
|
||||
import com.iqser.red.service.ocr.v1.server.client.DossierClient;
|
||||
import com.iqser.red.service.ocr.v1.server.client.DossierTemplateClient;
|
||||
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
|
||||
import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.DossierTemplate;
|
||||
|
||||
@ -1,125 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.service;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.pdftron.pdf.Element;
|
||||
import com.pdftron.pdf.ElementReader;
|
||||
import com.pdftron.pdf.ElementWriter;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.pdf.PageIterator;
|
||||
import com.pdftron.sdf.Obj;
|
||||
import com.pdftron.sdf.SDFDoc;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
public class WatermarkRemovalService {
|
||||
|
||||
/**
|
||||
* !!!Warning!! This logic is definitive wrong and should NEVER run in production,
|
||||
* however it was used in second DocuMine (SCM) prototype and we currently need it to compare the results.
|
||||
*
|
||||
* @param pdfFile the file as Inputstream.
|
||||
* @param transferOutputStream the resulting file as Outputstream.
|
||||
*/
|
||||
@SneakyThrows
|
||||
public void removeWatermarks(InputStream pdfFile, OutputStream transferOutputStream) {
|
||||
|
||||
PDFDoc pdfDoc = new PDFDoc(pdfFile);
|
||||
this.execute(pdfDoc);
|
||||
|
||||
try {
|
||||
pdfDoc.save(transferOutputStream, SDFDoc.SaveMode.LINEARIZED, null);
|
||||
} catch (Exception var10) {
|
||||
log.error("File could not be saved after watermark removal");
|
||||
throw new RuntimeException(var10);
|
||||
} finally {
|
||||
pdfDoc.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void execute(PDFDoc pdfDoc) {
|
||||
|
||||
ElementWriter writer = new ElementWriter();
|
||||
ElementReader reader = new ElementReader();
|
||||
Set<Integer> visited = new TreeSet<>();
|
||||
|
||||
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
|
||||
Page page = iterator.next();
|
||||
removeOverlapText(page, reader, writer, visited);
|
||||
}
|
||||
|
||||
reader.destroy();
|
||||
writer.destroy();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void removeOverlapText(Page page, ElementReader reader, ElementWriter writer, Set<Integer> visited) {
|
||||
|
||||
visited.add((int) page.getSDFObj().getObjNum());
|
||||
reader.begin(page);
|
||||
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
|
||||
processElements(reader, writer, visited, false);
|
||||
writer.end();
|
||||
reader.end();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void processElements(ElementReader reader, ElementWriter writer, Set<Integer> visited, boolean isInForm) {
|
||||
|
||||
for (Element element = reader.next(); element != null; element = reader.next())
|
||||
switch (element.getType()) {
|
||||
case Element.e_image, Element.e_inline_image -> processImage(element, writer, isInForm);
|
||||
case Element.e_form -> processForm(reader, writer, element, visited);
|
||||
default -> writer.writeElement(element);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void processForm(ElementReader reader, ElementWriter writer, Element element, Set<Integer> visited) {
|
||||
|
||||
writer.writeElement(element);
|
||||
Obj formObj = element.getXObject();
|
||||
|
||||
if (!visited.contains((int) formObj.getObjNum())) {
|
||||
visited.add((int) formObj.getObjNum());
|
||||
ElementWriter formWriter = new ElementWriter();
|
||||
reader.formBegin();
|
||||
formWriter.begin(formObj);
|
||||
|
||||
reader.clearChangeList();
|
||||
formWriter.setDefaultGState(reader);
|
||||
|
||||
processElements(reader, formWriter, visited, true);
|
||||
formWriter.end();
|
||||
formWriter.destroy();
|
||||
reader.end();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void processImage(Element element, ElementWriter writer, boolean isInForm) {
|
||||
|
||||
// !!! Warning, this will also remove none watermark images form files.
|
||||
// Idea: Remove watermarks by comparing (hash values) images. Watermarks to remove should be uploaded in dossier/dossierTemplate.
|
||||
// Removing watermarks should be done in preprocessing, not at ocr.
|
||||
if (!isInForm) {
|
||||
writer.writeElement(element);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,13 +1,12 @@
|
||||
package com.iqser.red.service.ocr.v1.server.service;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
import java.io.FileOutputStream;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.WatermarkRemovalService;
|
||||
import com.iqser.red.service.ocr.v1.server.AbstractTest;
|
||||
import com.iqser.red.service.ocr.v1.server.utils.OsUtils;
|
||||
|
||||
@ -29,5 +28,4 @@ class WatermarkRemovalServiceTest extends AbstractTest {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user