RED-6126: In the OCRService, OCR Text is not applied to Document

*called PDFNet.initialize and terminate before and after message receive
*updated comments
*renamed some variables
This commit is contained in:
Kilian Schuettler 2023-02-21 10:42:14 +01:00
parent a6d99f5916
commit ec8cf3c324
2 changed files with 22 additions and 27 deletions

View File

@ -1,34 +1,23 @@
package com.iqser.red.service.ocr.v1.server.service; package com.iqser.red.service.ocr.v1.server.service;
import static java.lang.String.format;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Map;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.stereotype.Service;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse; import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse;
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration; import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings; import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings;
import com.pdftron.common.PDFNetException; import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.OCRModule; import com.pdftron.pdf.*;
import com.pdftron.pdf.OCROptions;
import com.pdftron.pdf.Optimizer;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.RectCollection;
import com.pdftron.sdf.SDFDoc; import com.pdftron.sdf.SDFDoc;
import io.micrometer.core.annotation.Timed; import io.micrometer.core.annotation.Timed;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.stereotype.Service;
import java.io.*;
import java.util.Map;
import static java.lang.String.format;
@Slf4j @Slf4j
@Service @Service
@ -67,17 +56,17 @@ public class OCRService {
try (ByteArrayOutputStream transferOutputStream = new ByteArrayOutputStream()) { try (ByteArrayOutputStream transferOutputStream = new ByteArrayOutputStream()) {
try (InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId)) { try (InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId)) {
long start = System.currentTimeMillis(); long removalStart = System.currentTimeMillis();
log.debug("Start invisible element removal for file with dossierId {} and fileId {}", dossierId, fileId); log.debug("Start invisible element removal for file with dossierId {} and fileId {}", dossierId, fileId);
invisibleElementRemovalService.removeInvisibleElements(fileStream, transferOutputStream, false); invisibleElementRemovalService.removeInvisibleElements(fileStream, transferOutputStream, false);
long end = System.currentTimeMillis(); long removalEnd = System.currentTimeMillis();
log.debug("Invisible element removal successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, format("%.1f", (end - start) / 1000.0)); log.debug("Invisible element removal successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, format("%.1f", (removalEnd - removalStart) / 1000.0));
}
try (InputStream transferInputStream = new ByteArrayInputStream(transferOutputStream.toByteArray())) { try (InputStream transferInputStream = new ByteArrayInputStream(transferOutputStream.toByteArray())) {
long start = System.currentTimeMillis(); long ocrStart = System.currentTimeMillis();
runOcr(transferInputStream, out, fileId); runOcr(transferInputStream, out, fileId);
long end = System.currentTimeMillis(); long ocrEnd = System.currentTimeMillis();
log.info("ocr successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, format("%.1f", (end - start) / 1000.0)); log.info("ocr successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, format("%.1f", (ocrEnd - ocrStart) / 1000.0));
}
} }
} }
} }
@ -94,7 +83,7 @@ public class OCRService {
// When a page does not have a TextZone, PDFTron whites out the page. But, PDFTron scans it anyway, resulting in a longer runtime. // When a page does not have a TextZone, PDFTron whites out the page. But, PDFTron scans it anyway, resulting in a longer runtime.
// So, we need to remove pages without images. // So, we need to remove pages without images.
// Furthermore, creating a new document is *much* faster than reusing the same document and adding/removing pages one by one. // Furthermore, creating a new document is *much* faster than reusing the same document and adding/removing pages one by one.
// Therefore, we create a new Document with a single page for every page that contains text. // Therefore, we create a new Document with a single page for every page that contains an image.
int numProcessedPages = 0; int numProcessedPages = 0;
for (Integer pageId : pageIdToRectCollection.keySet()) { for (Integer pageId : pageIdToRectCollection.keySet()) {
try { try {
@ -139,6 +128,7 @@ public class OCRService {
log.error("Processed File with fileId {} could not be saved", fileId); log.error("Processed File with fileId {} could not be saved", fileId);
throw new RuntimeException(e); throw new RuntimeException(e);
} }
pdfDoc.close();
} }

View File

@ -15,6 +15,8 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.ocr.v1.api.model.DocumentRequest; import com.iqser.red.service.ocr.v1.api.model.DocumentRequest;
import com.iqser.red.service.ocr.v1.server.client.FileStatusProcessingUpdateClient; import com.iqser.red.service.ocr.v1.server.client.FileStatusProcessingUpdateClient;
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration; import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
import com.iqser.red.service.ocr.v1.server.initializer.PDFNetInitializer;
import com.pdftron.pdf.PDFNet;
import feign.FeignException; import feign.FeignException;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
@ -28,6 +30,7 @@ public class OcrMessageReceiver {
private final ObjectMapper objectMapper; private final ObjectMapper objectMapper;
private final FileStorageService fileStorageService; private final FileStorageService fileStorageService;
private final FileStatusProcessingUpdateClient fileStatusProcessingUpdateClient; private final FileStatusProcessingUpdateClient fileStatusProcessingUpdateClient;
private final PDFNetInitializer pdfNetInitializer;
private final OCRService ocrService; private final OCRService ocrService;
@ -36,6 +39,7 @@ public class OcrMessageReceiver {
@RabbitListener(queues = MessagingConfiguration.OCR_QUEUE, concurrency = "1") @RabbitListener(queues = MessagingConfiguration.OCR_QUEUE, concurrency = "1")
public void receiveOcr(String in) throws JsonProcessingException { public void receiveOcr(String in) throws JsonProcessingException {
pdfNetInitializer.init();
DocumentRequest ocrRequestMessage = objectMapper.readValue(in, DocumentRequest.class); DocumentRequest ocrRequestMessage = objectMapper.readValue(in, DocumentRequest.class);
log.info("Start ocr for file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId()); log.info("Start ocr for file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
@ -58,6 +62,7 @@ public class OcrMessageReceiver {
} }
fileStatusProcessingUpdateClient.ocrSuccessful(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId()); fileStatusProcessingUpdateClient.ocrSuccessful(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
PDFNet.terminate();
} }