RED-6126: In the OCRService, OCR Text is not applied to Document
*called PDFNet.initialize and terminate before and after message receive *updated comments *renamed some variables
This commit is contained in:
parent
a6d99f5916
commit
ec8cf3c324
@ -1,34 +1,23 @@
|
|||||||
package com.iqser.red.service.ocr.v1.server.service;
|
package com.iqser.red.service.ocr.v1.server.service;
|
||||||
|
|
||||||
import static java.lang.String.format;
|
|
||||||
|
|
||||||
import java.io.ByteArrayInputStream;
|
|
||||||
import java.io.ByteArrayOutputStream;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.InputStream;
|
|
||||||
import java.io.OutputStream;
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
|
||||||
import org.springframework.stereotype.Service;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse;
|
import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse;
|
||||||
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
|
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
|
||||||
import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings;
|
import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings;
|
||||||
import com.pdftron.common.PDFNetException;
|
import com.pdftron.common.PDFNetException;
|
||||||
import com.pdftron.pdf.OCRModule;
|
import com.pdftron.pdf.*;
|
||||||
import com.pdftron.pdf.OCROptions;
|
|
||||||
import com.pdftron.pdf.Optimizer;
|
|
||||||
import com.pdftron.pdf.PDFDoc;
|
|
||||||
import com.pdftron.pdf.Page;
|
|
||||||
import com.pdftron.pdf.RectCollection;
|
|
||||||
import com.pdftron.sdf.SDFDoc;
|
import com.pdftron.sdf.SDFDoc;
|
||||||
|
|
||||||
import io.micrometer.core.annotation.Timed;
|
import io.micrometer.core.annotation.Timed;
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import static java.lang.String.format;
|
||||||
|
|
||||||
@Slf4j
|
@Slf4j
|
||||||
@Service
|
@Service
|
||||||
@ -67,17 +56,17 @@ public class OCRService {
|
|||||||
|
|
||||||
try (ByteArrayOutputStream transferOutputStream = new ByteArrayOutputStream()) {
|
try (ByteArrayOutputStream transferOutputStream = new ByteArrayOutputStream()) {
|
||||||
try (InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId)) {
|
try (InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId)) {
|
||||||
long start = System.currentTimeMillis();
|
long removalStart = System.currentTimeMillis();
|
||||||
log.debug("Start invisible element removal for file with dossierId {} and fileId {}", dossierId, fileId);
|
log.debug("Start invisible element removal for file with dossierId {} and fileId {}", dossierId, fileId);
|
||||||
invisibleElementRemovalService.removeInvisibleElements(fileStream, transferOutputStream, false);
|
invisibleElementRemovalService.removeInvisibleElements(fileStream, transferOutputStream, false);
|
||||||
long end = System.currentTimeMillis();
|
long removalEnd = System.currentTimeMillis();
|
||||||
log.debug("Invisible element removal successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, format("%.1f", (end - start) / 1000.0));
|
log.debug("Invisible element removal successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, format("%.1f", (removalEnd - removalStart) / 1000.0));
|
||||||
}
|
|
||||||
try (InputStream transferInputStream = new ByteArrayInputStream(transferOutputStream.toByteArray())) {
|
try (InputStream transferInputStream = new ByteArrayInputStream(transferOutputStream.toByteArray())) {
|
||||||
long start = System.currentTimeMillis();
|
long ocrStart = System.currentTimeMillis();
|
||||||
runOcr(transferInputStream, out, fileId);
|
runOcr(transferInputStream, out, fileId);
|
||||||
long end = System.currentTimeMillis();
|
long ocrEnd = System.currentTimeMillis();
|
||||||
log.info("ocr successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, format("%.1f", (end - start) / 1000.0));
|
log.info("ocr successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, format("%.1f", (ocrEnd - ocrStart) / 1000.0));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -94,7 +83,7 @@ public class OCRService {
|
|||||||
// When a page does not have a TextZone, PDFTron whites out the page. But, PDFTron scans it anyway, resulting in a longer runtime.
|
// When a page does not have a TextZone, PDFTron whites out the page. But, PDFTron scans it anyway, resulting in a longer runtime.
|
||||||
// So, we need to remove pages without images.
|
// So, we need to remove pages without images.
|
||||||
// Furthermore, creating a new document is *much* faster than reusing the same document and adding/removing pages one by one.
|
// Furthermore, creating a new document is *much* faster than reusing the same document and adding/removing pages one by one.
|
||||||
// Therefore, we create a new Document with a single page for every page that contains text.
|
// Therefore, we create a new Document with a single page for every page that contains an image.
|
||||||
int numProcessedPages = 0;
|
int numProcessedPages = 0;
|
||||||
for (Integer pageId : pageIdToRectCollection.keySet()) {
|
for (Integer pageId : pageIdToRectCollection.keySet()) {
|
||||||
try {
|
try {
|
||||||
@ -139,6 +128,7 @@ public class OCRService {
|
|||||||
log.error("Processed File with fileId {} could not be saved", fileId);
|
log.error("Processed File with fileId {} could not be saved", fileId);
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
|
pdfDoc.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -15,6 +15,8 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
|||||||
import com.iqser.red.service.ocr.v1.api.model.DocumentRequest;
|
import com.iqser.red.service.ocr.v1.api.model.DocumentRequest;
|
||||||
import com.iqser.red.service.ocr.v1.server.client.FileStatusProcessingUpdateClient;
|
import com.iqser.red.service.ocr.v1.server.client.FileStatusProcessingUpdateClient;
|
||||||
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
|
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
|
||||||
|
import com.iqser.red.service.ocr.v1.server.initializer.PDFNetInitializer;
|
||||||
|
import com.pdftron.pdf.PDFNet;
|
||||||
|
|
||||||
import feign.FeignException;
|
import feign.FeignException;
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
@ -28,6 +30,7 @@ public class OcrMessageReceiver {
|
|||||||
private final ObjectMapper objectMapper;
|
private final ObjectMapper objectMapper;
|
||||||
private final FileStorageService fileStorageService;
|
private final FileStorageService fileStorageService;
|
||||||
private final FileStatusProcessingUpdateClient fileStatusProcessingUpdateClient;
|
private final FileStatusProcessingUpdateClient fileStatusProcessingUpdateClient;
|
||||||
|
private final PDFNetInitializer pdfNetInitializer;
|
||||||
|
|
||||||
private final OCRService ocrService;
|
private final OCRService ocrService;
|
||||||
|
|
||||||
@ -36,6 +39,7 @@ public class OcrMessageReceiver {
|
|||||||
@RabbitListener(queues = MessagingConfiguration.OCR_QUEUE, concurrency = "1")
|
@RabbitListener(queues = MessagingConfiguration.OCR_QUEUE, concurrency = "1")
|
||||||
public void receiveOcr(String in) throws JsonProcessingException {
|
public void receiveOcr(String in) throws JsonProcessingException {
|
||||||
|
|
||||||
|
pdfNetInitializer.init();
|
||||||
DocumentRequest ocrRequestMessage = objectMapper.readValue(in, DocumentRequest.class);
|
DocumentRequest ocrRequestMessage = objectMapper.readValue(in, DocumentRequest.class);
|
||||||
|
|
||||||
log.info("Start ocr for file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
|
log.info("Start ocr for file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
|
||||||
@ -58,6 +62,7 @@ public class OcrMessageReceiver {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fileStatusProcessingUpdateClient.ocrSuccessful(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
|
fileStatusProcessingUpdateClient.ocrSuccessful(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
|
||||||
|
PDFNet.terminate();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user