Pull request #14: RED-6321: OCR not working correctly with 3.6.0
Merge in RED/ocr-service from RED-6321 to master * commit '2d503c74a6fba0b02ac4c92af82a493165e45761': RED-6321: OCR not working correctly with 3.6.0 *added end() statements to formWriters to write their changes to the PDF contentStream *moved replaceOriginalPageWithOcrPage outside the main OCR Loop since it caused exponential RAM Usage in some cases
This commit is contained in:
commit
d91fb737cb
@ -77,6 +77,7 @@ public class InvisibleElementRemovalService {
|
||||
|
||||
visitedXObjIds.add(page.getSDFObj().getObjNum());
|
||||
|
||||
|
||||
InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder()
|
||||
.reader(reader)
|
||||
.clippingPathStack(new ClippingPathStack(page.getMediaBox()))
|
||||
@ -91,8 +92,6 @@ public class InvisibleElementRemovalService {
|
||||
context.visitedXObjIds().clear();
|
||||
|
||||
removeOverlappedElements(page, writer, context);
|
||||
reader.end();
|
||||
writer.end();
|
||||
}
|
||||
|
||||
try {
|
||||
@ -227,6 +226,7 @@ public class InvisibleElementRemovalService {
|
||||
formWriter.setDefaultGState(context.reader());
|
||||
|
||||
processElements(formWriter, context);
|
||||
formWriter.end();
|
||||
formWriter.destroy();
|
||||
context.reader().end();
|
||||
}
|
||||
@ -364,6 +364,7 @@ public class InvisibleElementRemovalService {
|
||||
formWriter.setDefaultGState(context.reader());
|
||||
|
||||
processOverlappedElements(formWriter, context);
|
||||
formWriter.end();
|
||||
formWriter.destroy();
|
||||
context.reader().end();
|
||||
}
|
||||
|
||||
@ -1,23 +1,35 @@
|
||||
package com.iqser.red.service.ocr.v1.server.service;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse;
|
||||
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
|
||||
import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.*;
|
||||
import com.pdftron.pdf.OCRModule;
|
||||
import com.pdftron.pdf.OCROptions;
|
||||
import com.pdftron.pdf.Optimizer;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.pdf.RectCollection;
|
||||
import com.pdftron.sdf.SDFDoc;
|
||||
|
||||
import io.micrometer.core.annotation.Timed;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.Map;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@ -60,7 +72,10 @@ public class OCRService {
|
||||
log.debug("Start invisible element removal for file with dossierId {} and fileId {}", dossierId, fileId);
|
||||
invisibleElementRemovalService.removeInvisibleElements(fileStream, transferOutputStream, false);
|
||||
long removalEnd = System.currentTimeMillis();
|
||||
log.debug("Invisible element removal successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, format("%.1f", (removalEnd - removalStart) / 1000.0));
|
||||
log.debug("Invisible element removal successful for file with dossierId {} and fileId {}, took {}s",
|
||||
dossierId,
|
||||
fileId,
|
||||
format("%.1f", (removalEnd - removalStart) / 1000.0));
|
||||
}
|
||||
try (InputStream transferInputStream = new ByteArrayInputStream(transferOutputStream.toByteArray())) {
|
||||
long ocrStart = System.currentTimeMillis();
|
||||
@ -78,18 +93,22 @@ public class OCRService {
|
||||
PDFDoc pdfDoc = new PDFDoc(fileStream);
|
||||
|
||||
Map<Integer, RectCollection> pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, true);
|
||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
|
||||
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder().fileId(fileId).numberOfPagesToOCR(pageIdToRectCollection.size()).build()));
|
||||
|
||||
// Optimization:
|
||||
// When a page does not have a TextZone, PDFTron whites out the page. But, PDFTron scans it anyway, resulting in a longer runtime.
|
||||
// So, we need to remove pages without images.
|
||||
// Furthermore, creating a new document is *much* faster than reusing the same document and adding/removing pages one by one.
|
||||
// Therefore, we create a new Document with a single page for every page that contains an image.
|
||||
// For some reason, if we insert the OCR singlePageDoc into the original PDFDoc inside the loop, for some documents, the RAM Usage increases exponentially with every page.
|
||||
// This is why, we replace the OCRed Pages outside the main loop.
|
||||
int numProcessedPages = 0;
|
||||
Map<Integer, PDFDoc> pageIdToSinglePagePdfDoc = new HashMap<>();
|
||||
for (Integer pageId : pageIdToRectCollection.keySet()) {
|
||||
try {
|
||||
PDFDoc singlePagePdfDoc = extractSinglePagePdfDoc(pdfDoc, pageId);
|
||||
processOcr(pageIdToRectCollection, pageId, singlePagePdfDoc);
|
||||
++numProcessedPages;
|
||||
|
||||
log.info("{}/{} Page {} done, OCR regions {}",
|
||||
numProcessedPages,
|
||||
@ -97,9 +116,8 @@ public class OCRService {
|
||||
pageId,
|
||||
getAllOcrTextZonesAsString(pageIdToRectCollection, pageId));
|
||||
|
||||
replaceOriginalPageWithOcrPage(pdfDoc, pageId, singlePagePdfDoc);
|
||||
|
||||
singlePagePdfDoc.close();
|
||||
pageIdToSinglePagePdfDoc.put(pageId, singlePagePdfDoc);
|
||||
++numProcessedPages;
|
||||
|
||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
|
||||
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder()
|
||||
@ -113,6 +131,9 @@ public class OCRService {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
log.info("Copying {} OCRed Pages into original Document", pageIdToSinglePagePdfDoc.size());
|
||||
pageIdToSinglePagePdfDoc.forEach((pageId, singlePagePdfDoc) -> replaceOriginalPageWithOcrPage(pdfDoc, pageId, singlePagePdfDoc));
|
||||
Optimizer.optimize(pdfDoc);
|
||||
|
||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
|
||||
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder()
|
||||
@ -122,7 +143,6 @@ public class OCRService {
|
||||
.ocrFinished(true)
|
||||
.build()));
|
||||
|
||||
Optimizer.optimize(pdfDoc);
|
||||
try {
|
||||
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
|
||||
} catch (Exception e) {
|
||||
@ -154,11 +174,13 @@ public class OCRService {
|
||||
}
|
||||
|
||||
|
||||
private static void replaceOriginalPageWithOcrPage(PDFDoc pdfDoc, Integer pageId, PDFDoc ocrPageDoc) throws PDFNetException {
|
||||
@SneakyThrows
|
||||
private static void replaceOriginalPageWithOcrPage(PDFDoc pdfDoc, Integer pageId, PDFDoc singlePagePdfDoc) {
|
||||
|
||||
Page ocrPage = ocrPageDoc.getPage(1);
|
||||
Page ocrPage = singlePagePdfDoc.getPage(1);
|
||||
pdfDoc.pageInsert(pdfDoc.getPageIterator(pageId), ocrPage);
|
||||
pdfDoc.pageRemove(pdfDoc.getPageIterator(pageId + 1));
|
||||
singlePagePdfDoc.close();
|
||||
}
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user