Pull request #14: RED-6321: OCR not working correctly with 3.6.0

Merge in RED/ocr-service from RED-6321 to master

* commit '2d503c74a6fba0b02ac4c92af82a493165e45761':
  RED-6321: OCR not working correctly with 3.6.0 *added end() statements to formWriters to write their changes to the PDF contentStream *moved replaceOriginalPageWithOcrPage outside the main OCR Loop since it caused exponential RAM Usage in some cases
This commit is contained in:
Kilian Schuettler 2023-03-06 10:16:02 +01:00
commit d91fb737cb
2 changed files with 41 additions and 18 deletions

View File

@ -77,6 +77,7 @@ public class InvisibleElementRemovalService {
visitedXObjIds.add(page.getSDFObj().getObjNum());
InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder()
.reader(reader)
.clippingPathStack(new ClippingPathStack(page.getMediaBox()))
@ -91,8 +92,6 @@ public class InvisibleElementRemovalService {
context.visitedXObjIds().clear();
removeOverlappedElements(page, writer, context);
reader.end();
writer.end();
}
try {
@ -227,6 +226,7 @@ public class InvisibleElementRemovalService {
formWriter.setDefaultGState(context.reader());
processElements(formWriter, context);
formWriter.end();
formWriter.destroy();
context.reader().end();
}
@ -364,6 +364,7 @@ public class InvisibleElementRemovalService {
formWriter.setDefaultGState(context.reader());
processOverlappedElements(formWriter, context);
formWriter.end();
formWriter.destroy();
context.reader().end();
}

View File

@ -1,23 +1,35 @@
package com.iqser.red.service.ocr.v1.server.service;
import static java.lang.String.format;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.HashMap;
import java.util.Map;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.stereotype.Service;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse;
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.*;
import com.pdftron.pdf.OCRModule;
import com.pdftron.pdf.OCROptions;
import com.pdftron.pdf.Optimizer;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.RectCollection;
import com.pdftron.sdf.SDFDoc;
import io.micrometer.core.annotation.Timed;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.stereotype.Service;
import java.io.*;
import java.util.Map;
import static java.lang.String.format;
@Slf4j
@Service
@ -60,7 +72,10 @@ public class OCRService {
log.debug("Start invisible element removal for file with dossierId {} and fileId {}", dossierId, fileId);
invisibleElementRemovalService.removeInvisibleElements(fileStream, transferOutputStream, false);
long removalEnd = System.currentTimeMillis();
log.debug("Invisible element removal successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, format("%.1f", (removalEnd - removalStart) / 1000.0));
log.debug("Invisible element removal successful for file with dossierId {} and fileId {}, took {}s",
dossierId,
fileId,
format("%.1f", (removalEnd - removalStart) / 1000.0));
}
try (InputStream transferInputStream = new ByteArrayInputStream(transferOutputStream.toByteArray())) {
long ocrStart = System.currentTimeMillis();
@ -78,18 +93,22 @@ public class OCRService {
PDFDoc pdfDoc = new PDFDoc(fileStream);
Map<Integer, RectCollection> pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, true);
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder().fileId(fileId).numberOfPagesToOCR(pageIdToRectCollection.size()).build()));
// Optimization:
// When a page does not have a TextZone, PDFTron whites out the page. But, PDFTron scans it anyway, resulting in a longer runtime.
// So, we need to remove pages without images.
// Furthermore, creating a new document is *much* faster than reusing the same document and adding/removing pages one by one.
// Therefore, we create a new Document with a single page for every page that contains an image.
// For some reason, if we insert the OCR singlePageDoc into the original PDFDoc inside the loop, for some documents, the RAM Usage increases exponentially with every page.
// This is why, we replace the OCRed Pages outside the main loop.
int numProcessedPages = 0;
Map<Integer, PDFDoc> pageIdToSinglePagePdfDoc = new HashMap<>();
for (Integer pageId : pageIdToRectCollection.keySet()) {
try {
PDFDoc singlePagePdfDoc = extractSinglePagePdfDoc(pdfDoc, pageId);
processOcr(pageIdToRectCollection, pageId, singlePagePdfDoc);
++numProcessedPages;
log.info("{}/{} Page {} done, OCR regions {}",
numProcessedPages,
@ -97,9 +116,8 @@ public class OCRService {
pageId,
getAllOcrTextZonesAsString(pageIdToRectCollection, pageId));
replaceOriginalPageWithOcrPage(pdfDoc, pageId, singlePagePdfDoc);
singlePagePdfDoc.close();
pageIdToSinglePagePdfDoc.put(pageId, singlePagePdfDoc);
++numProcessedPages;
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder()
@ -113,6 +131,9 @@ public class OCRService {
throw new RuntimeException(e);
}
}
log.info("Copying {} OCRed Pages into original Document", pageIdToSinglePagePdfDoc.size());
pageIdToSinglePagePdfDoc.forEach((pageId, singlePagePdfDoc) -> replaceOriginalPageWithOcrPage(pdfDoc, pageId, singlePagePdfDoc));
Optimizer.optimize(pdfDoc);
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder()
@ -122,7 +143,6 @@ public class OCRService {
.ocrFinished(true)
.build()));
Optimizer.optimize(pdfDoc);
try {
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
} catch (Exception e) {
@ -154,11 +174,13 @@ public class OCRService {
}
private static void replaceOriginalPageWithOcrPage(PDFDoc pdfDoc, Integer pageId, PDFDoc ocrPageDoc) throws PDFNetException {
@SneakyThrows
private static void replaceOriginalPageWithOcrPage(PDFDoc pdfDoc, Integer pageId, PDFDoc singlePagePdfDoc) {
Page ocrPage = ocrPageDoc.getPage(1);
Page ocrPage = singlePagePdfDoc.getPage(1);
pdfDoc.pageInsert(pdfDoc.getPageIterator(pageId), ocrPage);
pdfDoc.pageRemove(pdfDoc.getPageIterator(pageId + 1));
singlePagePdfDoc.close();
}