RED-6321: OCR not working correctly with 3.6.0

*added end() statements to formWriters to write their changes to the PDF contentStream
*moved replaceOriginalPageWithOcrPage outside the main OCR Loop since it caused exponential RAM Usage in some cases
This commit is contained in:
Kilian Schuettler 2023-03-03 10:16:46 +01:00
parent 7427258349
commit ac3947962a
2 changed files with 35 additions and 18 deletions

View File

@ -77,6 +77,7 @@ public class InvisibleElementRemovalService {
visitedXObjIds.add(page.getSDFObj().getObjNum());
InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder()
.reader(reader)
.clippingPathStack(new ClippingPathStack(page.getMediaBox()))
@ -91,8 +92,6 @@ public class InvisibleElementRemovalService {
context.visitedXObjIds().clear();
removeOverlappedElements(page, writer, context);
reader.end();
writer.end();
}
try {
@ -227,6 +226,7 @@ public class InvisibleElementRemovalService {
formWriter.setDefaultGState(context.reader());
processElements(formWriter, context);
formWriter.end();
formWriter.destroy();
context.reader().end();
}
@ -364,6 +364,7 @@ public class InvisibleElementRemovalService {
formWriter.setDefaultGState(context.reader());
processOverlappedElements(formWriter, context);
formWriter.end();
formWriter.destroy();
context.reader().end();
}

View File

@ -1,23 +1,35 @@
package com.iqser.red.service.ocr.v1.server.service;
import static java.lang.String.format;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.HashMap;
import java.util.Map;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.stereotype.Service;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse;
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.*;
import com.pdftron.pdf.OCRModule;
import com.pdftron.pdf.OCROptions;
import com.pdftron.pdf.Optimizer;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.RectCollection;
import com.pdftron.sdf.SDFDoc;
import io.micrometer.core.annotation.Timed;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.stereotype.Service;
import java.io.*;
import java.util.Map;
import static java.lang.String.format;
@Slf4j
@Service
@ -84,11 +96,14 @@ public class OCRService {
// So, we need to remove pages without images.
// Furthermore, creating a new document is *much* faster than reusing the same document and adding/removing pages one by one.
// Therefore, we create a new Document with a single page for every page that contains an image.
// Moreover, if we insert the OCR Page into the PDFDoc inside the loop, for some documents, the RAM Usage increases exponentially with every page, this does not make any sense to me...
int numProcessedPages = 0;
Map<Integer, PDFDoc> pageIdToSinglePagePdfDoc = new HashMap<>();
for (Integer pageId : pageIdToRectCollection.keySet()) {
try {
PDFDoc singlePagePdfDoc = extractSinglePagePdfDoc(pdfDoc, pageId);
processOcr(pageIdToRectCollection, pageId, singlePagePdfDoc);
pageIdToSinglePagePdfDoc.put(pageId, singlePagePdfDoc);
++numProcessedPages;
log.info("{}/{} Page {} done, OCR regions {}",
@ -97,9 +112,6 @@ public class OCRService {
pageId,
getAllOcrTextZonesAsString(pageIdToRectCollection, pageId));
replaceOriginalPageWithOcrPage(pdfDoc, pageId, singlePagePdfDoc);
singlePagePdfDoc.close();
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder()
@ -113,6 +125,9 @@ public class OCRService {
throw new RuntimeException(e);
}
}
log.info("Copying {} OCRed Pages into original Document", pageIdToSinglePagePdfDoc.size());
pageIdToSinglePagePdfDoc.forEach((pageId, singlePagePdfDoc) -> replaceOriginalPageWithOcrPage(pdfDoc, pageId, singlePagePdfDoc));
Optimizer.optimize(pdfDoc);
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder()
@ -122,7 +137,6 @@ public class OCRService {
.ocrFinished(true)
.build()));
Optimizer.optimize(pdfDoc);
try {
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
} catch (Exception e) {
@ -154,11 +168,13 @@ public class OCRService {
}
private static void replaceOriginalPageWithOcrPage(PDFDoc pdfDoc, Integer pageId, PDFDoc ocrPageDoc) throws PDFNetException {
@SneakyThrows
private static void replaceOriginalPageWithOcrPage(PDFDoc pdfDoc, Integer pageIndex, PDFDoc singlePagePdfDoc) {
Page ocrPage = ocrPageDoc.getPage(1);
pdfDoc.pageInsert(pdfDoc.getPageIterator(pageId), ocrPage);
pdfDoc.pageRemove(pdfDoc.getPageIterator(pageId + 1));
Page ocrPage = singlePagePdfDoc.getPage(1);
pdfDoc.pageInsert(pdfDoc.getPageIterator(pageIndex), ocrPage);
pdfDoc.pageRemove(pdfDoc.getPageIterator(pageIndex + 1));
singlePagePdfDoc.close();
}