RED-6321: OCR not working correctly with 3.6.0
*added end() statements to formWriters to write their changes to the PDF contentStream *moved replaceOriginalPageWithOcrPage outside the main OCR Loop since it caused exponential RAM Usage in some cases
This commit is contained in:
parent
7427258349
commit
ac3947962a
@ -77,6 +77,7 @@ public class InvisibleElementRemovalService {
|
||||
|
||||
visitedXObjIds.add(page.getSDFObj().getObjNum());
|
||||
|
||||
|
||||
InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder()
|
||||
.reader(reader)
|
||||
.clippingPathStack(new ClippingPathStack(page.getMediaBox()))
|
||||
@ -91,8 +92,6 @@ public class InvisibleElementRemovalService {
|
||||
context.visitedXObjIds().clear();
|
||||
|
||||
removeOverlappedElements(page, writer, context);
|
||||
reader.end();
|
||||
writer.end();
|
||||
}
|
||||
|
||||
try {
|
||||
@ -227,6 +226,7 @@ public class InvisibleElementRemovalService {
|
||||
formWriter.setDefaultGState(context.reader());
|
||||
|
||||
processElements(formWriter, context);
|
||||
formWriter.end();
|
||||
formWriter.destroy();
|
||||
context.reader().end();
|
||||
}
|
||||
@ -364,6 +364,7 @@ public class InvisibleElementRemovalService {
|
||||
formWriter.setDefaultGState(context.reader());
|
||||
|
||||
processOverlappedElements(formWriter, context);
|
||||
formWriter.end();
|
||||
formWriter.destroy();
|
||||
context.reader().end();
|
||||
}
|
||||
|
||||
@ -1,23 +1,35 @@
|
||||
package com.iqser.red.service.ocr.v1.server.service;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse;
|
||||
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
|
||||
import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.*;
|
||||
import com.pdftron.pdf.OCRModule;
|
||||
import com.pdftron.pdf.OCROptions;
|
||||
import com.pdftron.pdf.Optimizer;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.pdf.RectCollection;
|
||||
import com.pdftron.sdf.SDFDoc;
|
||||
|
||||
import io.micrometer.core.annotation.Timed;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.Map;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@ -84,11 +96,14 @@ public class OCRService {
|
||||
// So, we need to remove pages without images.
|
||||
// Furthermore, creating a new document is *much* faster than reusing the same document and adding/removing pages one by one.
|
||||
// Therefore, we create a new Document with a single page for every page that contains an image.
|
||||
// Moreover, if we insert the OCR Page into the PDFDoc inside the loop, for some documents, the RAM Usage increases exponentially with every page, this does not make any sense to me...
|
||||
int numProcessedPages = 0;
|
||||
Map<Integer, PDFDoc> pageIdToSinglePagePdfDoc = new HashMap<>();
|
||||
for (Integer pageId : pageIdToRectCollection.keySet()) {
|
||||
try {
|
||||
PDFDoc singlePagePdfDoc = extractSinglePagePdfDoc(pdfDoc, pageId);
|
||||
processOcr(pageIdToRectCollection, pageId, singlePagePdfDoc);
|
||||
pageIdToSinglePagePdfDoc.put(pageId, singlePagePdfDoc);
|
||||
++numProcessedPages;
|
||||
|
||||
log.info("{}/{} Page {} done, OCR regions {}",
|
||||
@ -97,9 +112,6 @@ public class OCRService {
|
||||
pageId,
|
||||
getAllOcrTextZonesAsString(pageIdToRectCollection, pageId));
|
||||
|
||||
replaceOriginalPageWithOcrPage(pdfDoc, pageId, singlePagePdfDoc);
|
||||
|
||||
singlePagePdfDoc.close();
|
||||
|
||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
|
||||
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder()
|
||||
@ -113,6 +125,9 @@ public class OCRService {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
log.info("Copying {} OCRed Pages into original Document", pageIdToSinglePagePdfDoc.size());
|
||||
pageIdToSinglePagePdfDoc.forEach((pageId, singlePagePdfDoc) -> replaceOriginalPageWithOcrPage(pdfDoc, pageId, singlePagePdfDoc));
|
||||
Optimizer.optimize(pdfDoc);
|
||||
|
||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
|
||||
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder()
|
||||
@ -122,7 +137,6 @@ public class OCRService {
|
||||
.ocrFinished(true)
|
||||
.build()));
|
||||
|
||||
Optimizer.optimize(pdfDoc);
|
||||
try {
|
||||
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
|
||||
} catch (Exception e) {
|
||||
@ -154,11 +168,13 @@ public class OCRService {
|
||||
}
|
||||
|
||||
|
||||
private static void replaceOriginalPageWithOcrPage(PDFDoc pdfDoc, Integer pageId, PDFDoc ocrPageDoc) throws PDFNetException {
|
||||
@SneakyThrows
|
||||
private static void replaceOriginalPageWithOcrPage(PDFDoc pdfDoc, Integer pageIndex, PDFDoc singlePagePdfDoc) {
|
||||
|
||||
Page ocrPage = ocrPageDoc.getPage(1);
|
||||
pdfDoc.pageInsert(pdfDoc.getPageIterator(pageId), ocrPage);
|
||||
pdfDoc.pageRemove(pdfDoc.getPageIterator(pageId + 1));
|
||||
Page ocrPage = singlePagePdfDoc.getPage(1);
|
||||
pdfDoc.pageInsert(pdfDoc.getPageIterator(pageIndex), ocrPage);
|
||||
pdfDoc.pageRemove(pdfDoc.getPageIterator(pageIndex + 1));
|
||||
singlePagePdfDoc.close();
|
||||
}
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user