RED-6126: In the OCRService, OCR Text is not applied to Document

*reverted application of OCR Text to Document to old state
*refactored OCR Service slightly
*added meaningful test cases
This commit is contained in:
Kilian Schuettler 2023-02-03 13:01:01 +01:00
parent b37ec5afc9
commit edd044395e
2 changed files with 104 additions and 41 deletions

View File

@ -20,12 +20,14 @@ import com.iqser.red.service.ocr.v1.server.model.image.ImageServiceResponse;
import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.OCRModule;
import com.pdftron.pdf.OCROptions;
import com.pdftron.pdf.Optimizer;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.RectCollection;
import com.pdftron.pdf.TextExtractor;
import com.pdftron.sdf.SDFDoc;
import lombok.RequiredArgsConstructor;
@ -55,35 +57,37 @@ public class OCRService {
InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId);
ImageServiceResponse imageServiceResponse = fileStorageService.getImageServiceResponse(dossierId, fileId);
byte[] fileWithoutInvisibleTextStream = invisibleElementRemovalService.removeInvisibleElements(fileStream, false);
byte[] fileWithoutInvisibleTextBytes = invisibleElementRemovalService.removeInvisibleElements(fileStream, false);
byte[] ocrBytes = ocr(fileWithoutInvisibleTextStream, fileId, imageServiceResponse);
byte[] ocrBytes = runOcrOnImages(fileWithoutInvisibleTextBytes, fileId, imageServiceResponse);
return new ByteArrayInputStream(ocrBytes);
}
private byte[] ocr(byte[] file, String fileId, ImageServiceResponse imageServiceResponse) {
private byte[] runOcrOnImages(byte[] file, String fileId, ImageServiceResponse imageServiceResponse) {
PDFDoc pdfDoc = null;
try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
pdfDoc = new PDFDoc(file);
Map<Integer, List<ImagePosition>> pages = new HashMap<>();
Map<Integer, List<ImagePosition>> pageIdToImgPos = new HashMap<>();
imageServiceResponse.getData()
.forEach(imageMetadata -> pages.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
.forEach(imageMetadata -> pageIdToImgPos.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
.add(new ImagePosition(new Rectangle(new Point(imageMetadata.getPosition().getX1(), imageMetadata.getPosition().getY1()),
imageMetadata.getGeometry().getWidth(),
imageMetadata.getGeometry().getHeight(),
imageMetadata.getPosition().getPageNumber()), imageMetadata.isAlpha())));
Map<Integer, PDFDoc> pdfDocMap = Collections.synchronizedMap(new HashMap<>());
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder().fileId(fileId).numberOfPagesToOCR(pages.keySet().size()).build()));
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder().fileId(fileId).numberOfPagesToOCR(pageIdToImgPos.size()).build()));
ocrPages(pdfDoc, fileId, pages, pdfDocMap);
// the PDFDoc is a helper document, which contains exactly one page
Map<Integer, PDFDoc> pageIdToOcrPageMap = runOcrPerPage(pdfDoc, fileId, pageIdToImgPos);
addOCRPagesToDocIfAdditionalWordsFound(pdfDoc, pageIdToOcrPageMap);
Optimizer.optimize(pdfDoc);
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
@ -92,8 +96,8 @@ public class OCRService {
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder()
.fileId(fileId)
.numberOfPagesToOCR(pages.keySet().size())
.numberOfOCRedPages(pages.keySet().size())
.numberOfPagesToOCR(pageIdToImgPos.size())
.numberOfOCRedPages(pageIdToOcrPageMap.size())
.ocrFinished(true)
.build()));
@ -113,21 +117,23 @@ public class OCRService {
@SneakyThrows
private void ocrPages(PDFDoc pdfDoc, String fileId, Map<Integer, List<ImagePosition>> pages, Map<Integer, PDFDoc> pdfDocMap) {
private Map<Integer, PDFDoc> runOcrPerPage(PDFDoc pdfDoc, String fileId, Map<Integer, List<ImagePosition>> pageIdToImgPosMap) {
int numberOfOCRedPages = 0;
for (var pageEntry : pages.entrySet()) {
Map<Integer, PDFDoc> pageIdToOcrPageMap = Collections.synchronizedMap(new HashMap<>());
int numberOfRunPages = 0;
for (var pageIdToImgPos : pageIdToImgPosMap.entrySet()) {
try {
RectCollection rectCollection = new RectCollection();
var page = pageEntry.getKey();
Integer pageIndex = pageIdToImgPos.getKey();
Page pdfPage = pdfDoc.getPageIterator(page).next();
Page pdfPage = pdfDoc.getPageIterator(pageIndex).next();
pdfPage.setMediaBox(pdfPage.getCropBox());
for (ImagePosition imagePosition : pageEntry.getValue()) {
RectCollection rectCollection = new RectCollection();
for (ImagePosition imagePosition : pageIdToImgPos.getValue()) {
Rectangle rectangle = imagePosition.getRectangle();
// Warning coordinate system is different in this call macOs/Linux
@ -135,31 +141,58 @@ public class OCRService {
rectCollection.addRect(rectangle.getTopLeft().getX(), y, rectangle.getTopLeft().getX() + rectangle.getWidth(), y + rectangle.getHeight());
}
PDFDoc ocrDoc = new PDFDoc();
ocrDoc.pagePushBack(pdfPage);
pdfDocMap.put(pageEntry.getKey(), ocrDoc);
// technically a document, but it always contains exactly one page
PDFDoc ocrPage = new PDFDoc();
ocrPage.pagePushBack(pdfPage);
pageIdToOcrPageMap.put(pageIndex, ocrPage);
OCROptions options = new OCROptions();
options.addTextZonesForPage(rectCollection, 1);
options.addLang(ENGLISH);
options.addDPI(settings.getOcrDPI());
OCRModule.processPDF(ocrDoc, options);
OCRModule.processPDF(ocrPage, options);
rectCollection.clear();
} catch (Exception e) {
log.warn("Failed to process PDF page {}", pageEntry.getKey());
log.warn("Failed to process PDF page {}", pageIdToImgPos.getKey());
}
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder()
.fileId(fileId)
.numberOfPagesToOCR(pages.keySet().size())
.numberOfOCRedPages(++numberOfOCRedPages)
.numberOfPagesToOCR(pageIdToImgPosMap.size())
.numberOfOCRedPages(++numberOfRunPages)
.build()));
log.warn("Done page {}", pageEntry);
log.warn("Done page {}", pageIdToImgPos);
}
return pageIdToOcrPageMap;
}
private void addOCRPagesToDocIfAdditionalWordsFound(PDFDoc pdfDoc, Map<Integer, PDFDoc> ocrDocPagesMap) throws PDFNetException {
for (var ocrDocPagesEntry : ocrDocPagesMap.entrySet()) {
int pageIndex = ocrDocPagesEntry.getKey();
Page ocrPage = ocrDocPagesEntry.getValue().getPage(1);
Page page = pdfDoc.getPage(pageIndex);
if (getWordCount(ocrPage) >= getWordCount(page)) {
pdfDoc.pageInsert(pdfDoc.getPageIterator(pageIndex), ocrPage);
pdfDoc.pageRemove(pdfDoc.getPageIterator(pageIndex + 1));
}
}
}
private static int getWordCount(Page pdfPage) {
TextExtractor txt = new TextExtractor();
txt.begin(pdfPage);
return txt.getWordCount();
}
}

View File

@ -3,10 +3,11 @@ package com.iqser.red.service.ocr.v1.server;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
import static org.assertj.core.api.Assertions.assertThat;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
@ -25,18 +26,22 @@ import org.springframework.core.io.ClassPathResource;
import org.springframework.test.context.junit.jupiter.SpringExtension;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService;
import com.iqser.red.service.ocr.v1.server.service.OCRService;
import com.iqser.red.service.ocr.v1.server.service.FileStorageService;
import com.iqser.red.service.ocr.v1.server.service.OCRService;
import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService;
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType;
import com.iqser.red.storage.commons.StorageAutoConfiguration;
import com.iqser.red.storage.commons.service.StorageService;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
import com.pdftron.pdf.TextExtractor;
import lombok.SneakyThrows;
@ExtendWith(SpringExtension.class)
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT //
, properties = {"pdftron.ocrmodule.path=/YourOCRModulePath"})
, properties = {"pdftron.ocrmodule.path=/home/kschuettler/iqser/PDFTron/ocr/Lib/"})
@Import(OcrServiceIntegrationTest.TestConfiguration.class)
public class OcrServiceIntegrationTest {
@ -58,10 +63,23 @@ public class OcrServiceIntegrationTest {
@Test
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
@SneakyThrows
public void testOCR() {
public void testOCRWatermark() {
String fileName = "Watermark";
assertThat(testOCR("Watermark")).contains("syngenta");
}
@Test
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
public void testOCRInvisibleText() {
String text = testOCR("InvisibleText");
assertThat(text).contains("Michela", "Gregori", "DVM", "PhD", "Pathologist", "SIGNATURE PAGE");
assertThat(text).doesNotContain("COMPLETION DATE:", "LABORATORY PROJECT ID:", "AUTHOR(S):", "Substance");
}
@SneakyThrows
private String testOCR(String fileName) {
ClassPathResource imageInfoResource = new ClassPathResource("files/" + fileName + ".IMAGE_INFO.json");
ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf");
@ -72,12 +90,24 @@ public class OcrServiceIntegrationTest {
var imageId = FileStorageService.getStorageId("dossier", "file", FileType.IMAGE_INFO);
storageService.storeObject(imageId, imageInfoResource.getInputStream());
var response = ocrService.ocrDocument("dossier", "file");
var out = FileUtils.openOutputStream(new File(getTemporaryDirectory() + "/" + fileName + ".pdf"));
IOUtils.copy(response, out);
System.out.println("File:" + getTemporaryDirectory() + "/" + fileName + ".pdf");
try (InputStream ocrDocument = ocrService.ocrDocument("dossier", "file")) {
byte[] ocrDocumentBytes = ocrDocument.readAllBytes();
try (var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
out.write(ocrDocumentBytes);
}
PDFDoc ocrDoc = new PDFDoc(ocrDocumentBytes);
TextExtractor extractor = new TextExtractor();
List<String> texts = new ArrayList<>();
PDFDoc pdfDoc = new PDFDoc(ocrDocumentBytes);
PageIterator iterator = pdfDoc.getPageIterator();
while (iterator.hasNext()) {
Page page = iterator.next();
extractor.begin(page);
texts.add(extractor.getAsText());
}
System.out.println("File:" + getTemporaryDirectory() + "/" + fileName + ".pdf");
return String.join("\n", texts);
}
}