RED-6126: In the OCRService, OCR Text is not applied to Document
*reverted application of OCR Text to Document to old state *refactored OCR Service slightly *added meaningful test cases
This commit is contained in:
parent
b37ec5afc9
commit
edd044395e
@ -20,12 +20,14 @@ import com.iqser.red.service.ocr.v1.server.model.image.ImageServiceResponse;
|
||||
import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings;
|
||||
import com.iqser.red.service.redaction.v1.model.Point;
|
||||
import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.OCRModule;
|
||||
import com.pdftron.pdf.OCROptions;
|
||||
import com.pdftron.pdf.Optimizer;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.pdf.RectCollection;
|
||||
import com.pdftron.pdf.TextExtractor;
|
||||
import com.pdftron.sdf.SDFDoc;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
@ -55,35 +57,37 @@ public class OCRService {
|
||||
InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId);
|
||||
ImageServiceResponse imageServiceResponse = fileStorageService.getImageServiceResponse(dossierId, fileId);
|
||||
|
||||
byte[] fileWithoutInvisibleTextStream = invisibleElementRemovalService.removeInvisibleElements(fileStream, false);
|
||||
byte[] fileWithoutInvisibleTextBytes = invisibleElementRemovalService.removeInvisibleElements(fileStream, false);
|
||||
|
||||
byte[] ocrBytes = ocr(fileWithoutInvisibleTextStream, fileId, imageServiceResponse);
|
||||
byte[] ocrBytes = runOcrOnImages(fileWithoutInvisibleTextBytes, fileId, imageServiceResponse);
|
||||
|
||||
return new ByteArrayInputStream(ocrBytes);
|
||||
|
||||
}
|
||||
|
||||
|
||||
private byte[] ocr(byte[] file, String fileId, ImageServiceResponse imageServiceResponse) {
|
||||
private byte[] runOcrOnImages(byte[] file, String fileId, ImageServiceResponse imageServiceResponse) {
|
||||
|
||||
PDFDoc pdfDoc = null;
|
||||
try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
|
||||
pdfDoc = new PDFDoc(file);
|
||||
|
||||
Map<Integer, List<ImagePosition>> pages = new HashMap<>();
|
||||
Map<Integer, List<ImagePosition>> pageIdToImgPos = new HashMap<>();
|
||||
|
||||
imageServiceResponse.getData()
|
||||
.forEach(imageMetadata -> pages.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
|
||||
.forEach(imageMetadata -> pageIdToImgPos.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
|
||||
.add(new ImagePosition(new Rectangle(new Point(imageMetadata.getPosition().getX1(), imageMetadata.getPosition().getY1()),
|
||||
imageMetadata.getGeometry().getWidth(),
|
||||
imageMetadata.getGeometry().getHeight(),
|
||||
imageMetadata.getPosition().getPageNumber()), imageMetadata.isAlpha())));
|
||||
|
||||
Map<Integer, PDFDoc> pdfDocMap = Collections.synchronizedMap(new HashMap<>());
|
||||
|
||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
|
||||
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder().fileId(fileId).numberOfPagesToOCR(pages.keySet().size()).build()));
|
||||
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder().fileId(fileId).numberOfPagesToOCR(pageIdToImgPos.size()).build()));
|
||||
|
||||
ocrPages(pdfDoc, fileId, pages, pdfDocMap);
|
||||
// the PDFDoc is a helper document, which contains exactly one page
|
||||
Map<Integer, PDFDoc> pageIdToOcrPageMap = runOcrPerPage(pdfDoc, fileId, pageIdToImgPos);
|
||||
|
||||
addOCRPagesToDocIfAdditionalWordsFound(pdfDoc, pageIdToOcrPageMap);
|
||||
|
||||
Optimizer.optimize(pdfDoc);
|
||||
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
|
||||
@ -92,8 +96,8 @@ public class OCRService {
|
||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
|
||||
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder()
|
||||
.fileId(fileId)
|
||||
.numberOfPagesToOCR(pages.keySet().size())
|
||||
.numberOfOCRedPages(pages.keySet().size())
|
||||
.numberOfPagesToOCR(pageIdToImgPos.size())
|
||||
.numberOfOCRedPages(pageIdToOcrPageMap.size())
|
||||
.ocrFinished(true)
|
||||
.build()));
|
||||
|
||||
@ -113,21 +117,23 @@ public class OCRService {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void ocrPages(PDFDoc pdfDoc, String fileId, Map<Integer, List<ImagePosition>> pages, Map<Integer, PDFDoc> pdfDocMap) {
|
||||
private Map<Integer, PDFDoc> runOcrPerPage(PDFDoc pdfDoc, String fileId, Map<Integer, List<ImagePosition>> pageIdToImgPosMap) {
|
||||
|
||||
int numberOfOCRedPages = 0;
|
||||
for (var pageEntry : pages.entrySet()) {
|
||||
Map<Integer, PDFDoc> pageIdToOcrPageMap = Collections.synchronizedMap(new HashMap<>());
|
||||
|
||||
int numberOfRunPages = 0;
|
||||
for (var pageIdToImgPos : pageIdToImgPosMap.entrySet()) {
|
||||
|
||||
try {
|
||||
RectCollection rectCollection = new RectCollection();
|
||||
|
||||
var page = pageEntry.getKey();
|
||||
Integer pageIndex = pageIdToImgPos.getKey();
|
||||
|
||||
Page pdfPage = pdfDoc.getPageIterator(page).next();
|
||||
Page pdfPage = pdfDoc.getPageIterator(pageIndex).next();
|
||||
|
||||
pdfPage.setMediaBox(pdfPage.getCropBox());
|
||||
|
||||
for (ImagePosition imagePosition : pageEntry.getValue()) {
|
||||
RectCollection rectCollection = new RectCollection();
|
||||
for (ImagePosition imagePosition : pageIdToImgPos.getValue()) {
|
||||
Rectangle rectangle = imagePosition.getRectangle();
|
||||
|
||||
// Warning coordinate system is different in this call macOs/Linux
|
||||
@ -135,31 +141,58 @@ public class OCRService {
|
||||
rectCollection.addRect(rectangle.getTopLeft().getX(), y, rectangle.getTopLeft().getX() + rectangle.getWidth(), y + rectangle.getHeight());
|
||||
}
|
||||
|
||||
PDFDoc ocrDoc = new PDFDoc();
|
||||
ocrDoc.pagePushBack(pdfPage);
|
||||
pdfDocMap.put(pageEntry.getKey(), ocrDoc);
|
||||
// technically a document, but it always contains exactly one page
|
||||
PDFDoc ocrPage = new PDFDoc();
|
||||
ocrPage.pagePushBack(pdfPage);
|
||||
pageIdToOcrPageMap.put(pageIndex, ocrPage);
|
||||
|
||||
OCROptions options = new OCROptions();
|
||||
options.addTextZonesForPage(rectCollection, 1);
|
||||
options.addLang(ENGLISH);
|
||||
options.addDPI(settings.getOcrDPI());
|
||||
OCRModule.processPDF(ocrDoc, options);
|
||||
OCRModule.processPDF(ocrPage, options);
|
||||
|
||||
rectCollection.clear();
|
||||
|
||||
} catch (Exception e) {
|
||||
log.warn("Failed to process PDF page {}", pageEntry.getKey());
|
||||
log.warn("Failed to process PDF page {}", pageIdToImgPos.getKey());
|
||||
}
|
||||
|
||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
|
||||
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder()
|
||||
.fileId(fileId)
|
||||
.numberOfPagesToOCR(pages.keySet().size())
|
||||
.numberOfOCRedPages(++numberOfOCRedPages)
|
||||
.numberOfPagesToOCR(pageIdToImgPosMap.size())
|
||||
.numberOfOCRedPages(++numberOfRunPages)
|
||||
.build()));
|
||||
|
||||
log.warn("Done page {}", pageEntry);
|
||||
log.warn("Done page {}", pageIdToImgPos);
|
||||
|
||||
}
|
||||
return pageIdToOcrPageMap;
|
||||
}
|
||||
|
||||
|
||||
private void addOCRPagesToDocIfAdditionalWordsFound(PDFDoc pdfDoc, Map<Integer, PDFDoc> ocrDocPagesMap) throws PDFNetException {
|
||||
|
||||
for (var ocrDocPagesEntry : ocrDocPagesMap.entrySet()) {
|
||||
int pageIndex = ocrDocPagesEntry.getKey();
|
||||
|
||||
Page ocrPage = ocrDocPagesEntry.getValue().getPage(1);
|
||||
Page page = pdfDoc.getPage(pageIndex);
|
||||
|
||||
if (getWordCount(ocrPage) >= getWordCount(page)) {
|
||||
pdfDoc.pageInsert(pdfDoc.getPageIterator(pageIndex), ocrPage);
|
||||
pdfDoc.pageRemove(pdfDoc.getPageIterator(pageIndex + 1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static int getWordCount(Page pdfPage) {
|
||||
|
||||
TextExtractor txt = new TextExtractor();
|
||||
txt.begin(pdfPage);
|
||||
return txt.getWordCount();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -3,10 +3,11 @@ package com.iqser.red.service.ocr.v1.server;
|
||||
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
@ -25,18 +26,22 @@ import org.springframework.core.io.ClassPathResource;
|
||||
import org.springframework.test.context.junit.jupiter.SpringExtension;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService;
|
||||
import com.iqser.red.service.ocr.v1.server.service.OCRService;
|
||||
import com.iqser.red.service.ocr.v1.server.service.FileStorageService;
|
||||
import com.iqser.red.service.ocr.v1.server.service.OCRService;
|
||||
import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService;
|
||||
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType;
|
||||
import com.iqser.red.storage.commons.StorageAutoConfiguration;
|
||||
import com.iqser.red.storage.commons.service.StorageService;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.pdf.PageIterator;
|
||||
import com.pdftron.pdf.TextExtractor;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@ExtendWith(SpringExtension.class)
|
||||
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT //
|
||||
, properties = {"pdftron.ocrmodule.path=/YourOCRModulePath"})
|
||||
, properties = {"pdftron.ocrmodule.path=/home/kschuettler/iqser/PDFTron/ocr/Lib/"})
|
||||
@Import(OcrServiceIntegrationTest.TestConfiguration.class)
|
||||
public class OcrServiceIntegrationTest {
|
||||
|
||||
@ -58,10 +63,23 @@ public class OcrServiceIntegrationTest {
|
||||
|
||||
@Test
|
||||
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
|
||||
@SneakyThrows
|
||||
public void testOCR() {
|
||||
public void testOCRWatermark() {
|
||||
|
||||
String fileName = "Watermark";
|
||||
assertThat(testOCR("Watermark")).contains("syngenta");
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
|
||||
public void testOCRInvisibleText() {
|
||||
String text = testOCR("InvisibleText");
|
||||
assertThat(text).contains("Michela", "Gregori", "DVM", "PhD", "Pathologist", "SIGNATURE PAGE");
|
||||
assertThat(text).doesNotContain("COMPLETION DATE:", "LABORATORY PROJECT ID:", "AUTHOR(S):", "Substance");
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private String testOCR(String fileName) {
|
||||
|
||||
ClassPathResource imageInfoResource = new ClassPathResource("files/" + fileName + ".IMAGE_INFO.json");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf");
|
||||
@ -72,12 +90,24 @@ public class OcrServiceIntegrationTest {
|
||||
var imageId = FileStorageService.getStorageId("dossier", "file", FileType.IMAGE_INFO);
|
||||
storageService.storeObject(imageId, imageInfoResource.getInputStream());
|
||||
|
||||
var response = ocrService.ocrDocument("dossier", "file");
|
||||
|
||||
var out = FileUtils.openOutputStream(new File(getTemporaryDirectory() + "/" + fileName + ".pdf"));
|
||||
IOUtils.copy(response, out);
|
||||
|
||||
System.out.println("File:" + getTemporaryDirectory() + "/" + fileName + ".pdf");
|
||||
try (InputStream ocrDocument = ocrService.ocrDocument("dossier", "file")) {
|
||||
byte[] ocrDocumentBytes = ocrDocument.readAllBytes();
|
||||
try (var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
|
||||
out.write(ocrDocumentBytes);
|
||||
}
|
||||
PDFDoc ocrDoc = new PDFDoc(ocrDocumentBytes);
|
||||
TextExtractor extractor = new TextExtractor();
|
||||
List<String> texts = new ArrayList<>();
|
||||
PDFDoc pdfDoc = new PDFDoc(ocrDocumentBytes);
|
||||
PageIterator iterator = pdfDoc.getPageIterator();
|
||||
while (iterator.hasNext()) {
|
||||
Page page = iterator.next();
|
||||
extractor.begin(page);
|
||||
texts.add(extractor.getAsText());
|
||||
}
|
||||
System.out.println("File:" + getTemporaryDirectory() + "/" + fileName + ".pdf");
|
||||
return String.join("\n", texts);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user