Pull request #9: RED-6126: In the OCRService, OCR Text is not applied to Document

Merge in RED/ocr-service from RED-6126 to master

* commit 'caff5580dda644451433921a6a66dafe1cfa5dca':
  RED-6126:  In the OCRService, OCR Text is not applied to Document *refactored Tests with inheritance *called PDFNet init/terminate in tests *don't call init on startup
  RED-6126:  In the OCRService, OCR Text is not applied to Document *called PDFNet.initialize and terminate before and after message receive *updated comments *renamed some variables
This commit is contained in:
Kilian Schuettler 2023-02-22 13:33:49 +01:00 committed by Dominique Eiflaender
commit bc661b7ea4
7 changed files with 153 additions and 171 deletions

View File

@ -1,14 +1,10 @@
package com.iqser.red.service.ocr.v1.server.initializer;
import javax.annotation.PostConstruct;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import com.pdftron.pdf.PDFNet;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
@Component
@RequiredArgsConstructor
@ -22,14 +18,14 @@ public class PDFNetInitializer {
@SneakyThrows
@PostConstruct
// Do not change back to application runner, if it is application runner it takes messages from the queue before PDFNet is initialized, that leads to UnsatisfiedLinkError.
public void init() {
PDFNet.initialize(pdftronLicense);
PDFNet.setTempPath("/tmp/pdftron");
PDFNet.addResourceSearchPath(ocrModulePath);
PDFNet.initialize(pdftronLicense);
}
}

View File

@ -1,34 +1,23 @@
package com.iqser.red.service.ocr.v1.server.service;
import static java.lang.String.format;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Map;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.stereotype.Service;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse;
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.OCRModule;
import com.pdftron.pdf.OCROptions;
import com.pdftron.pdf.Optimizer;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.RectCollection;
import com.pdftron.pdf.*;
import com.pdftron.sdf.SDFDoc;
import io.micrometer.core.annotation.Timed;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.stereotype.Service;
import java.io.*;
import java.util.Map;
import static java.lang.String.format;
@Slf4j
@Service
@ -67,17 +56,17 @@ public class OCRService {
try (ByteArrayOutputStream transferOutputStream = new ByteArrayOutputStream()) {
try (InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId)) {
long start = System.currentTimeMillis();
long removalStart = System.currentTimeMillis();
log.debug("Start invisible element removal for file with dossierId {} and fileId {}", dossierId, fileId);
invisibleElementRemovalService.removeInvisibleElements(fileStream, transferOutputStream, false);
long end = System.currentTimeMillis();
log.debug("Invisible element removal successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, format("%.1f", (end - start) / 1000.0));
long removalEnd = System.currentTimeMillis();
log.debug("Invisible element removal successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, format("%.1f", (removalEnd - removalStart) / 1000.0));
}
try (InputStream transferInputStream = new ByteArrayInputStream(transferOutputStream.toByteArray())) {
long start = System.currentTimeMillis();
long ocrStart = System.currentTimeMillis();
runOcr(transferInputStream, out, fileId);
long end = System.currentTimeMillis();
log.info("ocr successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, format("%.1f", (end - start) / 1000.0));
long ocrEnd = System.currentTimeMillis();
log.info("ocr successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, format("%.1f", (ocrEnd - ocrStart) / 1000.0));
}
}
}
@ -94,7 +83,7 @@ public class OCRService {
// When a page does not have a TextZone, PDFTron whites out the page. But, PDFTron scans it anyway, resulting in a longer runtime.
// So, we need to remove pages without images.
// Furthermore, creating a new document is *much* faster than reusing the same document and adding/removing pages one by one.
// Therefore, we create a new Document with a single page for every page that contains text.
// Therefore, we create a new Document with a single page for every page that contains an image.
int numProcessedPages = 0;
for (Integer pageId : pageIdToRectCollection.keySet()) {
try {
@ -139,6 +128,7 @@ public class OCRService {
log.error("Processed File with fileId {} could not be saved", fileId);
throw new RuntimeException(e);
}
pdfDoc.close();
}

View File

@ -15,6 +15,8 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.ocr.v1.api.model.DocumentRequest;
import com.iqser.red.service.ocr.v1.server.client.FileStatusProcessingUpdateClient;
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
import com.iqser.red.service.ocr.v1.server.initializer.PDFNetInitializer;
import com.pdftron.pdf.PDFNet;
import feign.FeignException;
import lombok.RequiredArgsConstructor;
@ -28,6 +30,7 @@ public class OcrMessageReceiver {
private final ObjectMapper objectMapper;
private final FileStorageService fileStorageService;
private final FileStatusProcessingUpdateClient fileStatusProcessingUpdateClient;
private final PDFNetInitializer pdfNetInitializer;
private final OCRService ocrService;
@ -36,6 +39,7 @@ public class OcrMessageReceiver {
@RabbitListener(queues = MessagingConfiguration.OCR_QUEUE, concurrency = "1")
public void receiveOcr(String in) throws JsonProcessingException {
pdfNetInitializer.init();
DocumentRequest ocrRequestMessage = objectMapper.readValue(in, DocumentRequest.class);
log.info("Start ocr for file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
@ -58,6 +62,7 @@ public class OcrMessageReceiver {
}
fileStatusProcessingUpdateClient.ocrSuccessful(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
PDFNet.terminate();
}

View File

@ -0,0 +1,77 @@
package com.iqser.red.service.ocr.v1.server;
import com.iqser.red.service.ocr.v1.server.initializer.PDFNetInitializer;
import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService;
import com.iqser.red.storage.commons.StorageAutoConfiguration;
import com.iqser.red.storage.commons.service.StorageService;
import com.pdftron.pdf.PDFNet;
import lombok.SneakyThrows;
import org.junit.jupiter.api.*;
import org.junit.jupiter.api.extension.ExtendWith;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.context.annotation.Import;
import org.springframework.context.annotation.Primary;
import org.springframework.test.context.junit.jupiter.SpringExtension;
import static org.assertj.core.api.Assertions.assertThat;
@ExtendWith(SpringExtension.class)
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
@Import(AbstractTest.TestConfiguration.class)
public class AbstractTest {
@Autowired
protected StorageService storageService;
@Autowired
private PDFNetInitializer pdfNetInitializer;
@BeforeEach
@SneakyThrows
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
public void initPDFNet() {
pdfNetInitializer.init();
}
@AfterAll
public static void terminatePDFNet() {
PDFNet.terminate();
}
@SneakyThrows
public void dummyTest() {
// Build needs one test to not fail.
assertThat(1).isEqualTo(1);
}
@AfterEach
public void cleanupStorage() {
if (this.storageService instanceof FileSystemBackedStorageService) {
((FileSystemBackedStorageService) this.storageService).clearStorage();
}
}
@Configuration
@EnableAutoConfiguration(exclude = {StorageAutoConfiguration.class, RabbitAutoConfiguration.class})
public static class TestConfiguration {
@Bean
@Primary
public StorageService inMemoryStorage() {
return new FileSystemBackedStorageService();
}
}
}

View File

@ -1,7 +1,22 @@
package com.iqser.red.service.ocr.v1.server;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
import static org.assertj.core.api.Assertions.assertThat;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.ocr.v1.server.service.FileStorageService;
import com.iqser.red.service.ocr.v1.server.service.OCRService;
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.*;
import io.micrometer.prometheus.PrometheusMeterRegistry;
import io.micrometer.prometheus.PrometheusTimer;
import lombok.SneakyThrows;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.core.io.ClassPathResource;
import java.io.FileInputStream;
import java.io.FileOutputStream;
@ -11,53 +26,11 @@ import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.context.annotation.Import;
import org.springframework.context.annotation.Primary;
import org.springframework.core.io.ClassPathResource;
import org.springframework.test.context.junit.jupiter.SpringExtension;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
import static org.assertj.core.api.Assertions.assertThat;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.ocr.v1.server.service.FileStorageService;
import com.iqser.red.service.ocr.v1.server.service.OCRService;
import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService;
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType;
import com.iqser.red.storage.commons.StorageAutoConfiguration;
import com.iqser.red.storage.commons.service.StorageService;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.OCRModule;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
import com.pdftron.pdf.TextExtractor;
import io.micrometer.prometheus.PrometheusMeterRegistry;
import io.micrometer.prometheus.PrometheusTimer;
import lombok.SneakyThrows;
@ExtendWith(SpringExtension.class)
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT //
, properties = {"pdftron.ocrmodule.path=/YourOCRModulePath"})
@Import(OcrServiceIntegrationTest.TestConfiguration.class)
public class OcrServiceIntegrationTest {
@Autowired
protected StorageService storageService;
@Autowired
protected FileStorageService fileStorageService;
@SpringBootTest(properties = {"pdftron.ocrmodule.path=/YourOCRModulePath"})
public class OcrServiceIntegrationTest extends AbstractTest {
@Autowired
protected ObjectMapper objectMapper;
@ -71,16 +44,13 @@ public class OcrServiceIntegrationTest {
@Autowired
private PrometheusMeterRegistry registry;
@BeforeEach
@SneakyThrows
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
@Disabled
public void assertOCRModuleIsLoaded() {
assert OCRModule.isModuleAvailable();
}
@Test
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
public void testOCRMetrics() {
@ -163,15 +133,16 @@ public class OcrServiceIntegrationTest {
}
try (var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
ocrService.runOcrOnDocument("dossier", "file", out);
System.out.println("File:" + getTemporaryDirectory() + "/" + fileName + ".pdf");
}
System.out.println("File:" + getTemporaryDirectory() + "/" + fileName + ".pdf");
try (var fileStream = new FileInputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
return extractAllTextFromDocument(fileStream);
}
}
}
private static String extractAllTextFromDocument(InputStream fileStream) throws IOException, PDFNetException {
PDFDoc pdfDoc = new PDFDoc(fileStream);
@ -189,34 +160,4 @@ public class OcrServiceIntegrationTest {
}
@SneakyThrows
public void dummyTest() {
// Build needs one test to not fail.
assertThat(1).isEqualTo(1);
}
@AfterEach
public void cleanupStorage() {
if (this.storageService instanceof FileSystemBackedStorageService) {
((FileSystemBackedStorageService) this.storageService).clearStorage();
}
}
@Configuration
@EnableAutoConfiguration(exclude = {StorageAutoConfiguration.class, RabbitAutoConfiguration.class})
public static class TestConfiguration {
@Bean
@Primary
public StorageService inMemoryStorage() {
return new FileSystemBackedStorageService();
}
}
}

View File

@ -1,7 +1,15 @@
package com.iqser.red.service.ocr.v1.server.service;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
import static org.assertj.core.api.Assertions.assertThat;
import com.iqser.red.service.ocr.v1.server.AbstractTest;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.*;
import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows;
import org.junit.jupiter.api.Test;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.core.io.ClassPathResource;
import java.io.FileInputStream;
import java.io.FileOutputStream;
@ -12,36 +20,11 @@ import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.context.annotation.Import;
import org.springframework.core.io.ClassPathResource;
import org.springframework.test.context.junit.jupiter.SpringExtension;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
import static org.assertj.core.api.Assertions.assertThat;
import com.iqser.red.service.ocr.v1.server.Application;
import com.iqser.red.service.ocr.v1.server.OcrServiceIntegrationTest;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.ColorPt;
import com.pdftron.pdf.ColorSpace;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementBuilder;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.Rect;
import com.pdftron.pdf.RectCollection;
import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows;
@ExtendWith(SpringExtension.class)
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
@Import(OcrServiceIntegrationTest.TestConfiguration.class)
class ImagePositionRetrievalServiceTest {
class ImagePositionRetrievalServiceTest extends AbstractTest {
@Autowired
private ImagePositionRetrievalService imagePositionRetrievalService;

View File

@ -1,34 +1,24 @@
package com.iqser.red.service.ocr.v1.server.service;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.context.annotation.Import;
import org.springframework.core.io.ClassPathResource;
import org.springframework.test.context.junit.jupiter.SpringExtension;
import com.iqser.red.service.ocr.v1.server.Application;
import com.iqser.red.service.ocr.v1.server.OcrServiceIntegrationTest;
import com.iqser.red.service.ocr.v1.server.AbstractTest;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
import com.pdftron.pdf.TextExtractor;
import lombok.SneakyThrows;
import org.junit.jupiter.api.Test;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.core.io.ClassPathResource;
@ExtendWith(SpringExtension.class)
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
@Import(OcrServiceIntegrationTest.TestConfiguration.class)
public class InvisibleElementRemovalServiceTest {
import java.io.FileInputStream;
import java.io.FileOutputStream;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
public class InvisibleElementRemovalServiceTest extends AbstractTest {
@Autowired
private InvisibleElementRemovalService invisibleElementRemovalService;