RED-6126: performance-test

*refactor to improve cleanness
*closed inputStream
This commit is contained in:
Kilian Schuettler 2023-02-10 14:49:10 +01:00
parent b3fa14b342
commit 37f1e03ebc
4 changed files with 99 additions and 51 deletions

View File

@ -101,18 +101,13 @@ public class ImagePositionRetrievalService {
// Sometimes images are split up into stripes, here we merge the positions of aligned and intersecting rectangles into one larger rectangle
private void mergeRectangleList(List<Rectangle2D> rectangleList) {
int idx = 0;
while (rectangleList.size() >= idx + 2) {
for (int idx = 0; rectangleList.size() >= idx + 2; ) {
var rect1 = rectangleList.get(idx);
var rect2 = rectangleList.get(idx + 1);
boolean isAlignedX = Math.abs(rect1.getMinX() - rect2.getMinX()) < TOLERANCE && Math.abs(rect1.getMaxX() - rect2.getMaxX()) < TOLERANCE;
boolean isAlignedY = Math.abs(rect1.getMinY() - rect2.getMinY()) < TOLERANCE && Math.abs(rect1.getMaxY() - rect2.getMaxY()) < TOLERANCE;
boolean intersects = rect1.intersects(rect2.getMinX() - TOLERANCE, rect2.getMinY() - TOLERANCE, rect2.getWidth() + (2 * TOLERANCE), rect2.getHeight() + (2 * TOLERANCE));
if (intersects && (isAlignedX || isAlignedY)) {
if (intersects(rect1, rect2) && isAlignedXOrY(rect1, rect2)) {
rectangleList.remove(idx + 1);
rectangleList.remove(idx);
rectangleList.add(idx, rect1.createUnion(rect2));
@ -123,6 +118,21 @@ public class ImagePositionRetrievalService {
}
private boolean intersects(Rectangle2D rect1, Rectangle2D rect2) {
return rect1.intersects(rect2.getMinX() - TOLERANCE, rect2.getMinY() - TOLERANCE, rect2.getWidth() + (2 * TOLERANCE), rect2.getHeight() + (2 * TOLERANCE));
}
private boolean isAlignedXOrY(Rectangle2D rect1, Rectangle2D rect2) {
boolean isAlignedX = Math.abs(rect1.getMinX() - rect2.getMinX()) < TOLERANCE && Math.abs(rect1.getMaxX() - rect2.getMaxX()) < TOLERANCE;
boolean isAlignedY = Math.abs(rect1.getMinY() - rect2.getMinY()) < TOLERANCE && Math.abs(rect1.getMaxY() - rect2.getMaxY()) < TOLERANCE;
return isAlignedX || isAlignedY;
}
private Rect toRotationAdjustedRect(Rect bbox, Page page, boolean mirrorY) throws PDFNetException {
int rotation = page.getRotation();

View File

@ -90,36 +90,26 @@ public class OCRService {
Map<Integer, RectCollection> pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, true);
// Optimization:
// When a page does not have a TextZone, PDFTron whites out the page. But, PDFTron scans it anyway, resulting in a longer runtime.
// So, we need to remove pages without images.
// Furthermore, creating a new document is *much* faster than reusing the same document and adding/removing pages one by one.
// Therefore, we create a new Document with a single page for every page that contains text.
int numProcessedPages = 0;
// optimization: only scanning pages that contain images
for (Integer pageId : pageIdToRectCollection.keySet()) {
try {
// optimization: creating a new document is faster than reusing the same and adding/removing pages one by one
OCROptions options = new OCROptions();
PDFDoc ocrPageDoc = new PDFDoc();
Page pdfPage = pdfDoc.getPage(pageId);
// optimization: this line ensures the ocr text is placed correctly by PDFTron
pdfPage.setMediaBox(pdfPage.getCropBox());
ocrPageDoc.pagePushBack(pdfPage);
options.addTextZonesForPage(pageIdToRectCollection.get(pageId), 1);
options.addLang(ENGLISH);
options.addDPI(settings.getOcrDPI());
OCRModule.processPDF(ocrPageDoc, options);
PDFDoc singlePagePdfDoc = extractSinglePagePdfDoc(pdfDoc, pageId);
processOcr(pageIdToRectCollection, pageId, singlePagePdfDoc);
++numProcessedPages;
StringBuilder zonesString = new StringBuilder();
for (int j = 0; j < pageIdToRectCollection.get(pageId).getNumRects(); ++j) {
var r = pageIdToRectCollection.get(pageId).getRectAt(j);
zonesString.append(format("[lower left (%.1f|%.1f) upper right (%.1f|%.1f)]", r.getX1(), r.getY1(), r.getX2(), r.getY2()));
}
log.info("{}/{} Page {} done, OCR regions {}", numProcessedPages, pageIdToRectCollection.size(), pageId, zonesString);
log.info("{}/{} Page {} done, OCR regions {}",
numProcessedPages,
pageIdToRectCollection.size(),
pageId,
getAllOcrTextZonesAsString(pageIdToRectCollection, pageId));
// re-adding OCR pages
Page ocrPage = ocrPageDoc.getPage(1);
pdfDoc.pageInsert(pdfDoc.getPageIterator(pageId), ocrPage);
pdfDoc.pageRemove(pdfDoc.getPageIterator(pageId + 1));
ocrPageDoc.close();
replaceOriginalPageWithOcrPage(pdfDoc, pageId, singlePagePdfDoc);
singlePagePdfDoc.close();
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder()
@ -151,4 +141,44 @@ public class OCRService {
}
}
private void processOcr(Map<Integer, RectCollection> pageIdToRectCollection, Integer pageId, PDFDoc singlePagePdfDoc) throws PDFNetException {
OCROptions options = new OCROptions();
options.addTextZonesForPage(pageIdToRectCollection.get(pageId), 1);
options.addLang(ENGLISH);
options.addDPI(settings.getOcrDPI());
OCRModule.processPDF(singlePagePdfDoc, options);
}
private static PDFDoc extractSinglePagePdfDoc(PDFDoc pdfDoc, Integer pageId) throws PDFNetException {
PDFDoc singlePagePdfDoc = new PDFDoc();
Page page = pdfDoc.getPage(pageId);
page.setMediaBox(page.getCropBox()); // this line ensures the ocr text is placed correctly by PDFTron, see TestFile MediaBoxBiggerThanCropBox.pdf
singlePagePdfDoc.pagePushBack(page);
return singlePagePdfDoc;
}
private static void replaceOriginalPageWithOcrPage(PDFDoc pdfDoc, Integer pageId, PDFDoc ocrPageDoc) throws PDFNetException {
Page ocrPage = ocrPageDoc.getPage(1);
pdfDoc.pageInsert(pdfDoc.getPageIterator(pageId), ocrPage);
pdfDoc.pageRemove(pdfDoc.getPageIterator(pageId + 1));
}
private static StringBuilder getAllOcrTextZonesAsString(Map<Integer, RectCollection> pageIdToRectCollection, Integer pageId) throws PDFNetException {
StringBuilder zonesString = new StringBuilder();
for (int j = 0; j < pageIdToRectCollection.get(pageId).getNumRects(); ++j) {
var r = pageIdToRectCollection.get(pageId).getRectAt(j);
zonesString.append(format("[lower left (%.1f|%.1f) upper right (%.1f|%.1f)]", r.getX1(), r.getY1(), r.getX2(), r.getY2()));
}
return zonesString;
}
}

View File

@ -31,13 +31,13 @@ public class OcrMessageReceiver {
private final OCRService ocrService;
@RabbitHandler
@RabbitListener(queues = MessagingConfiguration.OCR_QUEUE, concurrency = "1")
public void receiveOcr(String in) throws JsonProcessingException {
DocumentRequest ocrRequestMessage = objectMapper.readValue(in, DocumentRequest.class);
log.info("Start ocr for file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
setStatusOcrProcessing(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
@ -47,16 +47,16 @@ public class OcrMessageReceiver {
fileStorageService.storeUntouchedFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), originalFile);
}
try (var out = new ByteArrayOutputStream()) {
ocrService.runOcrOnDocument(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), out);
fileStorageService.storeOriginalFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), new ByteArrayInputStream(out.toByteArray()));
try (var transferStream = new ByteArrayOutputStream()) {
ocrService.runOcrOnDocument(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), transferStream);
try (var inputStream = new ByteArrayInputStream(transferStream.toByteArray())) {
fileStorageService.storeOriginalFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), inputStream);
}
} catch (IOException e) {
log.error("Failed to store file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
throw new RuntimeException(e);
}
fileStatusProcessingUpdateClient.ocrSuccessful(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
}

View File

@ -5,12 +5,12 @@ import static org.assertj.core.api.Assertions.assertThat;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;
import io.micrometer.prometheus.PrometheusMeterRegistry;
import io.micrometer.prometheus.PrometheusTimer;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
@ -36,12 +36,15 @@ import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService;
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType;
import com.iqser.red.storage.commons.StorageAutoConfiguration;
import com.iqser.red.storage.commons.service.StorageService;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.OCRModule;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
import com.pdftron.pdf.TextExtractor;
import io.micrometer.prometheus.PrometheusMeterRegistry;
import io.micrometer.prometheus.PrometheusTimer;
import lombok.SneakyThrows;
@ExtendWith(SpringExtension.class)
@ -80,19 +83,20 @@ public class OcrServiceIntegrationTest {
@Test
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
public void testOCRMetrics(){
public void testOCRMetrics() {
testOCR("Watermark");
testOCR("Watermark");
testOCR("Watermark");
var ocrOnDocumentMeter = registry.getMeters().stream()
.filter(m -> m.getId().getName().equalsIgnoreCase("redactmanager_runOcrOnDocument")).findAny();
var ocrOnDocumentMeter = registry.getMeters().stream().filter(m -> m.getId().getName().equalsIgnoreCase("redactmanager_runOcrOnDocument")).findAny();
assertThat(ocrOnDocumentMeter.isPresent()).isTrue();
PrometheusTimer timer = (PrometheusTimer) ocrOnDocumentMeter.get();
assertThat(timer.count()).isEqualTo(3);
assertThat(timer.mean(TimeUnit.SECONDS)).isGreaterThan(0.1);
}
@Test
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
public void testOcr() {
@ -153,30 +157,34 @@ public class OcrServiceIntegrationTest {
private String testOCR(String fileName) {
ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf");
var originId = FileStorageService.getStorageId("dossier", "file", FileType.ORIGIN);
try (var fileStream = pdfFileResource.getInputStream()) {
storageService.storeObject(originId, fileStream);
}
try (var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
ocrService.runOcrOnDocument("dossier", "file", out);
}
System.out.println("File:" + getTemporaryDirectory() + "/" + fileName + ".pdf");
try (var fileStream = new FileInputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
return extractAllTextFromDocument(fileStream);
}
}
private static String extractAllTextFromDocument(InputStream fileStream) throws IOException, PDFNetException {
PDFDoc pdfDoc = new PDFDoc(fileStream);
TextExtractor extractor = new TextExtractor();
List<String> texts = new ArrayList<>();
PDFDoc pdfDoc;
try (var fileStream = new FileInputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
pdfDoc = new PDFDoc(fileStream);
}
PageIterator iterator = pdfDoc.getPageIterator();
while (iterator.hasNext()) {
Page page = iterator.next();
extractor.begin(page);
texts.add(extractor.getAsText());
}
System.out.println("File:" + getTemporaryDirectory() + "/" + fileName + ".pdf");
return String.join("\n", texts);
}
@ -184,7 +192,7 @@ public class OcrServiceIntegrationTest {
@SneakyThrows
public void dummyTest() {
// Build needs one text to not fail.
// Build needs one test to not fail.
assertThat(1).isEqualTo(1);
}
@ -204,7 +212,7 @@ public class OcrServiceIntegrationTest {
@Bean
@Primary
public StorageService inmemoryStorage() {
public StorageService inMemoryStorage() {
return new FileSystemBackedStorageService();
}