RED-6126: performance-test
*refactor to improve cleanness *closed inputStream
This commit is contained in:
parent
b3fa14b342
commit
37f1e03ebc
@ -101,18 +101,13 @@ public class ImagePositionRetrievalService {
|
||||
|
||||
// Sometimes images are split up into stripes, here we merge the positions of aligned and intersecting rectangles into one larger rectangle
|
||||
private void mergeRectangleList(List<Rectangle2D> rectangleList) {
|
||||
int idx = 0;
|
||||
|
||||
while (rectangleList.size() >= idx + 2) {
|
||||
for (int idx = 0; rectangleList.size() >= idx + 2; ) {
|
||||
|
||||
var rect1 = rectangleList.get(idx);
|
||||
var rect2 = rectangleList.get(idx + 1);
|
||||
|
||||
boolean isAlignedX = Math.abs(rect1.getMinX() - rect2.getMinX()) < TOLERANCE && Math.abs(rect1.getMaxX() - rect2.getMaxX()) < TOLERANCE;
|
||||
boolean isAlignedY = Math.abs(rect1.getMinY() - rect2.getMinY()) < TOLERANCE && Math.abs(rect1.getMaxY() - rect2.getMaxY()) < TOLERANCE;
|
||||
boolean intersects = rect1.intersects(rect2.getMinX() - TOLERANCE, rect2.getMinY() - TOLERANCE, rect2.getWidth() + (2 * TOLERANCE), rect2.getHeight() + (2 * TOLERANCE));
|
||||
|
||||
if (intersects && (isAlignedX || isAlignedY)) {
|
||||
if (intersects(rect1, rect2) && isAlignedXOrY(rect1, rect2)) {
|
||||
rectangleList.remove(idx + 1);
|
||||
rectangleList.remove(idx);
|
||||
rectangleList.add(idx, rect1.createUnion(rect2));
|
||||
@ -123,6 +118,21 @@ public class ImagePositionRetrievalService {
|
||||
}
|
||||
|
||||
|
||||
private boolean intersects(Rectangle2D rect1, Rectangle2D rect2) {
|
||||
|
||||
return rect1.intersects(rect2.getMinX() - TOLERANCE, rect2.getMinY() - TOLERANCE, rect2.getWidth() + (2 * TOLERANCE), rect2.getHeight() + (2 * TOLERANCE));
|
||||
}
|
||||
|
||||
|
||||
private boolean isAlignedXOrY(Rectangle2D rect1, Rectangle2D rect2) {
|
||||
|
||||
boolean isAlignedX = Math.abs(rect1.getMinX() - rect2.getMinX()) < TOLERANCE && Math.abs(rect1.getMaxX() - rect2.getMaxX()) < TOLERANCE;
|
||||
boolean isAlignedY = Math.abs(rect1.getMinY() - rect2.getMinY()) < TOLERANCE && Math.abs(rect1.getMaxY() - rect2.getMaxY()) < TOLERANCE;
|
||||
|
||||
return isAlignedX || isAlignedY;
|
||||
}
|
||||
|
||||
|
||||
private Rect toRotationAdjustedRect(Rect bbox, Page page, boolean mirrorY) throws PDFNetException {
|
||||
|
||||
int rotation = page.getRotation();
|
||||
|
||||
@ -90,36 +90,26 @@ public class OCRService {
|
||||
|
||||
Map<Integer, RectCollection> pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, true);
|
||||
|
||||
// Optimization:
|
||||
// When a page does not have a TextZone, PDFTron whites out the page. But, PDFTron scans it anyway, resulting in a longer runtime.
|
||||
// So, we need to remove pages without images.
|
||||
// Furthermore, creating a new document is *much* faster than reusing the same document and adding/removing pages one by one.
|
||||
// Therefore, we create a new Document with a single page for every page that contains text.
|
||||
int numProcessedPages = 0;
|
||||
// optimization: only scanning pages that contain images
|
||||
for (Integer pageId : pageIdToRectCollection.keySet()) {
|
||||
try {
|
||||
// optimization: creating a new document is faster than reusing the same and adding/removing pages one by one
|
||||
OCROptions options = new OCROptions();
|
||||
PDFDoc ocrPageDoc = new PDFDoc();
|
||||
Page pdfPage = pdfDoc.getPage(pageId);
|
||||
// optimization: this line ensures the ocr text is placed correctly by PDFTron
|
||||
pdfPage.setMediaBox(pdfPage.getCropBox());
|
||||
ocrPageDoc.pagePushBack(pdfPage);
|
||||
options.addTextZonesForPage(pageIdToRectCollection.get(pageId), 1);
|
||||
options.addLang(ENGLISH);
|
||||
options.addDPI(settings.getOcrDPI());
|
||||
|
||||
OCRModule.processPDF(ocrPageDoc, options);
|
||||
PDFDoc singlePagePdfDoc = extractSinglePagePdfDoc(pdfDoc, pageId);
|
||||
processOcr(pageIdToRectCollection, pageId, singlePagePdfDoc);
|
||||
++numProcessedPages;
|
||||
|
||||
StringBuilder zonesString = new StringBuilder();
|
||||
for (int j = 0; j < pageIdToRectCollection.get(pageId).getNumRects(); ++j) {
|
||||
var r = pageIdToRectCollection.get(pageId).getRectAt(j);
|
||||
zonesString.append(format("[lower left (%.1f|%.1f) upper right (%.1f|%.1f)]", r.getX1(), r.getY1(), r.getX2(), r.getY2()));
|
||||
}
|
||||
log.info("{}/{} Page {} done, OCR regions {}", numProcessedPages, pageIdToRectCollection.size(), pageId, zonesString);
|
||||
log.info("{}/{} Page {} done, OCR regions {}",
|
||||
numProcessedPages,
|
||||
pageIdToRectCollection.size(),
|
||||
pageId,
|
||||
getAllOcrTextZonesAsString(pageIdToRectCollection, pageId));
|
||||
|
||||
// re-adding OCR pages
|
||||
Page ocrPage = ocrPageDoc.getPage(1);
|
||||
pdfDoc.pageInsert(pdfDoc.getPageIterator(pageId), ocrPage);
|
||||
pdfDoc.pageRemove(pdfDoc.getPageIterator(pageId + 1));
|
||||
ocrPageDoc.close();
|
||||
replaceOriginalPageWithOcrPage(pdfDoc, pageId, singlePagePdfDoc);
|
||||
singlePagePdfDoc.close();
|
||||
|
||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
|
||||
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder()
|
||||
@ -151,4 +141,44 @@ public class OCRService {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void processOcr(Map<Integer, RectCollection> pageIdToRectCollection, Integer pageId, PDFDoc singlePagePdfDoc) throws PDFNetException {
|
||||
|
||||
OCROptions options = new OCROptions();
|
||||
options.addTextZonesForPage(pageIdToRectCollection.get(pageId), 1);
|
||||
options.addLang(ENGLISH);
|
||||
options.addDPI(settings.getOcrDPI());
|
||||
|
||||
OCRModule.processPDF(singlePagePdfDoc, options);
|
||||
}
|
||||
|
||||
|
||||
private static PDFDoc extractSinglePagePdfDoc(PDFDoc pdfDoc, Integer pageId) throws PDFNetException {
|
||||
|
||||
PDFDoc singlePagePdfDoc = new PDFDoc();
|
||||
Page page = pdfDoc.getPage(pageId);
|
||||
page.setMediaBox(page.getCropBox()); // this line ensures the ocr text is placed correctly by PDFTron, see TestFile MediaBoxBiggerThanCropBox.pdf
|
||||
singlePagePdfDoc.pagePushBack(page);
|
||||
return singlePagePdfDoc;
|
||||
}
|
||||
|
||||
|
||||
private static void replaceOriginalPageWithOcrPage(PDFDoc pdfDoc, Integer pageId, PDFDoc ocrPageDoc) throws PDFNetException {
|
||||
|
||||
Page ocrPage = ocrPageDoc.getPage(1);
|
||||
pdfDoc.pageInsert(pdfDoc.getPageIterator(pageId), ocrPage);
|
||||
pdfDoc.pageRemove(pdfDoc.getPageIterator(pageId + 1));
|
||||
}
|
||||
|
||||
|
||||
private static StringBuilder getAllOcrTextZonesAsString(Map<Integer, RectCollection> pageIdToRectCollection, Integer pageId) throws PDFNetException {
|
||||
|
||||
StringBuilder zonesString = new StringBuilder();
|
||||
for (int j = 0; j < pageIdToRectCollection.get(pageId).getNumRects(); ++j) {
|
||||
var r = pageIdToRectCollection.get(pageId).getRectAt(j);
|
||||
zonesString.append(format("[lower left (%.1f|%.1f) upper right (%.1f|%.1f)]", r.getX1(), r.getY1(), r.getX2(), r.getY2()));
|
||||
}
|
||||
return zonesString;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -31,13 +31,13 @@ public class OcrMessageReceiver {
|
||||
|
||||
private final OCRService ocrService;
|
||||
|
||||
|
||||
@RabbitHandler
|
||||
@RabbitListener(queues = MessagingConfiguration.OCR_QUEUE, concurrency = "1")
|
||||
public void receiveOcr(String in) throws JsonProcessingException {
|
||||
|
||||
DocumentRequest ocrRequestMessage = objectMapper.readValue(in, DocumentRequest.class);
|
||||
|
||||
|
||||
log.info("Start ocr for file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
|
||||
|
||||
setStatusOcrProcessing(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
|
||||
@ -47,16 +47,16 @@ public class OcrMessageReceiver {
|
||||
fileStorageService.storeUntouchedFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), originalFile);
|
||||
}
|
||||
|
||||
try (var out = new ByteArrayOutputStream()) {
|
||||
ocrService.runOcrOnDocument(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), out);
|
||||
|
||||
fileStorageService.storeOriginalFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), new ByteArrayInputStream(out.toByteArray()));
|
||||
try (var transferStream = new ByteArrayOutputStream()) {
|
||||
ocrService.runOcrOnDocument(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), transferStream);
|
||||
try (var inputStream = new ByteArrayInputStream(transferStream.toByteArray())) {
|
||||
fileStorageService.storeOriginalFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), inputStream);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
log.error("Failed to store file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
|
||||
fileStatusProcessingUpdateClient.ocrSuccessful(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
|
||||
|
||||
}
|
||||
|
||||
@ -5,12 +5,12 @@ import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import io.micrometer.prometheus.PrometheusMeterRegistry;
|
||||
import io.micrometer.prometheus.PrometheusTimer;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
@ -36,12 +36,15 @@ import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService;
|
||||
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType;
|
||||
import com.iqser.red.storage.commons.StorageAutoConfiguration;
|
||||
import com.iqser.red.storage.commons.service.StorageService;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.OCRModule;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.pdf.PageIterator;
|
||||
import com.pdftron.pdf.TextExtractor;
|
||||
|
||||
import io.micrometer.prometheus.PrometheusMeterRegistry;
|
||||
import io.micrometer.prometheus.PrometheusTimer;
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@ExtendWith(SpringExtension.class)
|
||||
@ -80,19 +83,20 @@ public class OcrServiceIntegrationTest {
|
||||
|
||||
@Test
|
||||
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
|
||||
public void testOCRMetrics(){
|
||||
public void testOCRMetrics() {
|
||||
|
||||
testOCR("Watermark");
|
||||
testOCR("Watermark");
|
||||
testOCR("Watermark");
|
||||
|
||||
var ocrOnDocumentMeter = registry.getMeters().stream()
|
||||
.filter(m -> m.getId().getName().equalsIgnoreCase("redactmanager_runOcrOnDocument")).findAny();
|
||||
var ocrOnDocumentMeter = registry.getMeters().stream().filter(m -> m.getId().getName().equalsIgnoreCase("redactmanager_runOcrOnDocument")).findAny();
|
||||
assertThat(ocrOnDocumentMeter.isPresent()).isTrue();
|
||||
PrometheusTimer timer = (PrometheusTimer) ocrOnDocumentMeter.get();
|
||||
assertThat(timer.count()).isEqualTo(3);
|
||||
assertThat(timer.mean(TimeUnit.SECONDS)).isGreaterThan(0.1);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
|
||||
public void testOcr() {
|
||||
@ -153,30 +157,34 @@ public class OcrServiceIntegrationTest {
|
||||
private String testOCR(String fileName) {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf");
|
||||
|
||||
var originId = FileStorageService.getStorageId("dossier", "file", FileType.ORIGIN);
|
||||
try (var fileStream = pdfFileResource.getInputStream()) {
|
||||
storageService.storeObject(originId, fileStream);
|
||||
}
|
||||
|
||||
try (var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
|
||||
ocrService.runOcrOnDocument("dossier", "file", out);
|
||||
}
|
||||
|
||||
System.out.println("File:" + getTemporaryDirectory() + "/" + fileName + ".pdf");
|
||||
try (var fileStream = new FileInputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
|
||||
return extractAllTextFromDocument(fileStream);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static String extractAllTextFromDocument(InputStream fileStream) throws IOException, PDFNetException {
|
||||
|
||||
PDFDoc pdfDoc = new PDFDoc(fileStream);
|
||||
TextExtractor extractor = new TextExtractor();
|
||||
List<String> texts = new ArrayList<>();
|
||||
PDFDoc pdfDoc;
|
||||
|
||||
try (var fileStream = new FileInputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
|
||||
pdfDoc = new PDFDoc(fileStream);
|
||||
}
|
||||
PageIterator iterator = pdfDoc.getPageIterator();
|
||||
while (iterator.hasNext()) {
|
||||
Page page = iterator.next();
|
||||
extractor.begin(page);
|
||||
texts.add(extractor.getAsText());
|
||||
}
|
||||
System.out.println("File:" + getTemporaryDirectory() + "/" + fileName + ".pdf");
|
||||
|
||||
return String.join("\n", texts);
|
||||
}
|
||||
|
||||
@ -184,7 +192,7 @@ public class OcrServiceIntegrationTest {
|
||||
@SneakyThrows
|
||||
public void dummyTest() {
|
||||
|
||||
// Build needs one text to not fail.
|
||||
// Build needs one test to not fail.
|
||||
assertThat(1).isEqualTo(1);
|
||||
}
|
||||
|
||||
@ -204,7 +212,7 @@ public class OcrServiceIntegrationTest {
|
||||
|
||||
@Bean
|
||||
@Primary
|
||||
public StorageService inmemoryStorage() {
|
||||
public StorageService inMemoryStorage() {
|
||||
|
||||
return new FileSystemBackedStorageService();
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user