Pull request #7: RED-6126 performance test

Merge in RED/ocr-service from RED-6126-performance-test to master

* commit '37f1e03ebcd5356e0f0b403a5c0cdd20fc133997':
  RED-6126: performance-test *refactor to improve cleanness *closed inputStream
  RED-6126: performance-test *fixed NullPointerException *fixed StackOverFlowError by ignoring very small images and moving to while loop instead of recursion
  RED-6126: performance-test *fixed time calculation
  RED-6126: performance-test *improved error logging
  RED-6126: performance-test *re-enabled overlap detection *re-creating helper document for every page instead of reusing and adding/removing pages
  RED-6126: Performance Tests *moved to streams for pdf file transfer *disabled overlap detection
This commit is contained in:
Kilian Schuettler 2023-02-10 15:00:55 +01:00
commit 001719a34c
9 changed files with 243 additions and 131 deletions

View File

@ -15,6 +15,7 @@ import com.pdftron.common.Matrix2D;
import com.pdftron.common.PDFNetException; import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.Element; import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementReader; import com.pdftron.pdf.ElementReader;
import com.pdftron.pdf.Image;
import com.pdftron.pdf.PDFDoc; import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page; import com.pdftron.pdf.Page;
import com.pdftron.pdf.Rect; import com.pdftron.pdf.Rect;
@ -27,9 +28,12 @@ public class ImagePositionRetrievalService {
private static final double TOLERANCE = 1e-1; private static final double TOLERANCE = 1e-1;
// any image with smaller height and width than this gets thrown out, see everyPointInDashedLineIsImage.pdf
private static final int PIXEL_THRESHOLD = 10;
/** /**
* Iterates over all elements in a PDF Document and retrieves the bounding box for each image, * Iterates over all elements in a PDF Document and retrieves the bounding box for each image, that is larger than the pixel threshold of 10 in either dimension.
* Then it adjusts the bounding boxes for the page rotation. * Then it adjusts the bounding boxes for the page rotation.
* If the mirrorY flag is set, the Y Coordinates are mirrored and moved up by the page height. This is required for PDFTrons OCRModule. * If the mirrorY flag is set, the Y Coordinates are mirrored and moved up by the page height. This is required for PDFTrons OCRModule.
* *
@ -63,7 +67,13 @@ public class ImagePositionRetrievalService {
Element element; Element element;
while ((element = reader.next()) != null) { while ((element = reader.next()) != null) {
switch (element.getType()) { switch (element.getType()) {
case Element.e_image, Element.e_inline_image -> imagePositions.addRect(toRotationAdjustedRect(element.getBBox(), currentPage, mirrorY)); case Element.e_image, Element.e_inline_image -> {
Image image = new Image(element.getXObject());
// see everyPointInDashedLineIsImage.pdf TestFile
if (image.getImageHeight() > PIXEL_THRESHOLD || image.getImageWidth() > PIXEL_THRESHOLD) {
imagePositions.addRect(toRotationAdjustedRect(element.getBBox(), currentPage, mirrorY));
}
}
case Element.e_form -> { case Element.e_form -> {
reader.formBegin(); reader.formBegin();
findImagePositionsOnPage(reader, imagePositions, currentPage, mirrorY); findImagePositionsOnPage(reader, imagePositions, currentPage, mirrorY);
@ -77,39 +87,49 @@ public class ImagePositionRetrievalService {
@SneakyThrows @SneakyThrows
public RectCollection mergeOverlappingRects(RectCollection imagePositions) { public RectCollection mergeOverlappingRects(RectCollection imagePositions) {
if (imagePositions.getNumRects() == 1) { if (imagePositions.getNumRects() < 2) {
return imagePositions; return imagePositions;
} }
List<Rectangle2D> rectangleList = toSortedRectangleList(imagePositions); List<Rectangle2D> rectangleList = toSortedRectangleList(imagePositions);
rectangleList = mergeRectangleListRecursive(rectangleList, 0); mergeRectangleList(rectangleList);
return toRectCollection(rectangleList); return toRectCollection(rectangleList);
} }
// Sometimes images are split up into stripes, here we merge the positions of aligned and intersecting rectangles into one larger rectangle // Sometimes images are split up into stripes, here we merge the positions of aligned and intersecting rectangles into one larger rectangle
private List<Rectangle2D> mergeRectangleListRecursive(List<Rectangle2D> rectangleList, int currentIdx) { private void mergeRectangleList(List<Rectangle2D> rectangleList) {
if (rectangleList.size() < currentIdx + 2) { for (int idx = 0; rectangleList.size() >= idx + 2; ) {
return rectangleList;
var rect1 = rectangleList.get(idx);
var rect2 = rectangleList.get(idx + 1);
if (intersects(rect1, rect2) && isAlignedXOrY(rect1, rect2)) {
rectangleList.remove(idx + 1);
rectangleList.remove(idx);
rectangleList.add(idx, rect1.createUnion(rect2));
} else {
++idx;
}
} }
}
var rect1 = rectangleList.get(currentIdx);
var rect2 = rectangleList.get(currentIdx + 1); private boolean intersects(Rectangle2D rect1, Rectangle2D rect2) {
return rect1.intersects(rect2.getMinX() - TOLERANCE, rect2.getMinY() - TOLERANCE, rect2.getWidth() + (2 * TOLERANCE), rect2.getHeight() + (2 * TOLERANCE));
}
private boolean isAlignedXOrY(Rectangle2D rect1, Rectangle2D rect2) {
boolean isAlignedX = Math.abs(rect1.getMinX() - rect2.getMinX()) < TOLERANCE && Math.abs(rect1.getMaxX() - rect2.getMaxX()) < TOLERANCE; boolean isAlignedX = Math.abs(rect1.getMinX() - rect2.getMinX()) < TOLERANCE && Math.abs(rect1.getMaxX() - rect2.getMaxX()) < TOLERANCE;
boolean isAlignedY = Math.abs(rect1.getMinY() - rect2.getMinY()) < TOLERANCE && Math.abs(rect1.getMaxY() - rect2.getMaxY()) < TOLERANCE; boolean isAlignedY = Math.abs(rect1.getMinY() - rect2.getMinY()) < TOLERANCE && Math.abs(rect1.getMaxY() - rect2.getMaxY()) < TOLERANCE;
boolean intersects = rect1.intersects(rect2.getMinX() - TOLERANCE, rect2.getMinY() - TOLERANCE, rect2.getWidth() + (2 * TOLERANCE), rect2.getHeight() + (2 * TOLERANCE));
if (intersects && (isAlignedX || isAlignedY)) { return isAlignedX || isAlignedY;
rectangleList.remove(currentIdx + 1);
rectangleList.remove(currentIdx);
rectangleList.add(currentIdx, rect1.createUnion(rect2));
return mergeRectangleListRecursive(rectangleList, currentIdx);
} else {
return mergeRectangleListRecursive(rectangleList, currentIdx + 1);
}
} }

View File

@ -5,6 +5,7 @@ import java.awt.geom.AffineTransform;
import java.awt.geom.GeneralPath; import java.awt.geom.GeneralPath;
import java.awt.geom.Rectangle2D; import java.awt.geom.Rectangle2D;
import java.io.InputStream; import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
@ -59,10 +60,10 @@ public class InvisibleElementRemovalService {
* @param pdfFile The PDF file to process * @param pdfFile The PDF file to process
* @param delta If this flag is set only the removed Elements will be written to the output file. * @param delta If this flag is set only the removed Elements will be written to the output file.
* The Elements are red if they are removed by clipping path, blue for transparency, and a green bounding box for overlap. * The Elements are red if they are removed by clipping path, blue for transparency, and a green bounding box for overlap.
* @return The resulting PDF File as bytes. * @param out OutputStream to write the resulting file to
**/ **/
@SneakyThrows @SneakyThrows
public byte[] removeInvisibleElements(InputStream pdfFile, boolean delta) { public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta) {
PDFDoc pdfDoc = new PDFDoc(pdfFile); PDFDoc pdfDoc = new PDFDoc(pdfFile);
@ -75,6 +76,8 @@ public class InvisibleElementRemovalService {
Page page = iterator.next(); Page page = iterator.next();
visitedXObjIds.add(page.getSDFObj().getObjNum()); visitedXObjIds.add(page.getSDFObj().getObjNum());
InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder() InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder()
.reader(reader) .reader(reader)
.clippingPathStack(new ClippingPathStack(page.getMediaBox())) .clippingPathStack(new ClippingPathStack(page.getMediaBox()))
@ -90,7 +93,7 @@ public class InvisibleElementRemovalService {
removeOverlappedElements(page, writer, context); removeOverlappedElements(page, writer, context);
} }
return pdfDoc.save(SDFDoc.SaveMode.REMOVE_UNUSED, null); pdfDoc.save(out, SDFDoc.SaveMode.REMOVE_UNUSED, null);
} }
@ -220,12 +223,18 @@ public class InvisibleElementRemovalService {
private void processPath(Element pathElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { private void processPath(Element pathElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
PathData pathData = pathElement.getPathData();
GeneralPath linePath = convertToGeneralPath(pathElement.getPathData()); if (pathData.getOperators().length == 0 && pathData.getPoints().length == 0) {
writer.writeGStateChanges(pathElement);
return;
}
GeneralPath linePath = convertToGeneralPath(pathData);
//transform path to initial user space //transform path to initial user space
var ctm = pathElement.getCTM(); var ctm = pathElement.getCTM();
var affineTransform = getAffineTransform(ctm); var affineTransform = toAffineTransform(ctm);
linePath.transform(affineTransform); linePath.transform(affineTransform);
var rect = linePath.getBounds2D(); var rect = linePath.getBounds2D();
@ -244,8 +253,13 @@ public class InvisibleElementRemovalService {
writer.writeElement(pathElement); writer.writeElement(pathElement);
} else { } else {
if (pathElement.isWindingFill()) {
linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
} else {
linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
}
if (inClippingPath) { if (inClippingPath) {
// TODO: WINDING RULE
if (isFilledAndNonTransparent(pathElement)) { if (isFilledAndNonTransparent(pathElement)) {
List<ElementFeatures> currentOverlappedElements = context.visibleElements() List<ElementFeatures> currentOverlappedElements = context.visibleElements()
.stream() .stream()
@ -270,12 +284,6 @@ public class InvisibleElementRemovalService {
} }
private static AffineTransform getAffineTransform(Matrix2D ctm) throws PDFNetException {
return new AffineTransform(ctm.getA(), ctm.getB(), ctm.getC(), ctm.getD(), ctm.getH(), ctm.getV());
}
private void removeOverlappedElements(Page page, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { private void removeOverlappedElements(Page page, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
context.reader().begin(page); context.reader().begin(page);
@ -422,6 +430,12 @@ public class InvisibleElementRemovalService {
} }
private static AffineTransform toAffineTransform(Matrix2D ctm) throws PDFNetException {
return new AffineTransform(ctm.getA(), ctm.getB(), ctm.getC(), ctm.getD(), ctm.getH(), ctm.getV());
}
@Builder @Builder
private record InvisibleElementRemovalContext( private record InvisibleElementRemovalContext(
boolean delta, boolean delta,

View File

@ -1,10 +1,14 @@
package com.iqser.red.service.ocr.v1.server.service; package com.iqser.red.service.ocr.v1.server.service;
import static java.lang.String.format;
import java.io.ByteArrayInputStream; import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.OutputStream;
import java.util.Map; import java.util.Map;
import io.micrometer.core.annotation.Timed;
import org.springframework.amqp.rabbit.core.RabbitTemplate; import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
@ -21,6 +25,7 @@ import com.pdftron.pdf.Page;
import com.pdftron.pdf.RectCollection; import com.pdftron.pdf.RectCollection;
import com.pdftron.sdf.SDFDoc; import com.pdftron.sdf.SDFDoc;
import io.micrometer.core.annotation.Timed;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
@ -55,58 +60,56 @@ public class OCRService {
* *
* @param dossierId The dossier id * @param dossierId The dossier id
* @param fileId The file id * @param fileId The file id
* @return the resulting PDF file as an InputStream * @param out OutputStream to write the file to
*/ */
@SneakyThrows
@Timed("redactmanager_runOcrOnDocument") @Timed("redactmanager_runOcrOnDocument")
public InputStream runOcrOnDocument(String dossierId, String fileId) { public void runOcrOnDocument(String dossierId, String fileId, OutputStream out) throws IOException {
InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId);
byte[] fileWithoutInvisibleTextBytes = invisibleElementRemovalService.removeInvisibleElements(fileStream, false);
byte[] ocrBytes = runOcr(fileWithoutInvisibleTextBytes, fileId);
return new ByteArrayInputStream(ocrBytes);
try (ByteArrayOutputStream transferOutputStream = new ByteArrayOutputStream()) {
try (InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId)) {
long start = System.currentTimeMillis();
log.debug("Start invisible element removal for file with dossierId {} and fileId {}", dossierId, fileId);
invisibleElementRemovalService.removeInvisibleElements(fileStream, transferOutputStream, false);
long end = System.currentTimeMillis();
log.debug("Invisible element removal successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, format("%.1f", (end - start) / 1000.0));
}
try (InputStream transferInputStream = new ByteArrayInputStream(transferOutputStream.toByteArray())) {
long start = System.currentTimeMillis();
runOcr(transferInputStream, out, fileId);
long end = System.currentTimeMillis();
log.info("ocr successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, format("%.1f", (end - start) / 1000.0));
}
}
} }
@SneakyThrows @SneakyThrows
private byte[] runOcr(byte[] file, String fileId) { private void runOcr(InputStream fileStream, OutputStream out, String fileId) {
PDFDoc pdfDoc = new PDFDoc(file); PDFDoc pdfDoc = new PDFDoc(fileStream);
Map<Integer, RectCollection> pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, true); Map<Integer, RectCollection> pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, true);
OCROptions options = new OCROptions(); // Optimization:
PDFDoc ocrPageDoc = new PDFDoc(); // When a page does not have a TextZone, PDFTron whites out the page. But, PDFTron scans it anyway, resulting in a longer runtime.
// So, we need to remove pages without images.
// Furthermore, creating a new document is *much* faster than reusing the same document and adding/removing pages one by one.
// Therefore, we create a new Document with a single page for every page that contains text.
int numProcessedPages = 0; int numProcessedPages = 0;
for (Integer pageId : pageIdToRectCollection.keySet()) { for (Integer pageId : pageIdToRectCollection.keySet()) {
try { try {
// optimization by only scanning pages that contain images PDFDoc singlePagePdfDoc = extractSinglePagePdfDoc(pdfDoc, pageId);
Page pdfPage = pdfDoc.getPage(pageId); processOcr(pageIdToRectCollection, pageId, singlePagePdfDoc);
pdfPage.setMediaBox(pdfPage.getCropBox()); // this line ensures the ocr text is placed correctly by PDFTron
ocrPageDoc.pagePushBack(pdfPage);
options.addTextZonesForPage(pageIdToRectCollection.get(pageId), 1);
options.addLang(ENGLISH);
options.addDPI(settings.getOcrDPI());
OCRModule.processPDF(ocrPageDoc, options);
++numProcessedPages; ++numProcessedPages;
StringBuilder zonesString = new StringBuilder(); log.info("{}/{} Page {} done, OCR regions {}",
for (int j = 0; j < pageIdToRectCollection.get(pageId).getNumRects(); ++j) { numProcessedPages,
var r = pageIdToRectCollection.get(pageId).getRectAt(j); pageIdToRectCollection.size(),
zonesString.append(String.format("[lower left (%.1f|%.1f) upper right (%.1f|%.1f)]", r.getX1(), r.getY1(), r.getX2(), r.getY2())); pageId,
} getAllOcrTextZonesAsString(pageIdToRectCollection, pageId));
log.info("{}/{} Page {} done, OCR regions {}", numProcessedPages, pageIdToRectCollection.size(), pageId, zonesString);
// re-adding OCR pages replaceOriginalPageWithOcrPage(pdfDoc, pageId, singlePagePdfDoc);
Page ocrPage = ocrPageDoc.getPage(1); singlePagePdfDoc.close();
pdfDoc.pageInsert(pdfDoc.getPageIterator(pageId), ocrPage);
pdfDoc.pageRemove(pdfDoc.getPageIterator(pageId + 1));
ocrPageDoc.pageRemove(ocrPageDoc.getPageIterator(1));
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE, rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder() objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder()
@ -116,13 +119,11 @@ public class OCRService {
.build())); .build()));
} catch (PDFNetException e) { } catch (PDFNetException e) {
log.error("failed to process page {}", pageId); log.error("Failed to process page {}", pageId);
throw new RuntimeException(e); throw new RuntimeException(e);
} }
} }
ocrPageDoc.close();
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE, rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder() objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder()
.fileId(fileId) .fileId(fileId)
@ -132,6 +133,52 @@ public class OCRService {
.build())); .build()));
Optimizer.optimize(pdfDoc); Optimizer.optimize(pdfDoc);
return pdfDoc.save(SDFDoc.SaveMode.LINEARIZED, null); try {
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
} catch (Exception e) {
log.error("Processed File with fileId {} could not be saved", fileId);
throw new RuntimeException(e);
}
} }
private void processOcr(Map<Integer, RectCollection> pageIdToRectCollection, Integer pageId, PDFDoc singlePagePdfDoc) throws PDFNetException {
OCROptions options = new OCROptions();
options.addTextZonesForPage(pageIdToRectCollection.get(pageId), 1);
options.addLang(ENGLISH);
options.addDPI(settings.getOcrDPI());
OCRModule.processPDF(singlePagePdfDoc, options);
}
private static PDFDoc extractSinglePagePdfDoc(PDFDoc pdfDoc, Integer pageId) throws PDFNetException {
PDFDoc singlePagePdfDoc = new PDFDoc();
Page page = pdfDoc.getPage(pageId);
page.setMediaBox(page.getCropBox()); // this line ensures the ocr text is placed correctly by PDFTron, see TestFile MediaBoxBiggerThanCropBox.pdf
singlePagePdfDoc.pagePushBack(page);
return singlePagePdfDoc;
}
private static void replaceOriginalPageWithOcrPage(PDFDoc pdfDoc, Integer pageId, PDFDoc ocrPageDoc) throws PDFNetException {
Page ocrPage = ocrPageDoc.getPage(1);
pdfDoc.pageInsert(pdfDoc.getPageIterator(pageId), ocrPage);
pdfDoc.pageRemove(pdfDoc.getPageIterator(pageId + 1));
}
private static StringBuilder getAllOcrTextZonesAsString(Map<Integer, RectCollection> pageIdToRectCollection, Integer pageId) throws PDFNetException {
StringBuilder zonesString = new StringBuilder();
for (int j = 0; j < pageIdToRectCollection.get(pageId).getNumRects(); ++j) {
var r = pageIdToRectCollection.get(pageId).getRectAt(j);
zonesString.append(format("[lower left (%.1f|%.1f) upper right (%.1f|%.1f)]", r.getX1(), r.getY1(), r.getX2(), r.getY2()));
}
return zonesString;
}
} }

View File

@ -1,5 +1,9 @@
package com.iqser.red.service.ocr.v1.server.service; package com.iqser.red.service.ocr.v1.server.service;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import org.springframework.amqp.AmqpRejectAndDontRequeueException; import org.springframework.amqp.AmqpRejectAndDontRequeueException;
import org.springframework.amqp.rabbit.annotation.RabbitHandler; import org.springframework.amqp.rabbit.annotation.RabbitHandler;
import org.springframework.amqp.rabbit.annotation.RabbitListener; import org.springframework.amqp.rabbit.annotation.RabbitListener;
@ -8,9 +12,9 @@ import org.springframework.stereotype.Service;
import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.ocr.v1.api.model.DocumentRequest;
import com.iqser.red.service.ocr.v1.server.client.FileStatusProcessingUpdateClient; import com.iqser.red.service.ocr.v1.server.client.FileStatusProcessingUpdateClient;
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration; import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
import com.iqser.red.service.ocr.v1.api.model.DocumentRequest;
import feign.FeignException; import feign.FeignException;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
@ -34,7 +38,6 @@ public class OcrMessageReceiver {
DocumentRequest ocrRequestMessage = objectMapper.readValue(in, DocumentRequest.class); DocumentRequest ocrRequestMessage = objectMapper.readValue(in, DocumentRequest.class);
long start = System.currentTimeMillis();
log.info("Start ocr for file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId()); log.info("Start ocr for file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
setStatusOcrProcessing(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId()); setStatusOcrProcessing(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
@ -44,14 +47,18 @@ public class OcrMessageReceiver {
fileStorageService.storeUntouchedFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), originalFile); fileStorageService.storeUntouchedFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), originalFile);
} }
var ocrResult = ocrService.runOcrOnDocument(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId()); try (var transferStream = new ByteArrayOutputStream()) {
ocrService.runOcrOnDocument(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), transferStream);
fileStorageService.storeOriginalFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), ocrResult); try (var inputStream = new ByteArrayInputStream(transferStream.toByteArray())) {
fileStorageService.storeOriginalFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), inputStream);
long end = System.currentTimeMillis(); }
log.info("Successfully processed ocr for file with dossierId {} and fileId {}, took {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), end - start); } catch (IOException e) {
log.error("Failed to store file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
throw new RuntimeException(e);
}
fileStatusProcessingUpdateClient.ocrSuccessful(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId()); fileStatusProcessingUpdateClient.ocrSuccessful(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
} }

View File

@ -3,14 +3,14 @@ package com.iqser.red.service.ocr.v1.server;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory; import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThat;
import java.io.FileInputStream;
import java.io.FileOutputStream; import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import io.micrometer.prometheus.PrometheusMeterRegistry;
import io.micrometer.prometheus.PrometheusTimer;
import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Disabled;
@ -36,12 +36,15 @@ import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService;
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType; import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType;
import com.iqser.red.storage.commons.StorageAutoConfiguration; import com.iqser.red.storage.commons.StorageAutoConfiguration;
import com.iqser.red.storage.commons.service.StorageService; import com.iqser.red.storage.commons.service.StorageService;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.OCRModule; import com.pdftron.pdf.OCRModule;
import com.pdftron.pdf.PDFDoc; import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page; import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator; import com.pdftron.pdf.PageIterator;
import com.pdftron.pdf.TextExtractor; import com.pdftron.pdf.TextExtractor;
import io.micrometer.prometheus.PrometheusMeterRegistry;
import io.micrometer.prometheus.PrometheusTimer;
import lombok.SneakyThrows; import lombok.SneakyThrows;
@ExtendWith(SpringExtension.class) @ExtendWith(SpringExtension.class)
@ -80,19 +83,20 @@ public class OcrServiceIntegrationTest {
@Test @Test
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top. @Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
public void testOCRMetrics(){ public void testOCRMetrics() {
testOCR("Watermark"); testOCR("Watermark");
testOCR("Watermark"); testOCR("Watermark");
testOCR("Watermark"); testOCR("Watermark");
var ocrOnDocumentMeter = registry.getMeters().stream() var ocrOnDocumentMeter = registry.getMeters().stream().filter(m -> m.getId().getName().equalsIgnoreCase("redactmanager_runOcrOnDocument")).findAny();
.filter(m -> m.getId().getName().equalsIgnoreCase("redactmanager_runOcrOnDocument")).findAny();
assertThat(ocrOnDocumentMeter.isPresent()).isTrue(); assertThat(ocrOnDocumentMeter.isPresent()).isTrue();
PrometheusTimer timer = (PrometheusTimer) ocrOnDocumentMeter.get(); PrometheusTimer timer = (PrometheusTimer) ocrOnDocumentMeter.get();
assertThat(timer.count()).isEqualTo(3); assertThat(timer.count()).isEqualTo(3);
assertThat(timer.mean(TimeUnit.SECONDS)).isGreaterThan(0.1); assertThat(timer.mean(TimeUnit.SECONDS)).isGreaterThan(0.1);
} }
@Test @Test
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top. @Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
public void testOcr() { public void testOcr() {
@ -153,34 +157,42 @@ public class OcrServiceIntegrationTest {
private String testOCR(String fileName) { private String testOCR(String fileName) {
ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf"); ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf");
var originId = FileStorageService.getStorageId("dossier", "file", FileType.ORIGIN); var originId = FileStorageService.getStorageId("dossier", "file", FileType.ORIGIN);
storageService.storeObject(originId, pdfFileResource.getInputStream()); try (var fileStream = pdfFileResource.getInputStream()) {
storageService.storeObject(originId, fileStream);
try (InputStream ocrDocument = ocrService.runOcrOnDocument("dossier", "file")) {
byte[] ocrDocumentBytes = ocrDocument.readAllBytes();
try (var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
out.write(ocrDocumentBytes);
}
TextExtractor extractor = new TextExtractor();
List<String> texts = new ArrayList<>();
PDFDoc pdfDoc = new PDFDoc(ocrDocumentBytes);
PageIterator iterator = pdfDoc.getPageIterator();
while (iterator.hasNext()) {
Page page = iterator.next();
extractor.begin(page);
texts.add(extractor.getAsText());
}
System.out.println("File:" + getTemporaryDirectory() + "/" + fileName + ".pdf");
return String.join("\n", texts);
} }
try (var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
ocrService.runOcrOnDocument("dossier", "file", out);
}
System.out.println("File:" + getTemporaryDirectory() + "/" + fileName + ".pdf");
try (var fileStream = new FileInputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
return extractAllTextFromDocument(fileStream);
}
}
private static String extractAllTextFromDocument(InputStream fileStream) throws IOException, PDFNetException {
PDFDoc pdfDoc = new PDFDoc(fileStream);
TextExtractor extractor = new TextExtractor();
List<String> texts = new ArrayList<>();
PageIterator iterator = pdfDoc.getPageIterator();
while (iterator.hasNext()) {
Page page = iterator.next();
extractor.begin(page);
texts.add(extractor.getAsText());
}
return String.join("\n", texts);
} }
@SneakyThrows @SneakyThrows
public void dummyTest() { public void dummyTest() {
// Build needs one text to not fail. // Build needs one test to not fail.
assertThat(1).isEqualTo(1); assertThat(1).isEqualTo(1);
} }
@ -200,7 +212,7 @@ public class OcrServiceIntegrationTest {
@Bean @Bean
@Primary @Primary
public StorageService inmemoryStorage() { public StorageService inMemoryStorage() {
return new FileSystemBackedStorageService(); return new FileSystemBackedStorageService();
} }

View File

@ -122,6 +122,14 @@ class ImagePositionRetrievalServiceTest {
assertThat(allRectCoords.size()).isEqualTo(48); assertThat(allRectCoords.size()).isEqualTo(48);
} }
@Test
@SneakyThrows
public void testEveryPointInDashedLineIsImage() {
String fileName = "everyPointInDashedLineIsImage";
List<int[]> allRectCoords = testImagePositionDetection(fileName);
assertThat(allRectCoords.size()).isEqualTo(0);
}
private List<int[]> testImagePositionDetection(String fileName) throws IOException, PDFNetException { private List<int[]> testImagePositionDetection(String fileName) throws IOException, PDFNetException {

View File

@ -3,9 +3,8 @@ package com.iqser.red.service.ocr.v1.server.service;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory; import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
import static org.assertj.core.api.AssertionsForClassTypes.assertThat; import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
import java.io.FileInputStream;
import java.io.FileOutputStream; import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.file.Files;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith; import org.junit.jupiter.api.extension.ExtendWith;
@ -46,22 +45,23 @@ public class InvisibleElementRemovalServiceTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf"); ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf");
var initialFileStream = Files.newInputStream(pdfFileResource.getFile().toPath()); try (var initialFileStream = pdfFileResource.getInputStream(); var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
var fileWithoutInvisibleElements = invisibleElementRemovalService.removeInvisibleElements(initialFileStream, false); invisibleElementRemovalService.removeInvisibleElements(initialFileStream, out,false);
}
initialFileStream = Files.newInputStream(pdfFileResource.getFile().toPath()); try (var initialFileStream = pdfFileResource.getInputStream(); var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + "_delta.pdf")) {
var deltaFile = invisibleElementRemovalService.removeInvisibleElements(initialFileStream, true); invisibleElementRemovalService.removeInvisibleElements(initialFileStream, out,true);
}
String fileWithoutInvisibleTextLocation = getTemporaryDirectory() + "/" + fileName + ".pdf"; System.out.println("Output File without invisible elements: files/" + fileName + ".pdf");
String deltaFileLocation = getTemporaryDirectory() + "/" + fileName + "_delta.pdf"; System.out.println("Output Delta File without invisible elements: files/" + fileName + "_delta.pdf");
saveToFile(fileWithoutInvisibleTextLocation, fileWithoutInvisibleElements);
saveToFile(deltaFileLocation, deltaFile);
System.out.println("Output File without invisible elements: " + fileWithoutInvisibleTextLocation);
System.out.println("Output Delta File: " + deltaFileLocation);
TextExtractor extractor = new TextExtractor(); TextExtractor extractor = new TextExtractor();
PDFDoc pdfDoc = new PDFDoc(fileWithoutInvisibleElements);
PDFDoc pdfDoc;
try(var fileStream = new FileInputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
pdfDoc = new PDFDoc(fileStream);
}
PageIterator iterator = pdfDoc.getPageIterator(); PageIterator iterator = pdfDoc.getPageIterator();
while (iterator.hasNext()) { while (iterator.hasNext()) {
Page page = iterator.next(); Page page = iterator.next();
@ -70,16 +70,4 @@ public class InvisibleElementRemovalServiceTest {
assertThat(text).contains("APPENDIX 16 Pathology Report", "Amendment 1", "Page 255 of 260"); assertThat(text).contains("APPENDIX 16 Pathology Report", "Amendment 1", "Page 255 of 260");
} }
} }
private void saveToFile(String location, byte[] fileBytes) {
try (var f_out = new FileOutputStream(location)) {
f_out.write(fileBytes);
} catch (IOException e) {
throw new RuntimeException("File location: " + location + "could not be openend, no file will be saved");
}
}
} }

View File

@ -0,0 +1,16 @@
<Configuration>
<Appenders>
<Console name="CONSOLE" target="SYSTEM_OUT">
<PatternLayout pattern="%d{HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n"/>
</Console>
</Appenders>
<Loggers>
<Root level="warn">
<AppenderRef ref="CONSOLE"/>
</Root>
<Logger name="com.iqser" level="info"/>
</Loggers>
</Configuration>