RED-6126: performance-test
*fixed NullPointerException *fixed StackOverFlowError by ignoring very small images and moving to while loop instead of recursion
This commit is contained in:
parent
7065d098f3
commit
b3fa14b342
@ -15,6 +15,7 @@ import com.pdftron.common.Matrix2D;
|
|||||||
import com.pdftron.common.PDFNetException;
|
import com.pdftron.common.PDFNetException;
|
||||||
import com.pdftron.pdf.Element;
|
import com.pdftron.pdf.Element;
|
||||||
import com.pdftron.pdf.ElementReader;
|
import com.pdftron.pdf.ElementReader;
|
||||||
|
import com.pdftron.pdf.Image;
|
||||||
import com.pdftron.pdf.PDFDoc;
|
import com.pdftron.pdf.PDFDoc;
|
||||||
import com.pdftron.pdf.Page;
|
import com.pdftron.pdf.Page;
|
||||||
import com.pdftron.pdf.Rect;
|
import com.pdftron.pdf.Rect;
|
||||||
@ -27,9 +28,12 @@ public class ImagePositionRetrievalService {
|
|||||||
|
|
||||||
private static final double TOLERANCE = 1e-1;
|
private static final double TOLERANCE = 1e-1;
|
||||||
|
|
||||||
|
// any image with smaller height and width than this gets thrown out, see everyPointInDashedLineIsImage.pdf
|
||||||
|
private static final int PIXEL_THRESHOLD = 10;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Iterates over all elements in a PDF Document and retrieves the bounding box for each image,
|
* Iterates over all elements in a PDF Document and retrieves the bounding box for each image, that is larger than the pixel threshold of 10 in either dimension.
|
||||||
* Then it adjusts the bounding boxes for the page rotation.
|
* Then it adjusts the bounding boxes for the page rotation.
|
||||||
* If the mirrorY flag is set, the Y Coordinates are mirrored and moved up by the page height. This is required for PDFTrons OCRModule.
|
* If the mirrorY flag is set, the Y Coordinates are mirrored and moved up by the page height. This is required for PDFTrons OCRModule.
|
||||||
*
|
*
|
||||||
@ -63,7 +67,13 @@ public class ImagePositionRetrievalService {
|
|||||||
Element element;
|
Element element;
|
||||||
while ((element = reader.next()) != null) {
|
while ((element = reader.next()) != null) {
|
||||||
switch (element.getType()) {
|
switch (element.getType()) {
|
||||||
case Element.e_image, Element.e_inline_image -> imagePositions.addRect(toRotationAdjustedRect(element.getBBox(), currentPage, mirrorY));
|
case Element.e_image, Element.e_inline_image -> {
|
||||||
|
Image image = new Image(element.getXObject());
|
||||||
|
// see everyPointInDashedLineIsImage.pdf TestFile
|
||||||
|
if (image.getImageHeight() > PIXEL_THRESHOLD || image.getImageWidth() > PIXEL_THRESHOLD) {
|
||||||
|
imagePositions.addRect(toRotationAdjustedRect(element.getBBox(), currentPage, mirrorY));
|
||||||
|
}
|
||||||
|
}
|
||||||
case Element.e_form -> {
|
case Element.e_form -> {
|
||||||
reader.formBegin();
|
reader.formBegin();
|
||||||
findImagePositionsOnPage(reader, imagePositions, currentPage, mirrorY);
|
findImagePositionsOnPage(reader, imagePositions, currentPage, mirrorY);
|
||||||
@ -77,38 +87,38 @@ public class ImagePositionRetrievalService {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public RectCollection mergeOverlappingRects(RectCollection imagePositions) {
|
public RectCollection mergeOverlappingRects(RectCollection imagePositions) {
|
||||||
|
|
||||||
if (imagePositions.getNumRects() == 1) {
|
if (imagePositions.getNumRects() < 2) {
|
||||||
return imagePositions;
|
return imagePositions;
|
||||||
}
|
}
|
||||||
|
|
||||||
List<Rectangle2D> rectangleList = toSortedRectangleList(imagePositions);
|
List<Rectangle2D> rectangleList = toSortedRectangleList(imagePositions);
|
||||||
|
|
||||||
rectangleList = mergeRectangleListRecursive(rectangleList, 0);
|
mergeRectangleList(rectangleList);
|
||||||
|
|
||||||
return toRectCollection(rectangleList);
|
return toRectCollection(rectangleList);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// Sometimes images are split up into stripes, here we merge the positions of aligned and intersecting rectangles into one larger rectangle
|
// Sometimes images are split up into stripes, here we merge the positions of aligned and intersecting rectangles into one larger rectangle
|
||||||
private List<Rectangle2D> mergeRectangleListRecursive(List<Rectangle2D> rectangleList, int currentIdx) {
|
private void mergeRectangleList(List<Rectangle2D> rectangleList) {
|
||||||
|
int idx = 0;
|
||||||
|
|
||||||
if (rectangleList.size() < currentIdx + 2) {
|
while (rectangleList.size() >= idx + 2) {
|
||||||
return rectangleList;
|
|
||||||
}
|
|
||||||
|
|
||||||
var rect1 = rectangleList.get(currentIdx);
|
var rect1 = rectangleList.get(idx);
|
||||||
var rect2 = rectangleList.get(currentIdx + 1);
|
var rect2 = rectangleList.get(idx + 1);
|
||||||
|
|
||||||
boolean isAlignedX = Math.abs(rect1.getMinX() - rect2.getMinX()) < TOLERANCE && Math.abs(rect1.getMaxX() - rect2.getMaxX()) < TOLERANCE;
|
boolean isAlignedX = Math.abs(rect1.getMinX() - rect2.getMinX()) < TOLERANCE && Math.abs(rect1.getMaxX() - rect2.getMaxX()) < TOLERANCE;
|
||||||
boolean isAlignedY = Math.abs(rect1.getMinY() - rect2.getMinY()) < TOLERANCE && Math.abs(rect1.getMaxY() - rect2.getMaxY()) < TOLERANCE;
|
boolean isAlignedY = Math.abs(rect1.getMinY() - rect2.getMinY()) < TOLERANCE && Math.abs(rect1.getMaxY() - rect2.getMaxY()) < TOLERANCE;
|
||||||
boolean intersects = rect1.intersects(rect2.getMinX() - TOLERANCE, rect2.getMinY() - TOLERANCE, rect2.getWidth() + (2 * TOLERANCE), rect2.getHeight() + (2 * TOLERANCE));
|
boolean intersects = rect1.intersects(rect2.getMinX() - TOLERANCE, rect2.getMinY() - TOLERANCE, rect2.getWidth() + (2 * TOLERANCE), rect2.getHeight() + (2 * TOLERANCE));
|
||||||
|
|
||||||
if (intersects && (isAlignedX || isAlignedY)) {
|
if (intersects && (isAlignedX || isAlignedY)) {
|
||||||
rectangleList.remove(currentIdx + 1);
|
rectangleList.remove(idx + 1);
|
||||||
rectangleList.remove(currentIdx);
|
rectangleList.remove(idx);
|
||||||
rectangleList.add(currentIdx, rect1.createUnion(rect2));
|
rectangleList.add(idx, rect1.createUnion(rect2));
|
||||||
return mergeRectangleListRecursive(rectangleList, currentIdx);
|
} else {
|
||||||
} else {
|
++idx;
|
||||||
return mergeRectangleListRecursive(rectangleList, currentIdx + 1);
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -76,6 +76,8 @@ public class InvisibleElementRemovalService {
|
|||||||
Page page = iterator.next();
|
Page page = iterator.next();
|
||||||
|
|
||||||
visitedXObjIds.add(page.getSDFObj().getObjNum());
|
visitedXObjIds.add(page.getSDFObj().getObjNum());
|
||||||
|
|
||||||
|
|
||||||
InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder()
|
InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder()
|
||||||
.reader(reader)
|
.reader(reader)
|
||||||
.clippingPathStack(new ClippingPathStack(page.getMediaBox()))
|
.clippingPathStack(new ClippingPathStack(page.getMediaBox()))
|
||||||
@ -221,8 +223,14 @@ public class InvisibleElementRemovalService {
|
|||||||
|
|
||||||
|
|
||||||
private void processPath(Element pathElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
|
private void processPath(Element pathElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
|
||||||
|
PathData pathData = pathElement.getPathData();
|
||||||
|
|
||||||
GeneralPath linePath = convertToGeneralPath(pathElement.getPathData());
|
if (pathData.getOperators().length == 0 && pathData.getPoints().length == 0) {
|
||||||
|
writer.writeGStateChanges(pathElement);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
GeneralPath linePath = convertToGeneralPath(pathData);
|
||||||
|
|
||||||
//transform path to initial user space
|
//transform path to initial user space
|
||||||
var ctm = pathElement.getCTM();
|
var ctm = pathElement.getCTM();
|
||||||
|
|||||||
@ -12,7 +12,6 @@ import java.util.Map;
|
|||||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse;
|
import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse;
|
||||||
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
|
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
|
||||||
@ -28,6 +27,7 @@ import com.pdftron.sdf.SDFDoc;
|
|||||||
|
|
||||||
import io.micrometer.core.annotation.Timed;
|
import io.micrometer.core.annotation.Timed;
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.SneakyThrows;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
@Slf4j
|
@Slf4j
|
||||||
@ -68,10 +68,10 @@ public class OCRService {
|
|||||||
try (ByteArrayOutputStream transferOutputStream = new ByteArrayOutputStream()) {
|
try (ByteArrayOutputStream transferOutputStream = new ByteArrayOutputStream()) {
|
||||||
try (InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId)) {
|
try (InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId)) {
|
||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
log.info("Start invisible element removal for file with dossierId {} and fileId {}", dossierId, fileId);
|
log.debug("Start invisible element removal for file with dossierId {} and fileId {}", dossierId, fileId);
|
||||||
invisibleElementRemovalService.removeInvisibleElements(fileStream, transferOutputStream, false);
|
invisibleElementRemovalService.removeInvisibleElements(fileStream, transferOutputStream, false);
|
||||||
long end = System.currentTimeMillis();
|
long end = System.currentTimeMillis();
|
||||||
log.info("Invisible element removal successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, format("%.1f", (end - start) / 1000.0));
|
log.debug("Invisible element removal successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, format("%.1f", (end - start) / 1000.0));
|
||||||
}
|
}
|
||||||
try (InputStream transferInputStream = new ByteArrayInputStream(transferOutputStream.toByteArray())) {
|
try (InputStream transferInputStream = new ByteArrayInputStream(transferOutputStream.toByteArray())) {
|
||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
@ -83,15 +83,10 @@ public class OCRService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
private void runOcr(InputStream fileStream, OutputStream out, String fileId) {
|
private void runOcr(InputStream fileStream, OutputStream out, String fileId) {
|
||||||
|
|
||||||
PDFDoc pdfDoc;
|
PDFDoc pdfDoc = new PDFDoc(fileStream);
|
||||||
try {
|
|
||||||
pdfDoc = new PDFDoc(fileStream);
|
|
||||||
} catch (Exception e) {
|
|
||||||
log.error("Couldn't parse file with fileId {} from InputStream ", fileId);
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
|
||||||
|
|
||||||
Map<Integer, RectCollection> pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, true);
|
Map<Integer, RectCollection> pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, true);
|
||||||
|
|
||||||
@ -136,24 +131,16 @@ public class OCRService {
|
|||||||
} catch (PDFNetException e) {
|
} catch (PDFNetException e) {
|
||||||
log.error("Failed to process page {}", pageId);
|
log.error("Failed to process page {}", pageId);
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
} catch (JsonProcessingException e) {
|
|
||||||
log.error("Failed to send \"processed\" message to rabbitMQ for file with fileID {} on OCR page {}/{}", fileId, numProcessedPages, pageIdToRectCollection.size());
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
|
||||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
|
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder()
|
||||||
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder()
|
.fileId(fileId)
|
||||||
.fileId(fileId)
|
.numberOfPagesToOCR(pageIdToRectCollection.size())
|
||||||
.numberOfPagesToOCR(pageIdToRectCollection.size())
|
.numberOfOCRedPages(numProcessedPages)
|
||||||
.numberOfOCRedPages(numProcessedPages)
|
.ocrFinished(true)
|
||||||
.ocrFinished(true)
|
.build()));
|
||||||
.build()));
|
|
||||||
} catch (JsonProcessingException e) {
|
|
||||||
log.error("Failed to send message to rabbitMQ for file with fileID {} on OCR page {}/{}", fileId, numProcessedPages, pageIdToRectCollection.size());
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
|
||||||
|
|
||||||
Optimizer.optimize(pdfDoc);
|
Optimizer.optimize(pdfDoc);
|
||||||
try {
|
try {
|
||||||
|
|||||||
@ -122,6 +122,14 @@ class ImagePositionRetrievalServiceTest {
|
|||||||
assertThat(allRectCoords.size()).isEqualTo(48);
|
assertThat(allRectCoords.size()).isEqualTo(48);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@SneakyThrows
|
||||||
|
public void testEveryPointInDashedLineIsImage() {
|
||||||
|
String fileName = "everyPointInDashedLineIsImage";
|
||||||
|
List<int[]> allRectCoords = testImagePositionDetection(fileName);
|
||||||
|
assertThat(allRectCoords.size()).isEqualTo(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private List<int[]> testImagePositionDetection(String fileName) throws IOException, PDFNetException {
|
private List<int[]> testImagePositionDetection(String fileName) throws IOException, PDFNetException {
|
||||||
|
|
||||||
|
|||||||
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user