RED-6126: performance-test

*fixed NullPointerException
*fixed StackOverFlowError by ignoring very small images and moving to while loop instead of recursion
This commit is contained in:
Kilian Schuettler 2023-02-09 17:01:57 +01:00
parent 7065d098f3
commit 7b96322e50
4 changed files with 47 additions and 21 deletions

View File

@ -15,6 +15,7 @@ import com.pdftron.common.Matrix2D;
import com.pdftron.common.PDFNetException; import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.Element; import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementReader; import com.pdftron.pdf.ElementReader;
import com.pdftron.pdf.Image;
import com.pdftron.pdf.PDFDoc; import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page; import com.pdftron.pdf.Page;
import com.pdftron.pdf.Rect; import com.pdftron.pdf.Rect;
@ -27,9 +28,12 @@ public class ImagePositionRetrievalService {
private static final double TOLERANCE = 1e-1; private static final double TOLERANCE = 1e-1;
// any image with smaller height and width than this gets thrown out, see everyPointInDashedLineIsImage.pdf
private static final int PIXEL_THRESHOLD = 10;
/** /**
* Iterates over all elements in a PDF Document and retrieves the bounding box for each image, * Iterates over all elements in a PDF Document and retrieves the bounding box for each image, that is larger than the pixel threshold of 10 in both dimensions,
* Then it adjusts the bounding boxes for the page rotation. * Then it adjusts the bounding boxes for the page rotation.
* If the mirrorY flag is set, the Y Coordinates are mirrored and moved up by the page height. This is required for PDFTrons OCRModule. * If the mirrorY flag is set, the Y Coordinates are mirrored and moved up by the page height. This is required for PDFTrons OCRModule.
* *
@ -63,7 +67,13 @@ public class ImagePositionRetrievalService {
Element element; Element element;
while ((element = reader.next()) != null) { while ((element = reader.next()) != null) {
switch (element.getType()) { switch (element.getType()) {
case Element.e_image, Element.e_inline_image -> imagePositions.addRect(toRotationAdjustedRect(element.getBBox(), currentPage, mirrorY)); case Element.e_image, Element.e_inline_image -> {
Image image = new Image(element.getXObject());
// see everyPointInDashedLineIsImage.pdf TestFile
if (image.getImageHeight() > PIXEL_THRESHOLD || image.getImageWidth() > PIXEL_THRESHOLD) {
imagePositions.addRect(toRotationAdjustedRect(element.getBBox(), currentPage, mirrorY));
}
}
case Element.e_form -> { case Element.e_form -> {
reader.formBegin(); reader.formBegin();
findImagePositionsOnPage(reader, imagePositions, currentPage, mirrorY); findImagePositionsOnPage(reader, imagePositions, currentPage, mirrorY);
@ -77,38 +87,38 @@ public class ImagePositionRetrievalService {
@SneakyThrows @SneakyThrows
public RectCollection mergeOverlappingRects(RectCollection imagePositions) { public RectCollection mergeOverlappingRects(RectCollection imagePositions) {
if (imagePositions.getNumRects() == 1) { if (imagePositions.getNumRects() < 2) {
return imagePositions; return imagePositions;
} }
List<Rectangle2D> rectangleList = toSortedRectangleList(imagePositions); List<Rectangle2D> rectangleList = toSortedRectangleList(imagePositions);
rectangleList = mergeRectangleListRecursive(rectangleList, 0); mergeRectangleListRecursive(rectangleList);
return toRectCollection(rectangleList); return toRectCollection(rectangleList);
} }
// Sometimes images are split up into stripes, here we merge the positions of aligned and intersecting rectangles into one larger rectangle // Sometimes images are split up into stripes, here we merge the positions of aligned and intersecting rectangles into one larger rectangle
private List<Rectangle2D> mergeRectangleListRecursive(List<Rectangle2D> rectangleList, int currentIdx) { private void mergeRectangleListRecursive(List<Rectangle2D> rectangleList) {
int idx = 0;
if (rectangleList.size() < currentIdx + 2) { while (rectangleList.size() >= idx + 2) {
return rectangleList;
}
var rect1 = rectangleList.get(currentIdx); var rect1 = rectangleList.get(idx);
var rect2 = rectangleList.get(currentIdx + 1); var rect2 = rectangleList.get(idx + 1);
boolean isAlignedX = Math.abs(rect1.getMinX() - rect2.getMinX()) < TOLERANCE && Math.abs(rect1.getMaxX() - rect2.getMaxX()) < TOLERANCE; boolean isAlignedX = Math.abs(rect1.getMinX() - rect2.getMinX()) < TOLERANCE && Math.abs(rect1.getMaxX() - rect2.getMaxX()) < TOLERANCE;
boolean isAlignedY = Math.abs(rect1.getMinY() - rect2.getMinY()) < TOLERANCE && Math.abs(rect1.getMaxY() - rect2.getMaxY()) < TOLERANCE; boolean isAlignedY = Math.abs(rect1.getMinY() - rect2.getMinY()) < TOLERANCE && Math.abs(rect1.getMaxY() - rect2.getMaxY()) < TOLERANCE;
boolean intersects = rect1.intersects(rect2.getMinX() - TOLERANCE, rect2.getMinY() - TOLERANCE, rect2.getWidth() + (2 * TOLERANCE), rect2.getHeight() + (2 * TOLERANCE)); boolean intersects = rect1.intersects(rect2.getMinX() - TOLERANCE, rect2.getMinY() - TOLERANCE, rect2.getWidth() + (2 * TOLERANCE), rect2.getHeight() + (2 * TOLERANCE));
if (intersects && (isAlignedX || isAlignedY)) { if (intersects && (isAlignedX || isAlignedY)) {
rectangleList.remove(currentIdx + 1); rectangleList.remove(idx + 1);
rectangleList.remove(currentIdx); rectangleList.remove(idx);
rectangleList.add(currentIdx, rect1.createUnion(rect2)); rectangleList.add(idx, rect1.createUnion(rect2));
return mergeRectangleListRecursive(rectangleList, currentIdx); } else {
} else { ++idx;
return mergeRectangleListRecursive(rectangleList, currentIdx + 1); }
} }
} }

View File

@ -76,6 +76,8 @@ public class InvisibleElementRemovalService {
Page page = iterator.next(); Page page = iterator.next();
visitedXObjIds.add(page.getSDFObj().getObjNum()); visitedXObjIds.add(page.getSDFObj().getObjNum());
InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder() InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder()
.reader(reader) .reader(reader)
.clippingPathStack(new ClippingPathStack(page.getMediaBox())) .clippingPathStack(new ClippingPathStack(page.getMediaBox()))
@ -221,8 +223,14 @@ public class InvisibleElementRemovalService {
private void processPath(Element pathElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { private void processPath(Element pathElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
PathData pathData = pathElement.getPathData();
GeneralPath linePath = convertToGeneralPath(pathElement.getPathData()); if (pathData.getOperators().length == 0 && pathData.getPoints().length == 0) {
writer.writeGStateChanges(pathElement);
return;
}
GeneralPath linePath = convertToGeneralPath(pathData);
//transform path to initial user space //transform path to initial user space
var ctm = pathElement.getCTM(); var ctm = pathElement.getCTM();

View File

@ -122,6 +122,14 @@ class ImagePositionRetrievalServiceTest {
assertThat(allRectCoords.size()).isEqualTo(48); assertThat(allRectCoords.size()).isEqualTo(48);
} }
@Test
@SneakyThrows
public void testEveryPointInDashedLineIsImage() {
String fileName = "everyPointInDashedLineIsImage";
List<int[]> allRectCoords = testImagePositionDetection(fileName);
assertThat(allRectCoords.size()).isEqualTo(0);
}
private List<int[]> testImagePositionDetection(String fileName) throws IOException, PDFNetException { private List<int[]> testImagePositionDetection(String fileName) throws IOException, PDFNetException {