RED-6126: In the OCRService, OCR Text is not applied to Document

*updated some comments
*very slight refactor
This commit is contained in:
Kilian Schuettler 2023-02-07 12:09:04 +01:00
parent a415224db5
commit d0d6bf70a4
2 changed files with 23 additions and 13 deletions

View File

@ -43,20 +43,22 @@ public class ImagePositionRetrievalService {
Map<Integer, RectCollection> pageIdToImagePositions = new HashMap<>();
ElementReader reader = new ElementReader();
for (int pageId = 1; pageId <= pdfDoc.getPageCount(); ++pageId) {
reader.begin(pdfDoc.getPage(pageId));
RectCollection imagePositions = new RectCollection();
processElements(reader, imagePositions, pdfDoc.getPage(pageId), mirrorY);
reader.begin(pdfDoc.getPage(pageId));
findImagePositionsOnPage(reader, imagePositions, pdfDoc.getPage(pageId), mirrorY);
imagePositions = mergeOverlappingRects(imagePositions);
reader.end();
if (imagePositions.getNumRects() > 0) {
pageIdToImagePositions.put(pageId, imagePositions);
}
reader.end();
}
return pageIdToImagePositions;
}
private void processElements(ElementReader reader, RectCollection imagePositions, Page currentPage, boolean mirrorY) throws PDFNetException {
private void findImagePositionsOnPage(ElementReader reader, RectCollection imagePositions, Page currentPage, boolean mirrorY) throws PDFNetException {
Element element;
while ((element = reader.next()) != null) {
@ -64,10 +66,9 @@ public class ImagePositionRetrievalService {
case Element.e_image, Element.e_inline_image -> imagePositions.addRect(toRotationAdjustedRect(element.getBBox(), currentPage, mirrorY));
case Element.e_form -> {
reader.formBegin();
processElements(reader, imagePositions, currentPage, mirrorY);
findImagePositionsOnPage(reader, imagePositions, currentPage, mirrorY);
reader.end();
}
}
}
}
@ -76,28 +77,31 @@ public class ImagePositionRetrievalService {
@SneakyThrows
public RectCollection mergeOverlappingRects(RectCollection imagePositions) {
if (imagePositions.getNumRects() < 2) {
if (imagePositions.getNumRects() == 1) {
return imagePositions;
}
List<Rectangle2D> rectangleList = toSortedRectangleList(imagePositions);
rectangleList = mergeRectangleListRecursive(rectangleList, 0);
return toRectCollection(rectangleList);
}
// Sometimes images are split up into stripes, here we try to merge the positions into one larger rectangle
// Sometimes images are split up into stripes, here we merge the positions of aligned and intersecting rectangles into one larger rectangle
private List<Rectangle2D> mergeRectangleListRecursive(List<Rectangle2D> rectangleList, int currentIdx) {
if (rectangleList.size() < currentIdx + 2) {
return rectangleList;
}
var rect1 = rectangleList.get(currentIdx);
var rect2 = rectangleList.get(currentIdx + 1);
boolean isAlignedX = Math.abs(rect1.getMinX() - rect2.getMinX()) < TOLERANCE && Math.abs(rect1.getMaxX() - rect2.getMaxX()) < TOLERANCE;
boolean isAlignedY = Math.abs(rect1.getMinY() - rect2.getMinY()) < TOLERANCE && Math.abs(rect1.getMaxY() - rect2.getMaxY()) < TOLERANCE;
boolean intersects = rect1.intersects(rect2.getMinX() - TOLERANCE, rect2.getMinY() - TOLERANCE, rect2.getWidth() + 2 * TOLERANCE, rect2.getHeight() + 2 * TOLERANCE);
boolean intersects = rect1.intersects(rect2.getMinX() - TOLERANCE, rect2.getMinY() - TOLERANCE, rect2.getWidth() + (2 * TOLERANCE), rect2.getHeight() + (2 * TOLERANCE));
if (intersects && (isAlignedX || isAlignedY)) {
rectangleList.remove(currentIdx + 1);
rectangleList.remove(currentIdx);
@ -114,26 +118,29 @@ public class ImagePositionRetrievalService {
int rotation = page.getRotation();
double height = page.getPageHeight();
double width = page.getPageWidth();
//Even though the getBBox() method returns coordinates with (0,0) in the lower left corner, the OCRModule's addTextZonesForPage() wants to have its coordinates with (0,0) in the upper left corner
// Even though PDFTron almost always has the origin in the lower left corner, for some reason, the OCRModule's addTextZonesForPage() uses the upper left corner as origin...
Matrix2D mirrorMatrix;
if (mirrorY) {
mirrorMatrix = new Matrix2D(1, 0, 0, -1, 0, height);
} else {
mirrorMatrix = new Matrix2D();
}
// We need to rotate the rects to fit to the page rotation
Matrix2D rotationMatrix = switch (rotation) {
case 1 -> new Matrix2D(0, -1, 1, 0, 0, height);
case 2 -> new Matrix2D(-1, 0, 0, -1, width, height);
case 3 -> new Matrix2D(0, 1, -1, 0, width, 0);
default -> new Matrix2D(1, 0, 0, 1, 0, 0);
default -> new Matrix2D();
};
Matrix2D finalMatrix = mirrorMatrix.multiply(rotationMatrix);
Point2D.Double p1 = finalMatrix.multPoint(bbox.getX1(), bbox.getY1());
Point2D.Double p2 = finalMatrix.multPoint(bbox.getX2(), bbox.getY2());
//PDFTron Rect needs lower left and upper right coordinates to calculate width and height correctly
// PDFTron Rect *needs* lower left and upper right coordinates to calculate width and height correctly, even though the documentation states otherwise
Point2D.Double lowerLeft = new Point2D.Double(Math.min(p1.x, p2.x), Math.min(p1.y, p2.y));
Point2D.Double upperRight = new Point2D.Double(Math.max(p1.x, p2.x), Math.max(p1.y, p2.y));

View File

@ -112,12 +112,15 @@ public class OCRService {
.numberOfPagesToOCR(pageIdToRectCollection.size())
.numberOfOCRedPages(numProcessedPages)
.build()));
} catch (PDFNetException e) {
log.error("failed to process page {}", pageId);
throw new RuntimeException(e);
}
}
ocrPageDoc.close();
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder()
.fileId(fileId)