RED-6126: In the OCRService, OCR Text is not applied to Document
*updated some comments *very slight refactor
This commit is contained in:
parent
a415224db5
commit
d0d6bf70a4
@ -43,20 +43,22 @@ public class ImagePositionRetrievalService {
|
||||
Map<Integer, RectCollection> pageIdToImagePositions = new HashMap<>();
|
||||
ElementReader reader = new ElementReader();
|
||||
for (int pageId = 1; pageId <= pdfDoc.getPageCount(); ++pageId) {
|
||||
reader.begin(pdfDoc.getPage(pageId));
|
||||
RectCollection imagePositions = new RectCollection();
|
||||
processElements(reader, imagePositions, pdfDoc.getPage(pageId), mirrorY);
|
||||
reader.begin(pdfDoc.getPage(pageId));
|
||||
|
||||
findImagePositionsOnPage(reader, imagePositions, pdfDoc.getPage(pageId), mirrorY);
|
||||
imagePositions = mergeOverlappingRects(imagePositions);
|
||||
|
||||
reader.end();
|
||||
if (imagePositions.getNumRects() > 0) {
|
||||
pageIdToImagePositions.put(pageId, imagePositions);
|
||||
}
|
||||
reader.end();
|
||||
}
|
||||
return pageIdToImagePositions;
|
||||
}
|
||||
|
||||
|
||||
private void processElements(ElementReader reader, RectCollection imagePositions, Page currentPage, boolean mirrorY) throws PDFNetException {
|
||||
private void findImagePositionsOnPage(ElementReader reader, RectCollection imagePositions, Page currentPage, boolean mirrorY) throws PDFNetException {
|
||||
|
||||
Element element;
|
||||
while ((element = reader.next()) != null) {
|
||||
@ -64,10 +66,9 @@ public class ImagePositionRetrievalService {
|
||||
case Element.e_image, Element.e_inline_image -> imagePositions.addRect(toRotationAdjustedRect(element.getBBox(), currentPage, mirrorY));
|
||||
case Element.e_form -> {
|
||||
reader.formBegin();
|
||||
processElements(reader, imagePositions, currentPage, mirrorY);
|
||||
findImagePositionsOnPage(reader, imagePositions, currentPage, mirrorY);
|
||||
reader.end();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -76,28 +77,31 @@ public class ImagePositionRetrievalService {
|
||||
@SneakyThrows
|
||||
public RectCollection mergeOverlappingRects(RectCollection imagePositions) {
|
||||
|
||||
if (imagePositions.getNumRects() < 2) {
|
||||
if (imagePositions.getNumRects() == 1) {
|
||||
return imagePositions;
|
||||
}
|
||||
|
||||
List<Rectangle2D> rectangleList = toSortedRectangleList(imagePositions);
|
||||
|
||||
rectangleList = mergeRectangleListRecursive(rectangleList, 0);
|
||||
|
||||
return toRectCollection(rectangleList);
|
||||
}
|
||||
|
||||
|
||||
// Sometimes images are split up into stripes, here we try to merge the positions into one larger rectangle
|
||||
// Sometimes images are split up into stripes, here we merge the positions of aligned and intersecting rectangles into one larger rectangle
|
||||
private List<Rectangle2D> mergeRectangleListRecursive(List<Rectangle2D> rectangleList, int currentIdx) {
|
||||
|
||||
if (rectangleList.size() < currentIdx + 2) {
|
||||
return rectangleList;
|
||||
}
|
||||
|
||||
var rect1 = rectangleList.get(currentIdx);
|
||||
var rect2 = rectangleList.get(currentIdx + 1);
|
||||
|
||||
boolean isAlignedX = Math.abs(rect1.getMinX() - rect2.getMinX()) < TOLERANCE && Math.abs(rect1.getMaxX() - rect2.getMaxX()) < TOLERANCE;
|
||||
boolean isAlignedY = Math.abs(rect1.getMinY() - rect2.getMinY()) < TOLERANCE && Math.abs(rect1.getMaxY() - rect2.getMaxY()) < TOLERANCE;
|
||||
boolean intersects = rect1.intersects(rect2.getMinX() - TOLERANCE, rect2.getMinY() - TOLERANCE, rect2.getWidth() + 2 * TOLERANCE, rect2.getHeight() + 2 * TOLERANCE);
|
||||
boolean intersects = rect1.intersects(rect2.getMinX() - TOLERANCE, rect2.getMinY() - TOLERANCE, rect2.getWidth() + (2 * TOLERANCE), rect2.getHeight() + (2 * TOLERANCE));
|
||||
|
||||
if (intersects && (isAlignedX || isAlignedY)) {
|
||||
rectangleList.remove(currentIdx + 1);
|
||||
rectangleList.remove(currentIdx);
|
||||
@ -114,26 +118,29 @@ public class ImagePositionRetrievalService {
|
||||
int rotation = page.getRotation();
|
||||
double height = page.getPageHeight();
|
||||
double width = page.getPageWidth();
|
||||
//Even though the getBBox() method returns coordinates with (0,0) in the lower left corner, the OCRModule's addTextZonesForPage() wants to have its coordinates with (0,0) in the upper left corner
|
||||
|
||||
// Even though PDFTron almost always has the origin in the lower left corner, for some reason, the OCRModule's addTextZonesForPage() uses the upper left corner as origin...
|
||||
Matrix2D mirrorMatrix;
|
||||
if (mirrorY) {
|
||||
mirrorMatrix = new Matrix2D(1, 0, 0, -1, 0, height);
|
||||
} else {
|
||||
mirrorMatrix = new Matrix2D();
|
||||
}
|
||||
|
||||
// We need to rotate the rects to fit to the page rotation
|
||||
Matrix2D rotationMatrix = switch (rotation) {
|
||||
case 1 -> new Matrix2D(0, -1, 1, 0, 0, height);
|
||||
case 2 -> new Matrix2D(-1, 0, 0, -1, width, height);
|
||||
case 3 -> new Matrix2D(0, 1, -1, 0, width, 0);
|
||||
default -> new Matrix2D(1, 0, 0, 1, 0, 0);
|
||||
default -> new Matrix2D();
|
||||
};
|
||||
|
||||
Matrix2D finalMatrix = mirrorMatrix.multiply(rotationMatrix);
|
||||
|
||||
Point2D.Double p1 = finalMatrix.multPoint(bbox.getX1(), bbox.getY1());
|
||||
Point2D.Double p2 = finalMatrix.multPoint(bbox.getX2(), bbox.getY2());
|
||||
|
||||
//PDFTron Rect needs lower left and upper right coordinates to calculate width and height correctly
|
||||
// PDFTron Rect *needs* lower left and upper right coordinates to calculate width and height correctly, even though the documentation states otherwise
|
||||
Point2D.Double lowerLeft = new Point2D.Double(Math.min(p1.x, p2.x), Math.min(p1.y, p2.y));
|
||||
Point2D.Double upperRight = new Point2D.Double(Math.max(p1.x, p2.x), Math.max(p1.y, p2.y));
|
||||
|
||||
|
||||
@ -112,12 +112,15 @@ public class OCRService {
|
||||
.numberOfPagesToOCR(pageIdToRectCollection.size())
|
||||
.numberOfOCRedPages(numProcessedPages)
|
||||
.build()));
|
||||
|
||||
} catch (PDFNetException e) {
|
||||
log.error("failed to process page {}", pageId);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
ocrPageDoc.close();
|
||||
|
||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
|
||||
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder()
|
||||
.fileId(fileId)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user