RED-6126: In the OCRService, OCR Text is not applied to Document

*complete refactor of the OCRService
*moved image position retrieval to new class instead of image service
*added new tests for image rotation
This commit is contained in:
Kilian Schuettler 2023-02-06 14:12:15 +01:00
parent 355887c865
commit a415224db5
15 changed files with 515 additions and 144 deletions

View File

@ -0,0 +1,170 @@
package com.iqser.red.service.ocr.v1.server.service;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.awt.geom.RectangularShape;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import org.springframework.stereotype.Service;
import com.pdftron.common.Matrix2D;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementReader;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.Rect;
import com.pdftron.pdf.RectCollection;
import lombok.SneakyThrows;
@Service
public class ImagePositionRetrievalService {
private static final double TOLERANCE = 1e-1;
/**
* Iterates over all elements in a PDF Document and retrieves the bounding box for each image,
* Then it adjusts the bounding boxes for the page rotation.
* If the mirrorY flag is set, the Y Coordinates are mirrored and moved up by the page height. This is required for PDFTrons OCRModule.
*
* @param pdfDoc a PDF File as PDFTron PDFDoc class
* @param mirrorY if this flag is set, all coordinates are calculated with upper left corner as (0,0), else initial user space
* @return a map with the page indices as keys and the image bounding boxes on that page as a RectCollection
*/
@SneakyThrows
public Map<Integer, RectCollection> getImagePositionPerPage(PDFDoc pdfDoc, boolean mirrorY) {
Map<Integer, RectCollection> pageIdToImagePositions = new HashMap<>();
ElementReader reader = new ElementReader();
for (int pageId = 1; pageId <= pdfDoc.getPageCount(); ++pageId) {
reader.begin(pdfDoc.getPage(pageId));
RectCollection imagePositions = new RectCollection();
processElements(reader, imagePositions, pdfDoc.getPage(pageId), mirrorY);
imagePositions = mergeOverlappingRects(imagePositions);
if (imagePositions.getNumRects() > 0) {
pageIdToImagePositions.put(pageId, imagePositions);
}
reader.end();
}
return pageIdToImagePositions;
}
private void processElements(ElementReader reader, RectCollection imagePositions, Page currentPage, boolean mirrorY) throws PDFNetException {
Element element;
while ((element = reader.next()) != null) {
switch (element.getType()) {
case Element.e_image, Element.e_inline_image -> imagePositions.addRect(toRotationAdjustedRect(element.getBBox(), currentPage, mirrorY));
case Element.e_form -> {
reader.formBegin();
processElements(reader, imagePositions, currentPage, mirrorY);
reader.end();
}
}
}
}
@SneakyThrows
public RectCollection mergeOverlappingRects(RectCollection imagePositions) {
if (imagePositions.getNumRects() < 2) {
return imagePositions;
}
List<Rectangle2D> rectangleList = toSortedRectangleList(imagePositions);
rectangleList = mergeRectangleListRecursive(rectangleList, 0);
return toRectCollection(rectangleList);
}
// Sometimes images are split up into stripes, here we try to merge the positions into one larger rectangle
private List<Rectangle2D> mergeRectangleListRecursive(List<Rectangle2D> rectangleList, int currentIdx) {
if (rectangleList.size() < currentIdx + 2) {
return rectangleList;
}
var rect1 = rectangleList.get(currentIdx);
var rect2 = rectangleList.get(currentIdx + 1);
boolean isAlignedX = Math.abs(rect1.getMinX() - rect2.getMinX()) < TOLERANCE && Math.abs(rect1.getMaxX() - rect2.getMaxX()) < TOLERANCE;
boolean isAlignedY = Math.abs(rect1.getMinY() - rect2.getMinY()) < TOLERANCE && Math.abs(rect1.getMaxY() - rect2.getMaxY()) < TOLERANCE;
boolean intersects = rect1.intersects(rect2.getMinX() - TOLERANCE, rect2.getMinY() - TOLERANCE, rect2.getWidth() + 2 * TOLERANCE, rect2.getHeight() + 2 * TOLERANCE);
if (intersects && (isAlignedX || isAlignedY)) {
rectangleList.remove(currentIdx + 1);
rectangleList.remove(currentIdx);
rectangleList.add(currentIdx, rect1.createUnion(rect2));
return mergeRectangleListRecursive(rectangleList, currentIdx);
} else {
return mergeRectangleListRecursive(rectangleList, currentIdx + 1);
}
}
private Rect toRotationAdjustedRect(Rect bbox, Page page, boolean mirrorY) throws PDFNetException {
int rotation = page.getRotation();
double height = page.getPageHeight();
double width = page.getPageWidth();
//Even though the getBBox() method returns coordinates with (0,0) in the lower left corner, the OCRModule's addTextZonesForPage() wants to have its coordinates with (0,0) in the upper left corner
Matrix2D mirrorMatrix;
if (mirrorY) {
mirrorMatrix = new Matrix2D(1, 0, 0, -1, 0, height);
} else {
mirrorMatrix = new Matrix2D();
}
// We need to rotate the rects to fit to the page rotation
Matrix2D rotationMatrix = switch (rotation) {
case 1 -> new Matrix2D(0, -1, 1, 0, 0, height);
case 2 -> new Matrix2D(-1, 0, 0, -1, width, height);
case 3 -> new Matrix2D(0, 1, -1, 0, width, 0);
default -> new Matrix2D(1, 0, 0, 1, 0, 0);
};
Matrix2D finalMatrix = mirrorMatrix.multiply(rotationMatrix);
Point2D.Double p1 = finalMatrix.multPoint(bbox.getX1(), bbox.getY1());
Point2D.Double p2 = finalMatrix.multPoint(bbox.getX2(), bbox.getY2());
//PDFTron Rect needs lower left and upper right coordinates to calculate width and height correctly
Point2D.Double lowerLeft = new Point2D.Double(Math.min(p1.x, p2.x), Math.min(p1.y, p2.y));
Point2D.Double upperRight = new Point2D.Double(Math.max(p1.x, p2.x), Math.max(p1.y, p2.y));
return new Rect(lowerLeft.x, lowerLeft.y, upperRight.x, upperRight.y);
}
private RectCollection toRectCollection(List<Rectangle2D> rectangleList) {
RectCollection rectCollection = new RectCollection();
rectangleList.forEach(r -> {
try {
rectCollection.addRect(new Rect(r.getMinX(), r.getMinY(), r.getMaxX(), r.getMaxY()));
} catch (PDFNetException e) {
throw new RuntimeException(e);
}
});
return rectCollection;
}
@SneakyThrows
private List<Rectangle2D> toSortedRectangleList(RectCollection rectCollection) {
List<Rectangle2D> list = new LinkedList<>();
for (int i = 0; i < rectCollection.getNumRects(); ++i) {
Rect r = rectCollection.getRectAt(i);
list.add(new Rectangle2D.Double(r.getX1(), r.getY1(), r.getWidth(), r.getHeight()));
}
list.sort(Comparator.comparingDouble(RectangularShape::getMinY).thenComparing(RectangularShape::getMinX));
return list;
}
}

View File

@ -1,12 +1,7 @@
package com.iqser.red.service.ocr.v1.server.service;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
@ -15,11 +10,7 @@ import org.springframework.stereotype.Service;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse;
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
import com.iqser.red.service.ocr.v1.server.model.ImagePosition;
import com.iqser.red.service.ocr.v1.server.model.image.ImageServiceResponse;
import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.OCRModule;
import com.pdftron.pdf.OCROptions;
@ -27,7 +18,6 @@ import com.pdftron.pdf.Optimizer;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.RectCollection;
import com.pdftron.pdf.TextExtractor;
import com.pdftron.sdf.SDFDoc;
import lombok.RequiredArgsConstructor;
@ -50,149 +40,93 @@ public class OCRService {
private final InvisibleElementRemovalService invisibleElementRemovalService;
private final ImagePositionRetrievalService imagePositionRetrievalService;
/**
* First loads the PDF Document from storage.
* Then removes all invisible Elements from the PDF, check InvisibleElementRemovalService for details.
* Then gets Image Position Information, check ImagePositionRetrievalService for details.
* Then runs OCR page by page, exclusively on pages which have images on them. It does so, by creating a new PDFDoc and inserting a single page at a time.
* This is because PDFTron OCROptions overlays all regions where OCR should not be run with white images. It does not check for empty pages.
* For Documents with many pages but few Images this results in major performance improvements.
* It then re-adds the OCRed Pages to the original document and saves it.
*
* @param dossierId The dossier id
* @param fileId The file id
* @return the resulting PDF file as an InputStream
*/
@SneakyThrows
public InputStream ocrDocument(String dossierId, String fileId) {
public InputStream runOcrOnDocument(String dossierId, String fileId) {
InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId);
ImageServiceResponse imageServiceResponse = fileStorageService.getImageServiceResponse(dossierId, fileId);
byte[] fileWithoutInvisibleTextBytes = invisibleElementRemovalService.removeInvisibleElements(fileStream, false);
byte[] ocrBytes = runOcrOnImages(fileWithoutInvisibleTextBytes, fileId, imageServiceResponse);
byte[] ocrBytes = runOcr(fileWithoutInvisibleTextBytes, fileId);
return new ByteArrayInputStream(ocrBytes);
}
private byte[] runOcrOnImages(byte[] file, String fileId, ImageServiceResponse imageServiceResponse) {
PDFDoc pdfDoc = null;
try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
pdfDoc = new PDFDoc(file);
Map<Integer, List<ImagePosition>> pageIdToImgPos = new HashMap<>();
imageServiceResponse.getData()
.forEach(imageMetadata -> pageIdToImgPos.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
.add(new ImagePosition(new Rectangle(new Point(imageMetadata.getPosition().getX1(), imageMetadata.getPosition().getY1()),
imageMetadata.getGeometry().getWidth(),
imageMetadata.getGeometry().getHeight(),
imageMetadata.getPosition().getPageNumber()), imageMetadata.isAlpha())));
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder().fileId(fileId).numberOfPagesToOCR(pageIdToImgPos.size()).build()));
// the PDFDoc is a helper document, which contains exactly one page
Map<Integer, PDFDoc> pageIdToOcrPageMap = runOcrPerPage(pdfDoc, fileId, pageIdToImgPos);
addOCRPagesToDocIfAdditionalWordsFound(pdfDoc, pageIdToOcrPageMap);
Optimizer.optimize(pdfDoc);
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
pdfDoc.close();
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder()
.fileId(fileId)
.numberOfPagesToOCR(pageIdToImgPos.size())
.numberOfOCRedPages(pageIdToOcrPageMap.size())
.ocrFinished(true)
.build()));
return out.toByteArray();
} catch (Exception e) {
throw new RuntimeException(e);
} finally {
if (pdfDoc != null) {
try {
pdfDoc.close();
} catch (Exception e) {
log.debug("Failed to close document", e);
}
}
}
}
@SneakyThrows
private Map<Integer, PDFDoc> runOcrPerPage(PDFDoc pdfDoc, String fileId, Map<Integer, List<ImagePosition>> pageIdToImgPosMap) {
private byte[] runOcr(byte[] file, String fileId) {
Map<Integer, PDFDoc> pageIdToOcrPageMap = Collections.synchronizedMap(new HashMap<>());
PDFDoc pdfDoc = new PDFDoc(file);
int numberOfRunPages = 0;
for (var pageIdToImgPos : pageIdToImgPosMap.entrySet()) {
Map<Integer, RectCollection> pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, true);
OCROptions options = new OCROptions();
PDFDoc ocrPageDoc = new PDFDoc();
int numProcessedPages = 0;
for (Integer pageId : pageIdToRectCollection.keySet()) {
try {
Integer pageIndex = pageIdToImgPos.getKey();
Page pdfPage = pdfDoc.getPageIterator(pageIndex).next();
pdfPage.setMediaBox(pdfPage.getCropBox());
RectCollection rectCollection = new RectCollection();
for (ImagePosition imagePosition : pageIdToImgPos.getValue()) {
Rectangle rectangle = imagePosition.getRectangle();
// Warning coordinate system is different in this call macOs/Linux
double y = -rectangle.getTopLeft().getY() + pdfPage.getCropBox().getY2() - rectangle.getHeight();
rectCollection.addRect(rectangle.getTopLeft().getX(), y, rectangle.getTopLeft().getX() + rectangle.getWidth(), y + rectangle.getHeight());
}
// technically a document, but it always contains exactly one page
PDFDoc ocrPage = new PDFDoc();
ocrPage.pagePushBack(pdfPage);
pageIdToOcrPageMap.put(pageIndex, ocrPage);
OCROptions options = new OCROptions();
options.addTextZonesForPage(rectCollection, 1);
// optimization by only scanning pages that contain images
Page pdfPage = pdfDoc.getPage(pageId);
pdfPage.setMediaBox(pdfPage.getCropBox()); // this line ensures the ocr text is placed correctly by PDFTron
ocrPageDoc.pagePushBack(pdfPage);
options.addTextZonesForPage(pageIdToRectCollection.get(pageId), 1);
options.addLang(ENGLISH);
options.addDPI(settings.getOcrDPI());
OCRModule.processPDF(ocrPage, options);
rectCollection.clear();
OCRModule.processPDF(ocrPageDoc, options);
++numProcessedPages;
} catch (Exception e) {
log.warn("Failed to process PDF page {}", pageIdToImgPos.getKey());
}
StringBuilder zonesString = new StringBuilder();
for (int j = 0; j < pageIdToRectCollection.get(pageId).getNumRects(); ++j) {
var r = pageIdToRectCollection.get(pageId).getRectAt(j);
zonesString.append(String.format("[lower left (%.1f|%.1f) upper right (%.1f|%.1f)]", r.getX1(), r.getY1(), r.getX2(), r.getY2()));
}
log.info("{}/{} Page {} done, OCR regions {}", numProcessedPages, pageIdToRectCollection.size(), pageId, zonesString);
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder()
.fileId(fileId)
.numberOfPagesToOCR(pageIdToImgPosMap.size())
.numberOfOCRedPages(++numberOfRunPages)
.build()));
// re-adding OCR pages
Page ocrPage = ocrPageDoc.getPage(1);
pdfDoc.pageInsert(pdfDoc.getPageIterator(pageId), ocrPage);
pdfDoc.pageRemove(pdfDoc.getPageIterator(pageId + 1));
ocrPageDoc.pageRemove(ocrPageDoc.getPageIterator(1));
log.warn("Done page {}", pageIdToImgPos);
}
return pageIdToOcrPageMap;
}
private void addOCRPagesToDocIfAdditionalWordsFound(PDFDoc pdfDoc, Map<Integer, PDFDoc> ocrDocPagesMap) throws PDFNetException {
for (var ocrDocPagesEntry : ocrDocPagesMap.entrySet()) {
int pageIndex = ocrDocPagesEntry.getKey();
Page ocrPage = ocrDocPagesEntry.getValue().getPage(1);
Page page = pdfDoc.getPage(pageIndex);
if (getWordCount(ocrPage) >= getWordCount(page)) {
pdfDoc.pageInsert(pdfDoc.getPageIterator(pageIndex), ocrPage);
pdfDoc.pageRemove(pdfDoc.getPageIterator(pageIndex + 1));
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder()
.fileId(fileId)
.numberOfPagesToOCR(pageIdToRectCollection.size())
.numberOfOCRedPages(numProcessedPages)
.build()));
} catch (PDFNetException e) {
log.error("failed to process page {}", pageId);
throw new RuntimeException(e);
}
}
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder()
.fileId(fileId)
.numberOfPagesToOCR(pageIdToRectCollection.size())
.numberOfOCRedPages(numProcessedPages)
.ocrFinished(true)
.build()));
Optimizer.optimize(pdfDoc);
return pdfDoc.save(SDFDoc.SaveMode.LINEARIZED, null);
}
private static int getWordCount(Page pdfPage) {
TextExtractor txt = new TextExtractor();
txt.begin(pdfPage);
return txt.getWordCount();
}
}

View File

@ -44,7 +44,7 @@ public class OcrMessageReceiver {
fileStorageService.storeUntouchedFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), originalFile);
}
var ocrResult = ocrService.ocrDocument(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
var ocrResult = ocrService.runOcrOnDocument(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
fileStorageService.storeOriginalFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), ocrResult);

View File

@ -9,6 +9,7 @@ import java.util.ArrayList;
import java.util.List;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
@ -32,6 +33,7 @@ import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService;
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType;
import com.iqser.red.storage.commons.StorageAutoConfiguration;
import com.iqser.red.storage.commons.service.StorageService;
import com.pdftron.pdf.OCRModule;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
@ -61,6 +63,53 @@ public class OcrServiceIntegrationTest {
private OCRService ocrService;
@BeforeEach
@SneakyThrows
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
public void assertOCRModuleIsLoaded() {
assert OCRModule.isModuleAvailable();
}
@Test
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
public void testOcr() {
// check visually for most of the images containing text, the resulting text is kind of nonsense, just ensure it is there
String text = testOCR("StitchedImagesMultiPage");
}
@Test
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
public void testManyRotatedImages() {
// check visually for most of the images containing text, the resulting text is kind of nonsense, just ensure it is there
String text = testOCR("manyRotatedImages");
assertThat(text).contains("Michela", "Gregori", "DVM", "PhD", "Pathologist");
}
@Test
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
public void testMergeImages() {
// check visually for most of the images containing text, the resulting text is kind of nonsense, just ensure it is there
String text = testOCR("merge_images");
assertThat(text).contains("Bodyweight change of dams with live young - group mean values",
"Control",
"mg/g day",
"10 mg/kg/day",
"20 mg/kg/",
"Days",
"50",
"-200",
"—250",
"150",
"200",
"250",
"—150");
}
@Test
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
public void testOCRWatermark() {
@ -82,21 +131,16 @@ public class OcrServiceIntegrationTest {
@SneakyThrows
private String testOCR(String fileName) {
ClassPathResource imageInfoResource = new ClassPathResource("files/" + fileName + ".IMAGE_INFO.json");
ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf");
var originId = FileStorageService.getStorageId("dossier", "file", FileType.ORIGIN);
storageService.storeObject(originId, pdfFileResource.getInputStream());
var imageId = FileStorageService.getStorageId("dossier", "file", FileType.IMAGE_INFO);
storageService.storeObject(imageId, imageInfoResource.getInputStream());
try (InputStream ocrDocument = ocrService.ocrDocument("dossier", "file")) {
try (InputStream ocrDocument = ocrService.runOcrOnDocument("dossier", "file")) {
byte[] ocrDocumentBytes = ocrDocument.readAllBytes();
try (var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
out.write(ocrDocumentBytes);
}
PDFDoc ocrDoc = new PDFDoc(ocrDocumentBytes);
TextExtractor extractor = new TextExtractor();
List<String> texts = new ArrayList<>();
PDFDoc pdfDoc = new PDFDoc(ocrDocumentBytes);

View File

@ -0,0 +1,228 @@
package com.iqser.red.service.ocr.v1.server.service;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
import static org.assertj.core.api.Assertions.assertThat;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.context.annotation.Import;
import org.springframework.core.io.ClassPathResource;
import org.springframework.test.context.junit.jupiter.SpringExtension;
import com.iqser.red.service.ocr.v1.server.Application;
import com.iqser.red.service.ocr.v1.server.OcrServiceIntegrationTest;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.ColorPt;
import com.pdftron.pdf.ColorSpace;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementBuilder;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.Rect;
import com.pdftron.pdf.RectCollection;
import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows;
@ExtendWith(SpringExtension.class)
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
@Import(OcrServiceIntegrationTest.TestConfiguration.class)
class ImagePositionRetrievalServiceTest {
@Autowired
private ImagePositionRetrievalService imagePositionRetrievalService;
@MockBean
protected RabbitTemplate rabbitTemplate;
@Test
@SneakyThrows
public void testImagePositionRetrievalForRotateTestFileWithImages() {
String fileName = "RotateTestFileWithImages";
List<int[]> allRectCoords = testImagePositionDetection(fileName);
assertThat(allRectCoords).contains(new int[]{48, 572, 295, 721},
new int[]{54, 279, 301, 428},
new int[]{360, 173, 509, 419},
new int[]{362, 522, 511, 768},
new int[]{459, 354, 608, 600},
new int[]{145, 404, 392, 553},
new int[]{151, 111, 398, 260},
new int[]{457, 5, 606, 251},
new int[]{395, 480, 545, 726},
new int[]{393, 130, 542, 377},
new int[]{88, 236, 334, 386},
new int[]{82, 530, 328, 679},
new int[]{465, 11, 614, 257},
new int[]{159, 117, 406, 266},
new int[]{467, 360, 617, 607},
new int[]{153, 410, 400, 559});
}
@Test
@SneakyThrows
public void testImagePositionRetrievalForRotateTestFileWithImagesExtremeCropbox() {
String fileName = "RotateTestFileWithImagesExtremeCropbox";
List<int[]> allRectCoords = testImagePositionDetection(fileName);
assertThat(allRectCoords).contains(new int[]{48, 572, 295, 721},
new int[]{362, 522, 511, 768},
new int[]{360, 173, 509, 419},
new int[]{54, 279, 301, 428},
new int[]{145, 192, 392, 341},
new int[]{459, 142, 608, 388},
new int[]{457, -207, 606, 39},
new int[]{151, -101, 398, 48},
new int[]{-30, 238, 216, 387},
new int[]{283, 188, 433, 434},
new int[]{281, -162, 430, 85},
new int[]{-24, -56, 222, 94},
new int[]{-39, 410, 208, 559},
new int[]{275, 360, 425, 607},
new int[]{273, 11, 422, 257},
new int[]{-33, 117, 214, 266});
}
@Test
@SneakyThrows
public void testMergeImages() {
String fileName = "merge_images";
List<int[]> allRectCoords = testImagePositionDetection(fileName);
assertThat(allRectCoords).contains(new int[]{90, 284, 398, 770});
}
@Test
@SneakyThrows
public void testStitchedImagesMultiPage() {
String fileName = "StitchedImagesMultiPage";
List<int[]> allRectCoords = testImagePositionDetection(fileName);
assertThat(allRectCoords.size()).isEqualTo(48);
}
private List<int[]> testImagePositionDetection(String fileName) throws IOException, PDFNetException {
InputStream fileStream = new FileInputStream(new ClassPathResource("files/" + fileName + ".pdf").getFile().getAbsolutePath());
PDFDoc pdfDoc = new PDFDoc(fileStream);
Map<Integer, RectCollection> pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, false);
ElementWriter writer = new ElementWriter();
pageIdToRectCollection.forEach((pageId, rectCollection) -> {
try {
writer.begin(pdfDoc.getPage(pageId));
drawRectCollection(writer, rectCollection, pdfDoc.getPage(pageId));
drawGrid(writer, pdfDoc.getPage(pageId));
writer.end();
StringBuilder zonesString = new StringBuilder();
for (int j = 0; j < rectCollection.getNumRects(); ++j) {
var r = rectCollection.getRectAt(j);
zonesString.append(String.format("%d:[lower left (%.1f|%.1f) upper right (%.1f|%.1f)]; ", j, r.getX1(), r.getY1(), r.getX2(), r.getY2()));
}
System.out.printf("Page %d: rotation: %d OCR regions %s \n", pageId, pdfDoc.getPage(pageId).getRotation(), zonesString);
} catch (PDFNetException e) {
throw new RuntimeException(e);
}
});
// Check visually for red Rectangles to match images in the saved pdf file
try (var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + "_IMAGE_BBOX.pdf")) {
out.write(pdfDoc.save(SDFDoc.SaveMode.LINEARIZED, null));
}
System.out.println("result file: " + getTemporaryDirectory() + "/" + fileName + "_IMAGE_BBOX.pdf");
fileStream.close();
// round all coords to nearest int to account for inconsistencies with the calculation of the bounding box
return pageIdToRectCollection.values().stream().map(this::toRoundedCoordinateArrayList).flatMap(List::stream).collect(Collectors.toList());
}
@SneakyThrows
private List<int[]> toRoundedCoordinateArrayList(RectCollection rectCollection) {
List<int[]> coords = new ArrayList<>(rectCollection.getNumRects());
for (int i = 0; i < rectCollection.getNumRects(); ++i) {
var r = rectCollection.getRectAt(i);
coords.add(new int[]{(int) Math.round(r.getX1()), (int) Math.round(r.getY1()), (int) Math.round(r.getX2()), (int) Math.round(r.getY2())});
}
return coords;
}
@SneakyThrows
private void drawGrid(ElementWriter writer, Page page) {
ElementBuilder eb = new ElementBuilder();
double dX = 15;
double dY = 15;
int nRows = (int) (page.getPageHeight() / dY) + 1;
int nCols = (int) (page.getPageWidth() / dX) + 1;
for (int row = 0; row < nRows; ++row) {
for (int col = 0; col < nCols; ++col) {
Element cell = eb.createRect(col * dX, row * dY, dX, dY);
cell.setPathStroke(true);
cell.getGState().setLineWidth(1);
cell.getGState().setStrokeOpacity(0.1);
cell.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
if (row == 0 && col == 0) {
cell.getGState().setStrokeColor(new ColorPt(0, 0, 1));
cell.setPathFill(true);
cell.getGState().setFillOpacity(0.8);
cell.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
cell.getGState().setFillColor(new ColorPt(0, 0, 1));
} else {
cell.setPathFill(false);
cell.getGState().setStrokeColor(new ColorPt(0.1, 0.1, 0.1));
}
writer.writePlacedElement(cell);
}
}
}
@SneakyThrows
public static void drawRectCollection(ElementWriter writer, RectCollection rectCollection, Page page) {
ColorPt colorPt = new ColorPt(1, 0, 0);
ElementBuilder eb = new ElementBuilder();
for (int i = 0; i < rectCollection.getNumRects(); ++i) {
Rect r = rectCollection.getRectAt(i);
Element rect = eb.createRect(r.getX1(), r.getY1(), r.getWidth(), r.getHeight());
rect.setPathStroke(true);
rect.getGState().setLineWidth(5);
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
rect.getGState().setStrokeColor(colorPt);
rect.setPathFill(true);
rect.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
rect.getGState().setFillColor(colorPt);
rect.getGState().setFillOpacity(0.5);
writer.writePlacedElement(rect);
}
}
}

View File

@ -1,4 +1,4 @@
package com.iqser.red.service.ocr.v1.server;
package com.iqser.red.service.ocr.v1.server.service;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
@ -17,7 +17,8 @@ import org.springframework.context.annotation.Import;
import org.springframework.core.io.ClassPathResource;
import org.springframework.test.context.junit.jupiter.SpringExtension;
import com.iqser.red.service.ocr.v1.server.service.InvisibleElementRemovalService;
import com.iqser.red.service.ocr.v1.server.Application;
import com.iqser.red.service.ocr.v1.server.OcrServiceIntegrationTest;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
@ -26,8 +27,7 @@ import com.pdftron.pdf.TextExtractor;
import lombok.SneakyThrows;
@ExtendWith(SpringExtension.class)
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT //
, properties = {"pdftron.ocrmodule.path=/YourOCRModulePath"})
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
@Import(OcrServiceIntegrationTest.TestConfiguration.class)
public class InvisibleElementRemovalServiceTest {

View File

@ -1 +0,0 @@
{"dossierId": "c8553cbd-409f-4e1a-baf4-34b11d49deac", "fileId": "bd6f93ed896dd0e2f641b0568f13ddf1", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": [{"classification": {"label": "other", "probabilities": {"other": 0.9999, "logo": 0.0001, "formula": 0.0, "signature": 0.0}}, "representation": "FFFFFEFBF7EFCFFFFFFFFFFFF", "position": {"x1": -3, "x2": 795, "y1": 0, "y2": 612, "pageNumber": 1}, "geometry": {"width": 798, "height": 612}, "alpha": false, "filters": {"geometry": {"imageSize": {"quotient": 1.0038, "tooLarge": true, "tooSmall": false}, "imageFormat": {"quotient": 1.3039, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "logo", "probabilities": {"logo": 0.9966, "other": 0.0025, "signature": 0.0005, "formula": 0.0003}}, "representation": "FFC33D3C323CCF3390C1F8C72", "position": {"x1": 120, "x2": 131, "y1": 264, "y2": 380, "pageNumber": 1}, "geometry": {"width": 11, "height": 116}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.0513, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.0948, "tooTall": true, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "other", "probabilities": {"other": 1.0, "formula": 0.0, "logo": 0.0, "signature": 0.0}}, "representation": "EF8FF6381060800318F0E187", "position": {"x1": 152, "x2": 205, "y1": 115, "y2": 533, "pageNumber": 1}, "geometry": {"width": 53, "height": 418}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.2138, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.1268, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": true}}, {"classification": {"label": "other", "probabilities": {"other": 0.5967, "logo": 0.1756, "signature": 0.1218, "formula": 0.106}}, "representation": "EC9377C9170E1070C3070C30F", "position": {"x1": 196, "x2": 221, "y1": 121, "y2": 245, "pageNumber": 1}, "geometry": {"width": 25, "height": 124}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.08, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.2016, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": true}}, {"classification": {"label": "logo", "probabilities": {"logo": 1.0, "formula": 0.0, "other": 0.0, "signature": 0.0}}, "representation": "CF1F1F70F140036860F1441B5", "position": {"x1": 707, "x2": 718, "y1": 481, "y2": 531, "pageNumber": 1}, "geometry": {"width": 11, "height": 50}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.0337, "tooLarge": false, "tooSmall": true}, "imageFormat": {"quotient": 0.22, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "logo", "probabilities": {"logo": 0.9704, "other": 0.0223, "formula": 0.0044, "signature": 0.0029}}, "representation": "CF0F1C70F1C7090081F7CF073", "position": {"x1": 732, "x2": 744, "y1": 118, "y2": 241, "pageNumber": 1}, "geometry": {"width": 12, "height": 123}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.0552, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.0976, "tooTall": true, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}], "dataCV": []}

View File

@ -1 +0,0 @@
{"dossierId": "55547c91-6b0e-4aa6-9009-2e7c4cd90f13", "fileId": "917b9d9c9f548f85fef3679db45ff46c", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": [{"classification": {"label": "signature", "probabilities": {"signature": 0.9927, "logo": 0.0038, "other": 0.0034, "formula": 0.0}}, "representation": "FFF2CF0F7C74FFC1070830FFF", "position": {"x1": -7, "x2": 603, "y1": 0, "y2": 852, "pageNumber": 1}, "geometry": {"width": 610, "height": 852}, "alpha": false, "filters": {"geometry": {"imageSize": {"quotient": 1.0096, "tooLarge": true, "tooSmall": false}, "imageFormat": {"quotient": 0.716, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}], "dataCV": []}

View File

@ -1 +0,0 @@
{"dossierId": "c8553cbd-409f-4e1a-baf4-34b11d49deac", "fileId": "bd6f93ed896dd0e2f641b0568f13ddf1", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": [{"classification": {"label": "other", "probabilities": {"other": 0.9999, "logo": 0.0001, "formula": 0.0, "signature": 0.0}}, "representation": "FFFFFEFBF7EFCFFFFFFFFFFFF", "position": {"x1": -3, "x2": 795, "y1": 0, "y2": 612, "pageNumber": 1}, "geometry": {"width": 798, "height": 612}, "alpha": false, "filters": {"geometry": {"imageSize": {"quotient": 1.0038, "tooLarge": true, "tooSmall": false}, "imageFormat": {"quotient": 1.3039, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "logo", "probabilities": {"logo": 0.9966, "other": 0.0025, "signature": 0.0005, "formula": 0.0003}}, "representation": "FFC33D3C323CCF3390C1F8C72", "position": {"x1": 120, "x2": 131, "y1": 264, "y2": 380, "pageNumber": 1}, "geometry": {"width": 11, "height": 116}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.0513, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.0948, "tooTall": true, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "other", "probabilities": {"other": 1.0, "formula": 0.0, "logo": 0.0, "signature": 0.0}}, "representation": "EF8FF6381060800318F0E187", "position": {"x1": 152, "x2": 205, "y1": 115, "y2": 533, "pageNumber": 1}, "geometry": {"width": 53, "height": 418}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.2138, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.1268, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": true}}, {"classification": {"label": "other", "probabilities": {"other": 0.5967, "logo": 0.1756, "signature": 0.1218, "formula": 0.106}}, "representation": "EC9377C9170E1070C3070C30F", "position": {"x1": 196, "x2": 221, "y1": 121, "y2": 245, "pageNumber": 1}, "geometry": {"width": 25, "height": 124}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.08, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.2016, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": true}}, {"classification": {"label": "logo", "probabilities": {"logo": 1.0, "formula": 0.0, "other": 0.0, "signature": 0.0}}, "representation": "CF1F1F70F140036860F1441B5", "position": {"x1": 707, "x2": 718, "y1": 481, "y2": 531, "pageNumber": 1}, "geometry": {"width": 11, "height": 50}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.0337, "tooLarge": false, "tooSmall": true}, "imageFormat": {"quotient": 0.22, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "logo", "probabilities": {"logo": 0.9704, "other": 0.0223, "formula": 0.0044, "signature": 0.0029}}, "representation": "CF0F1C70F1C7090081F7CF073", "position": {"x1": 732, "x2": 744, "y1": 118, "y2": 241, "pageNumber": 1}, "geometry": {"width": 12, "height": 123}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.0552, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.0976, "tooTall": true, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}], "dataCV": []}

View File

@ -1 +0,0 @@
{"dossierId": "55547c91-6b0e-4aa6-9009-2e7c4cd90f13", "fileId": "d7f1e0e37cba4e28ebdf894a79d3bd67", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": [{"classification": {"label": "signature", "probabilities": {"signature": 0.9872, "logo": 0.0064, "other": 0.0063, "formula": 0.0001}}, "representation": "FFFCF10608F6F89747BFFC301", "position": {"x1": -9, "x2": 584, "y1": 9, "y2": 849, "pageNumber": 1}, "geometry": {"width": 593, "height": 840}, "alpha": false, "filters": {"geometry": {"imageSize": {"quotient": 0.9992, "tooLarge": true, "tooSmall": false}, "imageFormat": {"quotient": 0.706, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}], "dataCV": []}

View File

@ -1 +0,0 @@
{"dossierId": "55547c91-6b0e-4aa6-9009-2e7c4cd90f13", "fileId": "32b19ec38896f5105c09041def470c90", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": [{"classification": {"label": "logo", "probabilities": {"logo": 0.9999, "signature": 0.0001, "formula": 0.0, "other": 0.0}}, "representation": "307EF8F6E9833CE9D7AF9EFFF", "position": {"x1": 26, "x2": 586, "y1": -2, "y2": 794, "pageNumber": 1}, "geometry": {"width": 560, "height": 796}, "alpha": false, "filters": {"geometry": {"imageSize": {"quotient": 0.959, "tooLarge": true, "tooSmall": false}, "imageFormat": {"quotient": 0.7035, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "logo", "probabilities": {"logo": 1.0, "formula": 0.0, "other": 0.0, "signature": 0.0}}, "representation": "FFF7FFD2000000018F3FFEFFF", "position": {"x1": 90, "x2": 210, "y1": 676, "y2": 720, "pageNumber": 1}, "geometry": {"width": 120, "height": 44}, "alpha": false, "filters": {"geometry": {"imageSize": {"quotient": 0.1044, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 2.7273, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": true}}], "dataCV": []}