Pull request #5: RED-6126

Merge in RED/ocr-service from RED-6126 to master

* commit '00cfe9e44948c153857ad59442dbc9349e1d4555':
  RED-6126: In the OCRService, OCR Text is not applied to Document *reformatted InvisibleElementRemovalService with new Code Style
  RED-6126: In the OCRService, OCR Text is not applied to Document *updated some comments *very slight refactor
  RED-6126: In the OCRService, OCR Text is not applied to Document *complete refactor of the OCRService *moved image position retrieval to new class instead of image service *added new tests for image rotation
  RED-6126: In the OCRService, OCR Text is not applied to Document *removed private configuration
  RED-6126: In the OCRService, OCR Text is not applied to Document *formatted one line
  RED-6126: In the OCRService, OCR Text is not applied to Document *reverted application of OCR Text to Document to old state *refactored OCR Service slightly *added meaningful test cases
This commit is contained in:
Kilian Schuettler 2023-02-07 13:35:32 +01:00 committed by Dominique Eiflaender
commit b0a658213d
16 changed files with 580 additions and 134 deletions

View File

@ -0,0 +1,177 @@
package com.iqser.red.service.ocr.v1.server.service;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.awt.geom.RectangularShape;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import org.springframework.stereotype.Service;
import com.pdftron.common.Matrix2D;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementReader;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.Rect;
import com.pdftron.pdf.RectCollection;
import lombok.SneakyThrows;
@Service
public class ImagePositionRetrievalService {
private static final double TOLERANCE = 1e-1;
/**
* Iterates over all elements in a PDF Document and retrieves the bounding box for each image,
* Then it adjusts the bounding boxes for the page rotation.
* If the mirrorY flag is set, the Y Coordinates are mirrored and moved up by the page height. This is required for PDFTrons OCRModule.
*
* @param pdfDoc a PDF File as PDFTron PDFDoc class
* @param mirrorY if this flag is set, all coordinates are calculated with upper left corner as (0,0), else initial user space
* @return a map with the page indices as keys and the image bounding boxes on that page as a RectCollection
*/
@SneakyThrows
public Map<Integer, RectCollection> getImagePositionPerPage(PDFDoc pdfDoc, boolean mirrorY) {
Map<Integer, RectCollection> pageIdToImagePositions = new HashMap<>();
ElementReader reader = new ElementReader();
for (int pageId = 1; pageId <= pdfDoc.getPageCount(); ++pageId) {
RectCollection imagePositions = new RectCollection();
reader.begin(pdfDoc.getPage(pageId));
findImagePositionsOnPage(reader, imagePositions, pdfDoc.getPage(pageId), mirrorY);
imagePositions = mergeOverlappingRects(imagePositions);
reader.end();
if (imagePositions.getNumRects() > 0) {
pageIdToImagePositions.put(pageId, imagePositions);
}
}
return pageIdToImagePositions;
}
private void findImagePositionsOnPage(ElementReader reader, RectCollection imagePositions, Page currentPage, boolean mirrorY) throws PDFNetException {
Element element;
while ((element = reader.next()) != null) {
switch (element.getType()) {
case Element.e_image, Element.e_inline_image -> imagePositions.addRect(toRotationAdjustedRect(element.getBBox(), currentPage, mirrorY));
case Element.e_form -> {
reader.formBegin();
findImagePositionsOnPage(reader, imagePositions, currentPage, mirrorY);
reader.end();
}
}
}
}
@SneakyThrows
public RectCollection mergeOverlappingRects(RectCollection imagePositions) {
if (imagePositions.getNumRects() == 1) {
return imagePositions;
}
List<Rectangle2D> rectangleList = toSortedRectangleList(imagePositions);
rectangleList = mergeRectangleListRecursive(rectangleList, 0);
return toRectCollection(rectangleList);
}
// Sometimes images are split up into stripes, here we merge the positions of aligned and intersecting rectangles into one larger rectangle
private List<Rectangle2D> mergeRectangleListRecursive(List<Rectangle2D> rectangleList, int currentIdx) {
if (rectangleList.size() < currentIdx + 2) {
return rectangleList;
}
var rect1 = rectangleList.get(currentIdx);
var rect2 = rectangleList.get(currentIdx + 1);
boolean isAlignedX = Math.abs(rect1.getMinX() - rect2.getMinX()) < TOLERANCE && Math.abs(rect1.getMaxX() - rect2.getMaxX()) < TOLERANCE;
boolean isAlignedY = Math.abs(rect1.getMinY() - rect2.getMinY()) < TOLERANCE && Math.abs(rect1.getMaxY() - rect2.getMaxY()) < TOLERANCE;
boolean intersects = rect1.intersects(rect2.getMinX() - TOLERANCE, rect2.getMinY() - TOLERANCE, rect2.getWidth() + (2 * TOLERANCE), rect2.getHeight() + (2 * TOLERANCE));
if (intersects && (isAlignedX || isAlignedY)) {
rectangleList.remove(currentIdx + 1);
rectangleList.remove(currentIdx);
rectangleList.add(currentIdx, rect1.createUnion(rect2));
return mergeRectangleListRecursive(rectangleList, currentIdx);
} else {
return mergeRectangleListRecursive(rectangleList, currentIdx + 1);
}
}
private Rect toRotationAdjustedRect(Rect bbox, Page page, boolean mirrorY) throws PDFNetException {
int rotation = page.getRotation();
double height = page.getPageHeight();
double width = page.getPageWidth();
// Even though PDFTron almost always has the origin in the lower left corner, for some reason, the OCRModule's addTextZonesForPage() uses the upper left corner as origin...
Matrix2D mirrorMatrix;
if (mirrorY) {
mirrorMatrix = new Matrix2D(1, 0, 0, -1, 0, height);
} else {
mirrorMatrix = new Matrix2D();
}
// We need to rotate the rects to fit to the page rotation
Matrix2D rotationMatrix = switch (rotation) {
case 1 -> new Matrix2D(0, -1, 1, 0, 0, height);
case 2 -> new Matrix2D(-1, 0, 0, -1, width, height);
case 3 -> new Matrix2D(0, 1, -1, 0, width, 0);
default -> new Matrix2D();
};
Matrix2D finalMatrix = mirrorMatrix.multiply(rotationMatrix);
Point2D.Double p1 = finalMatrix.multPoint(bbox.getX1(), bbox.getY1());
Point2D.Double p2 = finalMatrix.multPoint(bbox.getX2(), bbox.getY2());
// PDFTron Rect *needs* lower left and upper right coordinates to calculate width and height correctly, even though the documentation states otherwise
Point2D.Double lowerLeft = new Point2D.Double(Math.min(p1.x, p2.x), Math.min(p1.y, p2.y));
Point2D.Double upperRight = new Point2D.Double(Math.max(p1.x, p2.x), Math.max(p1.y, p2.y));
return new Rect(lowerLeft.x, lowerLeft.y, upperRight.x, upperRight.y);
}
private RectCollection toRectCollection(List<Rectangle2D> rectangleList) {
RectCollection rectCollection = new RectCollection();
rectangleList.forEach(r -> {
try {
rectCollection.addRect(new Rect(r.getMinX(), r.getMinY(), r.getMaxX(), r.getMaxY()));
} catch (PDFNetException e) {
throw new RuntimeException(e);
}
});
return rectCollection;
}
@SneakyThrows
private List<Rectangle2D> toSortedRectangleList(RectCollection rectCollection) {
List<Rectangle2D> list = new LinkedList<>();
for (int i = 0; i < rectCollection.getNumRects(); ++i) {
Rect r = rectCollection.getRectAt(i);
list.add(new Rectangle2D.Double(r.getX1(), r.getY1(), r.getWidth(), r.getHeight()));
}
list.sort(Comparator.comparingDouble(RectangularShape::getMinY).thenComparing(RectangularShape::getMinX));
return list;
}
}

View File

@ -423,12 +423,13 @@ public class InvisibleElementRemovalService {
@Builder
private record InvisibleElementRemovalContext(boolean delta, //
ElementReader reader, //
ClippingPathStack clippingPathStack, //
List<ElementFeatures> overlappedElements, //
List<ElementFeatures> visibleElements, //
Set<Long> visitedXObjIds) {
private record InvisibleElementRemovalContext(
boolean delta,
ElementReader reader,
ClippingPathStack clippingPathStack,
List<ElementFeatures> overlappedElements,
List<ElementFeatures> visibleElements,
Set<Long> visitedXObjIds) {
}

View File

@ -1,12 +1,7 @@
package com.iqser.red.service.ocr.v1.server.service;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
@ -15,11 +10,8 @@ import org.springframework.stereotype.Service;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse;
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
import com.iqser.red.service.ocr.v1.server.model.ImagePosition;
import com.iqser.red.service.ocr.v1.server.model.image.ImageServiceResponse;
import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.OCRModule;
import com.pdftron.pdf.OCROptions;
import com.pdftron.pdf.Optimizer;
@ -48,118 +40,96 @@ public class OCRService {
private final InvisibleElementRemovalService invisibleElementRemovalService;
private final ImagePositionRetrievalService imagePositionRetrievalService;
/**
* First loads the PDF Document from storage.
* Then removes all invisible Elements from the PDF, check InvisibleElementRemovalService for details.
* Then gets Image Position Information, check ImagePositionRetrievalService for details.
* Then runs OCR page by page, exclusively on pages which have images on them. It does so, by creating a new PDFDoc and inserting a single page at a time.
* This is because PDFTron OCROptions overlays all regions where OCR should not be run with white images. It does not check for empty pages.
* For Documents with many pages but few Images this results in major performance improvements.
* It then re-adds the OCRed Pages to the original document and saves it.
*
* @param dossierId The dossier id
* @param fileId The file id
* @return the resulting PDF file as an InputStream
*/
@SneakyThrows
public InputStream ocrDocument(String dossierId, String fileId) {
public InputStream runOcrOnDocument(String dossierId, String fileId) {
InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId);
ImageServiceResponse imageServiceResponse = fileStorageService.getImageServiceResponse(dossierId, fileId);
byte[] fileWithoutInvisibleTextStream = invisibleElementRemovalService.removeInvisibleElements(fileStream, false);
byte[] fileWithoutInvisibleTextBytes = invisibleElementRemovalService.removeInvisibleElements(fileStream, false);
byte[] ocrBytes = ocr(fileWithoutInvisibleTextStream, fileId, imageServiceResponse);
byte[] ocrBytes = runOcr(fileWithoutInvisibleTextBytes, fileId);
return new ByteArrayInputStream(ocrBytes);
}
private byte[] ocr(byte[] file, String fileId, ImageServiceResponse imageServiceResponse) {
PDFDoc pdfDoc = null;
try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
pdfDoc = new PDFDoc(file);
Map<Integer, List<ImagePosition>> pages = new HashMap<>();
imageServiceResponse.getData()
.forEach(imageMetadata -> pages.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
.add(new ImagePosition(new Rectangle(new Point(imageMetadata.getPosition().getX1(), imageMetadata.getPosition().getY1()),
imageMetadata.getGeometry().getWidth(),
imageMetadata.getGeometry().getHeight(),
imageMetadata.getPosition().getPageNumber()), imageMetadata.isAlpha())));
Map<Integer, PDFDoc> pdfDocMap = Collections.synchronizedMap(new HashMap<>());
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder().fileId(fileId).numberOfPagesToOCR(pages.keySet().size()).build()));
ocrPages(pdfDoc, fileId, pages, pdfDocMap);
Optimizer.optimize(pdfDoc);
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
pdfDoc.close();
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder()
.fileId(fileId)
.numberOfPagesToOCR(pages.keySet().size())
.numberOfOCRedPages(pages.keySet().size())
.ocrFinished(true)
.build()));
return out.toByteArray();
} catch (Exception e) {
throw new RuntimeException(e);
} finally {
if (pdfDoc != null) {
try {
pdfDoc.close();
} catch (Exception e) {
log.debug("Failed to close document", e);
}
}
}
}
@SneakyThrows
private void ocrPages(PDFDoc pdfDoc, String fileId, Map<Integer, List<ImagePosition>> pages, Map<Integer, PDFDoc> pdfDocMap) {
private byte[] runOcr(byte[] file, String fileId) {
int numberOfOCRedPages = 0;
for (var pageEntry : pages.entrySet()) {
PDFDoc pdfDoc = new PDFDoc(file);
Map<Integer, RectCollection> pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, true);
OCROptions options = new OCROptions();
PDFDoc ocrPageDoc = new PDFDoc();
int numProcessedPages = 0;
for (Integer pageId : pageIdToRectCollection.keySet()) {
try {
RectCollection rectCollection = new RectCollection();
var page = pageEntry.getKey();
Page pdfPage = pdfDoc.getPageIterator(page).next();
pdfPage.setMediaBox(pdfPage.getCropBox());
for (ImagePosition imagePosition : pageEntry.getValue()) {
Rectangle rectangle = imagePosition.getRectangle();
// Warning coordinate system is different in this call macOs/Linux
double y = -rectangle.getTopLeft().getY() + pdfPage.getCropBox().getY2() - rectangle.getHeight();
rectCollection.addRect(rectangle.getTopLeft().getX(), y, rectangle.getTopLeft().getX() + rectangle.getWidth(), y + rectangle.getHeight());
}
PDFDoc ocrDoc = new PDFDoc();
ocrDoc.pagePushBack(pdfPage);
pdfDocMap.put(pageEntry.getKey(), ocrDoc);
OCROptions options = new OCROptions();
options.addTextZonesForPage(rectCollection, 1);
// optimization by only scanning pages that contain images
Page pdfPage = pdfDoc.getPage(pageId);
pdfPage.setMediaBox(pdfPage.getCropBox()); // this line ensures the ocr text is placed correctly by PDFTron
ocrPageDoc.pagePushBack(pdfPage);
options.addTextZonesForPage(pageIdToRectCollection.get(pageId), 1);
options.addLang(ENGLISH);
options.addDPI(settings.getOcrDPI());
OCRModule.processPDF(ocrDoc, options);
rectCollection.clear();
OCRModule.processPDF(ocrPageDoc, options);
++numProcessedPages;
} catch (Exception e) {
log.warn("Failed to process PDF page {}", pageEntry.getKey());
StringBuilder zonesString = new StringBuilder();
for (int j = 0; j < pageIdToRectCollection.get(pageId).getNumRects(); ++j) {
var r = pageIdToRectCollection.get(pageId).getRectAt(j);
zonesString.append(String.format("[lower left (%.1f|%.1f) upper right (%.1f|%.1f)]", r.getX1(), r.getY1(), r.getX2(), r.getY2()));
}
log.info("{}/{} Page {} done, OCR regions {}", numProcessedPages, pageIdToRectCollection.size(), pageId, zonesString);
// re-adding OCR pages
Page ocrPage = ocrPageDoc.getPage(1);
pdfDoc.pageInsert(pdfDoc.getPageIterator(pageId), ocrPage);
pdfDoc.pageRemove(pdfDoc.getPageIterator(pageId + 1));
ocrPageDoc.pageRemove(ocrPageDoc.getPageIterator(1));
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder()
.fileId(fileId)
.numberOfPagesToOCR(pageIdToRectCollection.size())
.numberOfOCRedPages(numProcessedPages)
.build()));
} catch (PDFNetException e) {
log.error("failed to process page {}", pageId);
throw new RuntimeException(e);
}
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder()
.fileId(fileId)
.numberOfPagesToOCR(pages.keySet().size())
.numberOfOCRedPages(++numberOfOCRedPages)
.build()));
log.warn("Done page {}", pageEntry);
}
ocrPageDoc.close();
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder()
.fileId(fileId)
.numberOfPagesToOCR(pageIdToRectCollection.size())
.numberOfOCRedPages(numProcessedPages)
.ocrFinished(true)
.build()));
Optimizer.optimize(pdfDoc);
return pdfDoc.save(SDFDoc.SaveMode.LINEARIZED, null);
}
}

View File

@ -44,7 +44,7 @@ public class OcrMessageReceiver {
fileStorageService.storeUntouchedFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), originalFile);
}
var ocrResult = ocrService.ocrDocument(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
var ocrResult = ocrService.runOcrOnDocument(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
fileStorageService.storeOriginalFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), ocrResult);

View File

@ -3,11 +3,13 @@ package com.iqser.red.service.ocr.v1.server;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
import static org.assertj.core.api.Assertions.assertThat;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
@ -25,12 +27,17 @@ import org.springframework.core.io.ClassPathResource;
import org.springframework.test.context.junit.jupiter.SpringExtension;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService;
import com.iqser.red.service.ocr.v1.server.service.OCRService;
import com.iqser.red.service.ocr.v1.server.service.FileStorageService;
import com.iqser.red.service.ocr.v1.server.service.OCRService;
import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService;
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType;
import com.iqser.red.storage.commons.StorageAutoConfiguration;
import com.iqser.red.storage.commons.service.StorageService;
import com.pdftron.pdf.OCRModule;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
import com.pdftron.pdf.TextExtractor;
import lombok.SneakyThrows;
@ -56,28 +63,96 @@ public class OcrServiceIntegrationTest {
private OCRService ocrService;
@BeforeEach
@SneakyThrows
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
public void assertOCRModuleIsLoaded() {
assert OCRModule.isModuleAvailable();
}
@Test
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
public void testOcr() {
// check visually for most of the images containing text, the resulting text is kind of nonsense, just ensure it is there
String text = testOCR("StitchedImagesMultiPage");
}
@Test
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
public void testManyRotatedImages() {
// check visually for most of the images containing text, the resulting text is kind of nonsense, just ensure it is there
String text = testOCR("manyRotatedImages");
assertThat(text).contains("Michela", "Gregori", "DVM", "PhD", "Pathologist");
}
@Test
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
public void testMergeImages() {
// check visually for most of the images containing text, the resulting text is kind of nonsense, just ensure it is there
String text = testOCR("merge_images");
assertThat(text).contains("Bodyweight change of dams with live young - group mean values",
"Control",
"mg/g day",
"10 mg/kg/day",
"20 mg/kg/",
"Days",
"50",
"-200",
"—250",
"150",
"200",
"250",
"—150");
}
@Test
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
public void testOCRWatermark() {
assertThat(testOCR("Watermark")).contains("syngenta");
}
@Test
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
public void testOCRInvisibleText() {
String text = testOCR("InvisibleText");
assertThat(text).contains("Michela", "Gregori", "DVM", "PhD", "Pathologist", "SIGNATURE PAGE");
assertThat(text).doesNotContain("COMPLETION DATE:", "LABORATORY PROJECT ID:", "AUTHOR(S):", "Substance");
}
@SneakyThrows
public void testOCR() {
private String testOCR(String fileName) {
String fileName = "Watermark";
ClassPathResource imageInfoResource = new ClassPathResource("files/" + fileName + ".IMAGE_INFO.json");
ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf");
var originId = FileStorageService.getStorageId("dossier", "file", FileType.ORIGIN);
storageService.storeObject(originId, pdfFileResource.getInputStream());
var imageId = FileStorageService.getStorageId("dossier", "file", FileType.IMAGE_INFO);
storageService.storeObject(imageId, imageInfoResource.getInputStream());
var response = ocrService.ocrDocument("dossier", "file");
var out = FileUtils.openOutputStream(new File(getTemporaryDirectory() + "/" + fileName + ".pdf"));
IOUtils.copy(response, out);
System.out.println("File:" + getTemporaryDirectory() + "/" + fileName + ".pdf");
try (InputStream ocrDocument = ocrService.runOcrOnDocument("dossier", "file")) {
byte[] ocrDocumentBytes = ocrDocument.readAllBytes();
try (var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
out.write(ocrDocumentBytes);
}
TextExtractor extractor = new TextExtractor();
List<String> texts = new ArrayList<>();
PDFDoc pdfDoc = new PDFDoc(ocrDocumentBytes);
PageIterator iterator = pdfDoc.getPageIterator();
while (iterator.hasNext()) {
Page page = iterator.next();
extractor.begin(page);
texts.add(extractor.getAsText());
}
System.out.println("File:" + getTemporaryDirectory() + "/" + fileName + ".pdf");
return String.join("\n", texts);
}
}

View File

@ -0,0 +1,228 @@
package com.iqser.red.service.ocr.v1.server.service;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
import static org.assertj.core.api.Assertions.assertThat;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.context.annotation.Import;
import org.springframework.core.io.ClassPathResource;
import org.springframework.test.context.junit.jupiter.SpringExtension;
import com.iqser.red.service.ocr.v1.server.Application;
import com.iqser.red.service.ocr.v1.server.OcrServiceIntegrationTest;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.ColorPt;
import com.pdftron.pdf.ColorSpace;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementBuilder;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.Rect;
import com.pdftron.pdf.RectCollection;
import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows;
@ExtendWith(SpringExtension.class)
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
@Import(OcrServiceIntegrationTest.TestConfiguration.class)
class ImagePositionRetrievalServiceTest {
@Autowired
private ImagePositionRetrievalService imagePositionRetrievalService;
@MockBean
protected RabbitTemplate rabbitTemplate;
@Test
@SneakyThrows
public void testImagePositionRetrievalForRotateTestFileWithImages() {
String fileName = "RotateTestFileWithImages";
List<int[]> allRectCoords = testImagePositionDetection(fileName);
assertThat(allRectCoords).contains(new int[]{48, 572, 295, 721},
new int[]{54, 279, 301, 428},
new int[]{360, 173, 509, 419},
new int[]{362, 522, 511, 768},
new int[]{459, 354, 608, 600},
new int[]{145, 404, 392, 553},
new int[]{151, 111, 398, 260},
new int[]{457, 5, 606, 251},
new int[]{395, 480, 545, 726},
new int[]{393, 130, 542, 377},
new int[]{88, 236, 334, 386},
new int[]{82, 530, 328, 679},
new int[]{465, 11, 614, 257},
new int[]{159, 117, 406, 266},
new int[]{467, 360, 617, 607},
new int[]{153, 410, 400, 559});
}
@Test
@SneakyThrows
public void testImagePositionRetrievalForRotateTestFileWithImagesExtremeCropbox() {
String fileName = "RotateTestFileWithImagesExtremeCropbox";
List<int[]> allRectCoords = testImagePositionDetection(fileName);
assertThat(allRectCoords).contains(new int[]{48, 572, 295, 721},
new int[]{362, 522, 511, 768},
new int[]{360, 173, 509, 419},
new int[]{54, 279, 301, 428},
new int[]{145, 192, 392, 341},
new int[]{459, 142, 608, 388},
new int[]{457, -207, 606, 39},
new int[]{151, -101, 398, 48},
new int[]{-30, 238, 216, 387},
new int[]{283, 188, 433, 434},
new int[]{281, -162, 430, 85},
new int[]{-24, -56, 222, 94},
new int[]{-39, 410, 208, 559},
new int[]{275, 360, 425, 607},
new int[]{273, 11, 422, 257},
new int[]{-33, 117, 214, 266});
}
@Test
@SneakyThrows
public void testMergeImages() {
String fileName = "merge_images";
List<int[]> allRectCoords = testImagePositionDetection(fileName);
assertThat(allRectCoords).contains(new int[]{90, 284, 398, 770});
}
@Test
@SneakyThrows
public void testStitchedImagesMultiPage() {
String fileName = "StitchedImagesMultiPage";
List<int[]> allRectCoords = testImagePositionDetection(fileName);
assertThat(allRectCoords.size()).isEqualTo(48);
}
private List<int[]> testImagePositionDetection(String fileName) throws IOException, PDFNetException {
InputStream fileStream = new FileInputStream(new ClassPathResource("files/" + fileName + ".pdf").getFile().getAbsolutePath());
PDFDoc pdfDoc = new PDFDoc(fileStream);
Map<Integer, RectCollection> pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, false);
ElementWriter writer = new ElementWriter();
pageIdToRectCollection.forEach((pageId, rectCollection) -> {
try {
writer.begin(pdfDoc.getPage(pageId));
drawRectCollection(writer, rectCollection, pdfDoc.getPage(pageId));
drawGrid(writer, pdfDoc.getPage(pageId));
writer.end();
StringBuilder zonesString = new StringBuilder();
for (int j = 0; j < rectCollection.getNumRects(); ++j) {
var r = rectCollection.getRectAt(j);
zonesString.append(String.format("%d:[lower left (%.1f|%.1f) upper right (%.1f|%.1f)]; ", j, r.getX1(), r.getY1(), r.getX2(), r.getY2()));
}
System.out.printf("Page %d: rotation: %d OCR regions %s \n", pageId, pdfDoc.getPage(pageId).getRotation(), zonesString);
} catch (PDFNetException e) {
throw new RuntimeException(e);
}
});
// Check visually for red Rectangles to match images in the saved pdf file
try (var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + "_IMAGE_BBOX.pdf")) {
out.write(pdfDoc.save(SDFDoc.SaveMode.LINEARIZED, null));
}
System.out.println("result file: " + getTemporaryDirectory() + "/" + fileName + "_IMAGE_BBOX.pdf");
fileStream.close();
// round all coords to nearest int to account for inconsistencies with the calculation of the bounding box
return pageIdToRectCollection.values().stream().map(this::toRoundedCoordinateArrayList).flatMap(List::stream).collect(Collectors.toList());
}
@SneakyThrows
private List<int[]> toRoundedCoordinateArrayList(RectCollection rectCollection) {
List<int[]> coords = new ArrayList<>(rectCollection.getNumRects());
for (int i = 0; i < rectCollection.getNumRects(); ++i) {
var r = rectCollection.getRectAt(i);
coords.add(new int[]{(int) Math.round(r.getX1()), (int) Math.round(r.getY1()), (int) Math.round(r.getX2()), (int) Math.round(r.getY2())});
}
return coords;
}
@SneakyThrows
private void drawGrid(ElementWriter writer, Page page) {
ElementBuilder eb = new ElementBuilder();
double dX = 15;
double dY = 15;
int nRows = (int) (page.getPageHeight() / dY) + 1;
int nCols = (int) (page.getPageWidth() / dX) + 1;
for (int row = 0; row < nRows; ++row) {
for (int col = 0; col < nCols; ++col) {
Element cell = eb.createRect(col * dX, row * dY, dX, dY);
cell.setPathStroke(true);
cell.getGState().setLineWidth(1);
cell.getGState().setStrokeOpacity(0.1);
cell.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
if (row == 0 && col == 0) {
cell.getGState().setStrokeColor(new ColorPt(0, 0, 1));
cell.setPathFill(true);
cell.getGState().setFillOpacity(0.8);
cell.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
cell.getGState().setFillColor(new ColorPt(0, 0, 1));
} else {
cell.setPathFill(false);
cell.getGState().setStrokeColor(new ColorPt(0.1, 0.1, 0.1));
}
writer.writePlacedElement(cell);
}
}
}
@SneakyThrows
public static void drawRectCollection(ElementWriter writer, RectCollection rectCollection, Page page) {
ColorPt colorPt = new ColorPt(1, 0, 0);
ElementBuilder eb = new ElementBuilder();
for (int i = 0; i < rectCollection.getNumRects(); ++i) {
Rect r = rectCollection.getRectAt(i);
Element rect = eb.createRect(r.getX1(), r.getY1(), r.getWidth(), r.getHeight());
rect.setPathStroke(true);
rect.getGState().setLineWidth(5);
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
rect.getGState().setStrokeColor(colorPt);
rect.setPathFill(true);
rect.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
rect.getGState().setFillColor(colorPt);
rect.getGState().setFillOpacity(0.5);
writer.writePlacedElement(rect);
}
}
}

View File

@ -1,4 +1,4 @@
package com.iqser.red.service.ocr.v1.server;
package com.iqser.red.service.ocr.v1.server.service;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
@ -17,7 +17,8 @@ import org.springframework.context.annotation.Import;
import org.springframework.core.io.ClassPathResource;
import org.springframework.test.context.junit.jupiter.SpringExtension;
import com.iqser.red.service.ocr.v1.server.service.InvisibleElementRemovalService;
import com.iqser.red.service.ocr.v1.server.Application;
import com.iqser.red.service.ocr.v1.server.OcrServiceIntegrationTest;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
@ -26,8 +27,7 @@ import com.pdftron.pdf.TextExtractor;
import lombok.SneakyThrows;
@ExtendWith(SpringExtension.class)
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT //
, properties = {"pdftron.ocrmodule.path=/YourOCRModulePath"})
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
@Import(OcrServiceIntegrationTest.TestConfiguration.class)
public class InvisibleElementRemovalServiceTest {

View File

@ -1 +0,0 @@
{"dossierId": "c8553cbd-409f-4e1a-baf4-34b11d49deac", "fileId": "bd6f93ed896dd0e2f641b0568f13ddf1", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": [{"classification": {"label": "other", "probabilities": {"other": 0.9999, "logo": 0.0001, "formula": 0.0, "signature": 0.0}}, "representation": "FFFFFEFBF7EFCFFFFFFFFFFFF", "position": {"x1": -3, "x2": 795, "y1": 0, "y2": 612, "pageNumber": 1}, "geometry": {"width": 798, "height": 612}, "alpha": false, "filters": {"geometry": {"imageSize": {"quotient": 1.0038, "tooLarge": true, "tooSmall": false}, "imageFormat": {"quotient": 1.3039, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "logo", "probabilities": {"logo": 0.9966, "other": 0.0025, "signature": 0.0005, "formula": 0.0003}}, "representation": "FFC33D3C323CCF3390C1F8C72", "position": {"x1": 120, "x2": 131, "y1": 264, "y2": 380, "pageNumber": 1}, "geometry": {"width": 11, "height": 116}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.0513, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.0948, "tooTall": true, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "other", "probabilities": {"other": 1.0, "formula": 0.0, "logo": 0.0, "signature": 0.0}}, "representation": "EF8FF6381060800318F0E187", "position": {"x1": 152, "x2": 205, "y1": 115, "y2": 533, "pageNumber": 1}, "geometry": {"width": 53, "height": 418}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.2138, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.1268, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": true}}, {"classification": {"label": "other", "probabilities": {"other": 0.5967, "logo": 0.1756, "signature": 0.1218, "formula": 0.106}}, "representation": "EC9377C9170E1070C3070C30F", "position": {"x1": 196, "x2": 221, "y1": 121, "y2": 245, "pageNumber": 1}, "geometry": {"width": 25, "height": 124}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.08, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.2016, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": true}}, {"classification": {"label": "logo", "probabilities": {"logo": 1.0, "formula": 0.0, "other": 0.0, "signature": 0.0}}, "representation": "CF1F1F70F140036860F1441B5", "position": {"x1": 707, "x2": 718, "y1": 481, "y2": 531, "pageNumber": 1}, "geometry": {"width": 11, "height": 50}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.0337, "tooLarge": false, "tooSmall": true}, "imageFormat": {"quotient": 0.22, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "logo", "probabilities": {"logo": 0.9704, "other": 0.0223, "formula": 0.0044, "signature": 0.0029}}, "representation": "CF0F1C70F1C7090081F7CF073", "position": {"x1": 732, "x2": 744, "y1": 118, "y2": 241, "pageNumber": 1}, "geometry": {"width": 12, "height": 123}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.0552, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.0976, "tooTall": true, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}], "dataCV": []}

View File

@ -1 +0,0 @@
{"dossierId": "55547c91-6b0e-4aa6-9009-2e7c4cd90f13", "fileId": "917b9d9c9f548f85fef3679db45ff46c", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": [{"classification": {"label": "signature", "probabilities": {"signature": 0.9927, "logo": 0.0038, "other": 0.0034, "formula": 0.0}}, "representation": "FFF2CF0F7C74FFC1070830FFF", "position": {"x1": -7, "x2": 603, "y1": 0, "y2": 852, "pageNumber": 1}, "geometry": {"width": 610, "height": 852}, "alpha": false, "filters": {"geometry": {"imageSize": {"quotient": 1.0096, "tooLarge": true, "tooSmall": false}, "imageFormat": {"quotient": 0.716, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}], "dataCV": []}

View File

@ -1 +0,0 @@
{"dossierId": "c8553cbd-409f-4e1a-baf4-34b11d49deac", "fileId": "bd6f93ed896dd0e2f641b0568f13ddf1", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": [{"classification": {"label": "other", "probabilities": {"other": 0.9999, "logo": 0.0001, "formula": 0.0, "signature": 0.0}}, "representation": "FFFFFEFBF7EFCFFFFFFFFFFFF", "position": {"x1": -3, "x2": 795, "y1": 0, "y2": 612, "pageNumber": 1}, "geometry": {"width": 798, "height": 612}, "alpha": false, "filters": {"geometry": {"imageSize": {"quotient": 1.0038, "tooLarge": true, "tooSmall": false}, "imageFormat": {"quotient": 1.3039, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "logo", "probabilities": {"logo": 0.9966, "other": 0.0025, "signature": 0.0005, "formula": 0.0003}}, "representation": "FFC33D3C323CCF3390C1F8C72", "position": {"x1": 120, "x2": 131, "y1": 264, "y2": 380, "pageNumber": 1}, "geometry": {"width": 11, "height": 116}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.0513, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.0948, "tooTall": true, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "other", "probabilities": {"other": 1.0, "formula": 0.0, "logo": 0.0, "signature": 0.0}}, "representation": "EF8FF6381060800318F0E187", "position": {"x1": 152, "x2": 205, "y1": 115, "y2": 533, "pageNumber": 1}, "geometry": {"width": 53, "height": 418}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.2138, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.1268, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": true}}, {"classification": {"label": "other", "probabilities": {"other": 0.5967, "logo": 0.1756, "signature": 0.1218, "formula": 0.106}}, "representation": "EC9377C9170E1070C3070C30F", "position": {"x1": 196, "x2": 221, "y1": 121, "y2": 245, "pageNumber": 1}, "geometry": {"width": 25, "height": 124}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.08, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.2016, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": true}}, {"classification": {"label": "logo", "probabilities": {"logo": 1.0, "formula": 0.0, "other": 0.0, "signature": 0.0}}, "representation": "CF1F1F70F140036860F1441B5", "position": {"x1": 707, "x2": 718, "y1": 481, "y2": 531, "pageNumber": 1}, "geometry": {"width": 11, "height": 50}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.0337, "tooLarge": false, "tooSmall": true}, "imageFormat": {"quotient": 0.22, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "logo", "probabilities": {"logo": 0.9704, "other": 0.0223, "formula": 0.0044, "signature": 0.0029}}, "representation": "CF0F1C70F1C7090081F7CF073", "position": {"x1": 732, "x2": 744, "y1": 118, "y2": 241, "pageNumber": 1}, "geometry": {"width": 12, "height": 123}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.0552, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.0976, "tooTall": true, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}], "dataCV": []}

View File

@ -1 +0,0 @@
{"dossierId": "55547c91-6b0e-4aa6-9009-2e7c4cd90f13", "fileId": "d7f1e0e37cba4e28ebdf894a79d3bd67", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": [{"classification": {"label": "signature", "probabilities": {"signature": 0.9872, "logo": 0.0064, "other": 0.0063, "formula": 0.0001}}, "representation": "FFFCF10608F6F89747BFFC301", "position": {"x1": -9, "x2": 584, "y1": 9, "y2": 849, "pageNumber": 1}, "geometry": {"width": 593, "height": 840}, "alpha": false, "filters": {"geometry": {"imageSize": {"quotient": 0.9992, "tooLarge": true, "tooSmall": false}, "imageFormat": {"quotient": 0.706, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}], "dataCV": []}

View File

@ -1 +0,0 @@
{"dossierId": "55547c91-6b0e-4aa6-9009-2e7c4cd90f13", "fileId": "32b19ec38896f5105c09041def470c90", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": [{"classification": {"label": "logo", "probabilities": {"logo": 0.9999, "signature": 0.0001, "formula": 0.0, "other": 0.0}}, "representation": "307EF8F6E9833CE9D7AF9EFFF", "position": {"x1": 26, "x2": 586, "y1": -2, "y2": 794, "pageNumber": 1}, "geometry": {"width": 560, "height": 796}, "alpha": false, "filters": {"geometry": {"imageSize": {"quotient": 0.959, "tooLarge": true, "tooSmall": false}, "imageFormat": {"quotient": 0.7035, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "logo", "probabilities": {"logo": 1.0, "formula": 0.0, "other": 0.0, "signature": 0.0}}, "representation": "FFF7FFD2000000018F3FFEFFF", "position": {"x1": 90, "x2": 210, "y1": 676, "y2": 720, "pageNumber": 1}, "geometry": {"width": 120, "height": 44}, "alpha": false, "filters": {"geometry": {"imageSize": {"quotient": 0.1044, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 2.7273, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": true}}], "dataCV": []}