Pull request #5: RED-6126
Merge in RED/ocr-service from RED-6126 to master * commit '00cfe9e44948c153857ad59442dbc9349e1d4555': RED-6126: In the OCRService, OCR Text is not applied to Document *reformatted InvisibleElementRemovalService with new Code Style RED-6126: In the OCRService, OCR Text is not applied to Document *updated some comments *very slight refactor RED-6126: In the OCRService, OCR Text is not applied to Document *complete refactor of the OCRService *moved image position retrieval to new class instead of image service *added new tests for image rotation RED-6126: In the OCRService, OCR Text is not applied to Document *removed private configuration RED-6126: In the OCRService, OCR Text is not applied to Document *formatted one line RED-6126: In the OCRService, OCR Text is not applied to Document *reverted application of OCR Text to Document to old state *refactored OCR Service slightly *added meaningful test cases
This commit is contained in:
commit
b0a658213d
@ -0,0 +1,177 @@
|
||||
package com.iqser.red.service.ocr.v1.server.service;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.awt.geom.RectangularShape;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.pdftron.common.Matrix2D;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.Element;
|
||||
import com.pdftron.pdf.ElementReader;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.pdf.Rect;
|
||||
import com.pdftron.pdf.RectCollection;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@Service
|
||||
public class ImagePositionRetrievalService {
|
||||
|
||||
private static final double TOLERANCE = 1e-1;
|
||||
|
||||
|
||||
/**
|
||||
* Iterates over all elements in a PDF Document and retrieves the bounding box for each image,
|
||||
* Then it adjusts the bounding boxes for the page rotation.
|
||||
* If the mirrorY flag is set, the Y Coordinates are mirrored and moved up by the page height. This is required for PDFTrons OCRModule.
|
||||
*
|
||||
* @param pdfDoc a PDF File as PDFTron PDFDoc class
|
||||
* @param mirrorY if this flag is set, all coordinates are calculated with upper left corner as (0,0), else initial user space
|
||||
* @return a map with the page indices as keys and the image bounding boxes on that page as a RectCollection
|
||||
*/
|
||||
@SneakyThrows
|
||||
public Map<Integer, RectCollection> getImagePositionPerPage(PDFDoc pdfDoc, boolean mirrorY) {
|
||||
|
||||
Map<Integer, RectCollection> pageIdToImagePositions = new HashMap<>();
|
||||
ElementReader reader = new ElementReader();
|
||||
for (int pageId = 1; pageId <= pdfDoc.getPageCount(); ++pageId) {
|
||||
RectCollection imagePositions = new RectCollection();
|
||||
reader.begin(pdfDoc.getPage(pageId));
|
||||
|
||||
findImagePositionsOnPage(reader, imagePositions, pdfDoc.getPage(pageId), mirrorY);
|
||||
imagePositions = mergeOverlappingRects(imagePositions);
|
||||
|
||||
reader.end();
|
||||
if (imagePositions.getNumRects() > 0) {
|
||||
pageIdToImagePositions.put(pageId, imagePositions);
|
||||
}
|
||||
}
|
||||
return pageIdToImagePositions;
|
||||
}
|
||||
|
||||
|
||||
private void findImagePositionsOnPage(ElementReader reader, RectCollection imagePositions, Page currentPage, boolean mirrorY) throws PDFNetException {
|
||||
|
||||
Element element;
|
||||
while ((element = reader.next()) != null) {
|
||||
switch (element.getType()) {
|
||||
case Element.e_image, Element.e_inline_image -> imagePositions.addRect(toRotationAdjustedRect(element.getBBox(), currentPage, mirrorY));
|
||||
case Element.e_form -> {
|
||||
reader.formBegin();
|
||||
findImagePositionsOnPage(reader, imagePositions, currentPage, mirrorY);
|
||||
reader.end();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public RectCollection mergeOverlappingRects(RectCollection imagePositions) {
|
||||
|
||||
if (imagePositions.getNumRects() == 1) {
|
||||
return imagePositions;
|
||||
}
|
||||
|
||||
List<Rectangle2D> rectangleList = toSortedRectangleList(imagePositions);
|
||||
|
||||
rectangleList = mergeRectangleListRecursive(rectangleList, 0);
|
||||
return toRectCollection(rectangleList);
|
||||
}
|
||||
|
||||
|
||||
// Sometimes images are split up into stripes, here we merge the positions of aligned and intersecting rectangles into one larger rectangle
|
||||
private List<Rectangle2D> mergeRectangleListRecursive(List<Rectangle2D> rectangleList, int currentIdx) {
|
||||
|
||||
if (rectangleList.size() < currentIdx + 2) {
|
||||
return rectangleList;
|
||||
}
|
||||
|
||||
var rect1 = rectangleList.get(currentIdx);
|
||||
var rect2 = rectangleList.get(currentIdx + 1);
|
||||
|
||||
boolean isAlignedX = Math.abs(rect1.getMinX() - rect2.getMinX()) < TOLERANCE && Math.abs(rect1.getMaxX() - rect2.getMaxX()) < TOLERANCE;
|
||||
boolean isAlignedY = Math.abs(rect1.getMinY() - rect2.getMinY()) < TOLERANCE && Math.abs(rect1.getMaxY() - rect2.getMaxY()) < TOLERANCE;
|
||||
boolean intersects = rect1.intersects(rect2.getMinX() - TOLERANCE, rect2.getMinY() - TOLERANCE, rect2.getWidth() + (2 * TOLERANCE), rect2.getHeight() + (2 * TOLERANCE));
|
||||
|
||||
if (intersects && (isAlignedX || isAlignedY)) {
|
||||
rectangleList.remove(currentIdx + 1);
|
||||
rectangleList.remove(currentIdx);
|
||||
rectangleList.add(currentIdx, rect1.createUnion(rect2));
|
||||
return mergeRectangleListRecursive(rectangleList, currentIdx);
|
||||
} else {
|
||||
return mergeRectangleListRecursive(rectangleList, currentIdx + 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private Rect toRotationAdjustedRect(Rect bbox, Page page, boolean mirrorY) throws PDFNetException {
|
||||
|
||||
int rotation = page.getRotation();
|
||||
double height = page.getPageHeight();
|
||||
double width = page.getPageWidth();
|
||||
|
||||
// Even though PDFTron almost always has the origin in the lower left corner, for some reason, the OCRModule's addTextZonesForPage() uses the upper left corner as origin...
|
||||
Matrix2D mirrorMatrix;
|
||||
if (mirrorY) {
|
||||
mirrorMatrix = new Matrix2D(1, 0, 0, -1, 0, height);
|
||||
} else {
|
||||
mirrorMatrix = new Matrix2D();
|
||||
}
|
||||
|
||||
// We need to rotate the rects to fit to the page rotation
|
||||
Matrix2D rotationMatrix = switch (rotation) {
|
||||
case 1 -> new Matrix2D(0, -1, 1, 0, 0, height);
|
||||
case 2 -> new Matrix2D(-1, 0, 0, -1, width, height);
|
||||
case 3 -> new Matrix2D(0, 1, -1, 0, width, 0);
|
||||
default -> new Matrix2D();
|
||||
};
|
||||
|
||||
Matrix2D finalMatrix = mirrorMatrix.multiply(rotationMatrix);
|
||||
|
||||
Point2D.Double p1 = finalMatrix.multPoint(bbox.getX1(), bbox.getY1());
|
||||
Point2D.Double p2 = finalMatrix.multPoint(bbox.getX2(), bbox.getY2());
|
||||
|
||||
// PDFTron Rect *needs* lower left and upper right coordinates to calculate width and height correctly, even though the documentation states otherwise
|
||||
Point2D.Double lowerLeft = new Point2D.Double(Math.min(p1.x, p2.x), Math.min(p1.y, p2.y));
|
||||
Point2D.Double upperRight = new Point2D.Double(Math.max(p1.x, p2.x), Math.max(p1.y, p2.y));
|
||||
|
||||
return new Rect(lowerLeft.x, lowerLeft.y, upperRight.x, upperRight.y);
|
||||
}
|
||||
|
||||
|
||||
private RectCollection toRectCollection(List<Rectangle2D> rectangleList) {
|
||||
|
||||
RectCollection rectCollection = new RectCollection();
|
||||
rectangleList.forEach(r -> {
|
||||
try {
|
||||
rectCollection.addRect(new Rect(r.getMinX(), r.getMinY(), r.getMaxX(), r.getMaxY()));
|
||||
} catch (PDFNetException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
});
|
||||
return rectCollection;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private List<Rectangle2D> toSortedRectangleList(RectCollection rectCollection) {
|
||||
|
||||
List<Rectangle2D> list = new LinkedList<>();
|
||||
for (int i = 0; i < rectCollection.getNumRects(); ++i) {
|
||||
Rect r = rectCollection.getRectAt(i);
|
||||
list.add(new Rectangle2D.Double(r.getX1(), r.getY1(), r.getWidth(), r.getHeight()));
|
||||
}
|
||||
list.sort(Comparator.comparingDouble(RectangularShape::getMinY).thenComparing(RectangularShape::getMinX));
|
||||
return list;
|
||||
}
|
||||
|
||||
}
|
||||
@ -423,12 +423,13 @@ public class InvisibleElementRemovalService {
|
||||
|
||||
|
||||
@Builder
|
||||
private record InvisibleElementRemovalContext(boolean delta, //
|
||||
ElementReader reader, //
|
||||
ClippingPathStack clippingPathStack, //
|
||||
List<ElementFeatures> overlappedElements, //
|
||||
List<ElementFeatures> visibleElements, //
|
||||
Set<Long> visitedXObjIds) {
|
||||
private record InvisibleElementRemovalContext(
|
||||
boolean delta,
|
||||
ElementReader reader,
|
||||
ClippingPathStack clippingPathStack,
|
||||
List<ElementFeatures> overlappedElements,
|
||||
List<ElementFeatures> visibleElements,
|
||||
Set<Long> visitedXObjIds) {
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -1,12 +1,7 @@
|
||||
package com.iqser.red.service.ocr.v1.server.service;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||
@ -15,11 +10,8 @@ import org.springframework.stereotype.Service;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse;
|
||||
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
|
||||
import com.iqser.red.service.ocr.v1.server.model.ImagePosition;
|
||||
import com.iqser.red.service.ocr.v1.server.model.image.ImageServiceResponse;
|
||||
import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings;
|
||||
import com.iqser.red.service.redaction.v1.model.Point;
|
||||
import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.OCRModule;
|
||||
import com.pdftron.pdf.OCROptions;
|
||||
import com.pdftron.pdf.Optimizer;
|
||||
@ -48,118 +40,96 @@ public class OCRService {
|
||||
|
||||
private final InvisibleElementRemovalService invisibleElementRemovalService;
|
||||
|
||||
private final ImagePositionRetrievalService imagePositionRetrievalService;
|
||||
|
||||
|
||||
/**
|
||||
* First loads the PDF Document from storage.
|
||||
* Then removes all invisible Elements from the PDF, check InvisibleElementRemovalService for details.
|
||||
* Then gets Image Position Information, check ImagePositionRetrievalService for details.
|
||||
* Then runs OCR page by page, exclusively on pages which have images on them. It does so, by creating a new PDFDoc and inserting a single page at a time.
|
||||
* This is because PDFTron OCROptions overlays all regions where OCR should not be run with white images. It does not check for empty pages.
|
||||
* For Documents with many pages but few Images this results in major performance improvements.
|
||||
* It then re-adds the OCRed Pages to the original document and saves it.
|
||||
*
|
||||
* @param dossierId The dossier id
|
||||
* @param fileId The file id
|
||||
* @return the resulting PDF file as an InputStream
|
||||
*/
|
||||
@SneakyThrows
|
||||
public InputStream ocrDocument(String dossierId, String fileId) {
|
||||
public InputStream runOcrOnDocument(String dossierId, String fileId) {
|
||||
|
||||
InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId);
|
||||
ImageServiceResponse imageServiceResponse = fileStorageService.getImageServiceResponse(dossierId, fileId);
|
||||
|
||||
byte[] fileWithoutInvisibleTextStream = invisibleElementRemovalService.removeInvisibleElements(fileStream, false);
|
||||
byte[] fileWithoutInvisibleTextBytes = invisibleElementRemovalService.removeInvisibleElements(fileStream, false);
|
||||
|
||||
byte[] ocrBytes = ocr(fileWithoutInvisibleTextStream, fileId, imageServiceResponse);
|
||||
byte[] ocrBytes = runOcr(fileWithoutInvisibleTextBytes, fileId);
|
||||
|
||||
return new ByteArrayInputStream(ocrBytes);
|
||||
|
||||
}
|
||||
|
||||
|
||||
private byte[] ocr(byte[] file, String fileId, ImageServiceResponse imageServiceResponse) {
|
||||
PDFDoc pdfDoc = null;
|
||||
try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
|
||||
pdfDoc = new PDFDoc(file);
|
||||
|
||||
Map<Integer, List<ImagePosition>> pages = new HashMap<>();
|
||||
|
||||
imageServiceResponse.getData()
|
||||
.forEach(imageMetadata -> pages.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
|
||||
.add(new ImagePosition(new Rectangle(new Point(imageMetadata.getPosition().getX1(), imageMetadata.getPosition().getY1()),
|
||||
imageMetadata.getGeometry().getWidth(),
|
||||
imageMetadata.getGeometry().getHeight(),
|
||||
imageMetadata.getPosition().getPageNumber()), imageMetadata.isAlpha())));
|
||||
|
||||
Map<Integer, PDFDoc> pdfDocMap = Collections.synchronizedMap(new HashMap<>());
|
||||
|
||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
|
||||
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder().fileId(fileId).numberOfPagesToOCR(pages.keySet().size()).build()));
|
||||
|
||||
ocrPages(pdfDoc, fileId, pages, pdfDocMap);
|
||||
|
||||
Optimizer.optimize(pdfDoc);
|
||||
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
|
||||
pdfDoc.close();
|
||||
|
||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
|
||||
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder()
|
||||
.fileId(fileId)
|
||||
.numberOfPagesToOCR(pages.keySet().size())
|
||||
.numberOfOCRedPages(pages.keySet().size())
|
||||
.ocrFinished(true)
|
||||
.build()));
|
||||
|
||||
return out.toByteArray();
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
} finally {
|
||||
if (pdfDoc != null) {
|
||||
try {
|
||||
pdfDoc.close();
|
||||
} catch (Exception e) {
|
||||
log.debug("Failed to close document", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void ocrPages(PDFDoc pdfDoc, String fileId, Map<Integer, List<ImagePosition>> pages, Map<Integer, PDFDoc> pdfDocMap) {
|
||||
private byte[] runOcr(byte[] file, String fileId) {
|
||||
|
||||
int numberOfOCRedPages = 0;
|
||||
for (var pageEntry : pages.entrySet()) {
|
||||
PDFDoc pdfDoc = new PDFDoc(file);
|
||||
|
||||
Map<Integer, RectCollection> pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, true);
|
||||
|
||||
OCROptions options = new OCROptions();
|
||||
PDFDoc ocrPageDoc = new PDFDoc();
|
||||
int numProcessedPages = 0;
|
||||
for (Integer pageId : pageIdToRectCollection.keySet()) {
|
||||
try {
|
||||
RectCollection rectCollection = new RectCollection();
|
||||
|
||||
var page = pageEntry.getKey();
|
||||
|
||||
Page pdfPage = pdfDoc.getPageIterator(page).next();
|
||||
|
||||
pdfPage.setMediaBox(pdfPage.getCropBox());
|
||||
|
||||
for (ImagePosition imagePosition : pageEntry.getValue()) {
|
||||
Rectangle rectangle = imagePosition.getRectangle();
|
||||
|
||||
// Warning coordinate system is different in this call macOs/Linux
|
||||
double y = -rectangle.getTopLeft().getY() + pdfPage.getCropBox().getY2() - rectangle.getHeight();
|
||||
rectCollection.addRect(rectangle.getTopLeft().getX(), y, rectangle.getTopLeft().getX() + rectangle.getWidth(), y + rectangle.getHeight());
|
||||
}
|
||||
|
||||
PDFDoc ocrDoc = new PDFDoc();
|
||||
ocrDoc.pagePushBack(pdfPage);
|
||||
pdfDocMap.put(pageEntry.getKey(), ocrDoc);
|
||||
|
||||
OCROptions options = new OCROptions();
|
||||
options.addTextZonesForPage(rectCollection, 1);
|
||||
// optimization by only scanning pages that contain images
|
||||
Page pdfPage = pdfDoc.getPage(pageId);
|
||||
pdfPage.setMediaBox(pdfPage.getCropBox()); // this line ensures the ocr text is placed correctly by PDFTron
|
||||
ocrPageDoc.pagePushBack(pdfPage);
|
||||
options.addTextZonesForPage(pageIdToRectCollection.get(pageId), 1);
|
||||
options.addLang(ENGLISH);
|
||||
options.addDPI(settings.getOcrDPI());
|
||||
OCRModule.processPDF(ocrDoc, options);
|
||||
|
||||
rectCollection.clear();
|
||||
OCRModule.processPDF(ocrPageDoc, options);
|
||||
++numProcessedPages;
|
||||
|
||||
} catch (Exception e) {
|
||||
log.warn("Failed to process PDF page {}", pageEntry.getKey());
|
||||
StringBuilder zonesString = new StringBuilder();
|
||||
for (int j = 0; j < pageIdToRectCollection.get(pageId).getNumRects(); ++j) {
|
||||
var r = pageIdToRectCollection.get(pageId).getRectAt(j);
|
||||
zonesString.append(String.format("[lower left (%.1f|%.1f) upper right (%.1f|%.1f)]", r.getX1(), r.getY1(), r.getX2(), r.getY2()));
|
||||
}
|
||||
log.info("{}/{} Page {} done, OCR regions {}", numProcessedPages, pageIdToRectCollection.size(), pageId, zonesString);
|
||||
|
||||
// re-adding OCR pages
|
||||
Page ocrPage = ocrPageDoc.getPage(1);
|
||||
pdfDoc.pageInsert(pdfDoc.getPageIterator(pageId), ocrPage);
|
||||
pdfDoc.pageRemove(pdfDoc.getPageIterator(pageId + 1));
|
||||
ocrPageDoc.pageRemove(ocrPageDoc.getPageIterator(1));
|
||||
|
||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
|
||||
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder()
|
||||
.fileId(fileId)
|
||||
.numberOfPagesToOCR(pageIdToRectCollection.size())
|
||||
.numberOfOCRedPages(numProcessedPages)
|
||||
.build()));
|
||||
|
||||
} catch (PDFNetException e) {
|
||||
log.error("failed to process page {}", pageId);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
|
||||
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder()
|
||||
.fileId(fileId)
|
||||
.numberOfPagesToOCR(pages.keySet().size())
|
||||
.numberOfOCRedPages(++numberOfOCRedPages)
|
||||
.build()));
|
||||
|
||||
log.warn("Done page {}", pageEntry);
|
||||
|
||||
}
|
||||
|
||||
ocrPageDoc.close();
|
||||
|
||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
|
||||
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder()
|
||||
.fileId(fileId)
|
||||
.numberOfPagesToOCR(pageIdToRectCollection.size())
|
||||
.numberOfOCRedPages(numProcessedPages)
|
||||
.ocrFinished(true)
|
||||
.build()));
|
||||
|
||||
Optimizer.optimize(pdfDoc);
|
||||
return pdfDoc.save(SDFDoc.SaveMode.LINEARIZED, null);
|
||||
}
|
||||
}
|
||||
|
||||
@ -44,7 +44,7 @@ public class OcrMessageReceiver {
|
||||
fileStorageService.storeUntouchedFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), originalFile);
|
||||
}
|
||||
|
||||
var ocrResult = ocrService.ocrDocument(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
|
||||
var ocrResult = ocrService.runOcrOnDocument(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
|
||||
|
||||
fileStorageService.storeOriginalFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), ocrResult);
|
||||
|
||||
|
||||
@ -3,11 +3,13 @@ package com.iqser.red.service.ocr.v1.server;
|
||||
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
@ -25,12 +27,17 @@ import org.springframework.core.io.ClassPathResource;
|
||||
import org.springframework.test.context.junit.jupiter.SpringExtension;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService;
|
||||
import com.iqser.red.service.ocr.v1.server.service.OCRService;
|
||||
import com.iqser.red.service.ocr.v1.server.service.FileStorageService;
|
||||
import com.iqser.red.service.ocr.v1.server.service.OCRService;
|
||||
import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService;
|
||||
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType;
|
||||
import com.iqser.red.storage.commons.StorageAutoConfiguration;
|
||||
import com.iqser.red.storage.commons.service.StorageService;
|
||||
import com.pdftron.pdf.OCRModule;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.pdf.PageIterator;
|
||||
import com.pdftron.pdf.TextExtractor;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@ -56,28 +63,96 @@ public class OcrServiceIntegrationTest {
|
||||
private OCRService ocrService;
|
||||
|
||||
|
||||
@BeforeEach
|
||||
@SneakyThrows
|
||||
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
|
||||
public void assertOCRModuleIsLoaded() {
|
||||
|
||||
assert OCRModule.isModuleAvailable();
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
|
||||
public void testOcr() {
|
||||
// check visually for most of the images containing text, the resulting text is kind of nonsense, just ensure it is there
|
||||
String text = testOCR("StitchedImagesMultiPage");
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
|
||||
public void testManyRotatedImages() {
|
||||
// check visually for most of the images containing text, the resulting text is kind of nonsense, just ensure it is there
|
||||
String text = testOCR("manyRotatedImages");
|
||||
assertThat(text).contains("Michela", "Gregori", "DVM", "PhD", "Pathologist");
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
|
||||
public void testMergeImages() {
|
||||
// check visually for most of the images containing text, the resulting text is kind of nonsense, just ensure it is there
|
||||
String text = testOCR("merge_images");
|
||||
assertThat(text).contains("Bodyweight change of dams with live young - group mean values",
|
||||
"Control",
|
||||
"mg/g day",
|
||||
"10 mg/kg/day",
|
||||
"20 mg/kg/",
|
||||
"Days",
|
||||
"50",
|
||||
"-200",
|
||||
"—250",
|
||||
"150",
|
||||
"200",
|
||||
"250",
|
||||
"—150");
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
|
||||
public void testOCRWatermark() {
|
||||
|
||||
assertThat(testOCR("Watermark")).contains("syngenta");
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
|
||||
public void testOCRInvisibleText() {
|
||||
|
||||
String text = testOCR("InvisibleText");
|
||||
assertThat(text).contains("Michela", "Gregori", "DVM", "PhD", "Pathologist", "SIGNATURE PAGE");
|
||||
assertThat(text).doesNotContain("COMPLETION DATE:", "LABORATORY PROJECT ID:", "AUTHOR(S):", "Substance");
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void testOCR() {
|
||||
private String testOCR(String fileName) {
|
||||
|
||||
String fileName = "Watermark";
|
||||
|
||||
ClassPathResource imageInfoResource = new ClassPathResource("files/" + fileName + ".IMAGE_INFO.json");
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf");
|
||||
|
||||
var originId = FileStorageService.getStorageId("dossier", "file", FileType.ORIGIN);
|
||||
storageService.storeObject(originId, pdfFileResource.getInputStream());
|
||||
|
||||
var imageId = FileStorageService.getStorageId("dossier", "file", FileType.IMAGE_INFO);
|
||||
storageService.storeObject(imageId, imageInfoResource.getInputStream());
|
||||
|
||||
var response = ocrService.ocrDocument("dossier", "file");
|
||||
|
||||
var out = FileUtils.openOutputStream(new File(getTemporaryDirectory() + "/" + fileName + ".pdf"));
|
||||
IOUtils.copy(response, out);
|
||||
|
||||
System.out.println("File:" + getTemporaryDirectory() + "/" + fileName + ".pdf");
|
||||
try (InputStream ocrDocument = ocrService.runOcrOnDocument("dossier", "file")) {
|
||||
byte[] ocrDocumentBytes = ocrDocument.readAllBytes();
|
||||
try (var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + ".pdf")) {
|
||||
out.write(ocrDocumentBytes);
|
||||
}
|
||||
TextExtractor extractor = new TextExtractor();
|
||||
List<String> texts = new ArrayList<>();
|
||||
PDFDoc pdfDoc = new PDFDoc(ocrDocumentBytes);
|
||||
PageIterator iterator = pdfDoc.getPageIterator();
|
||||
while (iterator.hasNext()) {
|
||||
Page page = iterator.next();
|
||||
extractor.begin(page);
|
||||
texts.add(extractor.getAsText());
|
||||
}
|
||||
System.out.println("File:" + getTemporaryDirectory() + "/" + fileName + ".pdf");
|
||||
return String.join("\n", texts);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -0,0 +1,228 @@
|
||||
package com.iqser.red.service.ocr.v1.server.service;
|
||||
|
||||
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.boot.test.context.SpringBootTest;
|
||||
import org.springframework.boot.test.mock.mockito.MockBean;
|
||||
import org.springframework.context.annotation.Import;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
import org.springframework.test.context.junit.jupiter.SpringExtension;
|
||||
|
||||
import com.iqser.red.service.ocr.v1.server.Application;
|
||||
import com.iqser.red.service.ocr.v1.server.OcrServiceIntegrationTest;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.ColorPt;
|
||||
import com.pdftron.pdf.ColorSpace;
|
||||
import com.pdftron.pdf.Element;
|
||||
import com.pdftron.pdf.ElementBuilder;
|
||||
import com.pdftron.pdf.ElementWriter;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.pdf.Rect;
|
||||
import com.pdftron.pdf.RectCollection;
|
||||
import com.pdftron.sdf.SDFDoc;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@ExtendWith(SpringExtension.class)
|
||||
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
|
||||
@Import(OcrServiceIntegrationTest.TestConfiguration.class)
|
||||
class ImagePositionRetrievalServiceTest {
|
||||
|
||||
@Autowired
|
||||
private ImagePositionRetrievalService imagePositionRetrievalService;
|
||||
|
||||
@MockBean
|
||||
protected RabbitTemplate rabbitTemplate;
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testImagePositionRetrievalForRotateTestFileWithImages() {
|
||||
|
||||
String fileName = "RotateTestFileWithImages";
|
||||
|
||||
List<int[]> allRectCoords = testImagePositionDetection(fileName);
|
||||
|
||||
assertThat(allRectCoords).contains(new int[]{48, 572, 295, 721},
|
||||
new int[]{54, 279, 301, 428},
|
||||
new int[]{360, 173, 509, 419},
|
||||
new int[]{362, 522, 511, 768},
|
||||
new int[]{459, 354, 608, 600},
|
||||
new int[]{145, 404, 392, 553},
|
||||
new int[]{151, 111, 398, 260},
|
||||
new int[]{457, 5, 606, 251},
|
||||
new int[]{395, 480, 545, 726},
|
||||
new int[]{393, 130, 542, 377},
|
||||
new int[]{88, 236, 334, 386},
|
||||
new int[]{82, 530, 328, 679},
|
||||
new int[]{465, 11, 614, 257},
|
||||
new int[]{159, 117, 406, 266},
|
||||
new int[]{467, 360, 617, 607},
|
||||
new int[]{153, 410, 400, 559});
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testImagePositionRetrievalForRotateTestFileWithImagesExtremeCropbox() {
|
||||
|
||||
String fileName = "RotateTestFileWithImagesExtremeCropbox";
|
||||
|
||||
List<int[]> allRectCoords = testImagePositionDetection(fileName);
|
||||
assertThat(allRectCoords).contains(new int[]{48, 572, 295, 721},
|
||||
new int[]{362, 522, 511, 768},
|
||||
new int[]{360, 173, 509, 419},
|
||||
new int[]{54, 279, 301, 428},
|
||||
new int[]{145, 192, 392, 341},
|
||||
new int[]{459, 142, 608, 388},
|
||||
new int[]{457, -207, 606, 39},
|
||||
new int[]{151, -101, 398, 48},
|
||||
new int[]{-30, 238, 216, 387},
|
||||
new int[]{283, 188, 433, 434},
|
||||
new int[]{281, -162, 430, 85},
|
||||
new int[]{-24, -56, 222, 94},
|
||||
new int[]{-39, 410, 208, 559},
|
||||
new int[]{275, 360, 425, 607},
|
||||
new int[]{273, 11, 422, 257},
|
||||
new int[]{-33, 117, 214, 266});
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testMergeImages() {
|
||||
|
||||
String fileName = "merge_images";
|
||||
List<int[]> allRectCoords = testImagePositionDetection(fileName);
|
||||
assertThat(allRectCoords).contains(new int[]{90, 284, 398, 770});
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testStitchedImagesMultiPage() {
|
||||
|
||||
String fileName = "StitchedImagesMultiPage";
|
||||
List<int[]> allRectCoords = testImagePositionDetection(fileName);
|
||||
assertThat(allRectCoords.size()).isEqualTo(48);
|
||||
}
|
||||
|
||||
|
||||
private List<int[]> testImagePositionDetection(String fileName) throws IOException, PDFNetException {
|
||||
|
||||
InputStream fileStream = new FileInputStream(new ClassPathResource("files/" + fileName + ".pdf").getFile().getAbsolutePath());
|
||||
PDFDoc pdfDoc = new PDFDoc(fileStream);
|
||||
|
||||
Map<Integer, RectCollection> pageIdToRectCollection = imagePositionRetrievalService.getImagePositionPerPage(pdfDoc, false);
|
||||
|
||||
ElementWriter writer = new ElementWriter();
|
||||
pageIdToRectCollection.forEach((pageId, rectCollection) -> {
|
||||
try {
|
||||
writer.begin(pdfDoc.getPage(pageId));
|
||||
drawRectCollection(writer, rectCollection, pdfDoc.getPage(pageId));
|
||||
drawGrid(writer, pdfDoc.getPage(pageId));
|
||||
writer.end();
|
||||
StringBuilder zonesString = new StringBuilder();
|
||||
for (int j = 0; j < rectCollection.getNumRects(); ++j) {
|
||||
var r = rectCollection.getRectAt(j);
|
||||
zonesString.append(String.format("%d:[lower left (%.1f|%.1f) upper right (%.1f|%.1f)]; ", j, r.getX1(), r.getY1(), r.getX2(), r.getY2()));
|
||||
}
|
||||
System.out.printf("Page %d: rotation: %d OCR regions %s \n", pageId, pdfDoc.getPage(pageId).getRotation(), zonesString);
|
||||
} catch (PDFNetException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
});
|
||||
|
||||
// Check visually for red Rectangles to match images in the saved pdf file
|
||||
try (var out = new FileOutputStream(getTemporaryDirectory() + "/" + fileName + "_IMAGE_BBOX.pdf")) {
|
||||
out.write(pdfDoc.save(SDFDoc.SaveMode.LINEARIZED, null));
|
||||
}
|
||||
System.out.println("result file: " + getTemporaryDirectory() + "/" + fileName + "_IMAGE_BBOX.pdf");
|
||||
fileStream.close();
|
||||
// round all coords to nearest int to account for inconsistencies with the calculation of the bounding box
|
||||
return pageIdToRectCollection.values().stream().map(this::toRoundedCoordinateArrayList).flatMap(List::stream).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private List<int[]> toRoundedCoordinateArrayList(RectCollection rectCollection) {
|
||||
|
||||
List<int[]> coords = new ArrayList<>(rectCollection.getNumRects());
|
||||
for (int i = 0; i < rectCollection.getNumRects(); ++i) {
|
||||
var r = rectCollection.getRectAt(i);
|
||||
coords.add(new int[]{(int) Math.round(r.getX1()), (int) Math.round(r.getY1()), (int) Math.round(r.getX2()), (int) Math.round(r.getY2())});
|
||||
}
|
||||
return coords;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void drawGrid(ElementWriter writer, Page page) {
|
||||
|
||||
ElementBuilder eb = new ElementBuilder();
|
||||
double dX = 15;
|
||||
double dY = 15;
|
||||
int nRows = (int) (page.getPageHeight() / dY) + 1;
|
||||
int nCols = (int) (page.getPageWidth() / dX) + 1;
|
||||
for (int row = 0; row < nRows; ++row) {
|
||||
for (int col = 0; col < nCols; ++col) {
|
||||
Element cell = eb.createRect(col * dX, row * dY, dX, dY);
|
||||
cell.setPathStroke(true);
|
||||
cell.getGState().setLineWidth(1);
|
||||
cell.getGState().setStrokeOpacity(0.1);
|
||||
cell.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
||||
if (row == 0 && col == 0) {
|
||||
cell.getGState().setStrokeColor(new ColorPt(0, 0, 1));
|
||||
cell.setPathFill(true);
|
||||
cell.getGState().setFillOpacity(0.8);
|
||||
cell.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||
cell.getGState().setFillColor(new ColorPt(0, 0, 1));
|
||||
} else {
|
||||
cell.setPathFill(false);
|
||||
cell.getGState().setStrokeColor(new ColorPt(0.1, 0.1, 0.1));
|
||||
}
|
||||
writer.writePlacedElement(cell);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static void drawRectCollection(ElementWriter writer, RectCollection rectCollection, Page page) {
|
||||
|
||||
ColorPt colorPt = new ColorPt(1, 0, 0);
|
||||
ElementBuilder eb = new ElementBuilder();
|
||||
for (int i = 0; i < rectCollection.getNumRects(); ++i) {
|
||||
Rect r = rectCollection.getRectAt(i);
|
||||
Element rect = eb.createRect(r.getX1(), r.getY1(), r.getWidth(), r.getHeight());
|
||||
|
||||
rect.setPathStroke(true);
|
||||
rect.getGState().setLineWidth(5);
|
||||
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
||||
rect.getGState().setStrokeColor(colorPt);
|
||||
|
||||
rect.setPathFill(true);
|
||||
rect.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||
rect.getGState().setFillColor(colorPt);
|
||||
rect.getGState().setFillOpacity(0.5);
|
||||
|
||||
writer.writePlacedElement(rect);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,4 +1,4 @@
|
||||
package com.iqser.red.service.ocr.v1.server;
|
||||
package com.iqser.red.service.ocr.v1.server.service;
|
||||
|
||||
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
|
||||
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
|
||||
@ -17,7 +17,8 @@ import org.springframework.context.annotation.Import;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
import org.springframework.test.context.junit.jupiter.SpringExtension;
|
||||
|
||||
import com.iqser.red.service.ocr.v1.server.service.InvisibleElementRemovalService;
|
||||
import com.iqser.red.service.ocr.v1.server.Application;
|
||||
import com.iqser.red.service.ocr.v1.server.OcrServiceIntegrationTest;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.pdf.PageIterator;
|
||||
@ -26,8 +27,7 @@ import com.pdftron.pdf.TextExtractor;
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@ExtendWith(SpringExtension.class)
|
||||
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT //
|
||||
, properties = {"pdftron.ocrmodule.path=/YourOCRModulePath"})
|
||||
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
|
||||
@Import(OcrServiceIntegrationTest.TestConfiguration.class)
|
||||
public class InvisibleElementRemovalServiceTest {
|
||||
|
||||
@ -1 +0,0 @@
|
||||
{"dossierId": "c8553cbd-409f-4e1a-baf4-34b11d49deac", "fileId": "bd6f93ed896dd0e2f641b0568f13ddf1", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": [{"classification": {"label": "other", "probabilities": {"other": 0.9999, "logo": 0.0001, "formula": 0.0, "signature": 0.0}}, "representation": "FFFFFEFBF7EFCFFFFFFFFFFFF", "position": {"x1": -3, "x2": 795, "y1": 0, "y2": 612, "pageNumber": 1}, "geometry": {"width": 798, "height": 612}, "alpha": false, "filters": {"geometry": {"imageSize": {"quotient": 1.0038, "tooLarge": true, "tooSmall": false}, "imageFormat": {"quotient": 1.3039, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "logo", "probabilities": {"logo": 0.9966, "other": 0.0025, "signature": 0.0005, "formula": 0.0003}}, "representation": "FFC33D3C323CCF3390C1F8C72", "position": {"x1": 120, "x2": 131, "y1": 264, "y2": 380, "pageNumber": 1}, "geometry": {"width": 11, "height": 116}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.0513, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.0948, "tooTall": true, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "other", "probabilities": {"other": 1.0, "formula": 0.0, "logo": 0.0, "signature": 0.0}}, "representation": "EF8FF6381060800318F0E187", "position": {"x1": 152, "x2": 205, "y1": 115, "y2": 533, "pageNumber": 1}, "geometry": {"width": 53, "height": 418}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.2138, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.1268, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": true}}, {"classification": {"label": "other", "probabilities": {"other": 0.5967, "logo": 0.1756, "signature": 0.1218, "formula": 0.106}}, "representation": "EC9377C9170E1070C3070C30F", "position": {"x1": 196, "x2": 221, "y1": 121, "y2": 245, "pageNumber": 1}, "geometry": {"width": 25, "height": 124}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.08, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.2016, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": true}}, {"classification": {"label": "logo", "probabilities": {"logo": 1.0, "formula": 0.0, "other": 0.0, "signature": 0.0}}, "representation": "CF1F1F70F140036860F1441B5", "position": {"x1": 707, "x2": 718, "y1": 481, "y2": 531, "pageNumber": 1}, "geometry": {"width": 11, "height": 50}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.0337, "tooLarge": false, "tooSmall": true}, "imageFormat": {"quotient": 0.22, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "logo", "probabilities": {"logo": 0.9704, "other": 0.0223, "formula": 0.0044, "signature": 0.0029}}, "representation": "CF0F1C70F1C7090081F7CF073", "position": {"x1": 732, "x2": 744, "y1": 118, "y2": 241, "pageNumber": 1}, "geometry": {"width": 12, "height": 123}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.0552, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.0976, "tooTall": true, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}], "dataCV": []}
|
||||
@ -1 +0,0 @@
|
||||
{"dossierId": "55547c91-6b0e-4aa6-9009-2e7c4cd90f13", "fileId": "917b9d9c9f548f85fef3679db45ff46c", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": [{"classification": {"label": "signature", "probabilities": {"signature": 0.9927, "logo": 0.0038, "other": 0.0034, "formula": 0.0}}, "representation": "FFF2CF0F7C74FFC1070830FFF", "position": {"x1": -7, "x2": 603, "y1": 0, "y2": 852, "pageNumber": 1}, "geometry": {"width": 610, "height": 852}, "alpha": false, "filters": {"geometry": {"imageSize": {"quotient": 1.0096, "tooLarge": true, "tooSmall": false}, "imageFormat": {"quotient": 0.716, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}], "dataCV": []}
|
||||
@ -1 +0,0 @@
|
||||
{"dossierId": "c8553cbd-409f-4e1a-baf4-34b11d49deac", "fileId": "bd6f93ed896dd0e2f641b0568f13ddf1", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": [{"classification": {"label": "other", "probabilities": {"other": 0.9999, "logo": 0.0001, "formula": 0.0, "signature": 0.0}}, "representation": "FFFFFEFBF7EFCFFFFFFFFFFFF", "position": {"x1": -3, "x2": 795, "y1": 0, "y2": 612, "pageNumber": 1}, "geometry": {"width": 798, "height": 612}, "alpha": false, "filters": {"geometry": {"imageSize": {"quotient": 1.0038, "tooLarge": true, "tooSmall": false}, "imageFormat": {"quotient": 1.3039, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "logo", "probabilities": {"logo": 0.9966, "other": 0.0025, "signature": 0.0005, "formula": 0.0003}}, "representation": "FFC33D3C323CCF3390C1F8C72", "position": {"x1": 120, "x2": 131, "y1": 264, "y2": 380, "pageNumber": 1}, "geometry": {"width": 11, "height": 116}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.0513, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.0948, "tooTall": true, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "other", "probabilities": {"other": 1.0, "formula": 0.0, "logo": 0.0, "signature": 0.0}}, "representation": "EF8FF6381060800318F0E187", "position": {"x1": 152, "x2": 205, "y1": 115, "y2": 533, "pageNumber": 1}, "geometry": {"width": 53, "height": 418}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.2138, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.1268, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": true}}, {"classification": {"label": "other", "probabilities": {"other": 0.5967, "logo": 0.1756, "signature": 0.1218, "formula": 0.106}}, "representation": "EC9377C9170E1070C3070C30F", "position": {"x1": 196, "x2": 221, "y1": 121, "y2": 245, "pageNumber": 1}, "geometry": {"width": 25, "height": 124}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.08, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.2016, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": true}}, {"classification": {"label": "logo", "probabilities": {"logo": 1.0, "formula": 0.0, "other": 0.0, "signature": 0.0}}, "representation": "CF1F1F70F140036860F1441B5", "position": {"x1": 707, "x2": 718, "y1": 481, "y2": 531, "pageNumber": 1}, "geometry": {"width": 11, "height": 50}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.0337, "tooLarge": false, "tooSmall": true}, "imageFormat": {"quotient": 0.22, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "logo", "probabilities": {"logo": 0.9704, "other": 0.0223, "formula": 0.0044, "signature": 0.0029}}, "representation": "CF0F1C70F1C7090081F7CF073", "position": {"x1": 732, "x2": 744, "y1": 118, "y2": 241, "pageNumber": 1}, "geometry": {"width": 12, "height": 123}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.0552, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.0976, "tooTall": true, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}], "dataCV": []}
|
||||
@ -1 +0,0 @@
|
||||
{"dossierId": "55547c91-6b0e-4aa6-9009-2e7c4cd90f13", "fileId": "d7f1e0e37cba4e28ebdf894a79d3bd67", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": [{"classification": {"label": "signature", "probabilities": {"signature": 0.9872, "logo": 0.0064, "other": 0.0063, "formula": 0.0001}}, "representation": "FFFCF10608F6F89747BFFC301", "position": {"x1": -9, "x2": 584, "y1": 9, "y2": 849, "pageNumber": 1}, "geometry": {"width": 593, "height": 840}, "alpha": false, "filters": {"geometry": {"imageSize": {"quotient": 0.9992, "tooLarge": true, "tooSmall": false}, "imageFormat": {"quotient": 0.706, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}], "dataCV": []}
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1 +0,0 @@
|
||||
{"dossierId": "55547c91-6b0e-4aa6-9009-2e7c4cd90f13", "fileId": "32b19ec38896f5105c09041def470c90", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": [{"classification": {"label": "logo", "probabilities": {"logo": 0.9999, "signature": 0.0001, "formula": 0.0, "other": 0.0}}, "representation": "307EF8F6E9833CE9D7AF9EFFF", "position": {"x1": 26, "x2": 586, "y1": -2, "y2": 794, "pageNumber": 1}, "geometry": {"width": 560, "height": 796}, "alpha": false, "filters": {"geometry": {"imageSize": {"quotient": 0.959, "tooLarge": true, "tooSmall": false}, "imageFormat": {"quotient": 0.7035, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "logo", "probabilities": {"logo": 1.0, "formula": 0.0, "other": 0.0, "signature": 0.0}}, "representation": "FFF7FFD2000000018F3FFEFFF", "position": {"x1": 90, "x2": 210, "y1": 676, "y2": 720, "pageNumber": 1}, "geometry": {"width": 120, "height": 44}, "alpha": false, "filters": {"geometry": {"imageSize": {"quotient": 0.1044, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 2.7273, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": true}}], "dataCV": []}
|
||||
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user