RED-2440: Integrated image-service-v2

This commit is contained in:
deiflaender 2021-12-22 10:35:13 +01:00
parent db16f8c1da
commit dee1aa1f01
27 changed files with 920 additions and 324 deletions

View File

@ -12,7 +12,7 @@
<artifactId>redaction-service-api-v1</artifactId>
<properties>
<persistence-service.version>0.149.0</persistence-service.version>
<persistence-service.version>0.151.0</persistence-service.version>
</properties>
<dependencies>

View File

@ -7,6 +7,7 @@ import lombok.Data;
import lombok.NonNull;
import lombok.RequiredArgsConstructor;
import java.util.ArrayList;
import java.util.List;
@Data
@ -16,7 +17,7 @@ public class Page {
@NonNull
private List<AbstractTextContainer> textBlocks;
private List<PdfImage> images;
private List<PdfImage> images = new ArrayList<>();
private Rectangle bodyTextFrame;

View File

@ -1,15 +0,0 @@
package com.iqser.red.service.redaction.v1.server.client;
import org.springframework.cloud.openfeign.FeignClient;
import org.springframework.http.MediaType;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestBody;
import org.springframework.web.multipart.MultipartFile;
@FeignClient(name = "ImageClassificationResource", url = "${image-service.url}")
public interface ImageClassificationClient {
@PostMapping(value = "/process_full_img", consumes = MediaType.MULTIPART_FORM_DATA_VALUE, produces = MediaType.APPLICATION_JSON_VALUE)
ImageClassificationResponse classify(@RequestBody MultipartFile file);
}

View File

@ -1,13 +0,0 @@
package com.iqser.red.service.redaction.v1.server.client;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@NoArgsConstructor
@AllArgsConstructor
public class ImageClassificationResponse {
private String category;
}

View File

@ -88,7 +88,7 @@ public class RedactionController implements RedactionResource {
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest
.getDossierId(), redactionRequest.getFileId(), FileType.ORIGIN));
try {
Document classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream);
Document classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream, null);
storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest
.getDossierId(), redactionRequest.getFileId(), FileType.ORIGIN));
@ -116,7 +116,7 @@ public class RedactionController implements RedactionResource {
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest
.getDossierId(), redactionRequest.getFileId(), FileType.ORIGIN));
try {
Document classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream);
Document classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream, null);
storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest
.getDossierId(), redactionRequest.getFileId(), FileType.ORIGIN));
@ -145,7 +145,7 @@ public class RedactionController implements RedactionResource {
try {
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(redactionRequest
.getDossierId(), redactionRequest.getFileId(), FileType.ORIGIN));
classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream, true);
classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream, null);
} catch (Exception e) {
throw new RedactionException(e);
}

View File

@ -183,9 +183,9 @@ public class PDFLinesTextStripper extends PDFTextStripper {
graphicsPath.clear();
break;
case OperatorName.DRAW_OBJECT:
processImageOperation(arguments);
break;
// case OperatorName.DRAW_OBJECT:
// processImageOperation(arguments);
// break;
}

View File

@ -389,8 +389,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine
}
}
characterListMapping.clear();
super.processPage(page);
writePage();
super.processPage(page);writePage();
endPage(page);
}
}

View File

@ -16,6 +16,7 @@ public class PdfImage {
private BufferedImage image;
@NonNull
private RedRectangle2D position;
@NonNull
private ImageType imageType;
private boolean isAppendedToParagraph;
private boolean hasTransparency;

View File

@ -0,0 +1,14 @@
package com.iqser.red.service.redaction.v1.server.redaction.model.image;
import java.util.HashMap;
import java.util.Map;
import lombok.Data;
@Data
public class Classification {
private Map<String, Float> probabilities = new HashMap<>();
private String label;
}

View File

@ -0,0 +1,10 @@
package com.iqser.red.service.redaction.v1.server.redaction.model.image;
import lombok.Data;
@Data
public class FilterGeometry {
private ImageSize imageSize;
private ImageFormat imageFormat;
}

View File

@ -0,0 +1,11 @@
package com.iqser.red.service.redaction.v1.server.redaction.model.image;
import lombok.Data;
@Data
public class Filters {
private FilterGeometry geometry;
private Probability probability;
private boolean allPassed;
}

View File

@ -0,0 +1,9 @@
package com.iqser.red.service.redaction.v1.server.redaction.model.image;
import lombok.Data;
@Data
public class Geometry {
private float width;
private float height;
}

View File

@ -0,0 +1,12 @@
package com.iqser.red.service.redaction.v1.server.redaction.model.image;
import lombok.Data;
@Data
public class ImageFormat {
private float quotient;
private boolean tooTall;
private boolean tooWide;
}

View File

@ -0,0 +1,12 @@
package com.iqser.red.service.redaction.v1.server.redaction.model.image;
import lombok.Data;
@Data
public class ImageMetadata {
private Classification classification;
private Position position;
private Geometry geometry;
private Filters filters;
}

View File

@ -0,0 +1,15 @@
package com.iqser.red.service.redaction.v1.server.redaction.model.image;
import java.util.ArrayList;
import java.util.List;
import lombok.Data;
@Data
public class ImageServiceResponse {
private String dossierId;
private String fileId;
private List<ImageMetadata> imageMetadata = new ArrayList<>();
}

View File

@ -0,0 +1,12 @@
package com.iqser.red.service.redaction.v1.server.redaction.model.image;
import lombok.Data;
@Data
public class ImageSize {
private float quotient;
private boolean tooLarge;
private boolean tooSmall;
}

View File

@ -0,0 +1,12 @@
package com.iqser.red.service.redaction.v1.server.redaction.model.image;
import lombok.Data;
@Data
public class Position {
private float x1;
private float x2;
private float y1;
private float y2;
private int pageNumber;
}

View File

@ -0,0 +1,10 @@
package com.iqser.red.service.redaction.v1.server.redaction.model.image;
import lombok.Data;
@Data
public class Probability {
private boolean unconfident;
}

View File

@ -26,8 +26,10 @@ import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryIncre
import com.iqser.red.service.redaction.v1.server.redaction.model.DictionaryVersion;
import com.iqser.red.service.redaction.v1.server.redaction.model.Image;
import com.iqser.red.service.redaction.v1.server.redaction.model.PageEntities;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import com.iqser.red.service.redaction.v1.server.redaction.model.RedRectangle2D;
import com.iqser.red.service.redaction.v1.server.redaction.utils.EntitySearchUtils;
import com.iqser.red.service.redaction.v1.server.segmentation.ImageService;
import com.iqser.red.service.redaction.v1.server.segmentation.PdfSegmentationService;
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
@ -53,6 +55,7 @@ public class AnalyzeService {
private final SectionTextBuilderService sectionTextBuilderService;
private final SectionGridCreatorService sectionGridCreatorService;
private final NerAnalyserService nerAnalyserService;
private final ImageService imageService;
public void analyzeDocumentStructure(StructureAnalyzeRequest analyzeRequest) {
@ -63,9 +66,15 @@ public class AnalyzeService {
Document classifiedDoc;
try {
var storedObjectStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(analyzeRequest
.getDossierId(), analyzeRequest.getFileId(), FileType.ORIGIN));
classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream);
Map<Integer, List<PdfImage>> pdfImages = null;
if(redactionServiceSettings.isEnableImageClassification()) {
pdfImages = imageService.convertImages(analyzeRequest.getDossierId(), analyzeRequest.getFileId());
}
classifiedDoc = pdfSegmentationService.parseDocument(storedObjectStream, pdfImages);
pageCount = classifiedDoc.getPages().size();
} catch (Exception e) {
throw new RedactionException(e);

View File

@ -1,71 +0,0 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.client.ImageClassificationClient;
import com.iqser.red.service.redaction.v1.server.client.ImageClassificationResponse;
import com.iqser.red.service.redaction.v1.server.client.MockMultipartFile;
import com.iqser.red.service.redaction.v1.server.redaction.model.ImageType;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import com.iqser.red.service.redaction.v1.server.settings.RedactionServiceSettings;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
import javax.imageio.ImageIO;
import java.io.ByteArrayOutputStream;
@Slf4j
@Service
@RequiredArgsConstructor
public class ImageClassificationService {
private final ImageClassificationClient imageClassificationClient;
private final RedactionServiceSettings settings;
public void classifyImages(Page page) {
page.getImages().forEach(image -> {
if (settings.isEnableImageClassification() && !isEntirePageImage(image, page)) {
long start = System.currentTimeMillis();
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
ImageIO.write(image.getImage(), "png", baos);
var mockFile = new MockMultipartFile("file", "Image.png", "image/png", baos.toByteArray());
ImageClassificationResponse response = imageClassificationClient.classify(mockFile);
image.setImageType(ImageType.valueOf(response.getCategory()));
} catch (Exception e) {
log.error("Could not classify image", e);
image.setImageType(ImageType.OTHER);
}
log.info("Image classification took: " + (System.currentTimeMillis() - start));
} else {
image.setImageType(ImageType.OTHER);
}
image.getImage().flush();
image.setImage(null);
if (image.getImageType().equals(ImageType.OTHER)) {
page.getTextBlocks().forEach(textblock -> {
if (image.getPosition()
.contains(textblock.getMinX(), textblock.getMinY(), textblock.getWidth(), textblock.getHeight())) {
image.setImageType(ImageType.OCR);
}
});
}
});
}
private boolean isEntirePageImage(PdfImage image, Page page){
double imageArea = image.getPosition().getHeight() * image.getPosition().getWidth();
if(imageArea / page.getCropBoxArea() >= settings.getMaxImageCropboxRatio()){
log.info("Skipping image classification because images is almost as large as the entire page");
return true;
}
return false;
}
}

View File

@ -1,165 +0,0 @@
package com.iqser.red.service.redaction.v1.server.segmentation;
import java.awt.Graphics;
import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;
import java.util.ArrayList;
import java.util.List;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class ImageMergeService {
public List<PdfImage> mergeImages(List<PdfImage> images, int rotation){
List<PdfImage> mergedList = processImages(images, rotation);
List<PdfImage> imagesInImage = new ArrayList<>();
for(PdfImage image: mergedList){
for (PdfImage inner: mergedList){
if(image != inner && image.getPosition().contains(inner.getPosition().getX(), inner.getPosition().getY(), inner.getPosition().getWidth(), inner.getPosition().getHeight())){
imagesInImage.add(inner);
}
}
}
mergedList.removeAll(imagesInImage);
return mergedList;
}
//merge images, if they are separated during pdf import, return new list of Pdfimages
private List<PdfImage> processImages(List<PdfImage> imageList, int rotation) {
if (imageList.size() > 1) {
List<PdfImage> mergedList = new ArrayList<>();
int countElementsInList = 0;
boolean beginImage = true;
// a List of Boolean, true = candidate for merging, false = no merging
List<Boolean> candidatesList = getCandidatesList(imageList, rotation);
// loop through list, if there are candidates for merging (true), merge images and add it to mergedList
for (int i = 0; i < candidatesList.size(); i++) {
if (candidatesList.get(i)) {
if (beginImage) {
//begin of image, merge two parts of imageList
PdfImage mergedImage = mergeTwoImages(imageList.get(i), imageList.get(i + 1), rotation);
// image merge successful
if (mergedImage != null) {
mergedList.add(mergedImage);
countElementsInList++;
}
} else {
//middle of an image, merge current piece auf mergedList with image of imageList
PdfImage mergedImage = mergeTwoImages(mergedList.get(countElementsInList - 1), imageList.get(i + 1), rotation);
// image merge successful
if (mergedImage != null) {
mergedList.set(countElementsInList - 1, mergedImage);
}
}
beginImage = false;
} else {
// if the last candidate is false, then both images i and i+1 must be added
if (i == candidatesList.size() - 1) {
if (countElementsInList > 0 && mergedList.get(countElementsInList - 1) == imageList.get(i)) {
mergedList.add(imageList.get(i + 1));
} else {
mergedList.add(imageList.get(i));
mergedList.add(imageList.get(i + 1));
}
} else {
//first image is not splitted, add i to resultlist
if (beginImage) {
mergedList.add(imageList.get(i));
countElementsInList++;
} else {
// i is the end of an image, add begin of new image
mergedList.add(imageList.get(i + 1));
countElementsInList++;
beginImage = false;
}
}
}
}
return mergedList;
} else {
return imageList;
}
}
private PdfImage mergeTwoImages(PdfImage image1, PdfImage image2, int rotation) {
// diese Angaben von getPosition scheinen nicht richtig zu sein, damit werden teile des Bildes abgeschnitten
double width = image1.getPosition().getWidth();
double width2 = image2.getPosition().getWidth();
double height1 = image1.getPosition().getHeight();
double height2 = image2.getPosition().getHeight();
// mit den Werten, die unter Image gespeichert sind, funktioniert es
double img1height = image1.getImage().getHeight();
double img1width = image1.getImage().getWidth();
double img2height = image2.getImage().getHeight();
BufferedImage mergedImage = new BufferedImage((int) img1width, (int) (img1height + img2height), BufferedImage.TYPE_INT_RGB);
Graphics mergedImageGraphics = mergedImage.getGraphics();
try {
mergedImageGraphics.drawImage(image1.getImage(), 0, 0, null);
mergedImageGraphics.drawImage(image2.getImage(), 0, (int) (img1height), null);
// set Image, Position and type for merged Image
//set position for merged image with values of image1 and the height of both
Rectangle2D pos = new Rectangle2D.Float();
pos.setRect(image1.getPosition().getX(), image2.getPosition().getY(), rotation == 90 ? width + width2: width, rotation == 90 ? height1 : height1 + height2);
PdfImage newPdfImage = new PdfImage(mergedImage, pos, image1.getPage(), image1.isHasTransparency() || image2.isHasTransparency());
// Graphics need to be disposed
image1.getImage().flush();
image2.getImage().flush();
mergedImage.flush();
mergedImageGraphics.dispose();
return newPdfImage;
} catch (Exception e) {
// failed to merge image
log.error("Failed to merge image", e);
return null;
}
}
//make a list of true and false, if the image is a candidate for merging
private List<Boolean> getCandidatesList(List<PdfImage> imageList, int rotation) {
List<Boolean> candidatesList = new ArrayList<>();
for (int i = 0; i < imageList.size(); i++) {
if (i >= 1) {
candidatesList.add(isCandidateForMerging(imageList.get(i - 1), imageList.get(i), rotation));
}
}
return candidatesList;
}
// evaluate if two images are candidates for merging, depending on their coordinates, width and height
private boolean isCandidateForMerging(PdfImage image1, PdfImage image2, int rotation) {
double x1 = rotation == 90 ? image1.getPosition().getY() : image1.getPosition().getX();
double y1 = rotation == 90 ? image1.getPosition().getX() : image1.getPosition().getY();
double width1 = rotation == 90 ? image1.getPosition().getHeight() : image1.getPosition().getWidth();
double x2 = rotation == 90 ? image2.getPosition().getY() : image2.getPosition().getX();
double y2 = rotation == 90 ? image2.getPosition().getX() : image2.getPosition().getY();
double width2 = rotation == 90 ? image2.getPosition().getHeight() : image2.getPosition().getWidth();
double height2 = rotation == 90 ? image2.getPosition().getWidth() : image2.getPosition().getHeight();
//if the x-coordinates and widths of images are equal and the height is equal to difference between y-coordinates,
// then it is the same picture and has to be merged -> return true
return x1 == x2 && width1 == width2 && Math.ceil(height2) == Math.ceil(rotation == 90 ? y2 - y1 : y1 - y2) && width2 > (height2 / 6);
}
}

View File

@ -0,0 +1,61 @@
package com.iqser.red.service.redaction.v1.server.segmentation;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import org.springframework.stereotype.Service;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.redaction.model.ImageType;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import com.iqser.red.service.redaction.v1.server.redaction.model.RedRectangle2D;
import com.iqser.red.service.redaction.v1.server.redaction.model.image.ImageServiceResponse;
import com.iqser.red.service.redaction.v1.server.storage.RedactionStorageService;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
@Service
@RequiredArgsConstructor
public class ImageService {
private final ObjectMapper objectMapper;
private final RedactionStorageService redactionStorageService;
@SneakyThrows
public Map<Integer, List<PdfImage>> convertImages(String dossierId, String fileId){
var imageClassificationStream = redactionStorageService.getStoredObject(RedactionStorageService.StorageIdUtils.getStorageId(dossierId, fileId, FileType.IMAGE_INFO));
ImageServiceResponse imageServiceResponse = objectMapper.readValue(imageClassificationStream, ImageServiceResponse.class);
Map<Integer, List<PdfImage>> images = new HashMap<>();
imageServiceResponse.getImageMetadata().stream().forEach(imageMetadata -> {
var classification = imageMetadata.getFilters().isAllPassed() ? ImageType.valueOf(imageMetadata.getClassification().getLabel().toUpperCase(Locale.ROOT)) : ImageType.OTHER;
images.computeIfAbsent(imageMetadata.getPosition().getPageNumber() ,x -> new ArrayList<>())
.add(new PdfImage(new RedRectangle2D(imageMetadata.getPosition().getX1(), imageMetadata.getPosition().getY1(), imageMetadata.getGeometry().getWidth(), imageMetadata.getGeometry().getHeight()), classification, imageMetadata.getPosition().getPageNumber()));
});
return images;
}
public void findOcr(Page page){
page.getImages().forEach(image -> {
if (image.getImageType().equals(ImageType.OTHER)) {
page.getTextBlocks().forEach(textblock -> {
if (image.getPosition()
.contains(textblock.getMinX(), textblock.getMinY(), textblock.getWidth(), textblock.getHeight())) {
image.setImageType(ImageType.OCR);
}
});
}
});
}
}

View File

@ -10,6 +10,7 @@ import java.nio.file.attribute.PosixFilePermission;
import java.nio.file.attribute.PosixFilePermissions;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.io.IOUtils;
@ -29,7 +30,6 @@ import com.iqser.red.service.redaction.v1.server.memory.MemoryStats;
import com.iqser.red.service.redaction.v1.server.parsing.PDFLinesTextStripper;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import com.iqser.red.service.redaction.v1.server.redaction.service.ImageClassificationService;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService;
@ -50,17 +50,10 @@ public class PdfSegmentationService {
private final BlockificationService blockificationService;
private final ClassificationService classificationService;
private final SectionsBuilderService sectionsBuilderService;
private final ImageClassificationService imageClassificationService;
private final ImageMergeService imageMergeService;
private final ImageService imageService;
public Document parseDocument(InputStream documentInputStream) throws IOException {
return parseDocument(documentInputStream, false);
}
public Document parseDocument(InputStream documentInputStream, boolean ignoreImages) throws IOException {
public Document parseDocument(InputStream documentInputStream, Map<Integer, List<PdfImage>> pdfImages) throws IOException {
PDDocument pdDocument = null;
try {
@ -122,15 +115,14 @@ public class PdfSegmentationService {
page.setLandscape(isLandscape || isRotated);
page.setPageNumber(pageNumber);
List<PdfImage> mergedList = imageMergeService.mergeImages(stripper.getImages(), rotation);
page.setImages(mergedList);
tableExtractionService.extractTables(cleanRulings, page);
buildPageStatistics(page);
increaseDocumentStatistics(page, document);
if (!ignoreImages) {
imageClassificationService.classifyImages(page);
if (pdfImages != null && pdfImages.containsKey(pageNumber)) {
page.setImages(pdfImages.get(pageNumber));
imageService.findOcr(page);
}
pages.add(page);

View File

@ -127,6 +127,8 @@ public class RedactionStorageService {
return dossierId + "/" + fileId + "." + fileType.name() + fileType.getExtension();
}
}
}

View File

@ -75,7 +75,6 @@ import com.iqser.red.service.redaction.v1.model.RedactionResult;
import com.iqser.red.service.redaction.v1.model.StructureAnalyzeRequest;
import com.iqser.red.service.redaction.v1.server.classification.model.SectionText;
import com.iqser.red.service.redaction.v1.server.client.DictionaryClient;
import com.iqser.red.service.redaction.v1.server.client.ImageClassificationClient;
import com.iqser.red.service.redaction.v1.server.client.LegalBasisClient;
import com.iqser.red.service.redaction.v1.server.client.RulesClient;
import com.iqser.red.service.redaction.v1.server.controller.RedactionController;
@ -137,9 +136,6 @@ public class RedactionIntegrationTest {
@MockBean
private DictionaryClient dictionaryClient;
@MockBean
private ImageClassificationClient imageClassificationClient;
@Autowired
private RedactionStorageService redactionStorageService;
@ -899,7 +895,7 @@ public class RedactionIntegrationTest {
@Test
public void redactionTest() throws IOException {
String fileName = "files/new/S416.pdf";
String fileName = "files/new/Single Study - Oral (Gavage) Mouse.pdf";
String outputFileName = OsUtils.getTemporaryDirectory() + "/Annotated.pdf";
long start = System.currentTimeMillis();

View File

@ -1,12 +1,16 @@
package com.iqser.red.service.redaction.v1.server.segmentation;
import com.amazonaws.services.s3.AmazonS3;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.redaction.v1.server.Application;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.service.BlockificationService;
import com.iqser.red.service.redaction.v1.server.client.LegalBasisClient;
import com.iqser.red.service.redaction.v1.server.redaction.model.ImageType;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import com.iqser.red.service.redaction.v1.server.redaction.model.RedRectangle2D;
import com.iqser.red.service.redaction.v1.server.redaction.model.image.ImageServiceResponse;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService;
@ -31,13 +35,18 @@ import javax.imageio.ImageIO;
import java.io.ByteArrayOutputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.stream.Collectors;
import static com.iqser.red.service.redaction.v1.server.redaction.utils.OsUtils.getTemporaryDirectory;
import static org.assertj.core.api.Assertions.assertThat;
import lombok.SneakyThrows;
@RunWith(SpringRunner.class)
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
@ -68,42 +77,29 @@ public class PdfSegmentationServiceTest {
@MockBean
private LegalBasisClient legalBasisClient;
@Autowired
private ObjectMapper objectMapper;
@Configuration
@EnableAutoConfiguration(exclude = { RabbitAutoConfiguration.class})
public static class TestConfiguration {
}
@Test
public void testMergeImages() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/270Rotated.pdf");
Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
assertThat(document.getPages().get(0).getImages().size()).isEqualTo(1);
assertThat(document.getPages().get(1).getImages().size()).isEqualTo(0);
}
@Test
@Ignore
public void testExtractImages() throws IOException {
@SneakyThrows
public void testMapping(){
ClassPathResource responseJson = new ClassPathResource("files/image_response.json");
ImageServiceResponse imageServiceResponse = objectMapper.readValue(responseJson.getInputStream(), ImageServiceResponse.class);
ClassPathResource pdfFileResource = new ClassPathResource("files/new/Single Study - Oral (Gavage) Mouse.pdf");
Map<Integer, List<PdfImage>> images = new HashMap<>();
imageServiceResponse.getImageMetadata().stream().forEach(imageMetadata -> {
images.computeIfAbsent(imageMetadata.getPosition().getPageNumber() ,x -> new ArrayList<>())
.add(new PdfImage(new RedRectangle2D(imageMetadata.getPosition().getX1(), imageMetadata.getPosition().getY1(), imageMetadata.getGeometry().getWidth(), imageMetadata.getGeometry().getHeight()), ImageType.valueOf(imageMetadata.getClassification().getLabel().toUpperCase(Locale.ROOT)), imageMetadata.getPosition().getPageNumber()));
});
Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
int i = 0;
for (Page page : document.getPages()) {
for (PdfImage image : page.getImages()) {
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
ImageIO.write(image.getImage(), "png", baos);
try (FileOutputStream fileOutputStream = new FileOutputStream(getTemporaryDirectory() + "/Image " + i + ".png")) {
fileOutputStream.write(baos.toByteArray());
}
}
i++;
}
}
System.out.println("object");
}
@ -112,7 +108,7 @@ public class PdfSegmentationServiceTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Spanning Cells.pdf");
Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream(), null);
assertThat(document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
@ -133,7 +129,7 @@ public class PdfSegmentationServiceTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Table.pdf");
Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream(), null);
assertThat(document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
@ -171,7 +167,7 @@ public class PdfSegmentationServiceTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Merge Multi Page Table.pdf");
Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream(), null);
assertThat(document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())
@ -209,7 +205,7 @@ public class PdfSegmentationServiceTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/Rotated Table Headers.pdf");
Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream(), null);
assertThat(document.getParagraphs()
.stream()
.flatMap(paragraph -> paragraph.getTables().stream())

View File

@ -0,0 +1,686 @@
{
"dossierId": "f889853e-4bf8-49a9-aae5-c38605c6ef40",
"fileId": "22ef63e29bb2a27db8497272336f6b32",
"imageMetadata": [
{
"classification": {
"probabilities": {
"logo": 1.0,
"signature": 0.0,
"other": 0.0,
"formula": 0.0
},
"label": "logo"
},
"position": {
"x1": 89.88,
"x2": 274.20000000000005,
"y1": 716.24,
"y2": 770.0,
"pageNumber": 1
},
"geometry": {
"width": 184.32000000000005,
"height": 53.75999999999999
},
"filters": {
"geometry": {
"imageSize": {
"quotient": 0.14298074612038092,
"tooLarge": false,
"tooSmall": false
},
"imageFormat": {
"quotient": 3.42857142857143,
"tooTall": false,
"tooWide": false
}
},
"probability": {
"unconfident": false
},
"allPassed": true
}
},
{
"classification": {
"probabilities": {
"signature": 0.999968,
"logo": 1.6e-05,
"other": 1.6e-05,
"formula": 0.0
},
"label": "signature"
},
"position": {
"x1": -0.10000600000000001,
"x2": 595.099994,
"y1": -0.07998660000000002,
"y2": 842.0800134,
"pageNumber": 3
},
"geometry": {
"width": 595.2,
"height": 842.16
},
"filters": {
"geometry": {
"imageSize": {
"quotient": 1.0000782051152328,
"tooLarge": true,
"tooSmall": false
},
"imageFormat": {
"quotient": 0.706754060986036,
"tooTall": false,
"tooWide": false
}
},
"probability": {
"unconfident": false
},
"allPassed": false
}
},
{
"classification": {
"probabilities": {
"signature": 0.999872,
"other": 7.9e-05,
"logo": 4.8e-05,
"formula": 0.0
},
"label": "signature"
},
"position": {
"x1": -0.10000600000000001,
"x2": 595.099994,
"y1": -0.07998660000000002,
"y2": 842.0800134,
"pageNumber": 7
},
"geometry": {
"width": 595.2,
"height": 842.16
},
"filters": {
"geometry": {
"imageSize": {
"quotient": 1.0000782051152328,
"tooLarge": true,
"tooSmall": false
},
"imageFormat": {
"quotient": 0.706754060986036,
"tooTall": false,
"tooWide": false
}
},
"probability": {
"unconfident": false
},
"allPassed": false
}
},
{
"classification": {
"probabilities": {
"signature": 0.996366,
"other": 0.00,
"logo": 2.3e-05,
"formula": 4e-06
},
"label": "signature"
},
"position": {
"x1": -0.10000600000000001,
"x2": 595.099994,
"y1": -0.07998660000000002,
"y2": 842.0800134,
"pageNumber": 8
},
"geometry": {
"width": 595.2,
"height": 842.16
},
"filters": {
"geometry": {
"imageSize": {
"quotient": 1.0002630764355351,
"tooLarge": true,
"tooSmall": false
},
"imageFormat": {
"quotient": 0.706754060986036,
"tooTall": false,
"tooWide": false
}
},
"probability": {
"unconfident": false
},
"allPassed": false
}
},
{
"classification": {
"probabilities": {
"signature": 0.999772,
"logo": 0.000131,
"other": 9.7e-05,
"formula": 0.0
},
"label": "signature"
},
"position": {
"x1": 82.59443842482001,
"x2": 512.6365568843402,
"y1": 116.943736387567,
"y2": 725.0718450317352,
"pageNumber": 73
},
"geometry": {
"width": 430.04211845952017,
"height": 608.1281086441682
},
"filters": {
"geometry": {
"imageSize": {
"quotient": 0.72236755521117,
"tooLarge": false,
"tooSmall": false
},
"imageFormat": {
"quotient": 0.7071571143427432,
"tooTall": false,
"tooWide": false
}
},
"probability": {
"unconfident": false
},
"allPassed": true
}
},
{
"classification": {
"probabilities": {
"signature": 1.0,
"logo": 0.0,
"other": 0.0,
"formula": 0.0
},
"label": "signature"
},
"position": {
"x1": 328.20483600000006,
"x2": 393.94460940000005,
"y1": 175.1643178,
"y2": 203.92865619999998,
"pageNumber": 81
},
"geometry": {
"width": 65.73977339999999,
"height": 28.764338399999986
},
"filters": {
"geometry": {
"imageSize": {
"quotient": 0.06142518774572455,
"tooLarge": false,
"tooSmall": false
},
"imageFormat": {
"quotient": 2.2854609929078022,
"tooTall": false,
"tooWide": false
}
},
"probability": {
"unconfident": false
},
"allPassed": true
}
},
{
"classification": {
"probabilities": {
"signature": 1.0,
"logo": 0.0,
"other": 0.0,
"formula": 0.0
},
"label": "signature"
},
"position": {
"x1": 136.5955818,
"x2": 224.72461859999999,
"y1": 175.1133172,
"y2": 203.97965680000001,
"pageNumber": 81
},
"geometry": {
"width": 88.1290368,
"height": 28.866339600000003
},
"filters": {
"geometry": {
"imageSize": {
"quotient": 0.07124601312700823,
"tooLarge": false,
"tooSmall": false
},
"imageFormat": {
"quotient": 3.053003533568904,
"tooTall": false,
"tooWide": false
}
},
"probability": {
"unconfident": false
},
"allPassed": true
}
},
{
"classification": {
"probabilities": {
"logo": 1.0,
"signature": 0.0,
"other": 0.0,
"formula": 0.0
},
"label": "logo"
},
"position": {
"x1": 194.99126880000003,
"x2": 399.80967840000005,
"y1": 554.6597824,
"y2": 686.2413304,
"pageNumber": 81
},
"geometry": {
"width": 204.81840960000002,
"height": 131.581548
},
"filters": {
"geometry": {
"imageSize": {
"quotient": 0.23189275858788796,
"tooLarge": false,
"tooSmall": false
},
"imageFormat": {
"quotient": 1.5565891472868219,
"tooTall": false,
"tooWide": false
}
},
"probability": {
"unconfident": false
},
"allPassed": true
}
},
{
"classification": {
"probabilities": {
"other": 1.0,
"signature": 0.0,
"formula": 0.0,
"logo": 0.0
},
"label": "other"
},
"position": {
"x1": 133.9945512,
"x2": 242.52382799999998,
"y1": 411.24609519999996,
"y2": 523.2434128,
"pageNumber": 90
},
"geometry": {
"width": 108.52927679999999,
"height": 111.99731760000003
},
"filters": {
"geometry": {
"imageSize": {
"quotient": 0.15573364968831904,
"tooLarge": false,
"tooSmall": false
},
"imageFormat": {
"quotient": 0.9690346083788703,
"tooTall": false,
"tooWide": false
}
},
"probability": {
"unconfident": false
},
"allPassed": true
}
},
{
"classification": {
"probabilities": {
"other": 1.0,
"signature": 0.0,
"formula": 0.0,
"logo": 0.0
},
"label": "other"
},
"position": {
"x1": 133.5865464,
"x2": 242.3198256,
"y1": 274.972492,
"y2": 387.7858192,
"pageNumber": 90
},
"geometry": {
"width": 108.7332792,
"height": 112.8133272
},
"filters": {
"geometry": {
"imageSize": {
"quotient": 0.15644678522591335,
"tooLarge": false,
"tooSmall": false
},
"imageFormat": {
"quotient": 0.9638336347197106,
"tooTall": false,
"tooWide": false
}
},
"probability": {
"unconfident": false
},
"allPassed": true
}
},
{
"classification": {
"probabilities": {
"other": 1.0,
"signature": 0.0,
"formula": 0.0,
"logo": 0.0
},
"label": "other"
},
"position": {
"x1": 246.19587120000003,
"x2": 356.5611696,
"y1": 400.84197279999995,
"y2": 519.3673672,
"pageNumber": 90
},
"geometry": {
"width": 110.3652984,
"height": 118.52539440000004
},
"filters": {
"geometry": {
"imageSize": {
"quotient": 0.1615575178049721,
"tooLarge": false,
"tooSmall": false
},
"imageFormat": {
"quotient": 0.9311531841652321,
"tooTall": false,
"tooWide": false
}
},
"probability": {
"unconfident": false
},
"allPassed": true
}
},
{
"classification": {
"probabilities": {
"other": 1.0,
"signature": 0.0,
"formula": 0.0,
"logo": 0.0
},
"label": "other"
},
"position": {
"x1": 244.9718568,
"x2": 358.3971912,
"y1": 274.972492,
"y2": 387.7858192,
"pageNumber": 90
},
"geometry": {
"width": 113.4253344,
"height": 112.8133272
},
"filters": {
"geometry": {
"imageSize": {
"quotient": 0.15978662903260646,
"tooLarge": false,
"tooSmall": false
},
"imageFormat": {
"quotient": 1.0054249547920433,
"tooTall": false,
"tooWide": false
}
},
"probability": {
"unconfident": false
},
"allPassed": true
}
},
{
"classification": {
"probabilities": {
"other": 1.0,
"signature": 0.0,
"formula": 0.0,
"logo": 0.0
},
"label": "other"
},
"position": {
"x1": 254.9679744,
"x2": 371.6573472,
"y1": 439.6024288,
"y2": 564.0438928,
"pageNumber": 91
},
"geometry": {
"width": 116.6893728,
"height": 124.441464
},
"filters": {
"geometry": {
"imageSize": {
"quotient": 0.17021718544102565,
"tooLarge": false,
"tooSmall": false
},
"imageFormat": {
"quotient": 0.9377049180327869,
"tooTall": false,
"tooWide": false
}
},
"probability": {
"unconfident": false
},
"allPassed": true
}
},
{
"classification": {
"probabilities": {
"other": 1.0,
"signature": 0.0,
"formula": 0.0,
"logo": 0.0
},
"label": "other"
},
"position": {
"x1": 133.9945512,
"x2": 249.663912,
"y1": 443.07046959999997,
"y2": 687.2613424,
"pageNumber": 91
},
"geometry": {
"width": 115.66936080000002,
"height": 244.19087280000002
},
"filters": {
"geometry": {
"imageSize": {
"quotient": 0.23739910530627284,
"tooLarge": false,
"tooSmall": false
},
"imageFormat": {
"quotient": 0.4736842105263158,
"tooTall": false,
"tooWide": false
}
},
"probability": {
"unconfident": false
},
"allPassed": true
}
},
{
"classification": {
"probabilities": {
"other": 1.0,
"signature": 0.0,
"formula": 0.0,
"logo": 0.0
},
"label": "other"
},
"position": {
"x1": 105.84222,
"x2": 374.870385,
"y1": 526.40545,
"y2": 687.05734,
"pageNumber": 92
},
"geometry": {
"width": 269.028165,
"height": 160.65188999999998
},
"filters": {
"geometry": {
"imageSize": {
"quotient": 0.2936614851112628,
"tooLarge": false,
"tooSmall": false
},
"imageFormat": {
"quotient": 1.6746031746031749,
"tooTall": false,
"tooWide": false
}
},
"probability": {
"unconfident": false
},
"allPassed": true
}
},
{
"classification": {
"probabilities": {
"logo": 0.788068,
"other": 0.152259,
"formula": 0.036883,
"signature": 0.02279
},
"label": "logo"
},
"position": {
"x1": 44.64999049990001,
"x2": 550.5759424999001,
"y1": 63.286004150029996,
"y2": 778.72242095003,
"pageNumber": 94
},
"geometry": {
"width": 505.92595200000005,
"height": 715.4364168000001
},
"filters": {
"geometry": {
"imageSize": {
"quotient": 0.8498341845521462,
"tooLarge": true,
"tooSmall": false
},
"imageFormat": {
"quotient": 0.7071571143427431,
"tooTall": false,
"tooWide": false
}
},
"probability": {
"unconfident": false
},
"allPassed": false
}
},
{
"classification": {
"probabilities": {
"signature": 0.998335,
"logo": 0.000955,
"other": 0.000703,
"formula": 7e-06
},
"label": "signature"
},
"position": {
"x1": 58.954005540029996,
"x2": 536.45979618003,
"y1": 83.94401504006001,
"y2": 758.05854296006,
"pageNumber": 95
},
"geometry": {
"width": 477.50579064000004,
"height": 674.11452792
},
"filters": {
"geometry": {
"imageSize": {
"quotient": 0.8014221863697041,
"tooLarge": true,
"tooSmall": false
},
"imageFormat": {
"quotient": 0.7083452007974936,
"tooTall": false,
"tooWide": false
}
},
"probability": {
"unconfident": false
},
"allPassed": false
}
}
]
}