Pull request #171: RED-1472: Fixed image merging in rotated pages

Merge in RED/redaction-service from RED-1472 to master

* commit 'b8dc0e448d9101db8099aaf638ac929a16c87c3a':
  RED-1472: Fixed image merging in rotated pages
This commit is contained in:
Dominique Eiflaender 2021-06-15 13:11:25 +02:00
commit 925b2a274c
2 changed files with 187 additions and 157 deletions

View File

@ -0,0 +1,165 @@
package com.iqser.red.service.redaction.v1.server.segmentation;
import java.awt.Graphics;
import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;
import java.util.ArrayList;
import java.util.List;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class ImageMergeService {
public List<PdfImage> mergeImages(List<PdfImage> images, int rotation){
List<PdfImage> mergedList = processImages(images, rotation);
List<PdfImage> imagesInImage = new ArrayList<>();
for(PdfImage image: mergedList){
for (PdfImage inner: mergedList){
if(image != inner && image.getPosition().contains(inner.getPosition().getX(), inner.getPosition().getY(), inner.getPosition().getWidth(), inner.getPosition().getHeight())){
imagesInImage.add(inner);
}
}
}
mergedList.removeAll(imagesInImage);
return mergedList;
}
//merge images, if they are separated during pdf import, return new list of Pdfimages
private List<PdfImage> processImages(List<PdfImage> imageList, int rotation) {
if (imageList.size() > 1) {
List<PdfImage> mergedList = new ArrayList<>();
int countElementsInList = 0;
boolean beginImage = true;
// a List of Boolean, true = candidate for merging, false = no merging
List<Boolean> candidatesList = getCandidatesList(imageList, rotation);
// loop through list, if there are candidates for merging (true), merge images and add it to mergedList
for (int i = 0; i < candidatesList.size(); i++) {
if (candidatesList.get(i)) {
if (beginImage) {
//begin of image, merge two parts of imageList
PdfImage mergedImage = mergeTwoImages(imageList.get(i), imageList.get(i + 1), rotation);
// image merge successful
if (mergedImage != null) {
mergedList.add(mergedImage);
countElementsInList++;
}
} else {
//middle of an image, merge current piece auf mergedList with image of imageList
PdfImage mergedImage = mergeTwoImages(mergedList.get(countElementsInList - 1), imageList.get(i + 1), rotation);
// image merge successful
if (mergedImage != null) {
mergedList.set(countElementsInList - 1, mergedImage);
}
}
beginImage = false;
} else {
// if the last candidate is false, then both images i and i+1 must be added
if (i == candidatesList.size() - 1) {
if (countElementsInList > 0 && mergedList.get(countElementsInList - 1) == imageList.get(i)) {
mergedList.add(imageList.get(i + 1));
} else {
mergedList.add(imageList.get(i));
mergedList.add(imageList.get(i + 1));
}
} else {
//first image is not splitted, add i to resultlist
if (beginImage) {
mergedList.add(imageList.get(i));
countElementsInList++;
} else {
// i is the end of an image, add begin of new image
mergedList.add(imageList.get(i + 1));
countElementsInList++;
beginImage = false;
}
}
}
}
return mergedList;
} else {
return imageList;
}
}
private PdfImage mergeTwoImages(PdfImage image1, PdfImage image2, int rotation) {
// diese Angaben von getPosition scheinen nicht richtig zu sein, damit werden teile des Bildes abgeschnitten
double width = image1.getPosition().getWidth();
double width2 = image2.getPosition().getWidth();
double height1 = image1.getPosition().getHeight();
double height2 = image2.getPosition().getHeight();
// mit den Werten, die unter Image gespeichert sind, funktioniert es
double img1height = image1.getImage().getHeight();
double img1width = image1.getImage().getWidth();
double img2height = image2.getImage().getHeight();
BufferedImage mergedImage = new BufferedImage((int) img1width, (int) (img1height + img2height), BufferedImage.TYPE_INT_RGB);
Graphics mergedImageGraphics = mergedImage.getGraphics();
try {
mergedImageGraphics.drawImage(image1.getImage(), 0, 0, null);
mergedImageGraphics.drawImage(image2.getImage(), 0, (int) (img1height), null);
// set Image, Position and type for merged Image
//set position for merged image with values of image1 and the height of both
Rectangle2D pos = new Rectangle2D.Float();
pos.setRect(image1.getPosition().getX(), image2.getPosition().getY(), rotation == 90 ? width + width2: width, rotation == 90 ? height1 : height1 + height2);
PdfImage newPdfImage = new PdfImage(mergedImage, pos, image1.getPage());
// Graphics need to be disposed
image1.getImage().flush();
image2.getImage().flush();
mergedImage.flush();
mergedImageGraphics.dispose();
return newPdfImage;
} catch (Exception e) {
// failed to merge image
log.error("Failed to merge image", e);
return null;
}
}
//make a list of true and false, if the image is a candidate for merging
private List<Boolean> getCandidatesList(List<PdfImage> imageList, int rotation) {
List<Boolean> candidatesList = new ArrayList<>();
for (int i = 0; i < imageList.size(); i++) {
if (i >= 1) {
candidatesList.add(isCandidateForMerging(imageList.get(i - 1), imageList.get(i), rotation));
}
}
return candidatesList;
}
// evaluate if two images are candidates for merging, depending on their coordinates, width and height
private boolean isCandidateForMerging(PdfImage image1, PdfImage image2, int rotation) {
double x1 = rotation == 90 ? image1.getPosition().getY() : image1.getPosition().getX();
double y1 = rotation == 90 ? image1.getPosition().getX() : image1.getPosition().getY();
double width1 = rotation == 90 ? image1.getPosition().getHeight() : image1.getPosition().getWidth();
double x2 = rotation == 90 ? image2.getPosition().getY() : image2.getPosition().getX();
double y2 = rotation == 90 ? image2.getPosition().getX() : image2.getPosition().getY();
double width2 = rotation == 90 ? image2.getPosition().getHeight() : image2.getPosition().getWidth();
double height2 = rotation == 90 ? image2.getPosition().getWidth() : image2.getPosition().getHeight();
//if the x-coordinates and widths of images are equal and the height is equal to difference between y-coordinates,
// then it is the same picture and has to be merged -> return true
return x1 == x2 && width1 == width2 && Math.ceil(height2) == Math.ceil(rotation == 90 ? y2 - y1 : y1 - y2) && width2 > (height2 / 6);
}
}

View File

@ -1,6 +1,19 @@
package com.iqser.red.service.redaction.v1.server.segmentation;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
@ -15,24 +28,9 @@ import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractT
import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
import com.iqser.red.service.redaction.v1.server.tableextraction.service.RulingCleaningService;
import com.iqser.red.service.redaction.v1.server.tableextraction.service.TableExtractionService;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.IOUtils;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.springframework.stereotype.Service;
import java.awt.Graphics;
import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
@Slf4j
@Service
@ -47,13 +45,17 @@ public class PdfSegmentationService {
private final ClassificationService classificationService;
private final SectionsBuilderService sectionsBuilderService;
private final ImageClassificationService imageClassificationService;
private final ImageMergeService imageMergeService;
public Document parseDocument(InputStream documentInputStream) throws IOException {
return parseDocument(documentInputStream, false);
}
public Document parseDocument(InputStream documentInputStream, boolean ignoreImages) throws IOException {
PDDocument pdDocument = null;
try {
//create tempFile
@ -64,7 +66,6 @@ public class PdfSegmentationService {
Document document = new Document();
List<Page> pages = new ArrayList<>();
pdDocument = reinitializePDDocument(tempFile, null);
long pageCount = pdDocument.getNumberOfPages();
@ -101,32 +102,19 @@ public class PdfSegmentationService {
page.setRotation(rotation);
page.setLandscape(isLandscape || isRotated);
page.setPageNumber(pageNumber);
List<PdfImage> mergedList = processImages(stripper.getImages());
List<PdfImage> imagesInImage = new ArrayList<>();
for(PdfImage image: mergedList){
for (PdfImage inner: mergedList){
if(image != inner && image.getPosition().contains(inner.getPosition().getX(), inner.getPosition().getY(), inner.getPosition().getWidth(), inner.getPosition().getHeight())){
imagesInImage.add(inner);
}
}
}
mergedList.removeAll(imagesInImage);
List<PdfImage> mergedList = imageMergeService.mergeImages(stripper.getImages(), rotation);
page.setImages(mergedList);
tableExtractionService.extractTables(cleanRulings, page);
buildPageStatistics(page);
increaseDocumentStatistics(page, document);
if (!ignoreImages) {
imageClassificationService.classifyImages(page);
}
pages.add(page);
}
document.setPages(pages);
@ -149,7 +137,9 @@ public class PdfSegmentationService {
}
}
private PDDocument reinitializePDDocument(File tempFile, PDDocument pdDocument) throws IOException {
if (pdDocument != null) {
pdDocument.close();
}
@ -164,130 +154,6 @@ public class PdfSegmentationService {
return newPDDocument;
}
//merge images, if they are separated during pdf import, return new list of Pdfimages
private List<PdfImage> processImages(List<PdfImage> imageList) {
if (imageList.size() > 1) {
List<PdfImage> mergedList = new ArrayList<>();
int countElementsInList = 0;
boolean beginImage = true;
// a List of Boolean, true = candidate for merging, false = no merging
List<Boolean> candidatesList = getCandidatesList(imageList);
// loop through list, if there are candidates for merging (true), merge images and add it to mergedList
for (int i = 0; i < candidatesList.size(); i++) {
if (candidatesList.get(i)) {
if (beginImage) {
//begin of image, merge two parts of imageList
PdfImage mergedImage = mergeTwoImages(imageList.get(i), imageList.get(i + 1));
// image merge successful
if (mergedImage != null) {
mergedList.add(mergedImage);
countElementsInList++;
}
} else {
//middle of an image, merge current piece auf mergedList with image of imageList
PdfImage mergedImage = mergeTwoImages(mergedList.get(countElementsInList - 1), imageList.get(i + 1));
// image merge successful
if (mergedImage != null) {
mergedList.set(countElementsInList - 1, mergedImage);
}
}
beginImage = false;
} else {
// if the last candidate is false, then both images i and i+1 must be added
if (i == candidatesList.size() - 1) {
if (countElementsInList > 0 && mergedList.get(countElementsInList - 1) == imageList.get(i)) {
mergedList.add(imageList.get(i + 1));
} else {
mergedList.add(imageList.get(i));
mergedList.add(imageList.get(i + 1));
}
} else {
//first image is not splitted, add i to resultlist
if (beginImage) {
mergedList.add(imageList.get(i));
countElementsInList++;
} else {
// i is the end of an image, add begin of new image
mergedList.add(imageList.get(i + 1));
countElementsInList++;
beginImage = false;
}
}
}
}
return mergedList;
} else {
return imageList;
}
}
private PdfImage mergeTwoImages(PdfImage image1, PdfImage image2) {
// diese Angaben von getPosition scheinen nicht richtig zu sein, damit werden teile des Bildes abgeschnitten
double width = image1.getPosition().getWidth();
double height1 = image1.getPosition().getHeight();
double height2 = image2.getPosition().getHeight();
// mit den Werten, die unter Image gespeichert sind, funktioniert es
double img1height = image1.getImage().getHeight();
double img1width = image1.getImage().getWidth();
double img2height = image2.getImage().getHeight();
BufferedImage mergedImage = new BufferedImage((int) img1width, (int) (img1height + img2height), BufferedImage.TYPE_INT_RGB);
Graphics mergedImageGraphics = mergedImage.getGraphics();
try {
mergedImageGraphics.drawImage(image1.getImage(), 0, 0, null);
mergedImageGraphics.drawImage(image2.getImage(), 0, (int) (img1height), null);
// set Image, Position and type for merged Image
//set position for merged image with values of image1 and the height of both
Rectangle2D pos = new Rectangle2D.Float();
pos.setRect(image1.getPosition().getX(), image2.getPosition().getY(), width, height1 + height2);
PdfImage newPdfImage = new PdfImage(mergedImage, pos, image1.getPage());
// Graphics need to be disposed
image1.getImage().flush();
image2.getImage().flush();
mergedImage.flush();
mergedImageGraphics.dispose();
return newPdfImage;
} catch (Exception e) {
// failed to merge image
log.error("Failed to merge image", e);
return null;
}
}
//make a list of true and false, if the image is a candidate for merging
private List<Boolean> getCandidatesList(List<PdfImage> imageList) {
List<Boolean> candidatesList = new ArrayList<>();
for (int i = 0; i < imageList.size(); i++) {
if (i >= 1) {
candidatesList.add(isCandidateForMerging(imageList.get(i - 1), imageList.get(i)));
}
}
return candidatesList;
}
// evaluate if two images are candidates for merging, depending on their coordinates, width and height
private boolean isCandidateForMerging(PdfImage image1, PdfImage image2) {
double x1 = image1.getPosition().getX();
double y1 = image1.getPosition().getY();
double width1 = image1.getPosition().getWidth();
double x2 = image2.getPosition().getX();
double y2 = image2.getPosition().getY();
double width2 = image2.getPosition().getWidth();
double height2 = image2.getPosition().getHeight();
//if the x-coordinates and widths of images are equal and the height is equal to difference between y-coordinates,
// then it is the same picture and has to be merged -> return true
return x1 == x2 && width1 == width2 && Math.ceil(height2) == Math.ceil(y1 - y2) && width2 > (height2 / 6);
}
private void increaseDocumentStatistics(Page page, Document document) {
@ -319,5 +185,4 @@ public class PdfSegmentationService {
}
}