Pull request #141: fix: merge images into one

Merge in RED/redaction-service from classimg3 to master

* commit '4d1ad3b5a56915e9e5df9191a535c0e5cd83972c':
  RED-1351: merge images into one
This commit is contained in:
Timo Bejan 2021-04-23 09:36:04 +02:00
commit b83250f161
4 changed files with 167 additions and 1 deletions

View File

@ -8,6 +8,7 @@ import com.iqser.red.service.redaction.v1.server.classification.service.Classifi
import com.iqser.red.service.redaction.v1.server.memory.MemoryStats;
import com.iqser.red.service.redaction.v1.server.parsing.PDFLinesTextStripper;
import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import com.iqser.red.service.redaction.v1.server.redaction.service.ImageClassificationService;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings;
@ -22,6 +23,9 @@ import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.springframework.stereotype.Service;
import java.awt.Graphics;
import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
@ -92,7 +96,8 @@ public class PdfSegmentationService {
page.setRotation(rotation);
page.setLandscape(isLandscape || isRotated);
page.setPageNumber(pageNumber);
page.setImages(stripper.getImages());
List<PdfImage> mergedList = processImages(stripper.getImages());
page.setImages(mergedList);
tableExtractionService.extractTables(cleanRulings, page);
buildPageStatistics(page);
@ -105,6 +110,7 @@ public class PdfSegmentationService {
pages.add(page);
}
document.setPages(pages);
@ -142,6 +148,110 @@ public class PdfSegmentationService {
return newPDDocument;
}
//merge images, if they are separated during pdf import, return new list of Pdfimages
private List<PdfImage> processImages(List<PdfImage> imageList){
if (imageList.size() >1) {
List<PdfImage> mergedList = new ArrayList<>();
int countElementsInList = 0;
boolean beginImage = true;
// a List of Boolean, true = candidate for merging, false = no merging
ArrayList<Boolean> candidatesList = getCandidatesList(imageList);
// loop through list, if there are candidates for merging (true), merge images and add it to mergedList
for (int i = 0; i < candidatesList.size(); i++) {
if (candidatesList.get(i)) {
if (beginImage) {
//begin of image, merge two parts of imageList
PdfImage mergedImage = mergeTwoImages(imageList.get(i), imageList.get(i + 1));
mergedList.add(mergedImage);
countElementsInList++;
} else {
//middle of an image, merge current piece auf mergedList with image of imageList
PdfImage mergedImage = mergeTwoImages(mergedList.get(countElementsInList - 1), imageList.get(i + 1));
mergedList.set(countElementsInList - 1, mergedImage);
}
beginImage = false;
} else {
// if the last candidate is false, then both images i and i+1 must be added
if (i == candidatesList.size()-1){
mergedList.add(imageList.get(i));
mergedList.add(imageList.get(i+1));
}else {
//first image is not splitted, add i to resultlist
if (beginImage){
mergedList.add(imageList.get(i));
countElementsInList++;
}else {
// i is the end of an image, add begin of new image
mergedList.add(imageList.get(i + 1));
countElementsInList++;
beginImage = false;
}
}
}
}return mergedList;
}else {
return imageList;
}
}
private PdfImage mergeTwoImages(PdfImage image1, PdfImage image2){
PdfImage newPdfImage = null;
// diese Angaben von getPosition scheinen nicht richtig zu sein, damit werden teile des Bildes abgeschnitten
double width = image1.getPosition().getWidth();
double height1 = image1.getPosition().getHeight();
double height2 = image2.getPosition().getHeight();
// mit den Werten, die unter Image gespeichert sind, funktioniert es
double img1height = image1.getImage().getHeight();
double img1width = image1.getImage().getWidth();
double img2height = image2.getImage().getHeight();
BufferedImage mergedImage = new BufferedImage((int) img1width, (int) (img1height+ img2height), BufferedImage.TYPE_INT_RGB);
Graphics g = mergedImage.getGraphics();
try {
g.drawImage(image1.getImage(),0,0,null);
g.drawImage(image2.getImage(),0, (int) (img1height),null);
// set Image, Position and type for merged Image
//set position for merged image with values of image1 and the height of both
Rectangle2D pos = new Rectangle2D.Float();
pos.setRect(image1.getPosition().getX(), image2.getPosition().getY(), width,height1+height2);
newPdfImage = new PdfImage(mergedImage,pos,0);
// Graphics need to be disposed
newPdfImage.getImage().flush();
newPdfImage.getImage().getGraphics().dispose();
} catch (Exception e) {
e.printStackTrace();
}
newPdfImage.setPage(image1.getPage());
return newPdfImage;
}
//make a list of true and false, if the image is a candidate for merging
private ArrayList<Boolean> getCandidatesList(List<PdfImage> imageList) {
ArrayList<Boolean> candidatesList = new ArrayList<>();
for (int i = 0; i < imageList.size(); i++) {
if (i >= 1) {
candidatesList.add(isCandidateForMerging(imageList.get(i - 1), imageList.get(i)));
}
}return candidatesList;
}
// evaluate if two images are candidates for merging, depending on their coordinates, width and height
private boolean isCandidateForMerging(PdfImage image1, PdfImage image2){
double x1 = image1.getPosition().getX();
double y1 = image1.getPosition().getY();
double width1 = image1.getPosition().getWidth();
double x2 = image2.getPosition().getX();
double y2 = image2.getPosition().getY();
double width2 = image2.getPosition().getWidth();
double height2 = image2.getPosition().getHeight();
//if the x-coordinates and widths of images are equal and the height is equal to difference between y-coordinates,
// then it is the same picture and has to be merged -> return true
return x1 == x2 && width1 == width2 && Math.ceil(height2) == Math.ceil(y1 - y2) && width2 > (height2 / 6);
}
private void increaseDocumentStatistics(Page page, Document document) {

View File

@ -482,6 +482,51 @@ public class RedactionIntegrationTest {
assertThat(result).isNotNull();
}
@Test
public void testMergedImages() throws IOException {
long start = System.currentTimeMillis();
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/merge_images.pdf");
AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream());
AnalyzeResult result = reanalyzeService.analyze(request);
Map<String, List<RedactionLogEntry>> duplicates = new HashMap<>();
var redactionLog = redactionStorageService.getRedactionLog(TEST_PROJECT_ID, TEST_FILE_ID);
redactionLog.getRedactionLogEntry().forEach(entry -> {
duplicates.computeIfAbsent(entry.getId(), v -> new ArrayList<>()).add(entry);
});
duplicates.entrySet().forEach(entry -> {
assertThat(entry.getValue().size()).isEqualTo(1);
});
dictionary.get(AUTHOR).add("Drinking water");
when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(1L);
AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder()
.projectId(TEST_PROJECT_ID)
.fileId(TEST_FILE_ID)
.build());
try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Annotated3.pdf")) {
fileOutputStream.write(annotateResponse.getDocument());
}
long rstart = System.currentTimeMillis();
reanalyzeService.reanalyze(request);
long rend = System.currentTimeMillis();
System.out.println("reanalysis analysis duration: " + (rend - rstart));
long end = System.currentTimeMillis();
System.out.println("duration: " + (end - start));
}
@Test
@Ignore

View File

@ -68,6 +68,17 @@ public class PdfSegmentationServiceTest {
}
@Test
public void testMergeImages() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/270Rotated.pdf");
Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream());
assertThat(document.getPages().get(0).getImages().size()).isEqualTo(1);
assertThat(document.getPages().get(1).getImages().size()).isEqualTo(0);
}
@Test
@Ignore
public void testExtractImages() throws IOException {