diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java index be4fa972..5b279051 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationService.java @@ -8,6 +8,7 @@ import com.iqser.red.service.redaction.v1.server.classification.service.Classifi import com.iqser.red.service.redaction.v1.server.memory.MemoryStats; import com.iqser.red.service.redaction.v1.server.parsing.PDFLinesTextStripper; import com.iqser.red.service.redaction.v1.server.parsing.model.TextPositionSequence; +import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage; import com.iqser.red.service.redaction.v1.server.redaction.service.ImageClassificationService; import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; import com.iqser.red.service.redaction.v1.server.tableextraction.model.CleanRulings; @@ -22,6 +23,9 @@ import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.springframework.stereotype.Service; +import java.awt.Graphics; +import java.awt.geom.Rectangle2D; +import java.awt.image.BufferedImage; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; @@ -92,7 +96,8 @@ public class PdfSegmentationService { page.setRotation(rotation); page.setLandscape(isLandscape || isRotated); page.setPageNumber(pageNumber); - page.setImages(stripper.getImages()); + List mergedList = processImages(stripper.getImages()); + page.setImages(mergedList); tableExtractionService.extractTables(cleanRulings, page); buildPageStatistics(page); @@ -105,6 +110,7 @@ public class PdfSegmentationService { pages.add(page); + } document.setPages(pages); @@ -142,6 +148,110 @@ public class PdfSegmentationService { return newPDDocument; } + //merge images, if they are separated during pdf import, return new list of Pdfimages + private List processImages(List imageList){ + if (imageList.size() >1) { + List mergedList = new ArrayList<>(); + int countElementsInList = 0; + boolean beginImage = true; + + // a List of Boolean, true = candidate for merging, false = no merging + ArrayList candidatesList = getCandidatesList(imageList); + + // loop through list, if there are candidates for merging (true), merge images and add it to mergedList + for (int i = 0; i < candidatesList.size(); i++) { + if (candidatesList.get(i)) { + if (beginImage) { + //begin of image, merge two parts of imageList + PdfImage mergedImage = mergeTwoImages(imageList.get(i), imageList.get(i + 1)); + mergedList.add(mergedImage); + countElementsInList++; + } else { + //middle of an image, merge current piece auf mergedList with image of imageList + PdfImage mergedImage = mergeTwoImages(mergedList.get(countElementsInList - 1), imageList.get(i + 1)); + mergedList.set(countElementsInList - 1, mergedImage); + } + beginImage = false; + } else { + // if the last candidate is false, then both images i and i+1 must be added + if (i == candidatesList.size()-1){ + mergedList.add(imageList.get(i)); + mergedList.add(imageList.get(i+1)); + }else { + //first image is not splitted, add i to resultlist + if (beginImage){ + mergedList.add(imageList.get(i)); + countElementsInList++; + }else { + // i is the end of an image, add begin of new image + mergedList.add(imageList.get(i + 1)); + countElementsInList++; + beginImage = false; + } + } + } + }return mergedList; + }else { + return imageList; + } + } + + private PdfImage mergeTwoImages(PdfImage image1, PdfImage image2){ + PdfImage newPdfImage = null; + // diese Angaben von getPosition scheinen nicht richtig zu sein, damit werden teile des Bildes abgeschnitten + double width = image1.getPosition().getWidth(); + double height1 = image1.getPosition().getHeight(); + double height2 = image2.getPosition().getHeight(); + // mit den Werten, die unter Image gespeichert sind, funktioniert es + double img1height = image1.getImage().getHeight(); + double img1width = image1.getImage().getWidth(); + double img2height = image2.getImage().getHeight(); + + BufferedImage mergedImage = new BufferedImage((int) img1width, (int) (img1height+ img2height), BufferedImage.TYPE_INT_RGB); + Graphics g = mergedImage.getGraphics(); + try { + g.drawImage(image1.getImage(),0,0,null); + g.drawImage(image2.getImage(),0, (int) (img1height),null); + + // set Image, Position and type for merged Image + //set position for merged image with values of image1 and the height of both + Rectangle2D pos = new Rectangle2D.Float(); + pos.setRect(image1.getPosition().getX(), image2.getPosition().getY(), width,height1+height2); + newPdfImage = new PdfImage(mergedImage,pos,0); + // Graphics need to be disposed + newPdfImage.getImage().flush(); + newPdfImage.getImage().getGraphics().dispose(); + } catch (Exception e) { + e.printStackTrace(); + } + newPdfImage.setPage(image1.getPage()); + return newPdfImage; + } + + //make a list of true and false, if the image is a candidate for merging + private ArrayList getCandidatesList(List imageList) { + ArrayList candidatesList = new ArrayList<>(); + for (int i = 0; i < imageList.size(); i++) { + if (i >= 1) { + candidatesList.add(isCandidateForMerging(imageList.get(i - 1), imageList.get(i))); + } + }return candidatesList; + } + + // evaluate if two images are candidates for merging, depending on their coordinates, width and height + private boolean isCandidateForMerging(PdfImage image1, PdfImage image2){ + double x1 = image1.getPosition().getX(); + double y1 = image1.getPosition().getY(); + double width1 = image1.getPosition().getWidth(); + double x2 = image2.getPosition().getX(); + double y2 = image2.getPosition().getY(); + double width2 = image2.getPosition().getWidth(); + double height2 = image2.getPosition().getHeight(); + //if the x-coordinates and widths of images are equal and the height is equal to difference between y-coordinates, + // then it is the same picture and has to be merged -> return true + return x1 == x2 && width1 == width2 && Math.ceil(height2) == Math.ceil(y1 - y2) && width2 > (height2 / 6); + } + private void increaseDocumentStatistics(Page page, Document document) { diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index 821e20d6..29852b95 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -482,6 +482,51 @@ public class RedactionIntegrationTest { assertThat(result).isNotNull(); } + @Test + public void testMergedImages() throws IOException { + + long start = System.currentTimeMillis(); + ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/merge_images.pdf"); + + AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream()); + AnalyzeResult result = reanalyzeService.analyze(request); + + Map> duplicates = new HashMap<>(); + + var redactionLog = redactionStorageService.getRedactionLog(TEST_PROJECT_ID, TEST_FILE_ID); + + redactionLog.getRedactionLogEntry().forEach(entry -> { + duplicates.computeIfAbsent(entry.getId(), v -> new ArrayList<>()).add(entry); + }); + + duplicates.entrySet().forEach(entry -> { + assertThat(entry.getValue().size()).isEqualTo(1); + }); + + dictionary.get(AUTHOR).add("Drinking water"); + when(dictionaryClient.getVersion(TEST_RULESET_ID)).thenReturn(1L); + + AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder() + .projectId(TEST_PROJECT_ID) + .fileId(TEST_FILE_ID) + .build()); + + try (FileOutputStream fileOutputStream = new FileOutputStream("/tmp/Annotated3.pdf")) { + fileOutputStream.write(annotateResponse.getDocument()); + } + long rstart = System.currentTimeMillis(); + reanalyzeService.reanalyze(request); + + long rend = System.currentTimeMillis(); + System.out.println("reanalysis analysis duration: " + (rend - rstart)); + + + long end = System.currentTimeMillis(); + + System.out.println("duration: " + (end - start)); + + + } @Test @Ignore diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java index 398e521f..44842b7d 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/segmentation/PdfSegmentationServiceTest.java @@ -68,6 +68,17 @@ public class PdfSegmentationServiceTest { } + @Test + public void testMergeImages() throws IOException { + + ClassPathResource pdfFileResource = new ClassPathResource("files/Minimal Examples/270Rotated.pdf"); + + Document document = pdfSegmentationService.parseDocument(pdfFileResource.getInputStream()); + assertThat(document.getPages().get(0).getImages().size()).isEqualTo(1); + assertThat(document.getPages().get(1).getImages().size()).isEqualTo(0); + + } + @Test @Ignore public void testExtractImages() throws IOException { diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/merge_images.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/merge_images.pdf new file mode 100644 index 00000000..a2decc1a Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/Minimal Examples/merge_images.pdf differ