diff --git a/redaction-service-v1/redaction-service-server-v1/pom.xml b/redaction-service-v1/redaction-service-server-v1/pom.xml index 7eae3868..837746f8 100644 --- a/redaction-service-v1/redaction-service-server-v1/pom.xml +++ b/redaction-service-v1/redaction-service-server-v1/pom.xml @@ -20,6 +20,11 @@ + + org.openpnp + opencv + 4.5.1-2 + com.iqser.red.commons storage-commons diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/RedactionExtractorService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/RedactionExtractorService.java new file mode 100644 index 00000000..626a9d55 --- /dev/null +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/RedactionExtractorService.java @@ -0,0 +1,187 @@ +package com.iqser.red.service.redaction.v1.server.redaction.service; + +import static org.opencv.imgproc.Imgproc.COLOR_BGR2GRAY; + +import java.awt.Color; +import java.awt.image.BufferedImage; +import java.awt.image.DataBufferByte; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.UUID; +import java.util.stream.Collectors; + +import javax.imageio.ImageIO; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.rendering.ImageType; +import org.apache.pdfbox.rendering.PDFRenderer; +import org.opencv.core.Core; +import org.opencv.core.Mat; +import org.opencv.core.MatOfByte; +import org.opencv.core.MatOfPoint; +import org.opencv.core.MatOfPoint2f; +import org.opencv.core.Rect; +import org.opencv.core.Scalar; +import org.opencv.core.Size; +import org.opencv.imgcodecs.Imgcodecs; +import org.opencv.imgproc.Imgproc; +import org.springframework.beans.BeanUtils; +import org.springframework.stereotype.Service; + +import com.iqser.red.service.redaction.v1.model.ImportedRedaction; +import com.iqser.red.service.redaction.v1.model.ImportedRedactions; +import com.iqser.red.service.redaction.v1.model.Point; +import com.iqser.red.service.redaction.v1.model.Rectangle; + +import lombok.SneakyThrows; + +@Service +public class RedactionExtractorService { + + @SneakyThrows + public ImportedRedactions extractRedaction(InputStream pdfFile) { + + PDDocument pdfDocument = PDDocument.load(pdfFile); + pdfDocument.setAllSecurityToBeRemoved(true); + + long pageCount = pdfDocument.getNumberOfPages(); + + Map> importedRedactionsMap = new HashMap<>(); + for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) { + PDPage page = pdfDocument.getPage(pageNumber - 1); + BufferedImage image = pageConvertToImage(pdfDocument, page, 144, ImageType.GRAY); + + + Mat imageAsMat = bufferedImage2Mat(image); + + Mat imageAsMatINv = new Mat(); + + Mat invertcolormatrix= new Mat(imageAsMat.rows(),imageAsMat.cols(), imageAsMat.type(), new Scalar(255,255,255)); + + Core.subtract(invertcolormatrix, imageAsMat, imageAsMatINv); + + + + Mat blurred = new Mat(); + Imgproc.GaussianBlur(imageAsMatINv,blurred, new Size(5, 5), 1); + Mat thresh = new Mat(); + Imgproc.threshold(blurred, thresh, 250, 255, Imgproc.THRESH_BINARY); + + + Mat hierachy = new Mat(); + List contours = new ArrayList<>(); + Imgproc.findContours(thresh,contours,hierachy, Imgproc.RETR_TREE, Imgproc.CHAIN_APPROX_NONE); + + + System.out.println(contours); + System.out.println(hierachy.size(3)); + + var min_nomralized_area=600; + + hierachy = hierachy.row(0).row(0); + + System.out.println("Cont" +contours.size() + "Hier" + hierachy.cols()); + + +// for(int j = 0; j<= hierachy.cols(); j++){ +// System.out.println("----> Col " + j); +// for (double d :hierachy.get(0,j)){ +// System.out.println( " " + d); +// } +// } + + + + int i = 0; + List filterd = new ArrayList<>(); + for(MatOfPoint contour: contours){ + if(isLikelyRedaction(contour, hierachy.get(0,i), min_nomralized_area)){ + filterd.add(contour); + } + i++; + } + + + Imgproc.drawContours(imageAsMat, filterd, -1, new Scalar(180), 2); + + + List importedRedactions = new ArrayList<>(); + for(MatOfPoint contour : filterd){ + Rect rect = Imgproc.boundingRect(contour); + + importedRedactions.add(new ImportedRedaction(UUID.randomUUID().toString(), List.of( + new Rectangle(new Point((float) (rect.x * page.getCropBox().getWidth() / image.getWidth()), + (float) (page.getCropBox().getHeight() - rect.y * page.getCropBox().getWidth() / image.getWidth())), + rect.width * page.getCropBox().getWidth() / image.getWidth(), + -rect.height * page.getCropBox().getWidth() / image.getWidth(), + 1)))); + } + + + importedRedactionsMap.put(1, importedRedactions); + + +// BufferedImage gray = new BufferedImage(imageAsMat.width(), imageAsMat.height(), BufferedImage.TYPE_BYTE_GRAY); +// byte[] data = ((DataBufferByte) gray.getRaster().getDataBuffer()).getData(); +// imageAsMat.get(0, 0, data); +// +// File outputfile = new File("/tmp/image.jpg"); +// ImageIO.write(gray, "jpg", outputfile); + +// + +// System.out.println(contours); + } + return new ImportedRedactions(importedRedactionsMap); + } + + + + private boolean isFilled(double[] hierachy){ + return hierachy[3] <= 0 && hierachy[2] == -1; + } + + private boolean isBoxy(MatOfPoint contour){ + double epsilon = 0.01 * Imgproc.arcLength(new MatOfPoint2f( contour.toArray()), true); + MatOfPoint2f approx = new MatOfPoint2f(); + Imgproc.approxPolyDP(new MatOfPoint2f( contour.toArray()), approx, epsilon, true); + return approx.toArray().length <= 10; + } + + + private boolean largeEnough(float minArea, MatOfPoint contour){ + return Imgproc.contourArea(contour, false) > minArea; + } + + private boolean isLikelyRedaction(MatOfPoint contour, double[] hierachy , float minArea) { + return isFilled(hierachy) && isBoxy(contour) && largeEnough(minArea, contour); + } + + + + public static Mat bufferedImage2Mat(BufferedImage image) throws IOException { + + ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); + ImageIO.write(image, "jpg", byteArrayOutputStream); + byteArrayOutputStream.flush(); + return Imgcodecs.imdecode(new MatOfByte(byteArrayOutputStream.toByteArray()), Imgcodecs.IMREAD_UNCHANGED); + } + + + public static BufferedImage pageConvertToImage(PDDocument doc, PDPage page, int dpi, + ImageType imageType) throws IOException { + + PDFRenderer renderer = new PDFRenderer(doc); + return renderer.renderImageWithDPI(doc.getPages().indexOf(page), dpi, imageType); + } + +} diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java index 4b0133f0..7c2edc9b 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java +++ b/redaction-service-v1/redaction-service-server-v1/src/test/java/com/iqser/red/service/redaction/v1/server/RedactionIntegrationTest.java @@ -37,6 +37,7 @@ import org.kie.api.builder.KieBuilder; import org.kie.api.builder.KieFileSystem; import org.kie.api.builder.KieModule; import org.kie.api.runtime.KieContainer; +import org.opencv.core.Core; import org.springframework.amqp.rabbit.core.RabbitTemplate; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.autoconfigure.EnableAutoConfiguration; @@ -66,6 +67,7 @@ import com.iqser.red.service.redaction.v1.model.AnalyzeResult; import com.iqser.red.service.redaction.v1.model.AnnotateRequest; import com.iqser.red.service.redaction.v1.model.AnnotateResponse; import com.iqser.red.service.redaction.v1.model.FileAttribute; +import com.iqser.red.service.redaction.v1.model.ImportedRedactions; import com.iqser.red.service.redaction.v1.model.RedactionLogEntry; import com.iqser.red.service.redaction.v1.model.RedactionRequest; import com.iqser.red.service.redaction.v1.model.RedactionResult; @@ -78,6 +80,7 @@ import com.iqser.red.service.redaction.v1.server.controller.RedactionController; import com.iqser.red.service.redaction.v1.server.memory.MemoryStats; import com.iqser.red.service.redaction.v1.server.redaction.service.AnalyzeService; import com.iqser.red.service.redaction.v1.server.redaction.service.ManualRedactionSurroundingTextService; +import com.iqser.red.service.redaction.v1.server.redaction.service.RedactionExtractorService; import com.iqser.red.service.redaction.v1.server.redaction.utils.OsUtils; import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader; import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities; @@ -554,6 +557,44 @@ public class RedactionIntegrationTest { } + @Autowired + private RedactionExtractorService redactionExtractorService; + +// static{ System.loadLibrary(Core.NATIVE_LIBRARY_NAME); } + + @Test + @SneakyThrows + public void testImportRedactions() { + nu.pattern.OpenCV.loadShared(); + + ClassPathResource pdfFileResource = new ClassPathResource("files/redest_man_page.pdf"); + + ImportedRedactions importedRedactions = redactionExtractorService.extractRedaction(pdfFileResource.getInputStream()); + + + String outputFileName = OsUtils.getTemporaryDirectory() + "/Annotated.pdf"; + + + AnalyzeRequest request = prepareStorage(pdfFileResource.getInputStream()); + storageService.storeObject(RedactionStorageService.StorageIdUtils.getStorageId(TEST_DOSSIER_ID, TEST_FILE_ID, FileType.IMPORTED_REDACTIONS), objectMapper.writeValueAsBytes(importedRedactions)); + + analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId())); + AnalyzeResult result = analyzeService.analyze(request); + + var redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID); + + AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder() + .dossierId(TEST_DOSSIER_ID) + .fileId(TEST_FILE_ID) + .build()); + + try (FileOutputStream fileOutputStream = new FileOutputStream(outputFileName)) { + fileOutputStream.write(annotateResponse.getDocument()); + } + + } + + @Test @Ignore public void testLargeScannedFileOOM() { diff --git a/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/redest_man_page.pdf b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/redest_man_page.pdf new file mode 100644 index 00000000..e7f6f9b5 Binary files /dev/null and b/redaction-service-v1/redaction-service-server-v1/src/test/resources/files/redest_man_page.pdf differ