Compare commits

...

2 Commits

Author SHA1 Message Date
deiflaender
350de7f385 Import redaction fixes 2022-02-08 14:37:03 +01:00
deiflaender
b09f036253 OpenCv First Version 2022-02-04 16:43:47 +01:00
5 changed files with 231 additions and 0 deletions

View File

@ -20,6 +20,11 @@
</properties>
<dependencies>
<dependency>
<groupId>org.openpnp</groupId>
<artifactId>opencv</artifactId>
<version>4.5.1-2</version>
</dependency>
<dependency>
<groupId>com.iqser.red.commons</groupId>
<artifactId>storage-commons</artifactId>

View File

@ -0,0 +1,184 @@
package com.iqser.red.service.redaction.v1.server.redaction.service;
import static org.opencv.imgproc.Imgproc.COLOR_BGR2GRAY;
import java.awt.Color;
import java.awt.image.BufferedImage;
import java.awt.image.DataBufferByte;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.stream.Collectors;
import javax.imageio.ImageIO;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.opencv.core.Core;
import org.opencv.core.Mat;
import org.opencv.core.MatOfByte;
import org.opencv.core.MatOfPoint;
import org.opencv.core.MatOfPoint2f;
import org.opencv.core.Rect;
import org.opencv.core.Scalar;
import org.opencv.core.Size;
import org.opencv.imgcodecs.Imgcodecs;
import org.opencv.imgproc.Imgproc;
import org.springframework.beans.BeanUtils;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.model.ImportedRedaction;
import com.iqser.red.service.redaction.v1.model.ImportedRedactions;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import lombok.SneakyThrows;
@Service
public class RedactionExtractorService {
@SneakyThrows
public ImportedRedactions extractRedaction(InputStream pdfFile) {
PDDocument pdfDocument = PDDocument.load(pdfFile);
pdfDocument.setAllSecurityToBeRemoved(true);
long pageCount = pdfDocument.getNumberOfPages();
Map<Integer, List<ImportedRedaction>> importedRedactionsMap = new HashMap<>();
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
PDPage page = pdfDocument.getPage(pageNumber - 1);
BufferedImage image = pageConvertToImage(pdfDocument, page, 200, ImageType.GRAY);
Mat imageAsMat = bufferedImage2Mat(image);
Mat imageAsMatINv = new Mat();
Mat invertcolormatrix= new Mat(imageAsMat.rows(),imageAsMat.cols(), imageAsMat.type(), new Scalar(255,255,255));
Core.subtract(invertcolormatrix, imageAsMat, imageAsMatINv);
Mat blurred = new Mat();
Imgproc.GaussianBlur(imageAsMatINv,blurred, new Size(5, 5), 1);
Mat thresh = new Mat();
Imgproc.threshold(blurred, thresh, 240, 255, Imgproc.THRESH_BINARY);
Mat hierachy = new Mat();
List<MatOfPoint> contours = new ArrayList<>();
Imgproc.findContours(thresh,contours,hierachy, Imgproc.RETR_TREE, Imgproc.CHAIN_APPROX_NONE);
System.out.println(contours);
System.out.println(hierachy.size(3));
var min_nomralized_area=800;
if(hierachy.rows() > 0 && hierachy.row(0).rows() > 0) {
hierachy = hierachy.row(0).row(0);
System.out.println("Cont" + contours.size() + "Hier" + hierachy.cols());
// for(int j = 0; j<= hierachy.cols(); j++){
// System.out.println("----> Col " + j);
// for (double d :hierachy.get(0,j)){
// System.out.println( " " + d);
// }
// }
int i = 0;
List<MatOfPoint> filterd = new ArrayList<>();
for (MatOfPoint contour : contours) {
if (isLikelyRedaction(contour, hierachy.get(0, i), min_nomralized_area)) {
filterd.add(contour);
}
i++;
}
Imgproc.drawContours(imageAsMat, filterd, -1, new Scalar(180), 2);
List<ImportedRedaction> importedRedactions = new ArrayList<>();
for (MatOfPoint contour : filterd) {
Rect rect = Imgproc.boundingRect(contour);
importedRedactions.add(new ImportedRedaction(UUID.randomUUID().toString(), List.of(new Rectangle(new Point((float) (rect.x * page.getCropBox()
.getWidth() / image.getWidth()), (float) (page.getCropBox().getHeight() - rect.y * page.getCropBox()
.getWidth() / image.getWidth())), rect.width * page.getCropBox()
.getWidth() / image.getWidth(), -rect.height * page.getCropBox()
.getWidth() / image.getWidth(), pageNumber))));
}
importedRedactionsMap.put(pageNumber, importedRedactions);
}
// BufferedImage gray = new BufferedImage(imageAsMat.width(), imageAsMat.height(), BufferedImage.TYPE_BYTE_GRAY);
// byte[] data = ((DataBufferByte) gray.getRaster().getDataBuffer()).getData();
// imageAsMat.get(0, 0, data);
//
// File outputfile = new File("/tmp/image.jpg");
// ImageIO.write(gray, "jpg", outputfile);
//
// System.out.println(contours);
}
pdfDocument.close();
return new ImportedRedactions(importedRedactionsMap);
}
private boolean isFilled(double[] hierachy){
return hierachy[3] <= 0 && hierachy[2] == -1;
}
private boolean isBoxy(MatOfPoint contour){
double epsilon = 0.01 * Imgproc.arcLength(new MatOfPoint2f( contour.toArray()), true);
MatOfPoint2f approx = new MatOfPoint2f();
Imgproc.approxPolyDP(new MatOfPoint2f( contour.toArray()), approx, epsilon, true);
return approx.toArray().length <= 10;
}
private boolean largeEnough(float minArea, MatOfPoint contour){
return Imgproc.contourArea(contour, false) > minArea;
}
private boolean isLikelyRedaction(MatOfPoint contour, double[] hierachy , float minArea) {
return isFilled(hierachy) && isBoxy(contour) && largeEnough(minArea, contour);
}
public static Mat bufferedImage2Mat(BufferedImage image) throws IOException {
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
ImageIO.write(image, "jpg", byteArrayOutputStream);
byteArrayOutputStream.flush();
return Imgcodecs.imdecode(new MatOfByte(byteArrayOutputStream.toByteArray()), Imgcodecs.IMREAD_UNCHANGED);
}
public static BufferedImage pageConvertToImage(PDDocument doc, PDPage page, int dpi,
ImageType imageType) throws IOException {
PDFRenderer renderer = new PDFRenderer(doc);
return renderer.renderImageWithDPI(doc.getPages().indexOf(page), dpi, imageType);
}
}

View File

@ -37,6 +37,7 @@ import org.kie.api.builder.KieBuilder;
import org.kie.api.builder.KieFileSystem;
import org.kie.api.builder.KieModule;
import org.kie.api.runtime.KieContainer;
import org.opencv.core.Core;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
@ -66,6 +67,7 @@ import com.iqser.red.service.redaction.v1.model.AnalyzeResult;
import com.iqser.red.service.redaction.v1.model.AnnotateRequest;
import com.iqser.red.service.redaction.v1.model.AnnotateResponse;
import com.iqser.red.service.redaction.v1.model.FileAttribute;
import com.iqser.red.service.redaction.v1.model.ImportedRedactions;
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
import com.iqser.red.service.redaction.v1.model.RedactionRequest;
import com.iqser.red.service.redaction.v1.model.RedactionResult;
@ -78,6 +80,7 @@ import com.iqser.red.service.redaction.v1.server.controller.RedactionController;
import com.iqser.red.service.redaction.v1.server.memory.MemoryStats;
import com.iqser.red.service.redaction.v1.server.redaction.service.AnalyzeService;
import com.iqser.red.service.redaction.v1.server.redaction.service.ManualRedactionSurroundingTextService;
import com.iqser.red.service.redaction.v1.server.redaction.service.RedactionExtractorService;
import com.iqser.red.service.redaction.v1.server.redaction.utils.OsUtils;
import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
@ -554,6 +557,45 @@ public class RedactionIntegrationTest {
}
@Autowired
private RedactionExtractorService redactionExtractorService;
// static{ System.loadLibrary(Core.NATIVE_LIBRARY_NAME); }
@Test
@SneakyThrows
public void testImportRedactions() {
nu.pattern.OpenCV.loadShared();
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06_redacted1.pdf");
ImportedRedactions importedRedactions = redactionExtractorService.extractRedaction(pdfFileResource.getInputStream());
String outputFileName = OsUtils.getTemporaryDirectory() + "/Annotated.pdf";
ClassPathResource pdfFileResource2 = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
AnalyzeRequest request = prepareStorage(pdfFileResource2.getInputStream());
storageService.storeObject(RedactionStorageService.StorageIdUtils.getStorageId(TEST_DOSSIER_ID, TEST_FILE_ID, FileType.IMPORTED_REDACTIONS), objectMapper.writeValueAsBytes(importedRedactions));
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
AnalyzeResult result = analyzeService.analyze(request);
var redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID);
AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder()
.dossierId(TEST_DOSSIER_ID)
.fileId(TEST_FILE_ID)
.build());
try (FileOutputStream fileOutputStream = new FileOutputStream(outputFileName)) {
fileOutputStream.write(annotateResponse.getDocument());
}
}
@Test
@Ignore
public void testLargeScannedFileOOM() {