Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
350de7f385 | ||
|
|
b09f036253 |
@ -20,6 +20,11 @@
|
|||||||
</properties>
|
</properties>
|
||||||
|
|
||||||
<dependencies>
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.openpnp</groupId>
|
||||||
|
<artifactId>opencv</artifactId>
|
||||||
|
<version>4.5.1-2</version>
|
||||||
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.iqser.red.commons</groupId>
|
<groupId>com.iqser.red.commons</groupId>
|
||||||
<artifactId>storage-commons</artifactId>
|
<artifactId>storage-commons</artifactId>
|
||||||
|
|||||||
@ -0,0 +1,184 @@
|
|||||||
|
package com.iqser.red.service.redaction.v1.server.redaction.service;
|
||||||
|
|
||||||
|
import static org.opencv.imgproc.Imgproc.COLOR_BGR2GRAY;
|
||||||
|
|
||||||
|
import java.awt.Color;
|
||||||
|
import java.awt.image.BufferedImage;
|
||||||
|
import java.awt.image.DataBufferByte;
|
||||||
|
import java.io.ByteArrayOutputStream;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileOutputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.UUID;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import javax.imageio.ImageIO;
|
||||||
|
|
||||||
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDPage;
|
||||||
|
import org.apache.pdfbox.rendering.ImageType;
|
||||||
|
import org.apache.pdfbox.rendering.PDFRenderer;
|
||||||
|
import org.opencv.core.Core;
|
||||||
|
import org.opencv.core.Mat;
|
||||||
|
import org.opencv.core.MatOfByte;
|
||||||
|
import org.opencv.core.MatOfPoint;
|
||||||
|
import org.opencv.core.MatOfPoint2f;
|
||||||
|
import org.opencv.core.Rect;
|
||||||
|
import org.opencv.core.Scalar;
|
||||||
|
import org.opencv.core.Size;
|
||||||
|
import org.opencv.imgcodecs.Imgcodecs;
|
||||||
|
import org.opencv.imgproc.Imgproc;
|
||||||
|
import org.springframework.beans.BeanUtils;
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.iqser.red.service.redaction.v1.model.ImportedRedaction;
|
||||||
|
import com.iqser.red.service.redaction.v1.model.ImportedRedactions;
|
||||||
|
import com.iqser.red.service.redaction.v1.model.Point;
|
||||||
|
import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||||
|
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
public class RedactionExtractorService {
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
public ImportedRedactions extractRedaction(InputStream pdfFile) {
|
||||||
|
|
||||||
|
PDDocument pdfDocument = PDDocument.load(pdfFile);
|
||||||
|
pdfDocument.setAllSecurityToBeRemoved(true);
|
||||||
|
|
||||||
|
long pageCount = pdfDocument.getNumberOfPages();
|
||||||
|
|
||||||
|
Map<Integer, List<ImportedRedaction>> importedRedactionsMap = new HashMap<>();
|
||||||
|
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
|
||||||
|
PDPage page = pdfDocument.getPage(pageNumber - 1);
|
||||||
|
BufferedImage image = pageConvertToImage(pdfDocument, page, 200, ImageType.GRAY);
|
||||||
|
|
||||||
|
|
||||||
|
Mat imageAsMat = bufferedImage2Mat(image);
|
||||||
|
|
||||||
|
Mat imageAsMatINv = new Mat();
|
||||||
|
|
||||||
|
Mat invertcolormatrix= new Mat(imageAsMat.rows(),imageAsMat.cols(), imageAsMat.type(), new Scalar(255,255,255));
|
||||||
|
|
||||||
|
Core.subtract(invertcolormatrix, imageAsMat, imageAsMatINv);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Mat blurred = new Mat();
|
||||||
|
Imgproc.GaussianBlur(imageAsMatINv,blurred, new Size(5, 5), 1);
|
||||||
|
Mat thresh = new Mat();
|
||||||
|
Imgproc.threshold(blurred, thresh, 240, 255, Imgproc.THRESH_BINARY);
|
||||||
|
|
||||||
|
|
||||||
|
Mat hierachy = new Mat();
|
||||||
|
List<MatOfPoint> contours = new ArrayList<>();
|
||||||
|
Imgproc.findContours(thresh,contours,hierachy, Imgproc.RETR_TREE, Imgproc.CHAIN_APPROX_NONE);
|
||||||
|
|
||||||
|
|
||||||
|
System.out.println(contours);
|
||||||
|
System.out.println(hierachy.size(3));
|
||||||
|
|
||||||
|
var min_nomralized_area=800;
|
||||||
|
|
||||||
|
if(hierachy.rows() > 0 && hierachy.row(0).rows() > 0) {
|
||||||
|
hierachy = hierachy.row(0).row(0);
|
||||||
|
|
||||||
|
System.out.println("Cont" + contours.size() + "Hier" + hierachy.cols());
|
||||||
|
|
||||||
|
// for(int j = 0; j<= hierachy.cols(); j++){
|
||||||
|
// System.out.println("----> Col " + j);
|
||||||
|
// for (double d :hierachy.get(0,j)){
|
||||||
|
// System.out.println( " " + d);
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
int i = 0;
|
||||||
|
List<MatOfPoint> filterd = new ArrayList<>();
|
||||||
|
for (MatOfPoint contour : contours) {
|
||||||
|
if (isLikelyRedaction(contour, hierachy.get(0, i), min_nomralized_area)) {
|
||||||
|
filterd.add(contour);
|
||||||
|
}
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
|
||||||
|
Imgproc.drawContours(imageAsMat, filterd, -1, new Scalar(180), 2);
|
||||||
|
|
||||||
|
List<ImportedRedaction> importedRedactions = new ArrayList<>();
|
||||||
|
for (MatOfPoint contour : filterd) {
|
||||||
|
Rect rect = Imgproc.boundingRect(contour);
|
||||||
|
|
||||||
|
importedRedactions.add(new ImportedRedaction(UUID.randomUUID().toString(), List.of(new Rectangle(new Point((float) (rect.x * page.getCropBox()
|
||||||
|
.getWidth() / image.getWidth()), (float) (page.getCropBox().getHeight() - rect.y * page.getCropBox()
|
||||||
|
.getWidth() / image.getWidth())), rect.width * page.getCropBox()
|
||||||
|
.getWidth() / image.getWidth(), -rect.height * page.getCropBox()
|
||||||
|
.getWidth() / image.getWidth(), pageNumber))));
|
||||||
|
}
|
||||||
|
|
||||||
|
importedRedactionsMap.put(pageNumber, importedRedactions);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// BufferedImage gray = new BufferedImage(imageAsMat.width(), imageAsMat.height(), BufferedImage.TYPE_BYTE_GRAY);
|
||||||
|
// byte[] data = ((DataBufferByte) gray.getRaster().getDataBuffer()).getData();
|
||||||
|
// imageAsMat.get(0, 0, data);
|
||||||
|
//
|
||||||
|
// File outputfile = new File("/tmp/image.jpg");
|
||||||
|
// ImageIO.write(gray, "jpg", outputfile);
|
||||||
|
|
||||||
|
//
|
||||||
|
|
||||||
|
// System.out.println(contours);
|
||||||
|
}
|
||||||
|
pdfDocument.close();
|
||||||
|
|
||||||
|
return new ImportedRedactions(importedRedactionsMap);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
private boolean isFilled(double[] hierachy){
|
||||||
|
return hierachy[3] <= 0 && hierachy[2] == -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isBoxy(MatOfPoint contour){
|
||||||
|
double epsilon = 0.01 * Imgproc.arcLength(new MatOfPoint2f( contour.toArray()), true);
|
||||||
|
MatOfPoint2f approx = new MatOfPoint2f();
|
||||||
|
Imgproc.approxPolyDP(new MatOfPoint2f( contour.toArray()), approx, epsilon, true);
|
||||||
|
return approx.toArray().length <= 10;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean largeEnough(float minArea, MatOfPoint contour){
|
||||||
|
return Imgproc.contourArea(contour, false) > minArea;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isLikelyRedaction(MatOfPoint contour, double[] hierachy , float minArea) {
|
||||||
|
return isFilled(hierachy) && isBoxy(contour) && largeEnough(minArea, contour);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
public static Mat bufferedImage2Mat(BufferedImage image) throws IOException {
|
||||||
|
|
||||||
|
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
|
||||||
|
ImageIO.write(image, "jpg", byteArrayOutputStream);
|
||||||
|
byteArrayOutputStream.flush();
|
||||||
|
return Imgcodecs.imdecode(new MatOfByte(byteArrayOutputStream.toByteArray()), Imgcodecs.IMREAD_UNCHANGED);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static BufferedImage pageConvertToImage(PDDocument doc, PDPage page, int dpi,
|
||||||
|
ImageType imageType) throws IOException {
|
||||||
|
|
||||||
|
PDFRenderer renderer = new PDFRenderer(doc);
|
||||||
|
return renderer.renderImageWithDPI(doc.getPages().indexOf(page), dpi, imageType);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -37,6 +37,7 @@ import org.kie.api.builder.KieBuilder;
|
|||||||
import org.kie.api.builder.KieFileSystem;
|
import org.kie.api.builder.KieFileSystem;
|
||||||
import org.kie.api.builder.KieModule;
|
import org.kie.api.builder.KieModule;
|
||||||
import org.kie.api.runtime.KieContainer;
|
import org.kie.api.runtime.KieContainer;
|
||||||
|
import org.opencv.core.Core;
|
||||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||||
import org.springframework.beans.factory.annotation.Autowired;
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
|
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
|
||||||
@ -66,6 +67,7 @@ import com.iqser.red.service.redaction.v1.model.AnalyzeResult;
|
|||||||
import com.iqser.red.service.redaction.v1.model.AnnotateRequest;
|
import com.iqser.red.service.redaction.v1.model.AnnotateRequest;
|
||||||
import com.iqser.red.service.redaction.v1.model.AnnotateResponse;
|
import com.iqser.red.service.redaction.v1.model.AnnotateResponse;
|
||||||
import com.iqser.red.service.redaction.v1.model.FileAttribute;
|
import com.iqser.red.service.redaction.v1.model.FileAttribute;
|
||||||
|
import com.iqser.red.service.redaction.v1.model.ImportedRedactions;
|
||||||
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
|
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
|
||||||
import com.iqser.red.service.redaction.v1.model.RedactionRequest;
|
import com.iqser.red.service.redaction.v1.model.RedactionRequest;
|
||||||
import com.iqser.red.service.redaction.v1.model.RedactionResult;
|
import com.iqser.red.service.redaction.v1.model.RedactionResult;
|
||||||
@ -78,6 +80,7 @@ import com.iqser.red.service.redaction.v1.server.controller.RedactionController;
|
|||||||
import com.iqser.red.service.redaction.v1.server.memory.MemoryStats;
|
import com.iqser.red.service.redaction.v1.server.memory.MemoryStats;
|
||||||
import com.iqser.red.service.redaction.v1.server.redaction.service.AnalyzeService;
|
import com.iqser.red.service.redaction.v1.server.redaction.service.AnalyzeService;
|
||||||
import com.iqser.red.service.redaction.v1.server.redaction.service.ManualRedactionSurroundingTextService;
|
import com.iqser.red.service.redaction.v1.server.redaction.service.ManualRedactionSurroundingTextService;
|
||||||
|
import com.iqser.red.service.redaction.v1.server.redaction.service.RedactionExtractorService;
|
||||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.OsUtils;
|
import com.iqser.red.service.redaction.v1.server.redaction.utils.OsUtils;
|
||||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
|
import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
|
||||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
||||||
@ -554,6 +557,45 @@ public class RedactionIntegrationTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Autowired
|
||||||
|
private RedactionExtractorService redactionExtractorService;
|
||||||
|
|
||||||
|
// static{ System.loadLibrary(Core.NATIVE_LIBRARY_NAME); }
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@SneakyThrows
|
||||||
|
public void testImportRedactions() {
|
||||||
|
nu.pattern.OpenCV.loadShared();
|
||||||
|
|
||||||
|
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06_redacted1.pdf");
|
||||||
|
|
||||||
|
ImportedRedactions importedRedactions = redactionExtractorService.extractRedaction(pdfFileResource.getInputStream());
|
||||||
|
|
||||||
|
|
||||||
|
String outputFileName = OsUtils.getTemporaryDirectory() + "/Annotated.pdf";
|
||||||
|
|
||||||
|
ClassPathResource pdfFileResource2 = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
|
||||||
|
|
||||||
|
AnalyzeRequest request = prepareStorage(pdfFileResource2.getInputStream());
|
||||||
|
storageService.storeObject(RedactionStorageService.StorageIdUtils.getStorageId(TEST_DOSSIER_ID, TEST_FILE_ID, FileType.IMPORTED_REDACTIONS), objectMapper.writeValueAsBytes(importedRedactions));
|
||||||
|
|
||||||
|
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
|
||||||
|
AnalyzeResult result = analyzeService.analyze(request);
|
||||||
|
|
||||||
|
var redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID);
|
||||||
|
|
||||||
|
AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder()
|
||||||
|
.dossierId(TEST_DOSSIER_ID)
|
||||||
|
.fileId(TEST_FILE_ID)
|
||||||
|
.build());
|
||||||
|
|
||||||
|
try (FileOutputStream fileOutputStream = new FileOutputStream(outputFileName)) {
|
||||||
|
fileOutputStream.write(annotateResponse.getDocument());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Ignore
|
@Ignore
|
||||||
public void testLargeScannedFileOOM() {
|
public void testLargeScannedFileOOM() {
|
||||||
|
|||||||
Binary file not shown.
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user