Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
350de7f385 | ||
|
|
b09f036253 |
@ -20,6 +20,11 @@
|
||||
</properties>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.openpnp</groupId>
|
||||
<artifactId>opencv</artifactId>
|
||||
<version>4.5.1-2</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.iqser.red.commons</groupId>
|
||||
<artifactId>storage-commons</artifactId>
|
||||
|
||||
@ -0,0 +1,184 @@
|
||||
package com.iqser.red.service.redaction.v1.server.redaction.service;
|
||||
|
||||
import static org.opencv.imgproc.Imgproc.COLOR_BGR2GRAY;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.awt.image.DataBufferByte;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.UUID;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.rendering.ImageType;
|
||||
import org.apache.pdfbox.rendering.PDFRenderer;
|
||||
import org.opencv.core.Core;
|
||||
import org.opencv.core.Mat;
|
||||
import org.opencv.core.MatOfByte;
|
||||
import org.opencv.core.MatOfPoint;
|
||||
import org.opencv.core.MatOfPoint2f;
|
||||
import org.opencv.core.Rect;
|
||||
import org.opencv.core.Scalar;
|
||||
import org.opencv.core.Size;
|
||||
import org.opencv.imgcodecs.Imgcodecs;
|
||||
import org.opencv.imgproc.Imgproc;
|
||||
import org.springframework.beans.BeanUtils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.model.ImportedRedaction;
|
||||
import com.iqser.red.service.redaction.v1.model.ImportedRedactions;
|
||||
import com.iqser.red.service.redaction.v1.model.Point;
|
||||
import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@Service
|
||||
public class RedactionExtractorService {
|
||||
|
||||
@SneakyThrows
|
||||
public ImportedRedactions extractRedaction(InputStream pdfFile) {
|
||||
|
||||
PDDocument pdfDocument = PDDocument.load(pdfFile);
|
||||
pdfDocument.setAllSecurityToBeRemoved(true);
|
||||
|
||||
long pageCount = pdfDocument.getNumberOfPages();
|
||||
|
||||
Map<Integer, List<ImportedRedaction>> importedRedactionsMap = new HashMap<>();
|
||||
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
|
||||
PDPage page = pdfDocument.getPage(pageNumber - 1);
|
||||
BufferedImage image = pageConvertToImage(pdfDocument, page, 200, ImageType.GRAY);
|
||||
|
||||
|
||||
Mat imageAsMat = bufferedImage2Mat(image);
|
||||
|
||||
Mat imageAsMatINv = new Mat();
|
||||
|
||||
Mat invertcolormatrix= new Mat(imageAsMat.rows(),imageAsMat.cols(), imageAsMat.type(), new Scalar(255,255,255));
|
||||
|
||||
Core.subtract(invertcolormatrix, imageAsMat, imageAsMatINv);
|
||||
|
||||
|
||||
|
||||
Mat blurred = new Mat();
|
||||
Imgproc.GaussianBlur(imageAsMatINv,blurred, new Size(5, 5), 1);
|
||||
Mat thresh = new Mat();
|
||||
Imgproc.threshold(blurred, thresh, 240, 255, Imgproc.THRESH_BINARY);
|
||||
|
||||
|
||||
Mat hierachy = new Mat();
|
||||
List<MatOfPoint> contours = new ArrayList<>();
|
||||
Imgproc.findContours(thresh,contours,hierachy, Imgproc.RETR_TREE, Imgproc.CHAIN_APPROX_NONE);
|
||||
|
||||
|
||||
System.out.println(contours);
|
||||
System.out.println(hierachy.size(3));
|
||||
|
||||
var min_nomralized_area=800;
|
||||
|
||||
if(hierachy.rows() > 0 && hierachy.row(0).rows() > 0) {
|
||||
hierachy = hierachy.row(0).row(0);
|
||||
|
||||
System.out.println("Cont" + contours.size() + "Hier" + hierachy.cols());
|
||||
|
||||
// for(int j = 0; j<= hierachy.cols(); j++){
|
||||
// System.out.println("----> Col " + j);
|
||||
// for (double d :hierachy.get(0,j)){
|
||||
// System.out.println( " " + d);
|
||||
// }
|
||||
// }
|
||||
|
||||
int i = 0;
|
||||
List<MatOfPoint> filterd = new ArrayList<>();
|
||||
for (MatOfPoint contour : contours) {
|
||||
if (isLikelyRedaction(contour, hierachy.get(0, i), min_nomralized_area)) {
|
||||
filterd.add(contour);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
Imgproc.drawContours(imageAsMat, filterd, -1, new Scalar(180), 2);
|
||||
|
||||
List<ImportedRedaction> importedRedactions = new ArrayList<>();
|
||||
for (MatOfPoint contour : filterd) {
|
||||
Rect rect = Imgproc.boundingRect(contour);
|
||||
|
||||
importedRedactions.add(new ImportedRedaction(UUID.randomUUID().toString(), List.of(new Rectangle(new Point((float) (rect.x * page.getCropBox()
|
||||
.getWidth() / image.getWidth()), (float) (page.getCropBox().getHeight() - rect.y * page.getCropBox()
|
||||
.getWidth() / image.getWidth())), rect.width * page.getCropBox()
|
||||
.getWidth() / image.getWidth(), -rect.height * page.getCropBox()
|
||||
.getWidth() / image.getWidth(), pageNumber))));
|
||||
}
|
||||
|
||||
importedRedactionsMap.put(pageNumber, importedRedactions);
|
||||
|
||||
}
|
||||
|
||||
// BufferedImage gray = new BufferedImage(imageAsMat.width(), imageAsMat.height(), BufferedImage.TYPE_BYTE_GRAY);
|
||||
// byte[] data = ((DataBufferByte) gray.getRaster().getDataBuffer()).getData();
|
||||
// imageAsMat.get(0, 0, data);
|
||||
//
|
||||
// File outputfile = new File("/tmp/image.jpg");
|
||||
// ImageIO.write(gray, "jpg", outputfile);
|
||||
|
||||
//
|
||||
|
||||
// System.out.println(contours);
|
||||
}
|
||||
pdfDocument.close();
|
||||
|
||||
return new ImportedRedactions(importedRedactionsMap);
|
||||
}
|
||||
|
||||
|
||||
|
||||
private boolean isFilled(double[] hierachy){
|
||||
return hierachy[3] <= 0 && hierachy[2] == -1;
|
||||
}
|
||||
|
||||
private boolean isBoxy(MatOfPoint contour){
|
||||
double epsilon = 0.01 * Imgproc.arcLength(new MatOfPoint2f( contour.toArray()), true);
|
||||
MatOfPoint2f approx = new MatOfPoint2f();
|
||||
Imgproc.approxPolyDP(new MatOfPoint2f( contour.toArray()), approx, epsilon, true);
|
||||
return approx.toArray().length <= 10;
|
||||
}
|
||||
|
||||
|
||||
private boolean largeEnough(float minArea, MatOfPoint contour){
|
||||
return Imgproc.contourArea(contour, false) > minArea;
|
||||
}
|
||||
|
||||
private boolean isLikelyRedaction(MatOfPoint contour, double[] hierachy , float minArea) {
|
||||
return isFilled(hierachy) && isBoxy(contour) && largeEnough(minArea, contour);
|
||||
}
|
||||
|
||||
|
||||
|
||||
public static Mat bufferedImage2Mat(BufferedImage image) throws IOException {
|
||||
|
||||
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
|
||||
ImageIO.write(image, "jpg", byteArrayOutputStream);
|
||||
byteArrayOutputStream.flush();
|
||||
return Imgcodecs.imdecode(new MatOfByte(byteArrayOutputStream.toByteArray()), Imgcodecs.IMREAD_UNCHANGED);
|
||||
}
|
||||
|
||||
|
||||
public static BufferedImage pageConvertToImage(PDDocument doc, PDPage page, int dpi,
|
||||
ImageType imageType) throws IOException {
|
||||
|
||||
PDFRenderer renderer = new PDFRenderer(doc);
|
||||
return renderer.renderImageWithDPI(doc.getPages().indexOf(page), dpi, imageType);
|
||||
}
|
||||
|
||||
}
|
||||
@ -37,6 +37,7 @@ import org.kie.api.builder.KieBuilder;
|
||||
import org.kie.api.builder.KieFileSystem;
|
||||
import org.kie.api.builder.KieModule;
|
||||
import org.kie.api.runtime.KieContainer;
|
||||
import org.opencv.core.Core;
|
||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
|
||||
@ -66,6 +67,7 @@ import com.iqser.red.service.redaction.v1.model.AnalyzeResult;
|
||||
import com.iqser.red.service.redaction.v1.model.AnnotateRequest;
|
||||
import com.iqser.red.service.redaction.v1.model.AnnotateResponse;
|
||||
import com.iqser.red.service.redaction.v1.model.FileAttribute;
|
||||
import com.iqser.red.service.redaction.v1.model.ImportedRedactions;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionLogEntry;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionRequest;
|
||||
import com.iqser.red.service.redaction.v1.model.RedactionResult;
|
||||
@ -78,6 +80,7 @@ import com.iqser.red.service.redaction.v1.server.controller.RedactionController;
|
||||
import com.iqser.red.service.redaction.v1.server.memory.MemoryStats;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.service.AnalyzeService;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.service.ManualRedactionSurroundingTextService;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.service.RedactionExtractorService;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.OsUtils;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.ResourceLoader;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.utils.TextNormalizationUtilities;
|
||||
@ -554,6 +557,45 @@ public class RedactionIntegrationTest {
|
||||
}
|
||||
|
||||
|
||||
@Autowired
|
||||
private RedactionExtractorService redactionExtractorService;
|
||||
|
||||
// static{ System.loadLibrary(Core.NATIVE_LIBRARY_NAME); }
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testImportRedactions() {
|
||||
nu.pattern.OpenCV.loadShared();
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06_redacted1.pdf");
|
||||
|
||||
ImportedRedactions importedRedactions = redactionExtractorService.extractRedaction(pdfFileResource.getInputStream());
|
||||
|
||||
|
||||
String outputFileName = OsUtils.getTemporaryDirectory() + "/Annotated.pdf";
|
||||
|
||||
ClassPathResource pdfFileResource2 = new ClassPathResource("files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
|
||||
|
||||
AnalyzeRequest request = prepareStorage(pdfFileResource2.getInputStream());
|
||||
storageService.storeObject(RedactionStorageService.StorageIdUtils.getStorageId(TEST_DOSSIER_ID, TEST_FILE_ID, FileType.IMPORTED_REDACTIONS), objectMapper.writeValueAsBytes(importedRedactions));
|
||||
|
||||
analyzeService.analyzeDocumentStructure(new StructureAnalyzeRequest(request.getDossierId(), request.getFileId()));
|
||||
AnalyzeResult result = analyzeService.analyze(request);
|
||||
|
||||
var redactionLog = redactionStorageService.getRedactionLog(TEST_DOSSIER_ID, TEST_FILE_ID);
|
||||
|
||||
AnnotateResponse annotateResponse = redactionController.annotate(AnnotateRequest.builder()
|
||||
.dossierId(TEST_DOSSIER_ID)
|
||||
.fileId(TEST_FILE_ID)
|
||||
.build());
|
||||
|
||||
try (FileOutputStream fileOutputStream = new FileOutputStream(outputFileName)) {
|
||||
fileOutputStream.write(annotateResponse.getDocument());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@Ignore
|
||||
public void testLargeScannedFileOOM() {
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user