RED-7075: New watermark removal logic
This commit is contained in:
parent
71cdb62229
commit
acd300ebc9
@ -0,0 +1,348 @@
|
||||
package com.iqser.red.pdftronlogic.commons;
|
||||
|
||||
import java.awt.Image;
|
||||
import java.awt.Toolkit;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.File;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.filters.FileDescriptorFilter;
|
||||
import com.pdftron.filters.Filter;
|
||||
import com.pdftron.filters.FilterReader;
|
||||
import com.pdftron.filters.FilterWriter;
|
||||
import com.pdftron.pdf.ColorPt;
|
||||
import com.pdftron.pdf.ColorSpace;
|
||||
import com.pdftron.pdf.Element;
|
||||
import com.pdftron.pdf.ElementBuilder;
|
||||
import com.pdftron.pdf.ElementReader;
|
||||
import com.pdftron.pdf.ElementWriter;
|
||||
import com.pdftron.pdf.Image2RGB;
|
||||
import com.pdftron.pdf.Optimizer;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.pdf.PageIterator;
|
||||
import com.pdftron.sdf.SDFDoc;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
public class WatermarkRemovalService {
|
||||
|
||||
final static double AREA_THRESHOLD = 0.6; // multiplied with page area
|
||||
final static double OCCURING_ON_PAGES_THRESHOLD_FACTOR = 0.4; // multiplied with number of pages
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static void removeWatermarks(InputStream pdfFile, OutputStream out) {
|
||||
|
||||
PDFDoc pdfDoc = new PDFDoc(pdfFile);
|
||||
|
||||
Map<Long, List<ElementFeatures>> formObjectsForPages = findAllFormObjectsAndImages(pdfDoc);
|
||||
|
||||
List<ElementFeatures> watermarkElementFeatures = filterSameFormObjectsOccuringOnMostPages(formObjectsForPages);
|
||||
|
||||
storeWatermarkImageHashValues(watermarkElementFeatures);
|
||||
|
||||
removeAllWatermarks(pdfDoc, watermarkElementFeatures);
|
||||
|
||||
try {
|
||||
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
} finally {
|
||||
pdfDoc.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static void storeWatermarkImageHashValues(List<ElementFeatures> watermarkElementFeatures) {
|
||||
for(ElementFeatures elementFeatures : watermarkElementFeatures){
|
||||
if(elementFeatures.getElementType() == Element.e_image || elementFeatures.getElementType() == Element.e_inline_image){
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static Map<Long, List<ElementFeatures>> findAllFormObjectsAndImages(PDFDoc pdfDoc) {
|
||||
|
||||
List<ElementFeatures> formObjectsOccuringMoreThanOnceOnAPage = new LinkedList<>();
|
||||
Map<Long, List<ElementFeatures>> formObjectsAndImagesForPages = new HashMap<>();
|
||||
Set<Long> visitedXObjIds = new TreeSet<>();
|
||||
|
||||
ElementReader reader = new ElementReader();
|
||||
|
||||
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
|
||||
|
||||
Page page = iterator.next();
|
||||
|
||||
double minAreaCoveringFromPage = AREA_THRESHOLD * page.getPageHeight() * page.getPageWidth();
|
||||
|
||||
LinkedList<ElementFeatures> elementFeaturesLinkedList = new LinkedList<>();
|
||||
|
||||
reader.begin(page);
|
||||
for (Element element = reader.next(); element != null; element = reader.next()) {
|
||||
if(element.getBBox() == null){
|
||||
continue;
|
||||
}
|
||||
if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (element.getType() == Element.e_form) {
|
||||
//processXObject(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringFromPage);
|
||||
} else if (element.getType() == Element.e_image || element.getType() == Element.e_inline_image) {
|
||||
// causes empty pages so far
|
||||
processImages(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage);
|
||||
}
|
||||
}
|
||||
|
||||
formObjectsAndImagesForPages.put(page.getSDFObj().getObjNum(), elementFeaturesLinkedList);
|
||||
}
|
||||
|
||||
reader.destroy();
|
||||
|
||||
return formObjectsAndImagesForPages;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static void processImages(Element element,
|
||||
Set<Long> visitedXObjIds,
|
||||
LinkedList<ElementFeatures> elementFeaturesLinkedList,
|
||||
List<ElementFeatures> formObjectsOccuringMoreThanOnceOnAPage) {
|
||||
|
||||
if(element.getType() == Element.e_image) {
|
||||
|
||||
//element.getImageData();
|
||||
|
||||
/*com.pdftron.pdf.Image image = new com.pdftron.pdf.Image(element.getXObject());
|
||||
System.out.println(image.getImageDataSize());
|
||||
//element.getImageData().writeToFile("C:\\Users\\RaphaelArnold\\AppData\\Local\\Temp\\" + "IMAGE1", false);
|
||||
String fname = "C:\\Users\\RaphaelArnold\\AppData\\Local\\Temp\\" + "IMAGE.png";
|
||||
image.exportAsPng(fname);
|
||||
|
||||
Image2RGB img_conv = new Image2RGB(element);
|
||||
FilterReader reader = new com.pdftron.filters.FilterReader(img_conv);
|
||||
byte[] image_data_out = new byte[element.getImageWidth() * element.getImageHeight() * 3];
|
||||
reader.read(image_data_out);
|
||||
System.out.println("he");
|
||||
|
||||
BufferedImage bufferedImage = ImageIO.read(new ByteArrayInputStream(image_data_out));
|
||||
bufferedImage.getScaledInstance(10,10,0);*/
|
||||
|
||||
|
||||
|
||||
//Optimizer.ImageSettings imageSettings = new Optimizer.ImageSettings();
|
||||
|
||||
|
||||
/*Image img = image.getBitmap();
|
||||
|
||||
BufferedImage bufferedImage= new BufferedImage(img.getWidth(null), img.getHeight(null), BufferedImage.TYPE_INT_RGB);
|
||||
img.getGraphics().drawImage(img, 0, 0, null);
|
||||
ImageIO.write(bufferedImage, "jpg", new File("C:\\myImage.jpg"));*/
|
||||
}
|
||||
ElementFeatures elementFeatures = ElementFeatures.extractFeatures(element);
|
||||
elementFeaturesLinkedList.add(elementFeatures);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static boolean processXObject(Element element,
|
||||
Set<Long> visitedXObjIds,
|
||||
LinkedList<ElementFeatures> elementFeaturesLinkedList,
|
||||
List<ElementFeatures> formObjectsOccuringMoreThanOnceOnAPage,
|
||||
double minAreaCoveringPage) {
|
||||
|
||||
/*for(ElementFeatures elementFeatures1 : formObjectsOccuringMoreThanOnceOnAPage){
|
||||
if(elementFeatures1.almostMatches(element)){
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
for (ElementFeatures elementFeatures1 : elementFeaturesLinkedList) {
|
||||
if (elementFeatures1.almostMatches(element)) {
|
||||
ElementFeatures elementFeatures = ElementFeatures.extractFeatures(element);
|
||||
formObjectsOccuringMoreThanOnceOnAPage.add(elementFeatures);
|
||||
elementFeaturesLinkedList.remove(elementFeatures);
|
||||
return;
|
||||
}
|
||||
}*/
|
||||
|
||||
|
||||
if (visitedXObjIds.add(element.getXObject().getObjNum())) {
|
||||
|
||||
ElementReader xObjectReader = new ElementReader();
|
||||
xObjectReader.begin(element.getXObject());
|
||||
boolean isContainingImageBigEnough = true;
|
||||
for (Element element1 = xObjectReader.next(); element1 != null; element1 = xObjectReader.next()) {
|
||||
if (element1.getType() == Element.e_form) {
|
||||
isContainingImageBigEnough = processXObject(element1, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage);
|
||||
} else if((element1.getType() == Element.e_image || element1.getType() == Element.e_inline_image)) {
|
||||
if(element1.getImageHeight()*element1.getImageWidth() < minAreaCoveringPage){
|
||||
xObjectReader.destroy();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
if(isContainingImageBigEnough) {
|
||||
elementFeaturesLinkedList.add(ElementFeatures.extractFeatures(element));
|
||||
}
|
||||
xObjectReader.destroy();
|
||||
} else {
|
||||
elementFeaturesLinkedList.add(ElementFeatures.extractFeatures(element));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
parameter
|
||||
*/
|
||||
private static List<ElementFeatures> filterSameFormObjectsOccuringOnMostPages(Map<Long, List<ElementFeatures>> formObjectsPerPage) {
|
||||
|
||||
int pageCount = formObjectsPerPage.keySet().size();
|
||||
int minPagesFilter = (int) (OCCURING_ON_PAGES_THRESHOLD_FACTOR * pageCount);
|
||||
|
||||
return formObjectsPerPage.values()
|
||||
.stream()
|
||||
.flatMap(Collection::stream)
|
||||
.filter(elementFeature -> formObjectsPerPage.values()
|
||||
.stream()
|
||||
.filter(elementFeaturesOnPage -> elementFeaturesOnPage.stream().anyMatch(elementFeature::almostMatches))
|
||||
.count() >= minPagesFilter)
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static void removeAllWatermarks(PDFDoc pdfDoc, List<ElementFeatures> watermarksElementFeaturesList) {
|
||||
|
||||
ElementReader reader = new ElementReader();
|
||||
ElementWriter writer = new ElementWriter();
|
||||
Set<Long> visitedXObjIds = new TreeSet<>();
|
||||
|
||||
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
|
||||
|
||||
Page page = iterator.next();
|
||||
|
||||
writeAllElementsExceptWatermarks(page, reader, writer, watermarksElementFeaturesList, visitedXObjIds);
|
||||
|
||||
}
|
||||
|
||||
reader.destroy();
|
||||
writer.destroy();
|
||||
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static void writeAllElementsExceptWatermarks(Page page,
|
||||
ElementReader reader,
|
||||
ElementWriter writer,
|
||||
List<ElementFeatures> watermarksElementFeaturesList,
|
||||
Set<Long> visitedXObjIds) {
|
||||
|
||||
reader.begin(page);
|
||||
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
|
||||
processElements(reader, writer, watermarksElementFeaturesList, visitedXObjIds);
|
||||
writer.end();
|
||||
reader.end();
|
||||
}
|
||||
|
||||
|
||||
private static void processElements(ElementReader reader,
|
||||
ElementWriter writer,
|
||||
List<ElementFeatures> watermarksElementFeaturesList,
|
||||
Set<Long> visitedXObjIds) throws PDFNetException {
|
||||
|
||||
for (Element element = reader.next(); element != null; element = reader.next())
|
||||
switch (element.getType()) {
|
||||
case Element.e_image, Element.e_inline_image -> removeImages(element,reader,writer, watermarksElementFeaturesList);
|
||||
case Element.e_form -> processForms(element, reader, writer, watermarksElementFeaturesList, visitedXObjIds);
|
||||
default -> writer.writeElement(element);
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private static void removeImages(Element element, ElementReader reader, ElementWriter writer, List<ElementFeatures> watermarksElementFeaturesList) {
|
||||
for (ElementFeatures elementFeatures : watermarksElementFeaturesList) {
|
||||
if (elementFeatures.almostMatches(element)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
writer.writeElement(element);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Maybe problem with visitedXObjIds, because, if on same page there are two identical xobjects
|
||||
but one is inside another xObject, the other is directly
|
||||
*/
|
||||
private static void processForms(Element element,
|
||||
ElementReader reader,
|
||||
ElementWriter writer,
|
||||
List<ElementFeatures> watermarksElementFeaturesList,
|
||||
Set<Long> visitedXObjIds) throws PDFNetException {
|
||||
|
||||
for (ElementFeatures elementFeatures : watermarksElementFeaturesList) {
|
||||
if (elementFeatures.almostMatches(element)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
writer.writeElement(element);
|
||||
|
||||
if (!visitedXObjIds.contains(element.getXObject().getObjNum())) {
|
||||
visitedXObjIds.add(element.getXObject().getObjNum());
|
||||
// writer needs to be newly initialized when entering a new content stream
|
||||
// see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
|
||||
ElementWriter formWriter = new ElementWriter();
|
||||
reader.formBegin();
|
||||
formWriter.begin(element.getXObject());
|
||||
|
||||
reader.clearChangeList();
|
||||
formWriter.setDefaultGState(reader);
|
||||
|
||||
processElements(reader, formWriter, watermarksElementFeaturesList, visitedXObjIds);
|
||||
formWriter.end();
|
||||
formWriter.destroy();
|
||||
reader.end();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static void drawBBox(ElementWriter writer, Rectangle2D r, String hexcolor) {
|
||||
|
||||
ColorPt colorPt = new ColorPt(Integer.valueOf(hexcolor.substring(1, 3), 16) / 255d,
|
||||
Integer.valueOf(hexcolor.substring(3, 5), 16) / 255d,
|
||||
Integer.valueOf(hexcolor.substring(5, 7), 16) / 255d);
|
||||
ElementBuilder eb = new ElementBuilder();
|
||||
Element rect = eb.createRect(r.getX(), r.getY(), r.getWidth(), r.getHeight());
|
||||
rect.setPathStroke(true);
|
||||
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
||||
rect.getGState().setStrokeColor(colorPt);
|
||||
writer.writePlacedElement(rect);
|
||||
|
||||
colorPt.destroy();
|
||||
eb.destroy();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,62 @@
|
||||
package com.iqser.red.pdftronlogic.commons;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Locale;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.platform.commons.util.StringUtils;
|
||||
|
||||
import com.pdftron.pdf.PDFNet;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
class WatermarkRemovalServiceTest {
|
||||
|
||||
@SneakyThrows
|
||||
@Test
|
||||
void removeWatermarks() {
|
||||
|
||||
PDFNet.addResourceSearchPath("C:/Users/RaphaelArnold/knecon/pdftron/ocrirismodule/Lib");
|
||||
PDFNet.initialize("demo:1650351709282:7bd235e003000000004ec28a6743e1163a085e2115de2536ab6e2cfe5a");
|
||||
|
||||
String filename = "files/18 - EVIDIS - Corrosao Irritacao ocular aguda.pdf";
|
||||
|
||||
String tmpFilename = createTmpFileName(filename, "WATERMARK_REMOVAL");
|
||||
try (var in = this.getClass().getClassLoader().getResourceAsStream(filename); var out = new FileOutputStream(tmpFilename)) {
|
||||
|
||||
{
|
||||
System.out.println(tmpFilename);
|
||||
WatermarkRemovalService.removeWatermarks(in, out);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static boolean isWindows() {
|
||||
|
||||
return System.getProperty("os.name").toLowerCase(Locale.ROOT).contains("windows");
|
||||
}
|
||||
|
||||
|
||||
public static String getTemporaryDirectory() {
|
||||
|
||||
String tmpdir = System.getProperty("java.io.tmpdir");
|
||||
if (isWindows() && StringUtils.isNotBlank(tmpdir)) {
|
||||
return tmpdir;
|
||||
}
|
||||
return "/tmp";
|
||||
}
|
||||
|
||||
|
||||
public static String createTmpFileName(String filename, String suffix) {
|
||||
|
||||
return Path.of(getTemporaryDirectory()).resolve(Path.of(filename).getFileName()).toString().replace(".pdf", "_" + suffix + ".pdf");
|
||||
}
|
||||
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user