RED-7075: New watermark removal logic

This commit is contained in:
RaphaelArnold 2023-07-19 12:49:55 +02:00
parent 71cdb62229
commit acd300ebc9
2 changed files with 410 additions and 0 deletions

View File

@ -0,0 +1,348 @@
package com.iqser.red.pdftronlogic.commons;
import java.awt.Image;
import java.awt.Toolkit;
import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import javax.imageio.ImageIO;
import com.pdftron.common.PDFNetException;
import com.pdftron.filters.FileDescriptorFilter;
import com.pdftron.filters.Filter;
import com.pdftron.filters.FilterReader;
import com.pdftron.filters.FilterWriter;
import com.pdftron.pdf.ColorPt;
import com.pdftron.pdf.ColorSpace;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementBuilder;
import com.pdftron.pdf.ElementReader;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.Image2RGB;
import com.pdftron.pdf.Optimizer;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows;
public class WatermarkRemovalService {
final static double AREA_THRESHOLD = 0.6; // multiplied with page area
final static double OCCURING_ON_PAGES_THRESHOLD_FACTOR = 0.4; // multiplied with number of pages
@SneakyThrows
public static void removeWatermarks(InputStream pdfFile, OutputStream out) {
PDFDoc pdfDoc = new PDFDoc(pdfFile);
Map<Long, List<ElementFeatures>> formObjectsForPages = findAllFormObjectsAndImages(pdfDoc);
List<ElementFeatures> watermarkElementFeatures = filterSameFormObjectsOccuringOnMostPages(formObjectsForPages);
storeWatermarkImageHashValues(watermarkElementFeatures);
removeAllWatermarks(pdfDoc, watermarkElementFeatures);
try {
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
} catch (Exception e) {
throw new RuntimeException(e);
} finally {
pdfDoc.close();
}
}
private static void storeWatermarkImageHashValues(List<ElementFeatures> watermarkElementFeatures) {
for(ElementFeatures elementFeatures : watermarkElementFeatures){
if(elementFeatures.getElementType() == Element.e_image || elementFeatures.getElementType() == Element.e_inline_image){
}
}
}
@SneakyThrows
private static Map<Long, List<ElementFeatures>> findAllFormObjectsAndImages(PDFDoc pdfDoc) {
List<ElementFeatures> formObjectsOccuringMoreThanOnceOnAPage = new LinkedList<>();
Map<Long, List<ElementFeatures>> formObjectsAndImagesForPages = new HashMap<>();
Set<Long> visitedXObjIds = new TreeSet<>();
ElementReader reader = new ElementReader();
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
Page page = iterator.next();
double minAreaCoveringFromPage = AREA_THRESHOLD * page.getPageHeight() * page.getPageWidth();
LinkedList<ElementFeatures> elementFeaturesLinkedList = new LinkedList<>();
reader.begin(page);
for (Element element = reader.next(); element != null; element = reader.next()) {
if(element.getBBox() == null){
continue;
}
if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage) {
continue;
}
if (element.getType() == Element.e_form) {
//processXObject(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringFromPage);
} else if (element.getType() == Element.e_image || element.getType() == Element.e_inline_image) {
// causes empty pages so far
processImages(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage);
}
}
formObjectsAndImagesForPages.put(page.getSDFObj().getObjNum(), elementFeaturesLinkedList);
}
reader.destroy();
return formObjectsAndImagesForPages;
}
@SneakyThrows
private static void processImages(Element element,
Set<Long> visitedXObjIds,
LinkedList<ElementFeatures> elementFeaturesLinkedList,
List<ElementFeatures> formObjectsOccuringMoreThanOnceOnAPage) {
if(element.getType() == Element.e_image) {
//element.getImageData();
/*com.pdftron.pdf.Image image = new com.pdftron.pdf.Image(element.getXObject());
System.out.println(image.getImageDataSize());
//element.getImageData().writeToFile("C:\\Users\\RaphaelArnold\\AppData\\Local\\Temp\\" + "IMAGE1", false);
String fname = "C:\\Users\\RaphaelArnold\\AppData\\Local\\Temp\\" + "IMAGE.png";
image.exportAsPng(fname);
Image2RGB img_conv = new Image2RGB(element);
FilterReader reader = new com.pdftron.filters.FilterReader(img_conv);
byte[] image_data_out = new byte[element.getImageWidth() * element.getImageHeight() * 3];
reader.read(image_data_out);
System.out.println("he");
BufferedImage bufferedImage = ImageIO.read(new ByteArrayInputStream(image_data_out));
bufferedImage.getScaledInstance(10,10,0);*/
//Optimizer.ImageSettings imageSettings = new Optimizer.ImageSettings();
/*Image img = image.getBitmap();
BufferedImage bufferedImage= new BufferedImage(img.getWidth(null), img.getHeight(null), BufferedImage.TYPE_INT_RGB);
img.getGraphics().drawImage(img, 0, 0, null);
ImageIO.write(bufferedImage, "jpg", new File("C:\\myImage.jpg"));*/
}
ElementFeatures elementFeatures = ElementFeatures.extractFeatures(element);
elementFeaturesLinkedList.add(elementFeatures);
}
@SneakyThrows
private static boolean processXObject(Element element,
Set<Long> visitedXObjIds,
LinkedList<ElementFeatures> elementFeaturesLinkedList,
List<ElementFeatures> formObjectsOccuringMoreThanOnceOnAPage,
double minAreaCoveringPage) {
/*for(ElementFeatures elementFeatures1 : formObjectsOccuringMoreThanOnceOnAPage){
if(elementFeatures1.almostMatches(element)){
return;
}
}
for (ElementFeatures elementFeatures1 : elementFeaturesLinkedList) {
if (elementFeatures1.almostMatches(element)) {
ElementFeatures elementFeatures = ElementFeatures.extractFeatures(element);
formObjectsOccuringMoreThanOnceOnAPage.add(elementFeatures);
elementFeaturesLinkedList.remove(elementFeatures);
return;
}
}*/
if (visitedXObjIds.add(element.getXObject().getObjNum())) {
ElementReader xObjectReader = new ElementReader();
xObjectReader.begin(element.getXObject());
boolean isContainingImageBigEnough = true;
for (Element element1 = xObjectReader.next(); element1 != null; element1 = xObjectReader.next()) {
if (element1.getType() == Element.e_form) {
isContainingImageBigEnough = processXObject(element1, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage);
} else if((element1.getType() == Element.e_image || element1.getType() == Element.e_inline_image)) {
if(element1.getImageHeight()*element1.getImageWidth() < minAreaCoveringPage){
xObjectReader.destroy();
return false;
}
}
}
if(isContainingImageBigEnough) {
elementFeaturesLinkedList.add(ElementFeatures.extractFeatures(element));
}
xObjectReader.destroy();
} else {
elementFeaturesLinkedList.add(ElementFeatures.extractFeatures(element));
}
return true;
}
/*
parameter
*/
private static List<ElementFeatures> filterSameFormObjectsOccuringOnMostPages(Map<Long, List<ElementFeatures>> formObjectsPerPage) {
int pageCount = formObjectsPerPage.keySet().size();
int minPagesFilter = (int) (OCCURING_ON_PAGES_THRESHOLD_FACTOR * pageCount);
return formObjectsPerPage.values()
.stream()
.flatMap(Collection::stream)
.filter(elementFeature -> formObjectsPerPage.values()
.stream()
.filter(elementFeaturesOnPage -> elementFeaturesOnPage.stream().anyMatch(elementFeature::almostMatches))
.count() >= minPagesFilter)
.toList();
}
@SneakyThrows
private static void removeAllWatermarks(PDFDoc pdfDoc, List<ElementFeatures> watermarksElementFeaturesList) {
ElementReader reader = new ElementReader();
ElementWriter writer = new ElementWriter();
Set<Long> visitedXObjIds = new TreeSet<>();
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
Page page = iterator.next();
writeAllElementsExceptWatermarks(page, reader, writer, watermarksElementFeaturesList, visitedXObjIds);
}
reader.destroy();
writer.destroy();
}
@SneakyThrows
private static void writeAllElementsExceptWatermarks(Page page,
ElementReader reader,
ElementWriter writer,
List<ElementFeatures> watermarksElementFeaturesList,
Set<Long> visitedXObjIds) {
reader.begin(page);
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
processElements(reader, writer, watermarksElementFeaturesList, visitedXObjIds);
writer.end();
reader.end();
}
private static void processElements(ElementReader reader,
ElementWriter writer,
List<ElementFeatures> watermarksElementFeaturesList,
Set<Long> visitedXObjIds) throws PDFNetException {
for (Element element = reader.next(); element != null; element = reader.next())
switch (element.getType()) {
case Element.e_image, Element.e_inline_image -> removeImages(element,reader,writer, watermarksElementFeaturesList);
case Element.e_form -> processForms(element, reader, writer, watermarksElementFeaturesList, visitedXObjIds);
default -> writer.writeElement(element);
}
}
@SneakyThrows
private static void removeImages(Element element, ElementReader reader, ElementWriter writer, List<ElementFeatures> watermarksElementFeaturesList) {
for (ElementFeatures elementFeatures : watermarksElementFeaturesList) {
if (elementFeatures.almostMatches(element)) {
return;
}
}
writer.writeElement(element);
}
/*
Maybe problem with visitedXObjIds, because, if on same page there are two identical xobjects
but one is inside another xObject, the other is directly
*/
private static void processForms(Element element,
ElementReader reader,
ElementWriter writer,
List<ElementFeatures> watermarksElementFeaturesList,
Set<Long> visitedXObjIds) throws PDFNetException {
for (ElementFeatures elementFeatures : watermarksElementFeaturesList) {
if (elementFeatures.almostMatches(element)) {
return;
}
}
writer.writeElement(element);
if (!visitedXObjIds.contains(element.getXObject().getObjNum())) {
visitedXObjIds.add(element.getXObject().getObjNum());
// writer needs to be newly initialized when entering a new content stream
// see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
ElementWriter formWriter = new ElementWriter();
reader.formBegin();
formWriter.begin(element.getXObject());
reader.clearChangeList();
formWriter.setDefaultGState(reader);
processElements(reader, formWriter, watermarksElementFeaturesList, visitedXObjIds);
formWriter.end();
formWriter.destroy();
reader.end();
}
}
@SneakyThrows
private static void drawBBox(ElementWriter writer, Rectangle2D r, String hexcolor) {
ColorPt colorPt = new ColorPt(Integer.valueOf(hexcolor.substring(1, 3), 16) / 255d,
Integer.valueOf(hexcolor.substring(3, 5), 16) / 255d,
Integer.valueOf(hexcolor.substring(5, 7), 16) / 255d);
ElementBuilder eb = new ElementBuilder();
Element rect = eb.createRect(r.getX(), r.getY(), r.getWidth(), r.getHeight());
rect.setPathStroke(true);
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
rect.getGState().setStrokeColor(colorPt);
writer.writePlacedElement(rect);
colorPt.destroy();
eb.destroy();
}
}

View File

@ -0,0 +1,62 @@
package com.iqser.red.pdftronlogic.commons;
import static org.junit.jupiter.api.Assertions.*;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.nio.file.Path;
import java.util.Locale;
import org.junit.jupiter.api.Test;
import org.junit.platform.commons.util.StringUtils;
import com.pdftron.pdf.PDFNet;
import lombok.SneakyThrows;
class WatermarkRemovalServiceTest {
@SneakyThrows
@Test
void removeWatermarks() {
PDFNet.addResourceSearchPath("C:/Users/RaphaelArnold/knecon/pdftron/ocrirismodule/Lib");
PDFNet.initialize("demo:1650351709282:7bd235e003000000004ec28a6743e1163a085e2115de2536ab6e2cfe5a");
String filename = "files/18 - EVIDIS - Corrosao Irritacao ocular aguda.pdf";
String tmpFilename = createTmpFileName(filename, "WATERMARK_REMOVAL");
try (var in = this.getClass().getClassLoader().getResourceAsStream(filename); var out = new FileOutputStream(tmpFilename)) {
{
System.out.println(tmpFilename);
WatermarkRemovalService.removeWatermarks(in, out);
}
}
}
private static boolean isWindows() {
return System.getProperty("os.name").toLowerCase(Locale.ROOT).contains("windows");
}
public static String getTemporaryDirectory() {
String tmpdir = System.getProperty("java.io.tmpdir");
if (isWindows() && StringUtils.isNotBlank(tmpdir)) {
return tmpdir;
}
return "/tmp";
}
public static String createTmpFileName(String filename, String suffix) {
return Path.of(getTemporaryDirectory()).resolve(Path.of(filename).getFileName()).toString().replace(".pdf", "_" + suffix + ".pdf");
}
}