diff --git a/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java b/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java new file mode 100644 index 0000000..609b1d9 --- /dev/null +++ b/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java @@ -0,0 +1,348 @@ +package com.iqser.red.pdftronlogic.commons; + +import java.awt.Image; +import java.awt.Toolkit; +import java.awt.geom.Rectangle2D; +import java.awt.image.BufferedImage; +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.Collection; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; + +import javax.imageio.ImageIO; + +import com.pdftron.common.PDFNetException; +import com.pdftron.filters.FileDescriptorFilter; +import com.pdftron.filters.Filter; +import com.pdftron.filters.FilterReader; +import com.pdftron.filters.FilterWriter; +import com.pdftron.pdf.ColorPt; +import com.pdftron.pdf.ColorSpace; +import com.pdftron.pdf.Element; +import com.pdftron.pdf.ElementBuilder; +import com.pdftron.pdf.ElementReader; +import com.pdftron.pdf.ElementWriter; +import com.pdftron.pdf.Image2RGB; +import com.pdftron.pdf.Optimizer; +import com.pdftron.pdf.PDFDoc; +import com.pdftron.pdf.Page; +import com.pdftron.pdf.PageIterator; +import com.pdftron.sdf.SDFDoc; + +import lombok.SneakyThrows; + +public class WatermarkRemovalService { + + final static double AREA_THRESHOLD = 0.6; // multiplied with page area + final static double OCCURING_ON_PAGES_THRESHOLD_FACTOR = 0.4; // multiplied with number of pages + + + @SneakyThrows + public static void removeWatermarks(InputStream pdfFile, OutputStream out) { + + PDFDoc pdfDoc = new PDFDoc(pdfFile); + + Map> formObjectsForPages = findAllFormObjectsAndImages(pdfDoc); + + List watermarkElementFeatures = filterSameFormObjectsOccuringOnMostPages(formObjectsForPages); + + storeWatermarkImageHashValues(watermarkElementFeatures); + + removeAllWatermarks(pdfDoc, watermarkElementFeatures); + + try { + pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null); + } catch (Exception e) { + throw new RuntimeException(e); + } finally { + pdfDoc.close(); + } + } + + + private static void storeWatermarkImageHashValues(List watermarkElementFeatures) { + for(ElementFeatures elementFeatures : watermarkElementFeatures){ + if(elementFeatures.getElementType() == Element.e_image || elementFeatures.getElementType() == Element.e_inline_image){ + + } + } + } + + + @SneakyThrows + private static Map> findAllFormObjectsAndImages(PDFDoc pdfDoc) { + + List formObjectsOccuringMoreThanOnceOnAPage = new LinkedList<>(); + Map> formObjectsAndImagesForPages = new HashMap<>(); + Set visitedXObjIds = new TreeSet<>(); + + ElementReader reader = new ElementReader(); + + for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) { + + Page page = iterator.next(); + + double minAreaCoveringFromPage = AREA_THRESHOLD * page.getPageHeight() * page.getPageWidth(); + + LinkedList elementFeaturesLinkedList = new LinkedList<>(); + + reader.begin(page); + for (Element element = reader.next(); element != null; element = reader.next()) { + if(element.getBBox() == null){ + continue; + } + if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage) { + continue; + } + + if (element.getType() == Element.e_form) { + //processXObject(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringFromPage); + } else if (element.getType() == Element.e_image || element.getType() == Element.e_inline_image) { + // causes empty pages so far + processImages(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage); + } + } + + formObjectsAndImagesForPages.put(page.getSDFObj().getObjNum(), elementFeaturesLinkedList); + } + + reader.destroy(); + + return formObjectsAndImagesForPages; + } + + + @SneakyThrows + private static void processImages(Element element, + Set visitedXObjIds, + LinkedList elementFeaturesLinkedList, + List formObjectsOccuringMoreThanOnceOnAPage) { + + if(element.getType() == Element.e_image) { + + //element.getImageData(); + + /*com.pdftron.pdf.Image image = new com.pdftron.pdf.Image(element.getXObject()); + System.out.println(image.getImageDataSize()); + //element.getImageData().writeToFile("C:\\Users\\RaphaelArnold\\AppData\\Local\\Temp\\" + "IMAGE1", false); + String fname = "C:\\Users\\RaphaelArnold\\AppData\\Local\\Temp\\" + "IMAGE.png"; + image.exportAsPng(fname); + + Image2RGB img_conv = new Image2RGB(element); + FilterReader reader = new com.pdftron.filters.FilterReader(img_conv); + byte[] image_data_out = new byte[element.getImageWidth() * element.getImageHeight() * 3]; + reader.read(image_data_out); + System.out.println("he"); + + BufferedImage bufferedImage = ImageIO.read(new ByteArrayInputStream(image_data_out)); + bufferedImage.getScaledInstance(10,10,0);*/ + + + + //Optimizer.ImageSettings imageSettings = new Optimizer.ImageSettings(); + + + /*Image img = image.getBitmap(); + + BufferedImage bufferedImage= new BufferedImage(img.getWidth(null), img.getHeight(null), BufferedImage.TYPE_INT_RGB); + img.getGraphics().drawImage(img, 0, 0, null); + ImageIO.write(bufferedImage, "jpg", new File("C:\\myImage.jpg"));*/ + } + ElementFeatures elementFeatures = ElementFeatures.extractFeatures(element); + elementFeaturesLinkedList.add(elementFeatures); + } + + + @SneakyThrows + private static boolean processXObject(Element element, + Set visitedXObjIds, + LinkedList elementFeaturesLinkedList, + List formObjectsOccuringMoreThanOnceOnAPage, + double minAreaCoveringPage) { + + /*for(ElementFeatures elementFeatures1 : formObjectsOccuringMoreThanOnceOnAPage){ + if(elementFeatures1.almostMatches(element)){ + return; + } + } + + for (ElementFeatures elementFeatures1 : elementFeaturesLinkedList) { + if (elementFeatures1.almostMatches(element)) { + ElementFeatures elementFeatures = ElementFeatures.extractFeatures(element); + formObjectsOccuringMoreThanOnceOnAPage.add(elementFeatures); + elementFeaturesLinkedList.remove(elementFeatures); + return; + } + }*/ + + + if (visitedXObjIds.add(element.getXObject().getObjNum())) { + + ElementReader xObjectReader = new ElementReader(); + xObjectReader.begin(element.getXObject()); + boolean isContainingImageBigEnough = true; + for (Element element1 = xObjectReader.next(); element1 != null; element1 = xObjectReader.next()) { + if (element1.getType() == Element.e_form) { + isContainingImageBigEnough = processXObject(element1, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage); + } else if((element1.getType() == Element.e_image || element1.getType() == Element.e_inline_image)) { + if(element1.getImageHeight()*element1.getImageWidth() < minAreaCoveringPage){ + xObjectReader.destroy(); + return false; + } + } + } + if(isContainingImageBigEnough) { + elementFeaturesLinkedList.add(ElementFeatures.extractFeatures(element)); + } + xObjectReader.destroy(); + } else { + elementFeaturesLinkedList.add(ElementFeatures.extractFeatures(element)); + } + return true; + } + + + /* + parameter + */ + private static List filterSameFormObjectsOccuringOnMostPages(Map> formObjectsPerPage) { + + int pageCount = formObjectsPerPage.keySet().size(); + int minPagesFilter = (int) (OCCURING_ON_PAGES_THRESHOLD_FACTOR * pageCount); + + return formObjectsPerPage.values() + .stream() + .flatMap(Collection::stream) + .filter(elementFeature -> formObjectsPerPage.values() + .stream() + .filter(elementFeaturesOnPage -> elementFeaturesOnPage.stream().anyMatch(elementFeature::almostMatches)) + .count() >= minPagesFilter) + .toList(); + } + + + @SneakyThrows + private static void removeAllWatermarks(PDFDoc pdfDoc, List watermarksElementFeaturesList) { + + ElementReader reader = new ElementReader(); + ElementWriter writer = new ElementWriter(); + Set visitedXObjIds = new TreeSet<>(); + + for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) { + + Page page = iterator.next(); + + writeAllElementsExceptWatermarks(page, reader, writer, watermarksElementFeaturesList, visitedXObjIds); + + } + + reader.destroy(); + writer.destroy(); + + } + + + @SneakyThrows + private static void writeAllElementsExceptWatermarks(Page page, + ElementReader reader, + ElementWriter writer, + List watermarksElementFeaturesList, + Set visitedXObjIds) { + + reader.begin(page); + writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict()); + processElements(reader, writer, watermarksElementFeaturesList, visitedXObjIds); + writer.end(); + reader.end(); + } + + + private static void processElements(ElementReader reader, + ElementWriter writer, + List watermarksElementFeaturesList, + Set visitedXObjIds) throws PDFNetException { + + for (Element element = reader.next(); element != null; element = reader.next()) + switch (element.getType()) { + case Element.e_image, Element.e_inline_image -> removeImages(element,reader,writer, watermarksElementFeaturesList); + case Element.e_form -> processForms(element, reader, writer, watermarksElementFeaturesList, visitedXObjIds); + default -> writer.writeElement(element); + } + } + + @SneakyThrows + private static void removeImages(Element element, ElementReader reader, ElementWriter writer, List watermarksElementFeaturesList) { + for (ElementFeatures elementFeatures : watermarksElementFeaturesList) { + if (elementFeatures.almostMatches(element)) { + return; + } + } + + writer.writeElement(element); + } + + + /* + Maybe problem with visitedXObjIds, because, if on same page there are two identical xobjects + but one is inside another xObject, the other is directly + */ + private static void processForms(Element element, + ElementReader reader, + ElementWriter writer, + List watermarksElementFeaturesList, + Set visitedXObjIds) throws PDFNetException { + + for (ElementFeatures elementFeatures : watermarksElementFeaturesList) { + if (elementFeatures.almostMatches(element)) { + return; + } + } + + writer.writeElement(element); + + if (!visitedXObjIds.contains(element.getXObject().getObjNum())) { + visitedXObjIds.add(element.getXObject().getObjNum()); + // writer needs to be newly initialized when entering a new content stream + // see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest) + ElementWriter formWriter = new ElementWriter(); + reader.formBegin(); + formWriter.begin(element.getXObject()); + + reader.clearChangeList(); + formWriter.setDefaultGState(reader); + + processElements(reader, formWriter, watermarksElementFeaturesList, visitedXObjIds); + formWriter.end(); + formWriter.destroy(); + reader.end(); + } + + } + + + @SneakyThrows + private static void drawBBox(ElementWriter writer, Rectangle2D r, String hexcolor) { + + ColorPt colorPt = new ColorPt(Integer.valueOf(hexcolor.substring(1, 3), 16) / 255d, + Integer.valueOf(hexcolor.substring(3, 5), 16) / 255d, + Integer.valueOf(hexcolor.substring(5, 7), 16) / 255d); + ElementBuilder eb = new ElementBuilder(); + Element rect = eb.createRect(r.getX(), r.getY(), r.getWidth(), r.getHeight()); + rect.setPathStroke(true); + rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB()); + rect.getGState().setStrokeColor(colorPt); + writer.writePlacedElement(rect); + + colorPt.destroy(); + eb.destroy(); + } + +} diff --git a/src/test/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalServiceTest.java b/src/test/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalServiceTest.java new file mode 100644 index 0000000..e5ed3fb --- /dev/null +++ b/src/test/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalServiceTest.java @@ -0,0 +1,62 @@ +package com.iqser.red.pdftronlogic.commons; + +import static org.junit.jupiter.api.Assertions.*; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.nio.file.Path; +import java.util.Locale; + +import org.junit.jupiter.api.Test; +import org.junit.platform.commons.util.StringUtils; + +import com.pdftron.pdf.PDFNet; + +import lombok.SneakyThrows; + +class WatermarkRemovalServiceTest { + + @SneakyThrows + @Test + void removeWatermarks() { + + PDFNet.addResourceSearchPath("C:/Users/RaphaelArnold/knecon/pdftron/ocrirismodule/Lib"); + PDFNet.initialize("demo:1650351709282:7bd235e003000000004ec28a6743e1163a085e2115de2536ab6e2cfe5a"); + + String filename = "files/18 - EVIDIS - Corrosao Irritacao ocular aguda.pdf"; + + String tmpFilename = createTmpFileName(filename, "WATERMARK_REMOVAL"); + try (var in = this.getClass().getClassLoader().getResourceAsStream(filename); var out = new FileOutputStream(tmpFilename)) { + + { + System.out.println(tmpFilename); + WatermarkRemovalService.removeWatermarks(in, out); + } + } + + } + + + private static boolean isWindows() { + + return System.getProperty("os.name").toLowerCase(Locale.ROOT).contains("windows"); + } + + + public static String getTemporaryDirectory() { + + String tmpdir = System.getProperty("java.io.tmpdir"); + if (isWindows() && StringUtils.isNotBlank(tmpdir)) { + return tmpdir; + } + return "/tmp"; + } + + + public static String createTmpFileName(String filename, String suffix) { + + return Path.of(getTemporaryDirectory()).resolve(Path.of(filename).getFileName()).toString().replace(".pdf", "_" + suffix + ".pdf"); + } + +} \ No newline at end of file