From a6e10ad5b9619bb27d462130a47b8df188fc8d15 Mon Sep 17 00:00:00 2001 From: deiflaender Date: Mon, 14 Aug 2023 13:11:46 +0200 Subject: [PATCH] RED-7080: Remove watermarks that are named as watermarks in OCG --- .../commons/WatermarkRemovalService.java | 106 ++++++++++-------- 1 file changed, 61 insertions(+), 45 deletions(-) diff --git a/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java b/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java index bd40494..7aab0c4 100644 --- a/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java +++ b/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java @@ -1,26 +1,17 @@ package com.iqser.red.pdftronlogic.commons; +import com.pdftron.common.PDFNetException; +import com.pdftron.pdf.*; +import com.pdftron.pdf.ocg.Group; +import com.pdftron.pdf.ocg.OCMD; +import com.pdftron.sdf.Obj; +import com.pdftron.sdf.SDFDoc; +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; + import java.io.InputStream; import java.io.OutputStream; -import java.util.Collection; -import java.util.HashMap; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.TreeSet; - -import com.pdftron.common.PDFNetException; -import com.pdftron.pdf.Element; -import com.pdftron.pdf.ElementReader; -import com.pdftron.pdf.ElementWriter; -import com.pdftron.pdf.PDFDoc; -import com.pdftron.pdf.Page; -import com.pdftron.pdf.PageIterator; -import com.pdftron.sdf.SDFDoc; - -import lombok.SneakyThrows; -import lombok.extern.slf4j.Slf4j; +import java.util.*; @Slf4j public class WatermarkRemovalService { @@ -36,15 +27,16 @@ public class WatermarkRemovalService { * First the possible watermarks (big XObjects or Images) will be detected and then checked if those appear on most pages according to the * OCCURING_ON_PAGES_THRESHOLD_FACTOR by using image hashing for similarity and size and stream size of the xobjects. * If so, these detected and confirmed will not be written to the pdf file. + * * @param pdfFile PDFFile to remove watermarks - * @param out The OutputStream the final file will be written to + * @param out The OutputStream the final file will be written to */ @SneakyThrows public void removeWatermarks(InputStream pdfFile, OutputStream out) { PDFDoc pdfDoc = new PDFDoc(pdfFile); - if(pdfDoc.getPageCount() < MIN_PAGES_THRESHOLD){ + if (pdfDoc.getPageCount() < MIN_PAGES_THRESHOLD) { log.info("Document page count {} is below threshold {}", pdfDoc.getPageCount(), MIN_PAGES_THRESHOLD); } else { Map> formObjectsForPages = findAllFormObjectsAndImages(pdfDoc); @@ -79,7 +71,6 @@ public class WatermarkRemovalService { ElementReader reader = new ElementReader(); - for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) { Page page = iterator.next(); @@ -103,10 +94,10 @@ public class WatermarkRemovalService { private void processElement(Element element, - Set visitedXObjIds, - List elementFeaturesLinkedList, - List formObjectsOccuringMoreThanOnceOnAPage, - double minAreaCoveringPage) throws PDFNetException { + Set visitedXObjIds, + List elementFeaturesLinkedList, + List formObjectsOccuringMoreThanOnceOnAPage, + double minAreaCoveringPage) throws PDFNetException { if (element.getBBox() == null) { return; @@ -137,10 +128,10 @@ public class WatermarkRemovalService { @SneakyThrows private void processXObject(Element element, - Set visitedXObjIds, - List elementFeaturesLinkedList, - List formObjectsOccuringMoreThanOnceOnAPage, - double minAreaCoveringPage) { + Set visitedXObjIds, + List elementFeaturesLinkedList, + List formObjectsOccuringMoreThanOnceOnAPage, + double minAreaCoveringPage) { if (visitedXObjIds.add(element.getXObject().getObjNum())) { ElementReader xObjectReader = new ElementReader(); @@ -198,10 +189,10 @@ public class WatermarkRemovalService { @SneakyThrows private void writeAllElementsExceptWatermarks(Page page, - ElementReader reader, - ElementWriter writer, - List watermarksElementFeaturesList, - Set visitedXObjIds) { + ElementReader reader, + ElementWriter writer, + List watermarksElementFeaturesList, + Set visitedXObjIds) { reader.begin(page); writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict()); @@ -212,14 +203,18 @@ public class WatermarkRemovalService { private void processElements(Page page, - ElementReader reader, - ElementWriter writer, - List watermarksElementFeaturesList, - Set visitedXObjIds) throws PDFNetException { + ElementReader reader, + ElementWriter writer, + List watermarksElementFeaturesList, + Set visitedXObjIds) throws PDFNetException { double minAreaCoveringFromPage = AREA_THRESHOLD * page.getPageHeight() * page.getPageWidth(); for (Element element = reader.next(); element != null; element = reader.next()) { + if (inOCGWatermark(element)) { + continue; + } + switch (element.getType()) { case Element.e_image, Element.e_inline_image -> { if (element.getBBox() == null) { @@ -232,7 +227,8 @@ public class WatermarkRemovalService { } removeImages(element, writer, watermarksElementFeaturesList); } - case Element.e_form -> processForms(page, element, reader, writer, watermarksElementFeaturesList, visitedXObjIds); + case Element.e_form -> + processForms(page, element, reader, writer, watermarksElementFeaturesList, visitedXObjIds); default -> writer.writeElement(element); } } @@ -240,7 +236,27 @@ public class WatermarkRemovalService { @SneakyThrows - private void removeImages(Element element, ElementWriter writer, List watermarksElementFeaturesList) { + private boolean inOCGWatermark(Element element) { + var xObj = element.getXObject(); + if (xObj != null) { + Obj oc = xObj.findObj("OC"); + if (oc != null) { + OCMD ocmd = new OCMD(oc); + if (ocmd.isValid()) { + Group group = new Group(ocmd.getOCGs()); + if (group.isValid() && group.getName().equals("Watermark")) { + return true; + } + } + } + } + return false; + } + + + @SneakyThrows + private void removeImages(Element element, ElementWriter + writer, List watermarksElementFeaturesList) { String hashValueOfImage = ImageHashFactory.calculate(element); ElementFeatures imageFeatures = ElementFeatureFactory.extractFeaturesWithHash(element, hashValueOfImage); @@ -255,11 +271,11 @@ public class WatermarkRemovalService { private void processForms(Page page, - Element element, - ElementReader reader, - ElementWriter writer, - List watermarksElementFeaturesList, - Set visitedXObjIds) throws PDFNetException { + Element element, + ElementReader reader, + ElementWriter writer, + List watermarksElementFeaturesList, + Set visitedXObjIds) throws PDFNetException { for (ElementFeatures elementFeatures : watermarksElementFeaturesList) { if (elementFeatures.almostMatches(element)) {