From 84e3390f4e390eaa8764b6b875b8ba04cdd0f4a2 Mon Sep 17 00:00:00 2001 From: RaphaelArnold Date: Thu, 31 Aug 2023 14:58:27 +0200 Subject: [PATCH 1/7] RED-7075: Improved watermark removal to recognize smaller images and text --- .../pdftronlogic/commons/ElementFeatures.java | 135 ++++++++++---- .../commons/WatermarkRemovalService.java | 175 +++++++++++++++--- .../commons/WatermarkRemovalServiceTest.java | 14 +- 3 files changed, 254 insertions(+), 70 deletions(-) diff --git a/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java b/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java index 6681ac9..6d621c4 100644 --- a/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java +++ b/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java @@ -3,7 +3,6 @@ package com.iqser.red.pdftronlogic.commons; import static com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService.TOLERANCE; import java.awt.Color; -import java.awt.geom.Area; import java.awt.geom.GeneralPath; import java.awt.geom.Rectangle2D; @@ -34,18 +33,6 @@ public class ElementFeatures { rectsAlmostMatch(element.getBBox()); } - public boolean almostMatches(ElementFeatures elementFeatures){ - return elementFeatures.getElementType() == elementType && - elementFeatures.getBoundingBox() != null && - rectsAlmostMatch(elementFeatures.getBoundingBox()); - } - - - protected boolean almostEqual(double a, double b) { - - return Math.abs(a - b) < TOLERANCE; - } - @SneakyThrows private boolean rectsAlmostMatch(Rect bBox) { @@ -57,6 +44,19 @@ public class ElementFeatures { almostEqual(bBox.getHeight(), boundingBox.getHeight()); } + + protected boolean almostEqual(double a, double b) { + + return Math.abs(a - b) < TOLERANCE; + } + + + public boolean almostMatches(ElementFeatures elementFeatures) { + + return elementFeatures.getElementType() == elementType && elementFeatures.getBoundingBox() != null && rectsAlmostMatch(elementFeatures.getBoundingBox()); + } + + @SneakyThrows private boolean rectsAlmostMatch(Rectangle2D bBox) { // To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance @@ -68,6 +68,53 @@ public class ElementFeatures { } + public boolean similarMatches(Element element) throws PDFNetException { + + return element.getType() == elementType && // + element.getBBox() != null && // + rectsSimilarMatch(element.getBBox()); + } + + + @SneakyThrows + private boolean rectsSimilarMatch(Rect rect) { + // To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance + + return similarEqualPosition(rect.getX1(), boundingBox.getX(), rect.getWidth()) && // + similarEqualPosition(rect.getY1(), boundingBox.getY(), rect.getHeight()) && // + similarEqualSize(rect.getWidth(), boundingBox.getWidth()) && // + similarEqualSize(rect.getHeight(), boundingBox.getHeight()); + } + + + protected boolean similarEqualPosition(double a, double b, double boxSize) { + + return Math.abs(a - b) < boxSize * 0.2; + } + + + protected boolean similarEqualSize(double a, double b) { + + return Math.abs(a - b) < a * 0.1; + } + + + public boolean similarMatches(ElementFeatures elementFeatures) { + + return elementFeatures.getElementType() == elementType && elementFeatures.getBoundingBox() != null && rectsSimilarMatch(elementFeatures.getBoundingBox()); + } + + + private boolean rectsSimilarMatch(Rectangle2D rectangle2D) { + // To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance + + return similarEqualPosition(rectangle2D.getX(), boundingBox.getX(), rectangle2D.getWidth()) && // + similarEqualPosition(rectangle2D.getY(), boundingBox.getY(), rectangle2D.getHeight()) && // + similarEqualSize(rectangle2D.getWidth(), boundingBox.getWidth()) && // + similarEqualSize(rectangle2D.getHeight(), boundingBox.getHeight()); + } + + @EqualsAndHashCode(callSuper = true) @Getter @SuperBuilder @@ -148,6 +195,7 @@ public class ElementFeatures { int bitsPerComponent; String hashOfImage; + @Override public boolean almostMatches(Element element) throws PDFNetException { @@ -160,22 +208,39 @@ public class ElementFeatures { bitsPerComponent == element.getBitsPerComponent(); } - public boolean almostMatches(ElementFeatures elementFeatures){ - if(elementFeatures.getClass() != this.getClass()){ + + public boolean almostMatches(ElementFeatures elementFeatures) { + + if (elementFeatures.getClass() != this.getClass()) { return false; } - return super.almostMatches(elementFeatures) && - this.dataSize == ((Image) elementFeatures).getDataSize() && - this.height == ((Image) elementFeatures).getHeight() && - this.width == ((Image) elementFeatures).getWidth() && - this.renderingIntent == ((Image) elementFeatures).getRenderingIntent() && - this.componentNum == ((Image) elementFeatures).getComponentNum() && - this.bitsPerComponent == ((Image) elementFeatures).getBitsPerComponent() && - calculateHammingDistance(((Image) elementFeatures).getHashOfImage()) <=4; + return super.almostMatches(elementFeatures) && this.dataSize == ((Image) elementFeatures).getDataSize() && this.height == ((Image) elementFeatures).getHeight() && this.width == ((Image) elementFeatures).getWidth() && this.renderingIntent == ((Image) elementFeatures).getRenderingIntent() && this.componentNum == ((Image) elementFeatures).getComponentNum() && this.bitsPerComponent == ((Image) elementFeatures).getBitsPerComponent() && calculateHammingDistance( + ((Image) elementFeatures).getHashOfImage()) <= 4; } + + public boolean similarMatches(Element element) throws PDFNetException { + + return super.almostMatches(element) && // + dataSize == element.getImageDataSize() && // + height == element.getImageHeight() && // + width == element.getImageWidth() && // + renderingIntent == element.getImageRenderingIntent() && // + componentNum == element.getComponentNum() && // + bitsPerComponent == element.getBitsPerComponent(); + } + + + public boolean similarMatches(ElementFeatures elementFeatures) { + + return super.similarMatches(elementFeatures) && this.dataSize == ((Image) elementFeatures).getDataSize() && this.height == ((Image) elementFeatures).getHeight() && this.width == ((Image) elementFeatures).getWidth() && this.renderingIntent == ((Image) elementFeatures).getRenderingIntent() && this.componentNum == ((Image) elementFeatures).getComponentNum() && this.bitsPerComponent == ((Image) elementFeatures).getBitsPerComponent() && calculateHammingDistance( + ((Image) elementFeatures).getHashOfImage()) <= 4; + } + + // Helper method to calculate the Hamming distance between two hexadecimal strings private int calculateHammingDistance(String hash2) { + int distance = 0; int maxLength = Math.max(this.hashOfImage.length(), hash2.length()); for (int i = 0; i < maxLength; i++) { @@ -202,34 +267,32 @@ public class ElementFeatures { @Override public boolean almostMatches(Element element) throws PDFNetException { + return element.getType() == getElementType() && // element.getBBox() != null && // - (super.rectsAlmostMatch(element.getBBox()) || almostRotateMatches(element.getBBox().getRectangle())) && - xObjectType == element.getXObject().getType() && - dictOrArrayOrStreamLength == element.getXObject().getDecodedStream().size(); + (super.rectsAlmostMatch(element.getBBox()) || almostRotateMatches(element.getBBox().getRectangle())) && xObjectType == element.getXObject() + .getType() && dictOrArrayOrStreamLength == element.getXObject().getDecodedStream().size(); } - public boolean almostMatches(ElementFeatures elementFeatures){ - if(elementFeatures.getClass() != this.getClass()){ + + public boolean almostMatches(ElementFeatures elementFeatures) { + + if (elementFeatures.getClass() != this.getClass()) { return false; } - return elementFeatures.getElementType() == getElementType() && - elementFeatures.getBoundingBox() != null && - (super.rectsAlmostMatch(elementFeatures.getBoundingBox()) || almostRotateMatches(elementFeatures.getBoundingBox().getBounds2D())) && - xObjectType == ((Form)elementFeatures).getXObjectType() && - dictOrArrayOrStreamLength == ((Form)elementFeatures).getDictOrArrayOrStreamLength(); + return elementFeatures.getElementType() == getElementType() && elementFeatures.getBoundingBox() != null && (super.rectsAlmostMatch(elementFeatures.getBoundingBox()) || almostRotateMatches( + elementFeatures.getBoundingBox() + .getBounds2D())) && xObjectType == ((Form) elementFeatures).getXObjectType() && dictOrArrayOrStreamLength == ((Form) elementFeatures).getDictOrArrayOrStreamLength(); } private boolean almostRotateMatches(Rectangle2D bBox) { + return almostEqual(bBox.getWidth(), getBoundingBox().getHeight()) && // almostEqual(bBox.getHeight(), getBoundingBox().getWidth()); } } - - - } diff --git a/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java b/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java index e1c7e23..209ed88 100644 --- a/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java +++ b/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java @@ -1,23 +1,47 @@ package com.iqser.red.pdftronlogic.commons; -import com.pdftron.common.PDFNetException; -import com.pdftron.pdf.*; -import com.pdftron.sdf.SDFDoc; -import lombok.SneakyThrows; -import lombok.extern.slf4j.Slf4j; - import java.io.InputStream; import java.io.OutputStream; -import java.util.*; +import java.util.Collection; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; + +import com.pdftron.common.PDFNetException; +import com.pdftron.pdf.Element; +import com.pdftron.pdf.ElementReader; +import com.pdftron.pdf.ElementWriter; +import com.pdftron.pdf.PDFDoc; +import com.pdftron.pdf.Page; +import com.pdftron.pdf.PageIterator; +import com.pdftron.sdf.SDFDoc; + +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; @Slf4j public class WatermarkRemovalService { - final static double AREA_THRESHOLD = 0.6; // multiplied with page area + final static double AREA_THRESHOLD = 0.5; // multiplied with page area final static double OCCURING_ON_PAGES_THRESHOLD_FACTOR = 0.75; // multiplied with number of pages final static int MIN_PAGES_THRESHOLD = 3; + final static double IMAGE_POSITION_HEIGHT_THRESHOLD = 0.2; // multiplied with page height + + final static double IMAGE_POSITION_WIDTH_THRESHOLD = 0.125; // multiplied with page width + + final static double TEXT_POSITION_THRESHOLD = 0.15; + + final static double MIN_TEXTWATERMAK_HEIGHT_THRESHOLD = 0.125; // multiplied with page height + + final static int PAGE_NUMBER_TEXT_SEARCH_THRESHOLD = 5; // stop text based search after 5 pages without watermark + final static double ROTATED_TEXT_THRESHOLD = 12.5; //this is in degrees + static boolean foundTextWatermark = true; + /** * The method remove watermark works only for Documents with size greater than MIN_PAGES_THRESHOLD. @@ -46,7 +70,7 @@ public class WatermarkRemovalService { log.info("Watermark found and will be removed!"); removeAllWatermarks(pdfDoc, watermarkElementFeatures); } else { - log.info("No watermark found!"); + log.info("No unlabeled watermark found!"); } } @@ -69,7 +93,6 @@ public class WatermarkRemovalService { ElementReader reader = new ElementReader(); - for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) { Page page = iterator.next(); @@ -80,7 +103,7 @@ public class WatermarkRemovalService { reader.begin(page); for (Element element = reader.next(); element != null; element = reader.next()) { - processElement(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringFromPage); + processElement(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringFromPage, page); } formObjectsAndImagesForPages.put(page.getSDFObj().getObjNum(), elementFeaturesLinkedList); @@ -96,22 +119,81 @@ public class WatermarkRemovalService { Set visitedXObjIds, List elementFeaturesLinkedList, List formObjectsOccuringMoreThanOnceOnAPage, - double minAreaCoveringPage) throws PDFNetException { + double minAreaCoveringPage, + Page page) throws PDFNetException { if (element.getBBox() == null) { return; } - if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringPage) { - return; - } if (element.getType() == Element.e_form) { - processXObject(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage); + if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringPage) { + return; + } + processXObject(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage, page); } else if (element.getType() == Element.e_image || element.getType() == Element.e_inline_image) { if (element.getXObject() == null) { return; } + if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringPage) { + boolean isLocatedNearBorder = element.getBBox().getY1() < page.getVisibleContentBox() + .getY1() + page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD || element.getBBox().getY2() > page.getVisibleContentBox() + .getY2() - page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD || element.getBBox().getX1() < page.getVisibleContentBox() + .getX1() + page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD || element.getBBox().getX2() > page.getVisibleContentBox() + .getX2() - page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD; + + if (isLocatedNearBorder) { + return; + } + } + processImages(element, elementFeaturesLinkedList); + } else if (element.getType() == Element.e_text) { + processText(element, elementFeaturesLinkedList, page); + } + } + + + @SneakyThrows + private void processText(Element element, List elementFeaturesLinkedList, Page page) { + + if (page.getIndex() == PAGE_NUMBER_TEXT_SEARCH_THRESHOLD) { + shouldTextSearchBeContinued(elementFeaturesLinkedList); + } + + if (!foundTextWatermark) { + return; + } + + if (Math.abs(element.getCTM().getB()) < Math.sin(Math.toRadians(ROTATED_TEXT_THRESHOLD)) || Math.abs(element.getCTM() + .getB()) > Math.sin(Math.toRadians(70 - ROTATED_TEXT_THRESHOLD))) { + return; + } + + if (Math.max(element.getBBox().getY1(), element.getBBox().getY2()) < page.getVisibleContentBox().getY1() + page.getPageHeight() * TEXT_POSITION_THRESHOLD) { + return; + } + + boolean isBigEnough = Math.abs(element.getBBox().getY1() - element.getBBox().getY2()) > page.getPageHeight() * MIN_TEXTWATERMAK_HEIGHT_THRESHOLD; + + if (isBigEnough) { + ElementFeatures elementFeatures = ElementFeatureFactory.extractFeatures(element); + elementFeaturesLinkedList.add(elementFeatures); + } + + } + + + private void shouldTextSearchBeContinued(List elementFeaturesLinkedList) { + + int countTextWatermarks = 0; + for (ElementFeatures elementFeatures : elementFeaturesLinkedList) { + if (elementFeatures.getElementType() == Element.e_text) { + countTextWatermarks++; + } + } + if (countTextWatermarks < elementFeaturesLinkedList.size() * OCCURING_ON_PAGES_THRESHOLD_FACTOR) { + foundTextWatermark = false; } } @@ -130,13 +212,14 @@ public class WatermarkRemovalService { Set visitedXObjIds, List elementFeaturesLinkedList, List formObjectsOccuringMoreThanOnceOnAPage, - double minAreaCoveringPage) { + double minAreaCoveringPage, + Page page) { if (visitedXObjIds.add(element.getXObject().getObjNum())) { ElementReader xObjectReader = new ElementReader(); xObjectReader.begin(element.getXObject()); for (Element element1 = xObjectReader.next(); element1 != null; element1 = xObjectReader.next()) { - processElement(element1, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage); + processElement(element1, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage, page); } elementFeaturesLinkedList.add(ElementFeatureFactory.extractFeatures(element)); xObjectReader.destroy(); @@ -159,7 +242,8 @@ public class WatermarkRemovalService { .flatMap(Collection::stream) .filter(elementFeature -> formObjectsPerPage.values() .stream() - .filter(elementFeaturesOnPage -> elementFeaturesOnPage.stream().anyMatch(elementFeature::almostMatches)) + .filter(elementFeaturesOnPage -> elementFeaturesOnPage.stream() + .anyMatch(elementFeature.getElementType() == Element.e_image || elementFeature.getElementType() == Element.e_inline_image ? elementFeature::similarMatches : elementFeature::almostMatches)) .count() >= minPagesFilter) .toList(); } @@ -210,21 +294,28 @@ public class WatermarkRemovalService { double minAreaCoveringFromPage = AREA_THRESHOLD * page.getPageHeight() * page.getPageWidth(); for (Element element = reader.next(); element != null; element = reader.next()) { - switch (element.getType()) { case Element.e_image, Element.e_inline_image -> { if (element.getBBox() == null) { writer.writeElement(element); continue; } - if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage || element.getXObject() == null) { - writer.writeElement(element); - continue; + if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage) { + boolean isLocatedNearBorder = element.getBBox().getY1() < page.getVisibleContentBox() + .getY1() + page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD || element.getBBox().getY2() > page.getVisibleContentBox() + .getY2() - page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD || element.getBBox().getX1() < page.getVisibleContentBox() + .getX1() + page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD || element.getBBox().getX2() > page.getVisibleContentBox() + .getX2() - page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD; + + if ((isLocatedNearBorder && element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage) || element.getXObject() == null) { + writer.writeElement(element); + continue; + } } removeImages(element, writer, watermarksElementFeaturesList); } - case Element.e_form -> - processForms(page, element, reader, writer, watermarksElementFeaturesList, visitedXObjIds); + case Element.e_form -> processForms(page, element, reader, writer, watermarksElementFeaturesList, visitedXObjIds); + case Element.e_text -> processText(element, reader, writer, watermarksElementFeaturesList, page); default -> writer.writeElement(element); } } @@ -232,13 +323,41 @@ public class WatermarkRemovalService { @SneakyThrows - private void removeImages(Element element, ElementWriter - writer, List watermarksElementFeaturesList) { + private void processText(Element element, ElementReader reader, ElementWriter writer, List watermarksElementFeaturesList, Page page) { + + if (!foundTextWatermark) { + writer.writeElement(element); + return; + } + + if (Math.abs(element.getCTM().getB()) < Math.sin(Math.toRadians(ROTATED_TEXT_THRESHOLD)) || Math.abs(element.getCTM() + .getB()) > Math.sin(Math.toRadians(70 - ROTATED_TEXT_THRESHOLD))) { + writer.writeElement(element); + return; + } + + if (Math.max(element.getBBox().getY1(), element.getBBox().getY2()) < page.getVisibleContentBox().getY1() + page.getPageHeight() * TEXT_POSITION_THRESHOLD) { + writer.writeElement(element); + return; + } + + for (ElementFeatures elementFeatures : watermarksElementFeaturesList) { + if (elementFeatures.almostMatches(element)) { + return; + } + } + writer.writeElement(element); + + } + + + @SneakyThrows + private void removeImages(Element element, ElementWriter writer, List watermarksElementFeaturesList) { String hashValueOfImage = ImageHashFactory.calculate(element); ElementFeatures imageFeatures = ElementFeatureFactory.extractFeaturesWithHash(element, hashValueOfImage); for (ElementFeatures elementFeatures : watermarksElementFeaturesList) { - if (elementFeatures.almostMatches(imageFeatures)) { + if (elementFeatures.similarMatches(imageFeatures)) { return; } } diff --git a/src/test/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalServiceTest.java b/src/test/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalServiceTest.java index 5e91d8d..6755ab1 100644 --- a/src/test/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalServiceTest.java +++ b/src/test/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalServiceTest.java @@ -1,15 +1,17 @@ package com.iqser.red.pdftronlogic.commons; -import com.pdftron.pdf.PDFNet; -import lombok.SneakyThrows; -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; -import org.junit.platform.commons.util.StringUtils; - import java.io.FileOutputStream; import java.nio.file.Path; import java.util.Locale; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; +import org.junit.platform.commons.util.StringUtils; + +import com.pdftron.pdf.PDFNet; + +import lombok.SneakyThrows; + @Disabled class WatermarkRemovalServiceTest { From 89c2ab02ea5cf33e36fa5110bec9b0dd3d85ddc3 Mon Sep 17 00:00:00 2001 From: RaphaelArnold Date: Thu, 31 Aug 2023 15:04:04 +0200 Subject: [PATCH 2/7] RED-7075: Improved WatermarkLogic including text and image watermarks --- .../red/pdftronlogic/commons/WatermarkRemovalService.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java b/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java index 209ed88..0a42f2f 100644 --- a/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java +++ b/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java @@ -307,7 +307,7 @@ public class WatermarkRemovalService { .getX1() + page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD || element.getBBox().getX2() > page.getVisibleContentBox() .getX2() - page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD; - if ((isLocatedNearBorder && element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage) || element.getXObject() == null) { + if (isLocatedNearBorder && element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage || element.getXObject() == null) { writer.writeElement(element); continue; } @@ -315,7 +315,7 @@ public class WatermarkRemovalService { removeImages(element, writer, watermarksElementFeaturesList); } case Element.e_form -> processForms(page, element, reader, writer, watermarksElementFeaturesList, visitedXObjIds); - case Element.e_text -> processText(element, reader, writer, watermarksElementFeaturesList, page); + case Element.e_text -> processText(element, writer, watermarksElementFeaturesList, page); default -> writer.writeElement(element); } } @@ -323,7 +323,7 @@ public class WatermarkRemovalService { @SneakyThrows - private void processText(Element element, ElementReader reader, ElementWriter writer, List watermarksElementFeaturesList, Page page) { + private void processText(Element element, ElementWriter writer, List watermarksElementFeaturesList, Page page) { if (!foundTextWatermark) { writer.writeElement(element); From 439499143244b17941cd768e70476174d3333361 Mon Sep 17 00:00:00 2001 From: RaphaelArnold Date: Thu, 31 Aug 2023 15:27:38 +0200 Subject: [PATCH 3/7] RED-7075: Some Styling improvements --- .../commons/WatermarkRemovalService.java | 65 +++++++++---------- 1 file changed, 31 insertions(+), 34 deletions(-) diff --git a/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java b/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java index 0a42f2f..2d8e6a2 100644 --- a/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java +++ b/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java @@ -36,7 +36,7 @@ public class WatermarkRemovalService { final static double TEXT_POSITION_THRESHOLD = 0.15; - final static double MIN_TEXTWATERMAK_HEIGHT_THRESHOLD = 0.125; // multiplied with page height + final static double MIN_TEXTWATERMARK_HEIGHT_THRESHOLD = 0.125; // multiplied with page height final static int PAGE_NUMBER_TEXT_SEARCH_THRESHOLD = 5; // stop text based search after 5 pages without watermark final static double ROTATED_TEXT_THRESHOLD = 12.5; //this is in degrees @@ -126,31 +126,12 @@ public class WatermarkRemovalService { return; } - if (element.getType() == Element.e_form) { - if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringPage) { - return; - } - processXObject(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage, page); - } else if (element.getType() == Element.e_image || element.getType() == Element.e_inline_image) { - if (element.getXObject() == null) { - return; - } - if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringPage) { - boolean isLocatedNearBorder = element.getBBox().getY1() < page.getVisibleContentBox() - .getY1() + page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD || element.getBBox().getY2() > page.getVisibleContentBox() - .getY2() - page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD || element.getBBox().getX1() < page.getVisibleContentBox() - .getX1() + page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD || element.getBBox().getX2() > page.getVisibleContentBox() - .getX2() - page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD; - - if (isLocatedNearBorder) { - return; - } - } - - processImages(element, elementFeaturesLinkedList); - } else if (element.getType() == Element.e_text) { - processText(element, elementFeaturesLinkedList, page); + switch (element.getType()) { + case Element.e_form -> processXObject(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage, page); + case Element.e_image, Element.e_inline_image -> processImages(element, elementFeaturesLinkedList, page, minAreaCoveringPage); + case Element.e_text -> processText(element, elementFeaturesLinkedList, page); } + } @@ -174,7 +155,7 @@ public class WatermarkRemovalService { return; } - boolean isBigEnough = Math.abs(element.getBBox().getY1() - element.getBBox().getY2()) > page.getPageHeight() * MIN_TEXTWATERMAK_HEIGHT_THRESHOLD; + boolean isBigEnough = Math.abs(element.getBBox().getY1() - element.getBBox().getY2()) > page.getPageHeight() * MIN_TEXTWATERMARK_HEIGHT_THRESHOLD; if (isBigEnough) { ElementFeatures elementFeatures = ElementFeatureFactory.extractFeatures(element); @@ -199,7 +180,14 @@ public class WatermarkRemovalService { @SneakyThrows - private void processImages(Element element, List elementFeaturesLinkedList) { + private void processImages(Element element, List elementFeaturesLinkedList, Page page, double minAreaCoveringPage) { + + if (element.getXObject() == null) { + return; + } + if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringPage && isLocatedNearBorder(element, page)) { + return; + } String hashOfImage = ImageHashFactory.calculate(element); ElementFeatures elementFeatures = ElementFeatureFactory.extractFeaturesWithHash(element, hashOfImage); @@ -207,6 +195,16 @@ public class WatermarkRemovalService { } + @SneakyThrows + private boolean isLocatedNearBorder(Element element, Page page) { + + return element.getBBox().getY1() < page.getVisibleContentBox().getY1() + page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD || element.getBBox() + .getY2() > page.getVisibleContentBox().getY2() - page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD || element.getBBox().getX1() < page.getVisibleContentBox() + .getX1() + page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD || element.getBBox().getX2() > page.getVisibleContentBox() + .getX2() - page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD; + } + + @SneakyThrows private void processXObject(Element element, Set visitedXObjIds, @@ -215,6 +213,10 @@ public class WatermarkRemovalService { double minAreaCoveringPage, Page page) { + if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringPage) { + return; + } + if (visitedXObjIds.add(element.getXObject().getObjNum())) { ElementReader xObjectReader = new ElementReader(); xObjectReader.begin(element.getXObject()); @@ -301,13 +303,8 @@ public class WatermarkRemovalService { continue; } if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage) { - boolean isLocatedNearBorder = element.getBBox().getY1() < page.getVisibleContentBox() - .getY1() + page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD || element.getBBox().getY2() > page.getVisibleContentBox() - .getY2() - page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD || element.getBBox().getX1() < page.getVisibleContentBox() - .getX1() + page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD || element.getBBox().getX2() > page.getVisibleContentBox() - .getX2() - page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD; - - if (isLocatedNearBorder && element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage || element.getXObject() == null) { + if (isLocatedNearBorder(element, page) && element.getBBox().getHeight() * element.getBBox() + .getWidth() < minAreaCoveringFromPage || element.getXObject() == null) { writer.writeElement(element); continue; } From 42836ae35adccb2830dc9431dfe20a46298c3b68 Mon Sep 17 00:00:00 2001 From: RaphaelArnold Date: Thu, 31 Aug 2023 15:35:11 +0200 Subject: [PATCH 4/7] RED-7075: PMD error resolved --- .../commons/WatermarkRemovalService.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java b/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java index 2d8e6a2..4335323 100644 --- a/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java +++ b/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java @@ -302,12 +302,12 @@ public class WatermarkRemovalService { writer.writeElement(element); continue; } - if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage) { - if (isLocatedNearBorder(element, page) && element.getBBox().getHeight() * element.getBBox() - .getWidth() < minAreaCoveringFromPage || element.getXObject() == null) { - writer.writeElement(element); - continue; - } + if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage && isLocatedNearBorder(element, page) && element.getBBox() + .getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage || element.getXObject() == null) { + + writer.writeElement(element); + continue; + } removeImages(element, writer, watermarksElementFeaturesList); } From bf3015161becfc1cca2271cd50b33a7d7db91a25 Mon Sep 17 00:00:00 2001 From: RaphaelArnold Date: Fri, 1 Sep 2023 09:58:16 +0200 Subject: [PATCH 5/7] RED-7075: Styling changes --- .../pdftronlogic/commons/ElementFeatures.java | 71 +++++++------------ .../commons/WatermarkRemovalService.java | 59 ++++++++------- 2 files changed, 57 insertions(+), 73 deletions(-) diff --git a/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java b/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java index 6d621c4..482c46e 100644 --- a/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java +++ b/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java @@ -22,6 +22,9 @@ import lombok.experimental.SuperBuilder; @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) public class ElementFeatures { + final private static double RECT_POSITION_SIMILARITY_THRESHOLD_FACTOR = 0.2; // specify how much the x and y value are allowed to differ + final private static double RECT_SIZE_SIMILARITY_THRESHOLD_FACTOR = 0.1; // the scale the images are allowed to differ + final private static double HAMMING_DISTANCE_THRESHOLD = 4; // defines the similarity of the hash of images int elementType; Rectangle2D boundingBox; @@ -68,53 +71,33 @@ public class ElementFeatures { } - public boolean similarMatches(Element element) throws PDFNetException { + public boolean isSimilarTo(ElementFeatures elementFeatures) { - return element.getType() == elementType && // - element.getBBox() != null && // - rectsSimilarMatch(element.getBBox()); + return elementFeatures.getElementType() == elementType && elementFeatures.getBoundingBox() != null && areRectsSimilar(elementFeatures.getBoundingBox()); } - @SneakyThrows - private boolean rectsSimilarMatch(Rect rect) { + private boolean areRectsSimilar(Rectangle2D rectangle2D) { // To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance - return similarEqualPosition(rect.getX1(), boundingBox.getX(), rect.getWidth()) && // - similarEqualPosition(rect.getY1(), boundingBox.getY(), rect.getHeight()) && // - similarEqualSize(rect.getWidth(), boundingBox.getWidth()) && // - similarEqualSize(rect.getHeight(), boundingBox.getHeight()); + return isPositionSimilar(rectangle2D.getX(), boundingBox.getX(), rectangle2D.getWidth()) && // + isPositionSimilar(rectangle2D.getY(), boundingBox.getY(), rectangle2D.getHeight()) && // + isSizeSimilar(rectangle2D.getWidth(), boundingBox.getWidth()) && // + isSizeSimilar(rectangle2D.getHeight(), boundingBox.getHeight()); } - protected boolean similarEqualPosition(double a, double b, double boxSize) { + protected boolean isPositionSimilar(double a, double b, double boxSize) { - return Math.abs(a - b) < boxSize * 0.2; + return Math.abs(a - b) < boxSize * RECT_POSITION_SIMILARITY_THRESHOLD_FACTOR; } - protected boolean similarEqualSize(double a, double b) { + protected boolean isSizeSimilar(double a, double b) { - return Math.abs(a - b) < a * 0.1; + return Math.abs(a - b) < a * RECT_SIZE_SIMILARITY_THRESHOLD_FACTOR; } - - public boolean similarMatches(ElementFeatures elementFeatures) { - - return elementFeatures.getElementType() == elementType && elementFeatures.getBoundingBox() != null && rectsSimilarMatch(elementFeatures.getBoundingBox()); - } - - - private boolean rectsSimilarMatch(Rectangle2D rectangle2D) { - // To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance - - return similarEqualPosition(rectangle2D.getX(), boundingBox.getX(), rectangle2D.getWidth()) && // - similarEqualPosition(rectangle2D.getY(), boundingBox.getY(), rectangle2D.getHeight()) && // - similarEqualSize(rectangle2D.getWidth(), boundingBox.getWidth()) && // - similarEqualSize(rectangle2D.getHeight(), boundingBox.getHeight()); - } - - @EqualsAndHashCode(callSuper = true) @Getter @SuperBuilder @@ -215,26 +198,20 @@ public class ElementFeatures { return false; } return super.almostMatches(elementFeatures) && this.dataSize == ((Image) elementFeatures).getDataSize() && this.height == ((Image) elementFeatures).getHeight() && this.width == ((Image) elementFeatures).getWidth() && this.renderingIntent == ((Image) elementFeatures).getRenderingIntent() && this.componentNum == ((Image) elementFeatures).getComponentNum() && this.bitsPerComponent == ((Image) elementFeatures).getBitsPerComponent() && calculateHammingDistance( - ((Image) elementFeatures).getHashOfImage()) <= 4; + ((Image) elementFeatures).getHashOfImage()) <= HAMMING_DISTANCE_THRESHOLD; } - public boolean similarMatches(Element element) throws PDFNetException { + public boolean isSimilarTo(ElementFeatures elementFeatures) { - return super.almostMatches(element) && // - dataSize == element.getImageDataSize() && // - height == element.getImageHeight() && // - width == element.getImageWidth() && // - renderingIntent == element.getImageRenderingIntent() && // - componentNum == element.getComponentNum() && // - bitsPerComponent == element.getBitsPerComponent(); - } - - - public boolean similarMatches(ElementFeatures elementFeatures) { - - return super.similarMatches(elementFeatures) && this.dataSize == ((Image) elementFeatures).getDataSize() && this.height == ((Image) elementFeatures).getHeight() && this.width == ((Image) elementFeatures).getWidth() && this.renderingIntent == ((Image) elementFeatures).getRenderingIntent() && this.componentNum == ((Image) elementFeatures).getComponentNum() && this.bitsPerComponent == ((Image) elementFeatures).getBitsPerComponent() && calculateHammingDistance( - ((Image) elementFeatures).getHashOfImage()) <= 4; + return super.isSimilarTo(elementFeatures) && // + //this.dataSize == ((Image) elementFeatures).getDataSize() && // + //this.height == ((Image) elementFeatures).getHeight() && // + //this.width == ((Image) elementFeatures).getWidth() && // + //this.renderingIntent == ((Image) elementFeatures).getRenderingIntent() && // + //this.componentNum == ((Image) elementFeatures).getComponentNum() && // + //this.bitsPerComponent == ((Image) elementFeatures).getBitsPerComponent() && // + calculateHammingDistance(((Image) elementFeatures).getHashOfImage()) <= HAMMING_DISTANCE_THRESHOLD; } diff --git a/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java b/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java index 4335323..d3484ec 100644 --- a/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java +++ b/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java @@ -45,8 +45,10 @@ public class WatermarkRemovalService { /** * The method remove watermark works only for Documents with size greater than MIN_PAGES_THRESHOLD. - * First the possible watermarks (big XObjects or Images) will be detected and then checked if those appear on most pages according to the - * OCCURING_ON_PAGES_THRESHOLD_FACTOR by using image hashing for similarity and size and stream size of the xobjects. + * The following watermarks will be found: big XObjects, big Images, small Images that appear in the middle of the page, and + * text that is rotated and big enough compared to height of page. + * First the possible watermarks will be detected and then checked if those appear on most pages according to the + * OCCURING_ON_PAGES_THRESHOLD_FACTOR. We us image hashing for similarity between pictures and size and stream size of the xobjects. * If so, these detected and confirmed will not be written to the pdf file. * * @param pdfFile PDFFile to remove watermarks @@ -142,16 +144,7 @@ public class WatermarkRemovalService { shouldTextSearchBeContinued(elementFeaturesLinkedList); } - if (!foundTextWatermark) { - return; - } - - if (Math.abs(element.getCTM().getB()) < Math.sin(Math.toRadians(ROTATED_TEXT_THRESHOLD)) || Math.abs(element.getCTM() - .getB()) > Math.sin(Math.toRadians(70 - ROTATED_TEXT_THRESHOLD))) { - return; - } - - if (Math.max(element.getBBox().getY1(), element.getBBox().getY2()) < page.getVisibleContentBox().getY1() + page.getPageHeight() * TEXT_POSITION_THRESHOLD) { + if (!couldTextBeAWatermark(element, page)) { return; } @@ -165,6 +158,14 @@ public class WatermarkRemovalService { } + @SneakyThrows + private boolean isTextRotated(Element element) { + + return Math.abs(element.getCTM().getB()) < Math.sin(Math.toRadians(ROTATED_TEXT_THRESHOLD)) || Math.abs(element.getCTM() + .getB()) > Math.sin(Math.toRadians(70 - ROTATED_TEXT_THRESHOLD)); + } + + private void shouldTextSearchBeContinued(List elementFeaturesLinkedList) { int countTextWatermarks = 0; @@ -245,7 +246,7 @@ public class WatermarkRemovalService { .filter(elementFeature -> formObjectsPerPage.values() .stream() .filter(elementFeaturesOnPage -> elementFeaturesOnPage.stream() - .anyMatch(elementFeature.getElementType() == Element.e_image || elementFeature.getElementType() == Element.e_inline_image ? elementFeature::similarMatches : elementFeature::almostMatches)) + .anyMatch(elementFeature.getElementType() == Element.e_image || elementFeature.getElementType() == Element.e_inline_image ? elementFeature::isSimilarTo : elementFeature::almostMatches)) .count() >= minPagesFilter) .toList(); } @@ -322,18 +323,7 @@ public class WatermarkRemovalService { @SneakyThrows private void processText(Element element, ElementWriter writer, List watermarksElementFeaturesList, Page page) { - if (!foundTextWatermark) { - writer.writeElement(element); - return; - } - - if (Math.abs(element.getCTM().getB()) < Math.sin(Math.toRadians(ROTATED_TEXT_THRESHOLD)) || Math.abs(element.getCTM() - .getB()) > Math.sin(Math.toRadians(70 - ROTATED_TEXT_THRESHOLD))) { - writer.writeElement(element); - return; - } - - if (Math.max(element.getBBox().getY1(), element.getBBox().getY2()) < page.getVisibleContentBox().getY1() + page.getPageHeight() * TEXT_POSITION_THRESHOLD) { + if (!couldTextBeAWatermark(element, page)) { writer.writeElement(element); return; } @@ -348,13 +338,30 @@ public class WatermarkRemovalService { } + private boolean couldTextBeAWatermark(Element element, Page page) throws PDFNetException { + + if (!foundTextWatermark) { + return false; + } + + if (isTextRotated(element)) { + return false; + } + + if (Math.max(element.getBBox().getY1(), element.getBBox().getY2()) < page.getVisibleContentBox().getY1() + page.getPageHeight() * TEXT_POSITION_THRESHOLD) { + return false; + } + return true; + } + + @SneakyThrows private void removeImages(Element element, ElementWriter writer, List watermarksElementFeaturesList) { String hashValueOfImage = ImageHashFactory.calculate(element); ElementFeatures imageFeatures = ElementFeatureFactory.extractFeaturesWithHash(element, hashValueOfImage); for (ElementFeatures elementFeatures : watermarksElementFeaturesList) { - if (elementFeatures.similarMatches(imageFeatures)) { + if (elementFeatures.isSimilarTo(imageFeatures)) { return; } } From 5338f247259df4841787957e756dd94b164eb71a Mon Sep 17 00:00:00 2001 From: RaphaelArnold Date: Fri, 1 Sep 2023 10:02:14 +0200 Subject: [PATCH 6/7] RED-7075: Styling change --- .../iqser/red/pdftronlogic/commons/ElementFeatures.java | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java b/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java index 482c46e..718c4ea 100644 --- a/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java +++ b/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java @@ -98,6 +98,7 @@ public class ElementFeatures { return Math.abs(a - b) < a * RECT_SIZE_SIMILARITY_THRESHOLD_FACTOR; } + @EqualsAndHashCode(callSuper = true) @Getter @SuperBuilder @@ -205,12 +206,6 @@ public class ElementFeatures { public boolean isSimilarTo(ElementFeatures elementFeatures) { return super.isSimilarTo(elementFeatures) && // - //this.dataSize == ((Image) elementFeatures).getDataSize() && // - //this.height == ((Image) elementFeatures).getHeight() && // - //this.width == ((Image) elementFeatures).getWidth() && // - //this.renderingIntent == ((Image) elementFeatures).getRenderingIntent() && // - //this.componentNum == ((Image) elementFeatures).getComponentNum() && // - //this.bitsPerComponent == ((Image) elementFeatures).getBitsPerComponent() && // calculateHammingDistance(((Image) elementFeatures).getHashOfImage()) <= HAMMING_DISTANCE_THRESHOLD; } From 9db8e63f080317e91136c7d3b55e3d88cbd129c9 Mon Sep 17 00:00:00 2001 From: RaphaelArnold Date: Fri, 1 Sep 2023 11:24:55 +0200 Subject: [PATCH 7/7] RED-7075: Style change --- .../iqser/red/pdftronlogic/commons/WatermarkRemovalService.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java b/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java index d3484ec..45694ac 100644 --- a/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java +++ b/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java @@ -196,6 +196,7 @@ public class WatermarkRemovalService { } + // Typically company logos on dossier pages are located near the border and should be excluded from the watermark removal @SneakyThrows private boolean isLocatedNearBorder(Element element, Page page) {