diff --git a/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java b/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java index 6681ac9..6d621c4 100644 --- a/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java +++ b/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java @@ -3,7 +3,6 @@ package com.iqser.red.pdftronlogic.commons; import static com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService.TOLERANCE; import java.awt.Color; -import java.awt.geom.Area; import java.awt.geom.GeneralPath; import java.awt.geom.Rectangle2D; @@ -34,18 +33,6 @@ public class ElementFeatures { rectsAlmostMatch(element.getBBox()); } - public boolean almostMatches(ElementFeatures elementFeatures){ - return elementFeatures.getElementType() == elementType && - elementFeatures.getBoundingBox() != null && - rectsAlmostMatch(elementFeatures.getBoundingBox()); - } - - - protected boolean almostEqual(double a, double b) { - - return Math.abs(a - b) < TOLERANCE; - } - @SneakyThrows private boolean rectsAlmostMatch(Rect bBox) { @@ -57,6 +44,19 @@ public class ElementFeatures { almostEqual(bBox.getHeight(), boundingBox.getHeight()); } + + protected boolean almostEqual(double a, double b) { + + return Math.abs(a - b) < TOLERANCE; + } + + + public boolean almostMatches(ElementFeatures elementFeatures) { + + return elementFeatures.getElementType() == elementType && elementFeatures.getBoundingBox() != null && rectsAlmostMatch(elementFeatures.getBoundingBox()); + } + + @SneakyThrows private boolean rectsAlmostMatch(Rectangle2D bBox) { // To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance @@ -68,6 +68,53 @@ public class ElementFeatures { } + public boolean similarMatches(Element element) throws PDFNetException { + + return element.getType() == elementType && // + element.getBBox() != null && // + rectsSimilarMatch(element.getBBox()); + } + + + @SneakyThrows + private boolean rectsSimilarMatch(Rect rect) { + // To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance + + return similarEqualPosition(rect.getX1(), boundingBox.getX(), rect.getWidth()) && // + similarEqualPosition(rect.getY1(), boundingBox.getY(), rect.getHeight()) && // + similarEqualSize(rect.getWidth(), boundingBox.getWidth()) && // + similarEqualSize(rect.getHeight(), boundingBox.getHeight()); + } + + + protected boolean similarEqualPosition(double a, double b, double boxSize) { + + return Math.abs(a - b) < boxSize * 0.2; + } + + + protected boolean similarEqualSize(double a, double b) { + + return Math.abs(a - b) < a * 0.1; + } + + + public boolean similarMatches(ElementFeatures elementFeatures) { + + return elementFeatures.getElementType() == elementType && elementFeatures.getBoundingBox() != null && rectsSimilarMatch(elementFeatures.getBoundingBox()); + } + + + private boolean rectsSimilarMatch(Rectangle2D rectangle2D) { + // To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance + + return similarEqualPosition(rectangle2D.getX(), boundingBox.getX(), rectangle2D.getWidth()) && // + similarEqualPosition(rectangle2D.getY(), boundingBox.getY(), rectangle2D.getHeight()) && // + similarEqualSize(rectangle2D.getWidth(), boundingBox.getWidth()) && // + similarEqualSize(rectangle2D.getHeight(), boundingBox.getHeight()); + } + + @EqualsAndHashCode(callSuper = true) @Getter @SuperBuilder @@ -148,6 +195,7 @@ public class ElementFeatures { int bitsPerComponent; String hashOfImage; + @Override public boolean almostMatches(Element element) throws PDFNetException { @@ -160,22 +208,39 @@ public class ElementFeatures { bitsPerComponent == element.getBitsPerComponent(); } - public boolean almostMatches(ElementFeatures elementFeatures){ - if(elementFeatures.getClass() != this.getClass()){ + + public boolean almostMatches(ElementFeatures elementFeatures) { + + if (elementFeatures.getClass() != this.getClass()) { return false; } - return super.almostMatches(elementFeatures) && - this.dataSize == ((Image) elementFeatures).getDataSize() && - this.height == ((Image) elementFeatures).getHeight() && - this.width == ((Image) elementFeatures).getWidth() && - this.renderingIntent == ((Image) elementFeatures).getRenderingIntent() && - this.componentNum == ((Image) elementFeatures).getComponentNum() && - this.bitsPerComponent == ((Image) elementFeatures).getBitsPerComponent() && - calculateHammingDistance(((Image) elementFeatures).getHashOfImage()) <=4; + return super.almostMatches(elementFeatures) && this.dataSize == ((Image) elementFeatures).getDataSize() && this.height == ((Image) elementFeatures).getHeight() && this.width == ((Image) elementFeatures).getWidth() && this.renderingIntent == ((Image) elementFeatures).getRenderingIntent() && this.componentNum == ((Image) elementFeatures).getComponentNum() && this.bitsPerComponent == ((Image) elementFeatures).getBitsPerComponent() && calculateHammingDistance( + ((Image) elementFeatures).getHashOfImage()) <= 4; } + + public boolean similarMatches(Element element) throws PDFNetException { + + return super.almostMatches(element) && // + dataSize == element.getImageDataSize() && // + height == element.getImageHeight() && // + width == element.getImageWidth() && // + renderingIntent == element.getImageRenderingIntent() && // + componentNum == element.getComponentNum() && // + bitsPerComponent == element.getBitsPerComponent(); + } + + + public boolean similarMatches(ElementFeatures elementFeatures) { + + return super.similarMatches(elementFeatures) && this.dataSize == ((Image) elementFeatures).getDataSize() && this.height == ((Image) elementFeatures).getHeight() && this.width == ((Image) elementFeatures).getWidth() && this.renderingIntent == ((Image) elementFeatures).getRenderingIntent() && this.componentNum == ((Image) elementFeatures).getComponentNum() && this.bitsPerComponent == ((Image) elementFeatures).getBitsPerComponent() && calculateHammingDistance( + ((Image) elementFeatures).getHashOfImage()) <= 4; + } + + // Helper method to calculate the Hamming distance between two hexadecimal strings private int calculateHammingDistance(String hash2) { + int distance = 0; int maxLength = Math.max(this.hashOfImage.length(), hash2.length()); for (int i = 0; i < maxLength; i++) { @@ -202,34 +267,32 @@ public class ElementFeatures { @Override public boolean almostMatches(Element element) throws PDFNetException { + return element.getType() == getElementType() && // element.getBBox() != null && // - (super.rectsAlmostMatch(element.getBBox()) || almostRotateMatches(element.getBBox().getRectangle())) && - xObjectType == element.getXObject().getType() && - dictOrArrayOrStreamLength == element.getXObject().getDecodedStream().size(); + (super.rectsAlmostMatch(element.getBBox()) || almostRotateMatches(element.getBBox().getRectangle())) && xObjectType == element.getXObject() + .getType() && dictOrArrayOrStreamLength == element.getXObject().getDecodedStream().size(); } - public boolean almostMatches(ElementFeatures elementFeatures){ - if(elementFeatures.getClass() != this.getClass()){ + + public boolean almostMatches(ElementFeatures elementFeatures) { + + if (elementFeatures.getClass() != this.getClass()) { return false; } - return elementFeatures.getElementType() == getElementType() && - elementFeatures.getBoundingBox() != null && - (super.rectsAlmostMatch(elementFeatures.getBoundingBox()) || almostRotateMatches(elementFeatures.getBoundingBox().getBounds2D())) && - xObjectType == ((Form)elementFeatures).getXObjectType() && - dictOrArrayOrStreamLength == ((Form)elementFeatures).getDictOrArrayOrStreamLength(); + return elementFeatures.getElementType() == getElementType() && elementFeatures.getBoundingBox() != null && (super.rectsAlmostMatch(elementFeatures.getBoundingBox()) || almostRotateMatches( + elementFeatures.getBoundingBox() + .getBounds2D())) && xObjectType == ((Form) elementFeatures).getXObjectType() && dictOrArrayOrStreamLength == ((Form) elementFeatures).getDictOrArrayOrStreamLength(); } private boolean almostRotateMatches(Rectangle2D bBox) { + return almostEqual(bBox.getWidth(), getBoundingBox().getHeight()) && // almostEqual(bBox.getHeight(), getBoundingBox().getWidth()); } } - - - } diff --git a/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java b/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java index e1c7e23..209ed88 100644 --- a/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java +++ b/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java @@ -1,23 +1,47 @@ package com.iqser.red.pdftronlogic.commons; -import com.pdftron.common.PDFNetException; -import com.pdftron.pdf.*; -import com.pdftron.sdf.SDFDoc; -import lombok.SneakyThrows; -import lombok.extern.slf4j.Slf4j; - import java.io.InputStream; import java.io.OutputStream; -import java.util.*; +import java.util.Collection; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; + +import com.pdftron.common.PDFNetException; +import com.pdftron.pdf.Element; +import com.pdftron.pdf.ElementReader; +import com.pdftron.pdf.ElementWriter; +import com.pdftron.pdf.PDFDoc; +import com.pdftron.pdf.Page; +import com.pdftron.pdf.PageIterator; +import com.pdftron.sdf.SDFDoc; + +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; @Slf4j public class WatermarkRemovalService { - final static double AREA_THRESHOLD = 0.6; // multiplied with page area + final static double AREA_THRESHOLD = 0.5; // multiplied with page area final static double OCCURING_ON_PAGES_THRESHOLD_FACTOR = 0.75; // multiplied with number of pages final static int MIN_PAGES_THRESHOLD = 3; + final static double IMAGE_POSITION_HEIGHT_THRESHOLD = 0.2; // multiplied with page height + + final static double IMAGE_POSITION_WIDTH_THRESHOLD = 0.125; // multiplied with page width + + final static double TEXT_POSITION_THRESHOLD = 0.15; + + final static double MIN_TEXTWATERMAK_HEIGHT_THRESHOLD = 0.125; // multiplied with page height + + final static int PAGE_NUMBER_TEXT_SEARCH_THRESHOLD = 5; // stop text based search after 5 pages without watermark + final static double ROTATED_TEXT_THRESHOLD = 12.5; //this is in degrees + static boolean foundTextWatermark = true; + /** * The method remove watermark works only for Documents with size greater than MIN_PAGES_THRESHOLD. @@ -46,7 +70,7 @@ public class WatermarkRemovalService { log.info("Watermark found and will be removed!"); removeAllWatermarks(pdfDoc, watermarkElementFeatures); } else { - log.info("No watermark found!"); + log.info("No unlabeled watermark found!"); } } @@ -69,7 +93,6 @@ public class WatermarkRemovalService { ElementReader reader = new ElementReader(); - for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) { Page page = iterator.next(); @@ -80,7 +103,7 @@ public class WatermarkRemovalService { reader.begin(page); for (Element element = reader.next(); element != null; element = reader.next()) { - processElement(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringFromPage); + processElement(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringFromPage, page); } formObjectsAndImagesForPages.put(page.getSDFObj().getObjNum(), elementFeaturesLinkedList); @@ -96,22 +119,81 @@ public class WatermarkRemovalService { Set visitedXObjIds, List elementFeaturesLinkedList, List formObjectsOccuringMoreThanOnceOnAPage, - double minAreaCoveringPage) throws PDFNetException { + double minAreaCoveringPage, + Page page) throws PDFNetException { if (element.getBBox() == null) { return; } - if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringPage) { - return; - } if (element.getType() == Element.e_form) { - processXObject(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage); + if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringPage) { + return; + } + processXObject(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage, page); } else if (element.getType() == Element.e_image || element.getType() == Element.e_inline_image) { if (element.getXObject() == null) { return; } + if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringPage) { + boolean isLocatedNearBorder = element.getBBox().getY1() < page.getVisibleContentBox() + .getY1() + page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD || element.getBBox().getY2() > page.getVisibleContentBox() + .getY2() - page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD || element.getBBox().getX1() < page.getVisibleContentBox() + .getX1() + page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD || element.getBBox().getX2() > page.getVisibleContentBox() + .getX2() - page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD; + + if (isLocatedNearBorder) { + return; + } + } + processImages(element, elementFeaturesLinkedList); + } else if (element.getType() == Element.e_text) { + processText(element, elementFeaturesLinkedList, page); + } + } + + + @SneakyThrows + private void processText(Element element, List elementFeaturesLinkedList, Page page) { + + if (page.getIndex() == PAGE_NUMBER_TEXT_SEARCH_THRESHOLD) { + shouldTextSearchBeContinued(elementFeaturesLinkedList); + } + + if (!foundTextWatermark) { + return; + } + + if (Math.abs(element.getCTM().getB()) < Math.sin(Math.toRadians(ROTATED_TEXT_THRESHOLD)) || Math.abs(element.getCTM() + .getB()) > Math.sin(Math.toRadians(70 - ROTATED_TEXT_THRESHOLD))) { + return; + } + + if (Math.max(element.getBBox().getY1(), element.getBBox().getY2()) < page.getVisibleContentBox().getY1() + page.getPageHeight() * TEXT_POSITION_THRESHOLD) { + return; + } + + boolean isBigEnough = Math.abs(element.getBBox().getY1() - element.getBBox().getY2()) > page.getPageHeight() * MIN_TEXTWATERMAK_HEIGHT_THRESHOLD; + + if (isBigEnough) { + ElementFeatures elementFeatures = ElementFeatureFactory.extractFeatures(element); + elementFeaturesLinkedList.add(elementFeatures); + } + + } + + + private void shouldTextSearchBeContinued(List elementFeaturesLinkedList) { + + int countTextWatermarks = 0; + for (ElementFeatures elementFeatures : elementFeaturesLinkedList) { + if (elementFeatures.getElementType() == Element.e_text) { + countTextWatermarks++; + } + } + if (countTextWatermarks < elementFeaturesLinkedList.size() * OCCURING_ON_PAGES_THRESHOLD_FACTOR) { + foundTextWatermark = false; } } @@ -130,13 +212,14 @@ public class WatermarkRemovalService { Set visitedXObjIds, List elementFeaturesLinkedList, List formObjectsOccuringMoreThanOnceOnAPage, - double minAreaCoveringPage) { + double minAreaCoveringPage, + Page page) { if (visitedXObjIds.add(element.getXObject().getObjNum())) { ElementReader xObjectReader = new ElementReader(); xObjectReader.begin(element.getXObject()); for (Element element1 = xObjectReader.next(); element1 != null; element1 = xObjectReader.next()) { - processElement(element1, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage); + processElement(element1, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage, page); } elementFeaturesLinkedList.add(ElementFeatureFactory.extractFeatures(element)); xObjectReader.destroy(); @@ -159,7 +242,8 @@ public class WatermarkRemovalService { .flatMap(Collection::stream) .filter(elementFeature -> formObjectsPerPage.values() .stream() - .filter(elementFeaturesOnPage -> elementFeaturesOnPage.stream().anyMatch(elementFeature::almostMatches)) + .filter(elementFeaturesOnPage -> elementFeaturesOnPage.stream() + .anyMatch(elementFeature.getElementType() == Element.e_image || elementFeature.getElementType() == Element.e_inline_image ? elementFeature::similarMatches : elementFeature::almostMatches)) .count() >= minPagesFilter) .toList(); } @@ -210,21 +294,28 @@ public class WatermarkRemovalService { double minAreaCoveringFromPage = AREA_THRESHOLD * page.getPageHeight() * page.getPageWidth(); for (Element element = reader.next(); element != null; element = reader.next()) { - switch (element.getType()) { case Element.e_image, Element.e_inline_image -> { if (element.getBBox() == null) { writer.writeElement(element); continue; } - if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage || element.getXObject() == null) { - writer.writeElement(element); - continue; + if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage) { + boolean isLocatedNearBorder = element.getBBox().getY1() < page.getVisibleContentBox() + .getY1() + page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD || element.getBBox().getY2() > page.getVisibleContentBox() + .getY2() - page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD || element.getBBox().getX1() < page.getVisibleContentBox() + .getX1() + page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD || element.getBBox().getX2() > page.getVisibleContentBox() + .getX2() - page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD; + + if ((isLocatedNearBorder && element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage) || element.getXObject() == null) { + writer.writeElement(element); + continue; + } } removeImages(element, writer, watermarksElementFeaturesList); } - case Element.e_form -> - processForms(page, element, reader, writer, watermarksElementFeaturesList, visitedXObjIds); + case Element.e_form -> processForms(page, element, reader, writer, watermarksElementFeaturesList, visitedXObjIds); + case Element.e_text -> processText(element, reader, writer, watermarksElementFeaturesList, page); default -> writer.writeElement(element); } } @@ -232,13 +323,41 @@ public class WatermarkRemovalService { @SneakyThrows - private void removeImages(Element element, ElementWriter - writer, List watermarksElementFeaturesList) { + private void processText(Element element, ElementReader reader, ElementWriter writer, List watermarksElementFeaturesList, Page page) { + + if (!foundTextWatermark) { + writer.writeElement(element); + return; + } + + if (Math.abs(element.getCTM().getB()) < Math.sin(Math.toRadians(ROTATED_TEXT_THRESHOLD)) || Math.abs(element.getCTM() + .getB()) > Math.sin(Math.toRadians(70 - ROTATED_TEXT_THRESHOLD))) { + writer.writeElement(element); + return; + } + + if (Math.max(element.getBBox().getY1(), element.getBBox().getY2()) < page.getVisibleContentBox().getY1() + page.getPageHeight() * TEXT_POSITION_THRESHOLD) { + writer.writeElement(element); + return; + } + + for (ElementFeatures elementFeatures : watermarksElementFeaturesList) { + if (elementFeatures.almostMatches(element)) { + return; + } + } + writer.writeElement(element); + + } + + + @SneakyThrows + private void removeImages(Element element, ElementWriter writer, List watermarksElementFeaturesList) { String hashValueOfImage = ImageHashFactory.calculate(element); ElementFeatures imageFeatures = ElementFeatureFactory.extractFeaturesWithHash(element, hashValueOfImage); for (ElementFeatures elementFeatures : watermarksElementFeaturesList) { - if (elementFeatures.almostMatches(imageFeatures)) { + if (elementFeatures.similarMatches(imageFeatures)) { return; } } diff --git a/src/test/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalServiceTest.java b/src/test/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalServiceTest.java index 5e91d8d..6755ab1 100644 --- a/src/test/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalServiceTest.java +++ b/src/test/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalServiceTest.java @@ -1,15 +1,17 @@ package com.iqser.red.pdftronlogic.commons; -import com.pdftron.pdf.PDFNet; -import lombok.SneakyThrows; -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; -import org.junit.platform.commons.util.StringUtils; - import java.io.FileOutputStream; import java.nio.file.Path; import java.util.Locale; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; +import org.junit.platform.commons.util.StringUtils; + +import com.pdftron.pdf.PDFNet; + +import lombok.SneakyThrows; + @Disabled class WatermarkRemovalServiceTest {