diff --git a/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java b/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java index 6681ac9..718c4ea 100644 --- a/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java +++ b/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java @@ -3,7 +3,6 @@ package com.iqser.red.pdftronlogic.commons; import static com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService.TOLERANCE; import java.awt.Color; -import java.awt.geom.Area; import java.awt.geom.GeneralPath; import java.awt.geom.Rectangle2D; @@ -23,6 +22,9 @@ import lombok.experimental.SuperBuilder; @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) public class ElementFeatures { + final private static double RECT_POSITION_SIMILARITY_THRESHOLD_FACTOR = 0.2; // specify how much the x and y value are allowed to differ + final private static double RECT_SIZE_SIMILARITY_THRESHOLD_FACTOR = 0.1; // the scale the images are allowed to differ + final private static double HAMMING_DISTANCE_THRESHOLD = 4; // defines the similarity of the hash of images int elementType; Rectangle2D boundingBox; @@ -34,18 +36,6 @@ public class ElementFeatures { rectsAlmostMatch(element.getBBox()); } - public boolean almostMatches(ElementFeatures elementFeatures){ - return elementFeatures.getElementType() == elementType && - elementFeatures.getBoundingBox() != null && - rectsAlmostMatch(elementFeatures.getBoundingBox()); - } - - - protected boolean almostEqual(double a, double b) { - - return Math.abs(a - b) < TOLERANCE; - } - @SneakyThrows private boolean rectsAlmostMatch(Rect bBox) { @@ -57,6 +47,19 @@ public class ElementFeatures { almostEqual(bBox.getHeight(), boundingBox.getHeight()); } + + protected boolean almostEqual(double a, double b) { + + return Math.abs(a - b) < TOLERANCE; + } + + + public boolean almostMatches(ElementFeatures elementFeatures) { + + return elementFeatures.getElementType() == elementType && elementFeatures.getBoundingBox() != null && rectsAlmostMatch(elementFeatures.getBoundingBox()); + } + + @SneakyThrows private boolean rectsAlmostMatch(Rectangle2D bBox) { // To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance @@ -68,6 +71,34 @@ public class ElementFeatures { } + public boolean isSimilarTo(ElementFeatures elementFeatures) { + + return elementFeatures.getElementType() == elementType && elementFeatures.getBoundingBox() != null && areRectsSimilar(elementFeatures.getBoundingBox()); + } + + + private boolean areRectsSimilar(Rectangle2D rectangle2D) { + // To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance + + return isPositionSimilar(rectangle2D.getX(), boundingBox.getX(), rectangle2D.getWidth()) && // + isPositionSimilar(rectangle2D.getY(), boundingBox.getY(), rectangle2D.getHeight()) && // + isSizeSimilar(rectangle2D.getWidth(), boundingBox.getWidth()) && // + isSizeSimilar(rectangle2D.getHeight(), boundingBox.getHeight()); + } + + + protected boolean isPositionSimilar(double a, double b, double boxSize) { + + return Math.abs(a - b) < boxSize * RECT_POSITION_SIMILARITY_THRESHOLD_FACTOR; + } + + + protected boolean isSizeSimilar(double a, double b) { + + return Math.abs(a - b) < a * RECT_SIZE_SIMILARITY_THRESHOLD_FACTOR; + } + + @EqualsAndHashCode(callSuper = true) @Getter @SuperBuilder @@ -148,6 +179,7 @@ public class ElementFeatures { int bitsPerComponent; String hashOfImage; + @Override public boolean almostMatches(Element element) throws PDFNetException { @@ -160,22 +192,27 @@ public class ElementFeatures { bitsPerComponent == element.getBitsPerComponent(); } - public boolean almostMatches(ElementFeatures elementFeatures){ - if(elementFeatures.getClass() != this.getClass()){ + + public boolean almostMatches(ElementFeatures elementFeatures) { + + if (elementFeatures.getClass() != this.getClass()) { return false; } - return super.almostMatches(elementFeatures) && - this.dataSize == ((Image) elementFeatures).getDataSize() && - this.height == ((Image) elementFeatures).getHeight() && - this.width == ((Image) elementFeatures).getWidth() && - this.renderingIntent == ((Image) elementFeatures).getRenderingIntent() && - this.componentNum == ((Image) elementFeatures).getComponentNum() && - this.bitsPerComponent == ((Image) elementFeatures).getBitsPerComponent() && - calculateHammingDistance(((Image) elementFeatures).getHashOfImage()) <=4; + return super.almostMatches(elementFeatures) && this.dataSize == ((Image) elementFeatures).getDataSize() && this.height == ((Image) elementFeatures).getHeight() && this.width == ((Image) elementFeatures).getWidth() && this.renderingIntent == ((Image) elementFeatures).getRenderingIntent() && this.componentNum == ((Image) elementFeatures).getComponentNum() && this.bitsPerComponent == ((Image) elementFeatures).getBitsPerComponent() && calculateHammingDistance( + ((Image) elementFeatures).getHashOfImage()) <= HAMMING_DISTANCE_THRESHOLD; } + + public boolean isSimilarTo(ElementFeatures elementFeatures) { + + return super.isSimilarTo(elementFeatures) && // + calculateHammingDistance(((Image) elementFeatures).getHashOfImage()) <= HAMMING_DISTANCE_THRESHOLD; + } + + // Helper method to calculate the Hamming distance between two hexadecimal strings private int calculateHammingDistance(String hash2) { + int distance = 0; int maxLength = Math.max(this.hashOfImage.length(), hash2.length()); for (int i = 0; i < maxLength; i++) { @@ -202,34 +239,32 @@ public class ElementFeatures { @Override public boolean almostMatches(Element element) throws PDFNetException { + return element.getType() == getElementType() && // element.getBBox() != null && // - (super.rectsAlmostMatch(element.getBBox()) || almostRotateMatches(element.getBBox().getRectangle())) && - xObjectType == element.getXObject().getType() && - dictOrArrayOrStreamLength == element.getXObject().getDecodedStream().size(); + (super.rectsAlmostMatch(element.getBBox()) || almostRotateMatches(element.getBBox().getRectangle())) && xObjectType == element.getXObject() + .getType() && dictOrArrayOrStreamLength == element.getXObject().getDecodedStream().size(); } - public boolean almostMatches(ElementFeatures elementFeatures){ - if(elementFeatures.getClass() != this.getClass()){ + + public boolean almostMatches(ElementFeatures elementFeatures) { + + if (elementFeatures.getClass() != this.getClass()) { return false; } - return elementFeatures.getElementType() == getElementType() && - elementFeatures.getBoundingBox() != null && - (super.rectsAlmostMatch(elementFeatures.getBoundingBox()) || almostRotateMatches(elementFeatures.getBoundingBox().getBounds2D())) && - xObjectType == ((Form)elementFeatures).getXObjectType() && - dictOrArrayOrStreamLength == ((Form)elementFeatures).getDictOrArrayOrStreamLength(); + return elementFeatures.getElementType() == getElementType() && elementFeatures.getBoundingBox() != null && (super.rectsAlmostMatch(elementFeatures.getBoundingBox()) || almostRotateMatches( + elementFeatures.getBoundingBox() + .getBounds2D())) && xObjectType == ((Form) elementFeatures).getXObjectType() && dictOrArrayOrStreamLength == ((Form) elementFeatures).getDictOrArrayOrStreamLength(); } private boolean almostRotateMatches(Rectangle2D bBox) { + return almostEqual(bBox.getWidth(), getBoundingBox().getHeight()) && // almostEqual(bBox.getHeight(), getBoundingBox().getWidth()); } } - - - } diff --git a/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java b/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java index e1c7e23..45694ac 100644 --- a/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java +++ b/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java @@ -1,28 +1,54 @@ package com.iqser.red.pdftronlogic.commons; -import com.pdftron.common.PDFNetException; -import com.pdftron.pdf.*; -import com.pdftron.sdf.SDFDoc; -import lombok.SneakyThrows; -import lombok.extern.slf4j.Slf4j; - import java.io.InputStream; import java.io.OutputStream; -import java.util.*; +import java.util.Collection; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; + +import com.pdftron.common.PDFNetException; +import com.pdftron.pdf.Element; +import com.pdftron.pdf.ElementReader; +import com.pdftron.pdf.ElementWriter; +import com.pdftron.pdf.PDFDoc; +import com.pdftron.pdf.Page; +import com.pdftron.pdf.PageIterator; +import com.pdftron.sdf.SDFDoc; + +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; @Slf4j public class WatermarkRemovalService { - final static double AREA_THRESHOLD = 0.6; // multiplied with page area + final static double AREA_THRESHOLD = 0.5; // multiplied with page area final static double OCCURING_ON_PAGES_THRESHOLD_FACTOR = 0.75; // multiplied with number of pages final static int MIN_PAGES_THRESHOLD = 3; + final static double IMAGE_POSITION_HEIGHT_THRESHOLD = 0.2; // multiplied with page height + + final static double IMAGE_POSITION_WIDTH_THRESHOLD = 0.125; // multiplied with page width + + final static double TEXT_POSITION_THRESHOLD = 0.15; + + final static double MIN_TEXTWATERMARK_HEIGHT_THRESHOLD = 0.125; // multiplied with page height + + final static int PAGE_NUMBER_TEXT_SEARCH_THRESHOLD = 5; // stop text based search after 5 pages without watermark + final static double ROTATED_TEXT_THRESHOLD = 12.5; //this is in degrees + static boolean foundTextWatermark = true; + /** * The method remove watermark works only for Documents with size greater than MIN_PAGES_THRESHOLD. - * First the possible watermarks (big XObjects or Images) will be detected and then checked if those appear on most pages according to the - * OCCURING_ON_PAGES_THRESHOLD_FACTOR by using image hashing for similarity and size and stream size of the xobjects. + * The following watermarks will be found: big XObjects, big Images, small Images that appear in the middle of the page, and + * text that is rotated and big enough compared to height of page. + * First the possible watermarks will be detected and then checked if those appear on most pages according to the + * OCCURING_ON_PAGES_THRESHOLD_FACTOR. We us image hashing for similarity between pictures and size and stream size of the xobjects. * If so, these detected and confirmed will not be written to the pdf file. * * @param pdfFile PDFFile to remove watermarks @@ -46,7 +72,7 @@ public class WatermarkRemovalService { log.info("Watermark found and will be removed!"); removeAllWatermarks(pdfDoc, watermarkElementFeatures); } else { - log.info("No watermark found!"); + log.info("No unlabeled watermark found!"); } } @@ -69,7 +95,6 @@ public class WatermarkRemovalService { ElementReader reader = new ElementReader(); - for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) { Page page = iterator.next(); @@ -80,7 +105,7 @@ public class WatermarkRemovalService { reader.begin(page); for (Element element = reader.next(); element != null; element = reader.next()) { - processElement(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringFromPage); + processElement(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringFromPage, page); } formObjectsAndImagesForPages.put(page.getSDFObj().getObjNum(), elementFeaturesLinkedList); @@ -96,28 +121,74 @@ public class WatermarkRemovalService { Set visitedXObjIds, List elementFeaturesLinkedList, List formObjectsOccuringMoreThanOnceOnAPage, - double minAreaCoveringPage) throws PDFNetException { + double minAreaCoveringPage, + Page page) throws PDFNetException { if (element.getBBox() == null) { return; } - if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringPage) { + + switch (element.getType()) { + case Element.e_form -> processXObject(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage, page); + case Element.e_image, Element.e_inline_image -> processImages(element, elementFeaturesLinkedList, page, minAreaCoveringPage); + case Element.e_text -> processText(element, elementFeaturesLinkedList, page); + } + + } + + + @SneakyThrows + private void processText(Element element, List elementFeaturesLinkedList, Page page) { + + if (page.getIndex() == PAGE_NUMBER_TEXT_SEARCH_THRESHOLD) { + shouldTextSearchBeContinued(elementFeaturesLinkedList); + } + + if (!couldTextBeAWatermark(element, page)) { return; } - if (element.getType() == Element.e_form) { - processXObject(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage); - } else if (element.getType() == Element.e_image || element.getType() == Element.e_inline_image) { - if (element.getXObject() == null) { - return; + boolean isBigEnough = Math.abs(element.getBBox().getY1() - element.getBBox().getY2()) > page.getPageHeight() * MIN_TEXTWATERMARK_HEIGHT_THRESHOLD; + + if (isBigEnough) { + ElementFeatures elementFeatures = ElementFeatureFactory.extractFeatures(element); + elementFeaturesLinkedList.add(elementFeatures); + } + + } + + + @SneakyThrows + private boolean isTextRotated(Element element) { + + return Math.abs(element.getCTM().getB()) < Math.sin(Math.toRadians(ROTATED_TEXT_THRESHOLD)) || Math.abs(element.getCTM() + .getB()) > Math.sin(Math.toRadians(70 - ROTATED_TEXT_THRESHOLD)); + } + + + private void shouldTextSearchBeContinued(List elementFeaturesLinkedList) { + + int countTextWatermarks = 0; + for (ElementFeatures elementFeatures : elementFeaturesLinkedList) { + if (elementFeatures.getElementType() == Element.e_text) { + countTextWatermarks++; } - processImages(element, elementFeaturesLinkedList); + } + if (countTextWatermarks < elementFeaturesLinkedList.size() * OCCURING_ON_PAGES_THRESHOLD_FACTOR) { + foundTextWatermark = false; } } @SneakyThrows - private void processImages(Element element, List elementFeaturesLinkedList) { + private void processImages(Element element, List elementFeaturesLinkedList, Page page, double minAreaCoveringPage) { + + if (element.getXObject() == null) { + return; + } + if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringPage && isLocatedNearBorder(element, page)) { + return; + } String hashOfImage = ImageHashFactory.calculate(element); ElementFeatures elementFeatures = ElementFeatureFactory.extractFeaturesWithHash(element, hashOfImage); @@ -125,18 +196,34 @@ public class WatermarkRemovalService { } + // Typically company logos on dossier pages are located near the border and should be excluded from the watermark removal + @SneakyThrows + private boolean isLocatedNearBorder(Element element, Page page) { + + return element.getBBox().getY1() < page.getVisibleContentBox().getY1() + page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD || element.getBBox() + .getY2() > page.getVisibleContentBox().getY2() - page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD || element.getBBox().getX1() < page.getVisibleContentBox() + .getX1() + page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD || element.getBBox().getX2() > page.getVisibleContentBox() + .getX2() - page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD; + } + + @SneakyThrows private void processXObject(Element element, Set visitedXObjIds, List elementFeaturesLinkedList, List formObjectsOccuringMoreThanOnceOnAPage, - double minAreaCoveringPage) { + double minAreaCoveringPage, + Page page) { + + if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringPage) { + return; + } if (visitedXObjIds.add(element.getXObject().getObjNum())) { ElementReader xObjectReader = new ElementReader(); xObjectReader.begin(element.getXObject()); for (Element element1 = xObjectReader.next(); element1 != null; element1 = xObjectReader.next()) { - processElement(element1, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage); + processElement(element1, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage, page); } elementFeaturesLinkedList.add(ElementFeatureFactory.extractFeatures(element)); xObjectReader.destroy(); @@ -159,7 +246,8 @@ public class WatermarkRemovalService { .flatMap(Collection::stream) .filter(elementFeature -> formObjectsPerPage.values() .stream() - .filter(elementFeaturesOnPage -> elementFeaturesOnPage.stream().anyMatch(elementFeature::almostMatches)) + .filter(elementFeaturesOnPage -> elementFeaturesOnPage.stream() + .anyMatch(elementFeature.getElementType() == Element.e_image || elementFeature.getElementType() == Element.e_inline_image ? elementFeature::isSimilarTo : elementFeature::almostMatches)) .count() >= minPagesFilter) .toList(); } @@ -210,21 +298,23 @@ public class WatermarkRemovalService { double minAreaCoveringFromPage = AREA_THRESHOLD * page.getPageHeight() * page.getPageWidth(); for (Element element = reader.next(); element != null; element = reader.next()) { - switch (element.getType()) { case Element.e_image, Element.e_inline_image -> { if (element.getBBox() == null) { writer.writeElement(element); continue; } - if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage || element.getXObject() == null) { + if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage && isLocatedNearBorder(element, page) && element.getBBox() + .getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage || element.getXObject() == null) { + writer.writeElement(element); continue; + } removeImages(element, writer, watermarksElementFeaturesList); } - case Element.e_form -> - processForms(page, element, reader, writer, watermarksElementFeaturesList, visitedXObjIds); + case Element.e_form -> processForms(page, element, reader, writer, watermarksElementFeaturesList, visitedXObjIds); + case Element.e_text -> processText(element, writer, watermarksElementFeaturesList, page); default -> writer.writeElement(element); } } @@ -232,13 +322,47 @@ public class WatermarkRemovalService { @SneakyThrows - private void removeImages(Element element, ElementWriter - writer, List watermarksElementFeaturesList) { + private void processText(Element element, ElementWriter writer, List watermarksElementFeaturesList, Page page) { + + if (!couldTextBeAWatermark(element, page)) { + writer.writeElement(element); + return; + } + + for (ElementFeatures elementFeatures : watermarksElementFeaturesList) { + if (elementFeatures.almostMatches(element)) { + return; + } + } + writer.writeElement(element); + + } + + + private boolean couldTextBeAWatermark(Element element, Page page) throws PDFNetException { + + if (!foundTextWatermark) { + return false; + } + + if (isTextRotated(element)) { + return false; + } + + if (Math.max(element.getBBox().getY1(), element.getBBox().getY2()) < page.getVisibleContentBox().getY1() + page.getPageHeight() * TEXT_POSITION_THRESHOLD) { + return false; + } + return true; + } + + + @SneakyThrows + private void removeImages(Element element, ElementWriter writer, List watermarksElementFeaturesList) { String hashValueOfImage = ImageHashFactory.calculate(element); ElementFeatures imageFeatures = ElementFeatureFactory.extractFeaturesWithHash(element, hashValueOfImage); for (ElementFeatures elementFeatures : watermarksElementFeaturesList) { - if (elementFeatures.almostMatches(imageFeatures)) { + if (elementFeatures.isSimilarTo(imageFeatures)) { return; } } diff --git a/src/test/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalServiceTest.java b/src/test/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalServiceTest.java index 5e91d8d..6755ab1 100644 --- a/src/test/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalServiceTest.java +++ b/src/test/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalServiceTest.java @@ -1,15 +1,17 @@ package com.iqser.red.pdftronlogic.commons; -import com.pdftron.pdf.PDFNet; -import lombok.SneakyThrows; -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; -import org.junit.platform.commons.util.StringUtils; - import java.io.FileOutputStream; import java.nio.file.Path; import java.util.Locale; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; +import org.junit.platform.commons.util.StringUtils; + +import com.pdftron.pdf.PDFNet; + +import lombok.SneakyThrows; + @Disabled class WatermarkRemovalServiceTest {