From bf3015161becfc1cca2271cd50b33a7d7db91a25 Mon Sep 17 00:00:00 2001 From: RaphaelArnold Date: Fri, 1 Sep 2023 09:58:16 +0200 Subject: [PATCH] RED-7075: Styling changes --- .../pdftronlogic/commons/ElementFeatures.java | 71 +++++++------------ .../commons/WatermarkRemovalService.java | 59 ++++++++------- 2 files changed, 57 insertions(+), 73 deletions(-) diff --git a/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java b/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java index 6d621c4..482c46e 100644 --- a/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java +++ b/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java @@ -22,6 +22,9 @@ import lombok.experimental.SuperBuilder; @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) public class ElementFeatures { + final private static double RECT_POSITION_SIMILARITY_THRESHOLD_FACTOR = 0.2; // specify how much the x and y value are allowed to differ + final private static double RECT_SIZE_SIMILARITY_THRESHOLD_FACTOR = 0.1; // the scale the images are allowed to differ + final private static double HAMMING_DISTANCE_THRESHOLD = 4; // defines the similarity of the hash of images int elementType; Rectangle2D boundingBox; @@ -68,53 +71,33 @@ public class ElementFeatures { } - public boolean similarMatches(Element element) throws PDFNetException { + public boolean isSimilarTo(ElementFeatures elementFeatures) { - return element.getType() == elementType && // - element.getBBox() != null && // - rectsSimilarMatch(element.getBBox()); + return elementFeatures.getElementType() == elementType && elementFeatures.getBoundingBox() != null && areRectsSimilar(elementFeatures.getBoundingBox()); } - @SneakyThrows - private boolean rectsSimilarMatch(Rect rect) { + private boolean areRectsSimilar(Rectangle2D rectangle2D) { // To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance - return similarEqualPosition(rect.getX1(), boundingBox.getX(), rect.getWidth()) && // - similarEqualPosition(rect.getY1(), boundingBox.getY(), rect.getHeight()) && // - similarEqualSize(rect.getWidth(), boundingBox.getWidth()) && // - similarEqualSize(rect.getHeight(), boundingBox.getHeight()); + return isPositionSimilar(rectangle2D.getX(), boundingBox.getX(), rectangle2D.getWidth()) && // + isPositionSimilar(rectangle2D.getY(), boundingBox.getY(), rectangle2D.getHeight()) && // + isSizeSimilar(rectangle2D.getWidth(), boundingBox.getWidth()) && // + isSizeSimilar(rectangle2D.getHeight(), boundingBox.getHeight()); } - protected boolean similarEqualPosition(double a, double b, double boxSize) { + protected boolean isPositionSimilar(double a, double b, double boxSize) { - return Math.abs(a - b) < boxSize * 0.2; + return Math.abs(a - b) < boxSize * RECT_POSITION_SIMILARITY_THRESHOLD_FACTOR; } - protected boolean similarEqualSize(double a, double b) { + protected boolean isSizeSimilar(double a, double b) { - return Math.abs(a - b) < a * 0.1; + return Math.abs(a - b) < a * RECT_SIZE_SIMILARITY_THRESHOLD_FACTOR; } - - public boolean similarMatches(ElementFeatures elementFeatures) { - - return elementFeatures.getElementType() == elementType && elementFeatures.getBoundingBox() != null && rectsSimilarMatch(elementFeatures.getBoundingBox()); - } - - - private boolean rectsSimilarMatch(Rectangle2D rectangle2D) { - // To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance - - return similarEqualPosition(rectangle2D.getX(), boundingBox.getX(), rectangle2D.getWidth()) && // - similarEqualPosition(rectangle2D.getY(), boundingBox.getY(), rectangle2D.getHeight()) && // - similarEqualSize(rectangle2D.getWidth(), boundingBox.getWidth()) && // - similarEqualSize(rectangle2D.getHeight(), boundingBox.getHeight()); - } - - @EqualsAndHashCode(callSuper = true) @Getter @SuperBuilder @@ -215,26 +198,20 @@ public class ElementFeatures { return false; } return super.almostMatches(elementFeatures) && this.dataSize == ((Image) elementFeatures).getDataSize() && this.height == ((Image) elementFeatures).getHeight() && this.width == ((Image) elementFeatures).getWidth() && this.renderingIntent == ((Image) elementFeatures).getRenderingIntent() && this.componentNum == ((Image) elementFeatures).getComponentNum() && this.bitsPerComponent == ((Image) elementFeatures).getBitsPerComponent() && calculateHammingDistance( - ((Image) elementFeatures).getHashOfImage()) <= 4; + ((Image) elementFeatures).getHashOfImage()) <= HAMMING_DISTANCE_THRESHOLD; } - public boolean similarMatches(Element element) throws PDFNetException { + public boolean isSimilarTo(ElementFeatures elementFeatures) { - return super.almostMatches(element) && // - dataSize == element.getImageDataSize() && // - height == element.getImageHeight() && // - width == element.getImageWidth() && // - renderingIntent == element.getImageRenderingIntent() && // - componentNum == element.getComponentNum() && // - bitsPerComponent == element.getBitsPerComponent(); - } - - - public boolean similarMatches(ElementFeatures elementFeatures) { - - return super.similarMatches(elementFeatures) && this.dataSize == ((Image) elementFeatures).getDataSize() && this.height == ((Image) elementFeatures).getHeight() && this.width == ((Image) elementFeatures).getWidth() && this.renderingIntent == ((Image) elementFeatures).getRenderingIntent() && this.componentNum == ((Image) elementFeatures).getComponentNum() && this.bitsPerComponent == ((Image) elementFeatures).getBitsPerComponent() && calculateHammingDistance( - ((Image) elementFeatures).getHashOfImage()) <= 4; + return super.isSimilarTo(elementFeatures) && // + //this.dataSize == ((Image) elementFeatures).getDataSize() && // + //this.height == ((Image) elementFeatures).getHeight() && // + //this.width == ((Image) elementFeatures).getWidth() && // + //this.renderingIntent == ((Image) elementFeatures).getRenderingIntent() && // + //this.componentNum == ((Image) elementFeatures).getComponentNum() && // + //this.bitsPerComponent == ((Image) elementFeatures).getBitsPerComponent() && // + calculateHammingDistance(((Image) elementFeatures).getHashOfImage()) <= HAMMING_DISTANCE_THRESHOLD; } diff --git a/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java b/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java index 4335323..d3484ec 100644 --- a/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java +++ b/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java @@ -45,8 +45,10 @@ public class WatermarkRemovalService { /** * The method remove watermark works only for Documents with size greater than MIN_PAGES_THRESHOLD. - * First the possible watermarks (big XObjects or Images) will be detected and then checked if those appear on most pages according to the - * OCCURING_ON_PAGES_THRESHOLD_FACTOR by using image hashing for similarity and size and stream size of the xobjects. + * The following watermarks will be found: big XObjects, big Images, small Images that appear in the middle of the page, and + * text that is rotated and big enough compared to height of page. + * First the possible watermarks will be detected and then checked if those appear on most pages according to the + * OCCURING_ON_PAGES_THRESHOLD_FACTOR. We us image hashing for similarity between pictures and size and stream size of the xobjects. * If so, these detected and confirmed will not be written to the pdf file. * * @param pdfFile PDFFile to remove watermarks @@ -142,16 +144,7 @@ public class WatermarkRemovalService { shouldTextSearchBeContinued(elementFeaturesLinkedList); } - if (!foundTextWatermark) { - return; - } - - if (Math.abs(element.getCTM().getB()) < Math.sin(Math.toRadians(ROTATED_TEXT_THRESHOLD)) || Math.abs(element.getCTM() - .getB()) > Math.sin(Math.toRadians(70 - ROTATED_TEXT_THRESHOLD))) { - return; - } - - if (Math.max(element.getBBox().getY1(), element.getBBox().getY2()) < page.getVisibleContentBox().getY1() + page.getPageHeight() * TEXT_POSITION_THRESHOLD) { + if (!couldTextBeAWatermark(element, page)) { return; } @@ -165,6 +158,14 @@ public class WatermarkRemovalService { } + @SneakyThrows + private boolean isTextRotated(Element element) { + + return Math.abs(element.getCTM().getB()) < Math.sin(Math.toRadians(ROTATED_TEXT_THRESHOLD)) || Math.abs(element.getCTM() + .getB()) > Math.sin(Math.toRadians(70 - ROTATED_TEXT_THRESHOLD)); + } + + private void shouldTextSearchBeContinued(List elementFeaturesLinkedList) { int countTextWatermarks = 0; @@ -245,7 +246,7 @@ public class WatermarkRemovalService { .filter(elementFeature -> formObjectsPerPage.values() .stream() .filter(elementFeaturesOnPage -> elementFeaturesOnPage.stream() - .anyMatch(elementFeature.getElementType() == Element.e_image || elementFeature.getElementType() == Element.e_inline_image ? elementFeature::similarMatches : elementFeature::almostMatches)) + .anyMatch(elementFeature.getElementType() == Element.e_image || elementFeature.getElementType() == Element.e_inline_image ? elementFeature::isSimilarTo : elementFeature::almostMatches)) .count() >= minPagesFilter) .toList(); } @@ -322,18 +323,7 @@ public class WatermarkRemovalService { @SneakyThrows private void processText(Element element, ElementWriter writer, List watermarksElementFeaturesList, Page page) { - if (!foundTextWatermark) { - writer.writeElement(element); - return; - } - - if (Math.abs(element.getCTM().getB()) < Math.sin(Math.toRadians(ROTATED_TEXT_THRESHOLD)) || Math.abs(element.getCTM() - .getB()) > Math.sin(Math.toRadians(70 - ROTATED_TEXT_THRESHOLD))) { - writer.writeElement(element); - return; - } - - if (Math.max(element.getBBox().getY1(), element.getBBox().getY2()) < page.getVisibleContentBox().getY1() + page.getPageHeight() * TEXT_POSITION_THRESHOLD) { + if (!couldTextBeAWatermark(element, page)) { writer.writeElement(element); return; } @@ -348,13 +338,30 @@ public class WatermarkRemovalService { } + private boolean couldTextBeAWatermark(Element element, Page page) throws PDFNetException { + + if (!foundTextWatermark) { + return false; + } + + if (isTextRotated(element)) { + return false; + } + + if (Math.max(element.getBBox().getY1(), element.getBBox().getY2()) < page.getVisibleContentBox().getY1() + page.getPageHeight() * TEXT_POSITION_THRESHOLD) { + return false; + } + return true; + } + + @SneakyThrows private void removeImages(Element element, ElementWriter writer, List watermarksElementFeaturesList) { String hashValueOfImage = ImageHashFactory.calculate(element); ElementFeatures imageFeatures = ElementFeatureFactory.extractFeaturesWithHash(element, hashValueOfImage); for (ElementFeatures elementFeatures : watermarksElementFeaturesList) { - if (elementFeatures.similarMatches(imageFeatures)) { + if (elementFeatures.isSimilarTo(imageFeatures)) { return; } }