From 51b6307f91ba4072fdad168d15c15943f336712a Mon Sep 17 00:00:00 2001 From: RaphaelArnold Date: Tue, 1 Aug 2023 17:36:52 +0200 Subject: [PATCH] RED-7075: Watermark Removal finished so far --- pom.xml | 6 + .../commons/ElementFeatureFactory.java | 80 +++++++ .../pdftronlogic/commons/ElementFeatures.java | 79 +++---- .../commons/ImageHashFactory.java | 116 ++++++++++ .../InvisibleElementRemovalService.java | 6 +- .../commons/WatermarkRemovalService.java | 217 ++++++------------ 6 files changed, 310 insertions(+), 194 deletions(-) create mode 100644 src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatureFactory.java create mode 100644 src/main/java/com/iqser/red/pdftronlogic/commons/ImageHashFactory.java diff --git a/pom.xml b/pom.xml index 4d37cab..b50f1d1 100644 --- a/pom.xml +++ b/pom.xml @@ -26,6 +26,12 @@ slf4j-api provided + + org.apache.logging.log4j + log4j-slf4j2-impl + 2.20.0 + test + com.google.guava guava diff --git a/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatureFactory.java b/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatureFactory.java new file mode 100644 index 0000000..6f1407e --- /dev/null +++ b/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatureFactory.java @@ -0,0 +1,80 @@ +package com.iqser.red.pdftronlogic.commons; + +import com.pdftron.common.PDFNetException; +import com.pdftron.pdf.Element; + +public class ElementFeatureFactory { + public static ElementFeatures extractFeatures(Element element) throws PDFNetException { + + return switch (element.getType()) { + case Element.e_path -> buildPath(element); + case Element.e_text -> buildText(element); + case Element.e_image, Element.e_inline_image -> buildImage(element).build(); + case Element.e_form -> buildForm(element); + // This technically should never happen, it's a safetynet + default -> throw new RuntimeException("Feature Extraction is not supported for PDFTron.Element with type: " + element.getType()); + }; + } + + public static ElementFeatures extractFeaturesWithHash(Element element, String hashObject) throws PDFNetException { + return buildImage(element) + .hashOfImage(hashObject) + .build(); + } + + + private static ElementFeatures.Form buildForm(Element element) throws PDFNetException { + + return ElementFeatures.Form.builder() + .elementType(element.getType()) + .boundingBox(Converter.toRectangle2D(element.getBBox())) + .xObjectType(element.getXObject().getType()) + .dictOrArrayOrStreamLength(element.getXObject().getType() == 7 ? element.getXObject().getDecodedStream().size() : 0) + .build(); + } + + + private static ElementFeatures.Image.ImageBuilder buildImage(Element element) throws PDFNetException { + + return ElementFeatures.Image.builder() + .elementType(element.getType()) + .boundingBox(Converter.toRectangle2D(element.getBBox())) + .dataSize(element.getImageDataSize()) + .height(element.getImageHeight()) + .width(element.getImageWidth()) + .renderingIntent(element.getImageRenderingIntent()) + .componentNum(element.getComponentNum()) + .bitsPerComponent(element.getBitsPerComponent()); + } + + + private static ElementFeatures.Text buildText(Element element) throws PDFNetException { + + return ElementFeatures.Text.builder() + .elementType(element.getType()) + .boundingBox(Converter.toRectangle2D(element.getBBox())) + .text(element.getTextString()) + .font(element.getGState().getFont().getType()) + .fontsize(element.getGState().getFontSize()) + .build(); + } + + + private static ElementFeatures.Path buildPath(Element element) throws PDFNetException { + + return ElementFeatures.Path.builder() + .elementType(element.getType()) + .boundingBox(Converter.toRectangle2D(element.getBBox())) + .isClippingPath(element.isClippingPath()) + .isClipWindingFill(element.isClipWindingFill()) + .isStroked(element.isStroked()) + .isFilled(element.isFilled()) + .isWindingFill(element.isWindingFill()) + .fillColor(Converter.convertColor(element.getGState().getFillColorSpace(), element.getGState().getFillColor())) + .strokeColor(Converter.convertColor(element.getGState().getStrokeColorSpace(), element.getGState().getStrokeColor())) + .linePath(Converter.convertToGeneralPathAndTransformToInitialUserSpace(element.getPathData(), element.getCTM())) + .build(); + } + + +} diff --git a/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java b/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java index 12e5733..2107928 100644 --- a/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java +++ b/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java @@ -26,49 +26,6 @@ public class ElementFeatures { int elementType; Rectangle2D boundingBox; - public static ElementFeatures extractFeatures(Element element) throws PDFNetException { - - return switch (element.getType()) { - case Element.e_path -> Path.builder() - .elementType(element.getType()) - .boundingBox(Converter.toRectangle2D(element.getBBox())) - .isClippingPath(element.isClippingPath()) - .isClipWindingFill(element.isClipWindingFill()) - .isStroked(element.isStroked()) - .isFilled(element.isFilled()) - .isWindingFill(element.isWindingFill()) - .fillColor(Converter.convertColor(element.getGState().getFillColorSpace(), element.getGState().getFillColor())) - .strokeColor(Converter.convertColor(element.getGState().getStrokeColorSpace(), element.getGState().getStrokeColor())) - .linePath(Converter.convertToGeneralPathAndTransformToInitialUserSpace(element.getPathData(), element.getCTM())) - .build(); - case Element.e_text -> Text.builder() - .elementType(element.getType()) - .boundingBox(Converter.toRectangle2D(element.getBBox())) - .text(element.getTextString()) - .font(element.getGState().getFont().getType()) - .fontsize(element.getGState().getFontSize()) - .build(); - case Element.e_image, Element.e_inline_image -> Image.builder() - .elementType(element.getType()) - .boundingBox(Converter.toRectangle2D(element.getBBox())) - .dataSize(element.getImageDataSize()) - .height(element.getImageHeight()) - .width(element.getImageWidth()) - .renderingIntent(element.getImageRenderingIntent()) - .componentNum(element.getComponentNum()) - .bitsPerComponent(element.getBitsPerComponent()) - .build(); - case Element.e_form -> Form.builder() - .elementType(element.getType()) - .boundingBox(Converter.toRectangle2D(element.getBBox())) - .xObjectType(element.getXObject().getType()) - .dictOrArrayOrStreamLength(element.getXObject().getType() == 7 ? element.getXObject().getDecodedStream().size() : 0) - .build(); - // This technically should never happen, it's a safetynet - default -> throw new RuntimeException("Feature Extraction is not supported for PDFTron.Element with type: " + element.getType()); - }; - } - public boolean almostMatches(Element element) throws PDFNetException { @@ -115,7 +72,7 @@ public class ElementFeatures { @Getter @SuperBuilder @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) - private static class Text extends ElementFeatures { + public static class Text extends ElementFeatures { String text; int font; @@ -181,7 +138,7 @@ public class ElementFeatures { @Getter @SuperBuilder @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) - private static class Image extends ElementFeatures { + public static class Image extends ElementFeatures { int dataSize; int height; @@ -189,7 +146,7 @@ public class ElementFeatures { int renderingIntent; int componentNum; int bitsPerComponent; - + String hashOfImage; @Override public boolean almostMatches(Element element) throws PDFNetException { @@ -203,13 +160,41 @@ public class ElementFeatures { bitsPerComponent == element.getBitsPerComponent(); } + public boolean almostMatches(ElementFeatures elementFeatures){ + if(elementFeatures.getClass() != this.getClass()){ + return false; + } + return super.almostMatches(elementFeatures) && + this.dataSize == ((Image) elementFeatures).getDataSize() && + this.height == ((Image) elementFeatures).getHeight() && + this.width == ((Image) elementFeatures).getWidth() && + this.renderingIntent == ((Image) elementFeatures).getRenderingIntent() && + this.componentNum == ((Image) elementFeatures).getComponentNum() && + this.bitsPerComponent == ((Image) elementFeatures).getBitsPerComponent() && + calculateHammingDistance(((Image) elementFeatures).getHashOfImage()) <=4; + } + + // Helper method to calculate the Hamming distance between two hexadecimal strings + private int calculateHammingDistance(String hash2) { + int distance = 0; + int maxLength = Math.max(this.hashOfImage.length(), hash2.length()); + for (int i = 0; i < maxLength; i++) { + char char1 = (i < this.hashOfImage.length()) ? this.hashOfImage.charAt(i) : '0'; + char char2 = (i < hash2.length()) ? hash2.charAt(i) : '0'; + if (char1 != char2) { + distance++; + } + } + return distance; + } + } @EqualsAndHashCode(callSuper = true) @Getter @SuperBuilder @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) - private static class Form extends ElementFeatures { + public static class Form extends ElementFeatures { int xObjectType; long dictOrArrayOrStreamLength; diff --git a/src/main/java/com/iqser/red/pdftronlogic/commons/ImageHashFactory.java b/src/main/java/com/iqser/red/pdftronlogic/commons/ImageHashFactory.java new file mode 100644 index 0000000..9e8f701 --- /dev/null +++ b/src/main/java/com/iqser/red/pdftronlogic/commons/ImageHashFactory.java @@ -0,0 +1,116 @@ +package com.iqser.red.pdftronlogic.commons; + +import java.awt.Color; +import java.awt.image.BufferedImage; +import java.io.ByteArrayInputStream; + +import javax.imageio.ImageIO; + +import com.pdftron.filters.FilterWriter; +import com.pdftron.filters.MemoryFilter; +import com.pdftron.pdf.Element; + +import lombok.SneakyThrows; +import lombok.experimental.UtilityClass; + +@UtilityClass +public class ImageHashFactory { + + @SneakyThrows + private byte[] getBytesOfImage(com.pdftron.pdf.Image inputImage) { + // 0 because the memory filter determines the size + var memFilter = new MemoryFilter(0, false); + var filterWriter = new FilterWriter(memFilter); + + inputImage.export(filterWriter); + filterWriter.flushAll(); + byte[] res = memFilter.getBuffer(); + + memFilter.flushAll(); + memFilter.destroy(); + filterWriter.destroy(); + return res; + } + @SneakyThrows + public String calculate(Element element) { + com.pdftron.pdf.Image image = new com.pdftron.pdf.Image(element.getXObject()); + + byte[] imageBytes = getBytesOfImage(image); + ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(imageBytes); + BufferedImage image1 = ImageIO.read(byteArrayInputStream); + + String hash = getSimplePHash(image1); + + return hash; + + } + + public String getSimplePHash(BufferedImage image) { + // Resize the image to a fixed size (e.g., 8x8 pixels) + int targetWidth = 8; + int targetHeight = 8; + BufferedImage resizedImage = new BufferedImage(targetWidth, targetHeight, BufferedImage.TYPE_INT_ARGB); + resizedImage.getGraphics().drawImage(image.getScaledInstance(targetWidth, targetHeight, java.awt.Image.SCALE_SMOOTH), 0, 0, targetWidth, targetHeight, null); + + // Convert the image to grayscale + BufferedImage grayscaleImage = new BufferedImage(targetWidth, targetHeight, BufferedImage.TYPE_BYTE_GRAY); + grayscaleImage.getGraphics().drawImage(resizedImage, 0, 0, null); + + // Calculate the average grayscale pixel value + int average = calculateAverage(grayscaleImage); + + // Create a binary hash based on pixel values + StringBuilder hashBuilder = new StringBuilder(); + for (int y = 0; y < targetHeight; y++) { + for (int x = 0; x < targetWidth; x++) { + int pixelValue = new Color(grayscaleImage.getRGB(x, y)).getRed(); + if (pixelValue > average) { + hashBuilder.append("1"); + } else { + hashBuilder.append("0"); + } + } + } + return hashBuilder.toString(); + } + + // Helper method to calculate the average grayscale pixel value + private int calculateAverage(BufferedImage image) { + int total = 0; + int width = image.getWidth(); + int height = image.getHeight(); + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + total += new Color(image.getRGB(x, y)).getRed(); + } + } + return total / (width * height); + } + + // to hash images either use getDHash or getSimplePHash + public String getDHash(BufferedImage image) throws Exception { + BufferedImage resizedImage = resizeImage(image, 9, 8); // Resize image to 9x8 for dHash + + long hash = 0L; + for (int y = 0; y < 8; y++) { + for (int x = 0; x < 8; x++) { + int leftPixel = resizedImage.getRGB(x, y); + int rightPixel = resizedImage.getRGB(x + 1, y); + hash <<= 1; + hash |= (leftPixel < rightPixel) ? 1 : 0; + } + } + + return Long.toHexString(hash); + } + + // Helper method to resize the image to the desired dimensions + private BufferedImage resizeImage(BufferedImage image, int width, int height) { + BufferedImage resizedImage = new BufferedImage(width, height, BufferedImage.TYPE_INT_ARGB); + resizedImage.getGraphics().drawImage(image, 0, 0, width, height, null); + return resizedImage; + } + + + +} diff --git a/src/main/java/com/iqser/red/pdftronlogic/commons/InvisibleElementRemovalService.java b/src/main/java/com/iqser/red/pdftronlogic/commons/InvisibleElementRemovalService.java index 8841398..fe7ba36 100644 --- a/src/main/java/com/iqser/red/pdftronlogic/commons/InvisibleElementRemovalService.java +++ b/src/main/java/com/iqser/red/pdftronlogic/commons/InvisibleElementRemovalService.java @@ -167,7 +167,7 @@ public class InvisibleElementRemovalService { boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight()); if (!context.delta() && inClippingPath) { - context.visibleElements().add(ElementFeatures.extractFeatures(imageElement)); + context.visibleElements().add(ElementFeatureFactory.extractFeatures(imageElement)); } if (context.delta() ^ inClippingPath) { @@ -192,7 +192,7 @@ public class InvisibleElementRemovalService { boolean isTextVisible = isTextRenderedVisibly(gState, textBBox, context); if (inClippingPath && isTextVisible) { - context.visibleElements().add(ElementFeatures.extractFeatures(textElement)); + context.visibleElements().add(ElementFeatureFactory.extractFeatures(textElement)); } if (!context.delta()) { if (inClippingPath && isTextVisible) { @@ -291,7 +291,7 @@ public class InvisibleElementRemovalService { context.overlappedElements().addAll(currentOverlappedElements); context.visibleElements().removeAll(currentOverlappedElements); } - context.visibleElements().add(ElementFeatures.extractFeatures(pathElement)); + context.visibleElements().add(ElementFeatureFactory.extractFeatures(pathElement)); if (!context.delta()) { writer.writeElement(pathElement); } diff --git a/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java b/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java index 609b1d9..33435fe 100644 --- a/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java +++ b/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java @@ -1,61 +1,52 @@ package com.iqser.red.pdftronlogic.commons; -import java.awt.Image; -import java.awt.Toolkit; -import java.awt.geom.Rectangle2D; -import java.awt.image.BufferedImage; -import java.io.ByteArrayInputStream; -import java.io.File; import java.io.InputStream; import java.io.OutputStream; import java.util.Collection; import java.util.HashMap; import java.util.LinkedList; import java.util.List; -import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.TreeSet; -import javax.imageio.ImageIO; - import com.pdftron.common.PDFNetException; -import com.pdftron.filters.FileDescriptorFilter; -import com.pdftron.filters.Filter; -import com.pdftron.filters.FilterReader; -import com.pdftron.filters.FilterWriter; -import com.pdftron.pdf.ColorPt; -import com.pdftron.pdf.ColorSpace; import com.pdftron.pdf.Element; -import com.pdftron.pdf.ElementBuilder; import com.pdftron.pdf.ElementReader; import com.pdftron.pdf.ElementWriter; -import com.pdftron.pdf.Image2RGB; -import com.pdftron.pdf.Optimizer; import com.pdftron.pdf.PDFDoc; import com.pdftron.pdf.Page; import com.pdftron.pdf.PageIterator; import com.pdftron.sdf.SDFDoc; import lombok.SneakyThrows; +import lombok.experimental.UtilityClass; +import lombok.extern.slf4j.Slf4j; +@UtilityClass +@Slf4j public class WatermarkRemovalService { final static double AREA_THRESHOLD = 0.6; // multiplied with page area - final static double OCCURING_ON_PAGES_THRESHOLD_FACTOR = 0.4; // multiplied with number of pages + final static double OCCURING_ON_PAGES_THRESHOLD_FACTOR = 0.75; // multiplied with number of pages + + final static int MIN_PAGES_THRESHOLD = 3; @SneakyThrows - public static void removeWatermarks(InputStream pdfFile, OutputStream out) { + public void removeWatermarks(InputStream pdfFile, OutputStream out) { PDFDoc pdfDoc = new PDFDoc(pdfFile); + if(pdfDoc.getPageCount() < MIN_PAGES_THRESHOLD){ + log.debug("Document page count {} is below threshold {}", pdfDoc.getPageCount(), MIN_PAGES_THRESHOLD); + return; + } + Map> formObjectsForPages = findAllFormObjectsAndImages(pdfDoc); List watermarkElementFeatures = filterSameFormObjectsOccuringOnMostPages(formObjectsForPages); - storeWatermarkImageHashValues(watermarkElementFeatures); - removeAllWatermarks(pdfDoc, watermarkElementFeatures); try { @@ -68,15 +59,6 @@ public class WatermarkRemovalService { } - private static void storeWatermarkImageHashValues(List watermarkElementFeatures) { - for(ElementFeatures elementFeatures : watermarkElementFeatures){ - if(elementFeatures.getElementType() == Element.e_image || elementFeatures.getElementType() == Element.e_inline_image){ - - } - } - } - - @SneakyThrows private static Map> findAllFormObjectsAndImages(PDFDoc pdfDoc) { @@ -86,6 +68,8 @@ public class WatermarkRemovalService { ElementReader reader = new ElementReader(); + + for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) { Page page = iterator.next(); @@ -96,19 +80,7 @@ public class WatermarkRemovalService { reader.begin(page); for (Element element = reader.next(); element != null; element = reader.next()) { - if(element.getBBox() == null){ - continue; - } - if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage) { - continue; - } - - if (element.getType() == Element.e_form) { - //processXObject(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringFromPage); - } else if (element.getType() == Element.e_image || element.getType() == Element.e_inline_image) { - // causes empty pages so far - processImages(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage); - } + processElement(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringFromPage); } formObjectsAndImagesForPages.put(page.getSDFObj().getObjNum(), elementFeaturesLinkedList); @@ -120,93 +92,54 @@ public class WatermarkRemovalService { } - @SneakyThrows - private static void processImages(Element element, - Set visitedXObjIds, - LinkedList elementFeaturesLinkedList, - List formObjectsOccuringMoreThanOnceOnAPage) { + private static void processElement(Element element, + Set visitedXObjIds, + List elementFeaturesLinkedList, + List formObjectsOccuringMoreThanOnceOnAPage, + double minAreaCoveringPage) throws PDFNetException { - if(element.getType() == Element.e_image) { - - //element.getImageData(); - - /*com.pdftron.pdf.Image image = new com.pdftron.pdf.Image(element.getXObject()); - System.out.println(image.getImageDataSize()); - //element.getImageData().writeToFile("C:\\Users\\RaphaelArnold\\AppData\\Local\\Temp\\" + "IMAGE1", false); - String fname = "C:\\Users\\RaphaelArnold\\AppData\\Local\\Temp\\" + "IMAGE.png"; - image.exportAsPng(fname); - - Image2RGB img_conv = new Image2RGB(element); - FilterReader reader = new com.pdftron.filters.FilterReader(img_conv); - byte[] image_data_out = new byte[element.getImageWidth() * element.getImageHeight() * 3]; - reader.read(image_data_out); - System.out.println("he"); - - BufferedImage bufferedImage = ImageIO.read(new ByteArrayInputStream(image_data_out)); - bufferedImage.getScaledInstance(10,10,0);*/ - - - - //Optimizer.ImageSettings imageSettings = new Optimizer.ImageSettings(); - - - /*Image img = image.getBitmap(); - - BufferedImage bufferedImage= new BufferedImage(img.getWidth(null), img.getHeight(null), BufferedImage.TYPE_INT_RGB); - img.getGraphics().drawImage(img, 0, 0, null); - ImageIO.write(bufferedImage, "jpg", new File("C:\\myImage.jpg"));*/ + if (element.getBBox() == null) { + return; } - ElementFeatures elementFeatures = ElementFeatures.extractFeatures(element); + if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringPage) { + return; + } + + if (element.getType() == Element.e_form) { + processXObject(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage); + } else if (element.getType() == Element.e_image || element.getType() == Element.e_inline_image) { + processImages(element, elementFeaturesLinkedList); + } + } + + + @SneakyThrows + private static void processImages(Element element, List elementFeaturesLinkedList) { + + String hashOfImage = ImageHashFactory.calculate(element); + ElementFeatures elementFeatures = ElementFeatureFactory.extractFeaturesWithHash(element, hashOfImage); elementFeaturesLinkedList.add(elementFeatures); } @SneakyThrows - private static boolean processXObject(Element element, + private static void processXObject(Element element, Set visitedXObjIds, - LinkedList elementFeaturesLinkedList, + List elementFeaturesLinkedList, List formObjectsOccuringMoreThanOnceOnAPage, - double minAreaCoveringPage) { - - /*for(ElementFeatures elementFeatures1 : formObjectsOccuringMoreThanOnceOnAPage){ - if(elementFeatures1.almostMatches(element)){ - return; - } - } - - for (ElementFeatures elementFeatures1 : elementFeaturesLinkedList) { - if (elementFeatures1.almostMatches(element)) { - ElementFeatures elementFeatures = ElementFeatures.extractFeatures(element); - formObjectsOccuringMoreThanOnceOnAPage.add(elementFeatures); - elementFeaturesLinkedList.remove(elementFeatures); - return; - } - }*/ - + double minAreaCoveringPage) { if (visitedXObjIds.add(element.getXObject().getObjNum())) { - ElementReader xObjectReader = new ElementReader(); xObjectReader.begin(element.getXObject()); - boolean isContainingImageBigEnough = true; for (Element element1 = xObjectReader.next(); element1 != null; element1 = xObjectReader.next()) { - if (element1.getType() == Element.e_form) { - isContainingImageBigEnough = processXObject(element1, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage); - } else if((element1.getType() == Element.e_image || element1.getType() == Element.e_inline_image)) { - if(element1.getImageHeight()*element1.getImageWidth() < minAreaCoveringPage){ - xObjectReader.destroy(); - return false; - } - } - } - if(isContainingImageBigEnough) { - elementFeaturesLinkedList.add(ElementFeatures.extractFeatures(element)); + processElement(element1, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage); } + elementFeaturesLinkedList.add(ElementFeatureFactory.extractFeatures(element)); xObjectReader.destroy(); } else { - elementFeaturesLinkedList.add(ElementFeatures.extractFeatures(element)); + elementFeaturesLinkedList.add(ElementFeatureFactory.extractFeatures(element)); } - return true; } @@ -259,29 +192,46 @@ public class WatermarkRemovalService { reader.begin(page); writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict()); - processElements(reader, writer, watermarksElementFeaturesList, visitedXObjIds); + processElements(page, reader, writer, watermarksElementFeaturesList, visitedXObjIds); writer.end(); reader.end(); } - private static void processElements(ElementReader reader, + private static void processElements(Page page, + ElementReader reader, ElementWriter writer, List watermarksElementFeaturesList, Set visitedXObjIds) throws PDFNetException { - for (Element element = reader.next(); element != null; element = reader.next()) + double minAreaCoveringFromPage = AREA_THRESHOLD * page.getPageHeight() * page.getPageWidth(); + for (Element element = reader.next(); element != null; element = reader.next()) { + switch (element.getType()) { - case Element.e_image, Element.e_inline_image -> removeImages(element,reader,writer, watermarksElementFeaturesList); - case Element.e_form -> processForms(element, reader, writer, watermarksElementFeaturesList, visitedXObjIds); + case Element.e_image, Element.e_inline_image -> { + if (element.getBBox() == null) { + continue; + } + if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage) { + writer.writeElement(element); + continue; + } + removeImages(element, writer, watermarksElementFeaturesList); + } + case Element.e_form -> processForms(page, element, reader, writer, watermarksElementFeaturesList, visitedXObjIds); default -> writer.writeElement(element); } + } } + @SneakyThrows - private static void removeImages(Element element, ElementReader reader, ElementWriter writer, List watermarksElementFeaturesList) { + private static void removeImages(Element element, ElementWriter writer, List watermarksElementFeaturesList) { + + String hashValueOfImage = ImageHashFactory.calculate(element); + ElementFeatures imageFeatures = ElementFeatureFactory.extractFeaturesWithHash(element, hashValueOfImage); for (ElementFeatures elementFeatures : watermarksElementFeaturesList) { - if (elementFeatures.almostMatches(element)) { + if (elementFeatures.almostMatches(imageFeatures)) { return; } } @@ -290,11 +240,8 @@ public class WatermarkRemovalService { } - /* - Maybe problem with visitedXObjIds, because, if on same page there are two identical xobjects - but one is inside another xObject, the other is directly - */ - private static void processForms(Element element, + private static void processForms(Page page, + Element element, ElementReader reader, ElementWriter writer, List watermarksElementFeaturesList, @@ -319,7 +266,7 @@ public class WatermarkRemovalService { reader.clearChangeList(); formWriter.setDefaultGState(reader); - processElements(reader, formWriter, watermarksElementFeaturesList, visitedXObjIds); + processElements(page, reader, formWriter, watermarksElementFeaturesList, visitedXObjIds); formWriter.end(); formWriter.destroy(); reader.end(); @@ -327,22 +274,4 @@ public class WatermarkRemovalService { } - - @SneakyThrows - private static void drawBBox(ElementWriter writer, Rectangle2D r, String hexcolor) { - - ColorPt colorPt = new ColorPt(Integer.valueOf(hexcolor.substring(1, 3), 16) / 255d, - Integer.valueOf(hexcolor.substring(3, 5), 16) / 255d, - Integer.valueOf(hexcolor.substring(5, 7), 16) / 255d); - ElementBuilder eb = new ElementBuilder(); - Element rect = eb.createRect(r.getX(), r.getY(), r.getWidth(), r.getHeight()); - rect.setPathStroke(true); - rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB()); - rect.getGState().setStrokeColor(colorPt); - writer.writePlacedElement(rect); - - colorPt.destroy(); - eb.destroy(); - } - }