diff --git a/pom.xml b/pom.xml
index 4d37cab..b50f1d1 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,6 +26,12 @@
slf4j-api
provided
+
+ org.apache.logging.log4j
+ log4j-slf4j2-impl
+ 2.20.0
+ test
+
com.google.guava
guava
diff --git a/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatureFactory.java b/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatureFactory.java
new file mode 100644
index 0000000..6f1407e
--- /dev/null
+++ b/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatureFactory.java
@@ -0,0 +1,80 @@
+package com.iqser.red.pdftronlogic.commons;
+
+import com.pdftron.common.PDFNetException;
+import com.pdftron.pdf.Element;
+
+public class ElementFeatureFactory {
+ public static ElementFeatures extractFeatures(Element element) throws PDFNetException {
+
+ return switch (element.getType()) {
+ case Element.e_path -> buildPath(element);
+ case Element.e_text -> buildText(element);
+ case Element.e_image, Element.e_inline_image -> buildImage(element).build();
+ case Element.e_form -> buildForm(element);
+ // This technically should never happen, it's a safetynet
+ default -> throw new RuntimeException("Feature Extraction is not supported for PDFTron.Element with type: " + element.getType());
+ };
+ }
+
+ public static ElementFeatures extractFeaturesWithHash(Element element, String hashObject) throws PDFNetException {
+ return buildImage(element)
+ .hashOfImage(hashObject)
+ .build();
+ }
+
+
+ private static ElementFeatures.Form buildForm(Element element) throws PDFNetException {
+
+ return ElementFeatures.Form.builder()
+ .elementType(element.getType())
+ .boundingBox(Converter.toRectangle2D(element.getBBox()))
+ .xObjectType(element.getXObject().getType())
+ .dictOrArrayOrStreamLength(element.getXObject().getType() == 7 ? element.getXObject().getDecodedStream().size() : 0)
+ .build();
+ }
+
+
+ private static ElementFeatures.Image.ImageBuilder, ?> buildImage(Element element) throws PDFNetException {
+
+ return ElementFeatures.Image.builder()
+ .elementType(element.getType())
+ .boundingBox(Converter.toRectangle2D(element.getBBox()))
+ .dataSize(element.getImageDataSize())
+ .height(element.getImageHeight())
+ .width(element.getImageWidth())
+ .renderingIntent(element.getImageRenderingIntent())
+ .componentNum(element.getComponentNum())
+ .bitsPerComponent(element.getBitsPerComponent());
+ }
+
+
+ private static ElementFeatures.Text buildText(Element element) throws PDFNetException {
+
+ return ElementFeatures.Text.builder()
+ .elementType(element.getType())
+ .boundingBox(Converter.toRectangle2D(element.getBBox()))
+ .text(element.getTextString())
+ .font(element.getGState().getFont().getType())
+ .fontsize(element.getGState().getFontSize())
+ .build();
+ }
+
+
+ private static ElementFeatures.Path buildPath(Element element) throws PDFNetException {
+
+ return ElementFeatures.Path.builder()
+ .elementType(element.getType())
+ .boundingBox(Converter.toRectangle2D(element.getBBox()))
+ .isClippingPath(element.isClippingPath())
+ .isClipWindingFill(element.isClipWindingFill())
+ .isStroked(element.isStroked())
+ .isFilled(element.isFilled())
+ .isWindingFill(element.isWindingFill())
+ .fillColor(Converter.convertColor(element.getGState().getFillColorSpace(), element.getGState().getFillColor()))
+ .strokeColor(Converter.convertColor(element.getGState().getStrokeColorSpace(), element.getGState().getStrokeColor()))
+ .linePath(Converter.convertToGeneralPathAndTransformToInitialUserSpace(element.getPathData(), element.getCTM()))
+ .build();
+ }
+
+
+}
diff --git a/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java b/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java
index 12e5733..2107928 100644
--- a/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java
+++ b/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java
@@ -26,49 +26,6 @@ public class ElementFeatures {
int elementType;
Rectangle2D boundingBox;
- public static ElementFeatures extractFeatures(Element element) throws PDFNetException {
-
- return switch (element.getType()) {
- case Element.e_path -> Path.builder()
- .elementType(element.getType())
- .boundingBox(Converter.toRectangle2D(element.getBBox()))
- .isClippingPath(element.isClippingPath())
- .isClipWindingFill(element.isClipWindingFill())
- .isStroked(element.isStroked())
- .isFilled(element.isFilled())
- .isWindingFill(element.isWindingFill())
- .fillColor(Converter.convertColor(element.getGState().getFillColorSpace(), element.getGState().getFillColor()))
- .strokeColor(Converter.convertColor(element.getGState().getStrokeColorSpace(), element.getGState().getStrokeColor()))
- .linePath(Converter.convertToGeneralPathAndTransformToInitialUserSpace(element.getPathData(), element.getCTM()))
- .build();
- case Element.e_text -> Text.builder()
- .elementType(element.getType())
- .boundingBox(Converter.toRectangle2D(element.getBBox()))
- .text(element.getTextString())
- .font(element.getGState().getFont().getType())
- .fontsize(element.getGState().getFontSize())
- .build();
- case Element.e_image, Element.e_inline_image -> Image.builder()
- .elementType(element.getType())
- .boundingBox(Converter.toRectangle2D(element.getBBox()))
- .dataSize(element.getImageDataSize())
- .height(element.getImageHeight())
- .width(element.getImageWidth())
- .renderingIntent(element.getImageRenderingIntent())
- .componentNum(element.getComponentNum())
- .bitsPerComponent(element.getBitsPerComponent())
- .build();
- case Element.e_form -> Form.builder()
- .elementType(element.getType())
- .boundingBox(Converter.toRectangle2D(element.getBBox()))
- .xObjectType(element.getXObject().getType())
- .dictOrArrayOrStreamLength(element.getXObject().getType() == 7 ? element.getXObject().getDecodedStream().size() : 0)
- .build();
- // This technically should never happen, it's a safetynet
- default -> throw new RuntimeException("Feature Extraction is not supported for PDFTron.Element with type: " + element.getType());
- };
- }
-
public boolean almostMatches(Element element) throws PDFNetException {
@@ -115,7 +72,7 @@ public class ElementFeatures {
@Getter
@SuperBuilder
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
- private static class Text extends ElementFeatures {
+ public static class Text extends ElementFeatures {
String text;
int font;
@@ -181,7 +138,7 @@ public class ElementFeatures {
@Getter
@SuperBuilder
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
- private static class Image extends ElementFeatures {
+ public static class Image extends ElementFeatures {
int dataSize;
int height;
@@ -189,7 +146,7 @@ public class ElementFeatures {
int renderingIntent;
int componentNum;
int bitsPerComponent;
-
+ String hashOfImage;
@Override
public boolean almostMatches(Element element) throws PDFNetException {
@@ -203,13 +160,41 @@ public class ElementFeatures {
bitsPerComponent == element.getBitsPerComponent();
}
+ public boolean almostMatches(ElementFeatures elementFeatures){
+ if(elementFeatures.getClass() != this.getClass()){
+ return false;
+ }
+ return super.almostMatches(elementFeatures) &&
+ this.dataSize == ((Image) elementFeatures).getDataSize() &&
+ this.height == ((Image) elementFeatures).getHeight() &&
+ this.width == ((Image) elementFeatures).getWidth() &&
+ this.renderingIntent == ((Image) elementFeatures).getRenderingIntent() &&
+ this.componentNum == ((Image) elementFeatures).getComponentNum() &&
+ this.bitsPerComponent == ((Image) elementFeatures).getBitsPerComponent() &&
+ calculateHammingDistance(((Image) elementFeatures).getHashOfImage()) <=4;
+ }
+
+ // Helper method to calculate the Hamming distance between two hexadecimal strings
+ private int calculateHammingDistance(String hash2) {
+ int distance = 0;
+ int maxLength = Math.max(this.hashOfImage.length(), hash2.length());
+ for (int i = 0; i < maxLength; i++) {
+ char char1 = (i < this.hashOfImage.length()) ? this.hashOfImage.charAt(i) : '0';
+ char char2 = (i < hash2.length()) ? hash2.charAt(i) : '0';
+ if (char1 != char2) {
+ distance++;
+ }
+ }
+ return distance;
+ }
+
}
@EqualsAndHashCode(callSuper = true)
@Getter
@SuperBuilder
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
- private static class Form extends ElementFeatures {
+ public static class Form extends ElementFeatures {
int xObjectType;
long dictOrArrayOrStreamLength;
diff --git a/src/main/java/com/iqser/red/pdftronlogic/commons/ImageHashFactory.java b/src/main/java/com/iqser/red/pdftronlogic/commons/ImageHashFactory.java
new file mode 100644
index 0000000..9e8f701
--- /dev/null
+++ b/src/main/java/com/iqser/red/pdftronlogic/commons/ImageHashFactory.java
@@ -0,0 +1,116 @@
+package com.iqser.red.pdftronlogic.commons;
+
+import java.awt.Color;
+import java.awt.image.BufferedImage;
+import java.io.ByteArrayInputStream;
+
+import javax.imageio.ImageIO;
+
+import com.pdftron.filters.FilterWriter;
+import com.pdftron.filters.MemoryFilter;
+import com.pdftron.pdf.Element;
+
+import lombok.SneakyThrows;
+import lombok.experimental.UtilityClass;
+
+@UtilityClass
+public class ImageHashFactory {
+
+ @SneakyThrows
+ private byte[] getBytesOfImage(com.pdftron.pdf.Image inputImage) {
+ // 0 because the memory filter determines the size
+ var memFilter = new MemoryFilter(0, false);
+ var filterWriter = new FilterWriter(memFilter);
+
+ inputImage.export(filterWriter);
+ filterWriter.flushAll();
+ byte[] res = memFilter.getBuffer();
+
+ memFilter.flushAll();
+ memFilter.destroy();
+ filterWriter.destroy();
+ return res;
+ }
+ @SneakyThrows
+ public String calculate(Element element) {
+ com.pdftron.pdf.Image image = new com.pdftron.pdf.Image(element.getXObject());
+
+ byte[] imageBytes = getBytesOfImage(image);
+ ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(imageBytes);
+ BufferedImage image1 = ImageIO.read(byteArrayInputStream);
+
+ String hash = getSimplePHash(image1);
+
+ return hash;
+
+ }
+
+ public String getSimplePHash(BufferedImage image) {
+ // Resize the image to a fixed size (e.g., 8x8 pixels)
+ int targetWidth = 8;
+ int targetHeight = 8;
+ BufferedImage resizedImage = new BufferedImage(targetWidth, targetHeight, BufferedImage.TYPE_INT_ARGB);
+ resizedImage.getGraphics().drawImage(image.getScaledInstance(targetWidth, targetHeight, java.awt.Image.SCALE_SMOOTH), 0, 0, targetWidth, targetHeight, null);
+
+ // Convert the image to grayscale
+ BufferedImage grayscaleImage = new BufferedImage(targetWidth, targetHeight, BufferedImage.TYPE_BYTE_GRAY);
+ grayscaleImage.getGraphics().drawImage(resizedImage, 0, 0, null);
+
+ // Calculate the average grayscale pixel value
+ int average = calculateAverage(grayscaleImage);
+
+ // Create a binary hash based on pixel values
+ StringBuilder hashBuilder = new StringBuilder();
+ for (int y = 0; y < targetHeight; y++) {
+ for (int x = 0; x < targetWidth; x++) {
+ int pixelValue = new Color(grayscaleImage.getRGB(x, y)).getRed();
+ if (pixelValue > average) {
+ hashBuilder.append("1");
+ } else {
+ hashBuilder.append("0");
+ }
+ }
+ }
+ return hashBuilder.toString();
+ }
+
+ // Helper method to calculate the average grayscale pixel value
+ private int calculateAverage(BufferedImage image) {
+ int total = 0;
+ int width = image.getWidth();
+ int height = image.getHeight();
+ for (int y = 0; y < height; y++) {
+ for (int x = 0; x < width; x++) {
+ total += new Color(image.getRGB(x, y)).getRed();
+ }
+ }
+ return total / (width * height);
+ }
+
+ // to hash images either use getDHash or getSimplePHash
+ public String getDHash(BufferedImage image) throws Exception {
+ BufferedImage resizedImage = resizeImage(image, 9, 8); // Resize image to 9x8 for dHash
+
+ long hash = 0L;
+ for (int y = 0; y < 8; y++) {
+ for (int x = 0; x < 8; x++) {
+ int leftPixel = resizedImage.getRGB(x, y);
+ int rightPixel = resizedImage.getRGB(x + 1, y);
+ hash <<= 1;
+ hash |= (leftPixel < rightPixel) ? 1 : 0;
+ }
+ }
+
+ return Long.toHexString(hash);
+ }
+
+ // Helper method to resize the image to the desired dimensions
+ private BufferedImage resizeImage(BufferedImage image, int width, int height) {
+ BufferedImage resizedImage = new BufferedImage(width, height, BufferedImage.TYPE_INT_ARGB);
+ resizedImage.getGraphics().drawImage(image, 0, 0, width, height, null);
+ return resizedImage;
+ }
+
+
+
+}
diff --git a/src/main/java/com/iqser/red/pdftronlogic/commons/InvisibleElementRemovalService.java b/src/main/java/com/iqser/red/pdftronlogic/commons/InvisibleElementRemovalService.java
index 8841398..fe7ba36 100644
--- a/src/main/java/com/iqser/red/pdftronlogic/commons/InvisibleElementRemovalService.java
+++ b/src/main/java/com/iqser/red/pdftronlogic/commons/InvisibleElementRemovalService.java
@@ -167,7 +167,7 @@ public class InvisibleElementRemovalService {
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
if (!context.delta() && inClippingPath) {
- context.visibleElements().add(ElementFeatures.extractFeatures(imageElement));
+ context.visibleElements().add(ElementFeatureFactory.extractFeatures(imageElement));
}
if (context.delta() ^ inClippingPath) {
@@ -192,7 +192,7 @@ public class InvisibleElementRemovalService {
boolean isTextVisible = isTextRenderedVisibly(gState, textBBox, context);
if (inClippingPath && isTextVisible) {
- context.visibleElements().add(ElementFeatures.extractFeatures(textElement));
+ context.visibleElements().add(ElementFeatureFactory.extractFeatures(textElement));
}
if (!context.delta()) {
if (inClippingPath && isTextVisible) {
@@ -291,7 +291,7 @@ public class InvisibleElementRemovalService {
context.overlappedElements().addAll(currentOverlappedElements);
context.visibleElements().removeAll(currentOverlappedElements);
}
- context.visibleElements().add(ElementFeatures.extractFeatures(pathElement));
+ context.visibleElements().add(ElementFeatureFactory.extractFeatures(pathElement));
if (!context.delta()) {
writer.writeElement(pathElement);
}
diff --git a/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java b/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java
index 609b1d9..33435fe 100644
--- a/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java
+++ b/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java
@@ -1,61 +1,52 @@
package com.iqser.red.pdftronlogic.commons;
-import java.awt.Image;
-import java.awt.Toolkit;
-import java.awt.geom.Rectangle2D;
-import java.awt.image.BufferedImage;
-import java.io.ByteArrayInputStream;
-import java.io.File;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
-import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
-import javax.imageio.ImageIO;
-
import com.pdftron.common.PDFNetException;
-import com.pdftron.filters.FileDescriptorFilter;
-import com.pdftron.filters.Filter;
-import com.pdftron.filters.FilterReader;
-import com.pdftron.filters.FilterWriter;
-import com.pdftron.pdf.ColorPt;
-import com.pdftron.pdf.ColorSpace;
import com.pdftron.pdf.Element;
-import com.pdftron.pdf.ElementBuilder;
import com.pdftron.pdf.ElementReader;
import com.pdftron.pdf.ElementWriter;
-import com.pdftron.pdf.Image2RGB;
-import com.pdftron.pdf.Optimizer;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows;
+import lombok.experimental.UtilityClass;
+import lombok.extern.slf4j.Slf4j;
+@UtilityClass
+@Slf4j
public class WatermarkRemovalService {
final static double AREA_THRESHOLD = 0.6; // multiplied with page area
- final static double OCCURING_ON_PAGES_THRESHOLD_FACTOR = 0.4; // multiplied with number of pages
+ final static double OCCURING_ON_PAGES_THRESHOLD_FACTOR = 0.75; // multiplied with number of pages
+
+ final static int MIN_PAGES_THRESHOLD = 3;
@SneakyThrows
- public static void removeWatermarks(InputStream pdfFile, OutputStream out) {
+ public void removeWatermarks(InputStream pdfFile, OutputStream out) {
PDFDoc pdfDoc = new PDFDoc(pdfFile);
+ if(pdfDoc.getPageCount() < MIN_PAGES_THRESHOLD){
+ log.debug("Document page count {} is below threshold {}", pdfDoc.getPageCount(), MIN_PAGES_THRESHOLD);
+ return;
+ }
+
Map> formObjectsForPages = findAllFormObjectsAndImages(pdfDoc);
List watermarkElementFeatures = filterSameFormObjectsOccuringOnMostPages(formObjectsForPages);
- storeWatermarkImageHashValues(watermarkElementFeatures);
-
removeAllWatermarks(pdfDoc, watermarkElementFeatures);
try {
@@ -68,15 +59,6 @@ public class WatermarkRemovalService {
}
- private static void storeWatermarkImageHashValues(List watermarkElementFeatures) {
- for(ElementFeatures elementFeatures : watermarkElementFeatures){
- if(elementFeatures.getElementType() == Element.e_image || elementFeatures.getElementType() == Element.e_inline_image){
-
- }
- }
- }
-
-
@SneakyThrows
private static Map> findAllFormObjectsAndImages(PDFDoc pdfDoc) {
@@ -86,6 +68,8 @@ public class WatermarkRemovalService {
ElementReader reader = new ElementReader();
+
+
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
Page page = iterator.next();
@@ -96,19 +80,7 @@ public class WatermarkRemovalService {
reader.begin(page);
for (Element element = reader.next(); element != null; element = reader.next()) {
- if(element.getBBox() == null){
- continue;
- }
- if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage) {
- continue;
- }
-
- if (element.getType() == Element.e_form) {
- //processXObject(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringFromPage);
- } else if (element.getType() == Element.e_image || element.getType() == Element.e_inline_image) {
- // causes empty pages so far
- processImages(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage);
- }
+ processElement(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringFromPage);
}
formObjectsAndImagesForPages.put(page.getSDFObj().getObjNum(), elementFeaturesLinkedList);
@@ -120,93 +92,54 @@ public class WatermarkRemovalService {
}
- @SneakyThrows
- private static void processImages(Element element,
- Set visitedXObjIds,
- LinkedList elementFeaturesLinkedList,
- List formObjectsOccuringMoreThanOnceOnAPage) {
+ private static void processElement(Element element,
+ Set visitedXObjIds,
+ List elementFeaturesLinkedList,
+ List formObjectsOccuringMoreThanOnceOnAPage,
+ double minAreaCoveringPage) throws PDFNetException {
- if(element.getType() == Element.e_image) {
-
- //element.getImageData();
-
- /*com.pdftron.pdf.Image image = new com.pdftron.pdf.Image(element.getXObject());
- System.out.println(image.getImageDataSize());
- //element.getImageData().writeToFile("C:\\Users\\RaphaelArnold\\AppData\\Local\\Temp\\" + "IMAGE1", false);
- String fname = "C:\\Users\\RaphaelArnold\\AppData\\Local\\Temp\\" + "IMAGE.png";
- image.exportAsPng(fname);
-
- Image2RGB img_conv = new Image2RGB(element);
- FilterReader reader = new com.pdftron.filters.FilterReader(img_conv);
- byte[] image_data_out = new byte[element.getImageWidth() * element.getImageHeight() * 3];
- reader.read(image_data_out);
- System.out.println("he");
-
- BufferedImage bufferedImage = ImageIO.read(new ByteArrayInputStream(image_data_out));
- bufferedImage.getScaledInstance(10,10,0);*/
-
-
-
- //Optimizer.ImageSettings imageSettings = new Optimizer.ImageSettings();
-
-
- /*Image img = image.getBitmap();
-
- BufferedImage bufferedImage= new BufferedImage(img.getWidth(null), img.getHeight(null), BufferedImage.TYPE_INT_RGB);
- img.getGraphics().drawImage(img, 0, 0, null);
- ImageIO.write(bufferedImage, "jpg", new File("C:\\myImage.jpg"));*/
+ if (element.getBBox() == null) {
+ return;
}
- ElementFeatures elementFeatures = ElementFeatures.extractFeatures(element);
+ if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringPage) {
+ return;
+ }
+
+ if (element.getType() == Element.e_form) {
+ processXObject(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage);
+ } else if (element.getType() == Element.e_image || element.getType() == Element.e_inline_image) {
+ processImages(element, elementFeaturesLinkedList);
+ }
+ }
+
+
+ @SneakyThrows
+ private static void processImages(Element element, List elementFeaturesLinkedList) {
+
+ String hashOfImage = ImageHashFactory.calculate(element);
+ ElementFeatures elementFeatures = ElementFeatureFactory.extractFeaturesWithHash(element, hashOfImage);
elementFeaturesLinkedList.add(elementFeatures);
}
@SneakyThrows
- private static boolean processXObject(Element element,
+ private static void processXObject(Element element,
Set visitedXObjIds,
- LinkedList elementFeaturesLinkedList,
+ List elementFeaturesLinkedList,
List formObjectsOccuringMoreThanOnceOnAPage,
- double minAreaCoveringPage) {
-
- /*for(ElementFeatures elementFeatures1 : formObjectsOccuringMoreThanOnceOnAPage){
- if(elementFeatures1.almostMatches(element)){
- return;
- }
- }
-
- for (ElementFeatures elementFeatures1 : elementFeaturesLinkedList) {
- if (elementFeatures1.almostMatches(element)) {
- ElementFeatures elementFeatures = ElementFeatures.extractFeatures(element);
- formObjectsOccuringMoreThanOnceOnAPage.add(elementFeatures);
- elementFeaturesLinkedList.remove(elementFeatures);
- return;
- }
- }*/
-
+ double minAreaCoveringPage) {
if (visitedXObjIds.add(element.getXObject().getObjNum())) {
-
ElementReader xObjectReader = new ElementReader();
xObjectReader.begin(element.getXObject());
- boolean isContainingImageBigEnough = true;
for (Element element1 = xObjectReader.next(); element1 != null; element1 = xObjectReader.next()) {
- if (element1.getType() == Element.e_form) {
- isContainingImageBigEnough = processXObject(element1, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage);
- } else if((element1.getType() == Element.e_image || element1.getType() == Element.e_inline_image)) {
- if(element1.getImageHeight()*element1.getImageWidth() < minAreaCoveringPage){
- xObjectReader.destroy();
- return false;
- }
- }
- }
- if(isContainingImageBigEnough) {
- elementFeaturesLinkedList.add(ElementFeatures.extractFeatures(element));
+ processElement(element1, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage);
}
+ elementFeaturesLinkedList.add(ElementFeatureFactory.extractFeatures(element));
xObjectReader.destroy();
} else {
- elementFeaturesLinkedList.add(ElementFeatures.extractFeatures(element));
+ elementFeaturesLinkedList.add(ElementFeatureFactory.extractFeatures(element));
}
- return true;
}
@@ -259,29 +192,46 @@ public class WatermarkRemovalService {
reader.begin(page);
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
- processElements(reader, writer, watermarksElementFeaturesList, visitedXObjIds);
+ processElements(page, reader, writer, watermarksElementFeaturesList, visitedXObjIds);
writer.end();
reader.end();
}
- private static void processElements(ElementReader reader,
+ private static void processElements(Page page,
+ ElementReader reader,
ElementWriter writer,
List watermarksElementFeaturesList,
Set visitedXObjIds) throws PDFNetException {
- for (Element element = reader.next(); element != null; element = reader.next())
+ double minAreaCoveringFromPage = AREA_THRESHOLD * page.getPageHeight() * page.getPageWidth();
+ for (Element element = reader.next(); element != null; element = reader.next()) {
+
switch (element.getType()) {
- case Element.e_image, Element.e_inline_image -> removeImages(element,reader,writer, watermarksElementFeaturesList);
- case Element.e_form -> processForms(element, reader, writer, watermarksElementFeaturesList, visitedXObjIds);
+ case Element.e_image, Element.e_inline_image -> {
+ if (element.getBBox() == null) {
+ continue;
+ }
+ if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage) {
+ writer.writeElement(element);
+ continue;
+ }
+ removeImages(element, writer, watermarksElementFeaturesList);
+ }
+ case Element.e_form -> processForms(page, element, reader, writer, watermarksElementFeaturesList, visitedXObjIds);
default -> writer.writeElement(element);
}
+ }
}
+
@SneakyThrows
- private static void removeImages(Element element, ElementReader reader, ElementWriter writer, List watermarksElementFeaturesList) {
+ private static void removeImages(Element element, ElementWriter writer, List watermarksElementFeaturesList) {
+
+ String hashValueOfImage = ImageHashFactory.calculate(element);
+ ElementFeatures imageFeatures = ElementFeatureFactory.extractFeaturesWithHash(element, hashValueOfImage);
for (ElementFeatures elementFeatures : watermarksElementFeaturesList) {
- if (elementFeatures.almostMatches(element)) {
+ if (elementFeatures.almostMatches(imageFeatures)) {
return;
}
}
@@ -290,11 +240,8 @@ public class WatermarkRemovalService {
}
- /*
- Maybe problem with visitedXObjIds, because, if on same page there are two identical xobjects
- but one is inside another xObject, the other is directly
- */
- private static void processForms(Element element,
+ private static void processForms(Page page,
+ Element element,
ElementReader reader,
ElementWriter writer,
List watermarksElementFeaturesList,
@@ -319,7 +266,7 @@ public class WatermarkRemovalService {
reader.clearChangeList();
formWriter.setDefaultGState(reader);
- processElements(reader, formWriter, watermarksElementFeaturesList, visitedXObjIds);
+ processElements(page, reader, formWriter, watermarksElementFeaturesList, visitedXObjIds);
formWriter.end();
formWriter.destroy();
reader.end();
@@ -327,22 +274,4 @@ public class WatermarkRemovalService {
}
-
- @SneakyThrows
- private static void drawBBox(ElementWriter writer, Rectangle2D r, String hexcolor) {
-
- ColorPt colorPt = new ColorPt(Integer.valueOf(hexcolor.substring(1, 3), 16) / 255d,
- Integer.valueOf(hexcolor.substring(3, 5), 16) / 255d,
- Integer.valueOf(hexcolor.substring(5, 7), 16) / 255d);
- ElementBuilder eb = new ElementBuilder();
- Element rect = eb.createRect(r.getX(), r.getY(), r.getWidth(), r.getHeight());
- rect.setPathStroke(true);
- rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
- rect.getGState().setStrokeColor(colorPt);
- writer.writePlacedElement(rect);
-
- colorPt.destroy();
- eb.destroy();
- }
-
}