RED-7075: Watermark Removal finished so far
This commit is contained in:
parent
967cba820d
commit
51b6307f91
6
pom.xml
6
pom.xml
@ -26,6 +26,12 @@
|
||||
<artifactId>slf4j-api</artifactId>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.logging.log4j</groupId>
|
||||
<artifactId>log4j-slf4j2-impl</artifactId>
|
||||
<version>2.20.0</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.google.guava</groupId>
|
||||
<artifactId>guava</artifactId>
|
||||
|
||||
@ -0,0 +1,80 @@
|
||||
package com.iqser.red.pdftronlogic.commons;
|
||||
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.Element;
|
||||
|
||||
public class ElementFeatureFactory {
|
||||
public static ElementFeatures extractFeatures(Element element) throws PDFNetException {
|
||||
|
||||
return switch (element.getType()) {
|
||||
case Element.e_path -> buildPath(element);
|
||||
case Element.e_text -> buildText(element);
|
||||
case Element.e_image, Element.e_inline_image -> buildImage(element).build();
|
||||
case Element.e_form -> buildForm(element);
|
||||
// This technically should never happen, it's a safetynet
|
||||
default -> throw new RuntimeException("Feature Extraction is not supported for PDFTron.Element with type: " + element.getType());
|
||||
};
|
||||
}
|
||||
|
||||
public static ElementFeatures extractFeaturesWithHash(Element element, String hashObject) throws PDFNetException {
|
||||
return buildImage(element)
|
||||
.hashOfImage(hashObject)
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
private static ElementFeatures.Form buildForm(Element element) throws PDFNetException {
|
||||
|
||||
return ElementFeatures.Form.builder()
|
||||
.elementType(element.getType())
|
||||
.boundingBox(Converter.toRectangle2D(element.getBBox()))
|
||||
.xObjectType(element.getXObject().getType())
|
||||
.dictOrArrayOrStreamLength(element.getXObject().getType() == 7 ? element.getXObject().getDecodedStream().size() : 0)
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
private static ElementFeatures.Image.ImageBuilder<?, ?> buildImage(Element element) throws PDFNetException {
|
||||
|
||||
return ElementFeatures.Image.builder()
|
||||
.elementType(element.getType())
|
||||
.boundingBox(Converter.toRectangle2D(element.getBBox()))
|
||||
.dataSize(element.getImageDataSize())
|
||||
.height(element.getImageHeight())
|
||||
.width(element.getImageWidth())
|
||||
.renderingIntent(element.getImageRenderingIntent())
|
||||
.componentNum(element.getComponentNum())
|
||||
.bitsPerComponent(element.getBitsPerComponent());
|
||||
}
|
||||
|
||||
|
||||
private static ElementFeatures.Text buildText(Element element) throws PDFNetException {
|
||||
|
||||
return ElementFeatures.Text.builder()
|
||||
.elementType(element.getType())
|
||||
.boundingBox(Converter.toRectangle2D(element.getBBox()))
|
||||
.text(element.getTextString())
|
||||
.font(element.getGState().getFont().getType())
|
||||
.fontsize(element.getGState().getFontSize())
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
private static ElementFeatures.Path buildPath(Element element) throws PDFNetException {
|
||||
|
||||
return ElementFeatures.Path.builder()
|
||||
.elementType(element.getType())
|
||||
.boundingBox(Converter.toRectangle2D(element.getBBox()))
|
||||
.isClippingPath(element.isClippingPath())
|
||||
.isClipWindingFill(element.isClipWindingFill())
|
||||
.isStroked(element.isStroked())
|
||||
.isFilled(element.isFilled())
|
||||
.isWindingFill(element.isWindingFill())
|
||||
.fillColor(Converter.convertColor(element.getGState().getFillColorSpace(), element.getGState().getFillColor()))
|
||||
.strokeColor(Converter.convertColor(element.getGState().getStrokeColorSpace(), element.getGState().getStrokeColor()))
|
||||
.linePath(Converter.convertToGeneralPathAndTransformToInitialUserSpace(element.getPathData(), element.getCTM()))
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
@ -26,49 +26,6 @@ public class ElementFeatures {
|
||||
int elementType;
|
||||
Rectangle2D boundingBox;
|
||||
|
||||
public static ElementFeatures extractFeatures(Element element) throws PDFNetException {
|
||||
|
||||
return switch (element.getType()) {
|
||||
case Element.e_path -> Path.builder()
|
||||
.elementType(element.getType())
|
||||
.boundingBox(Converter.toRectangle2D(element.getBBox()))
|
||||
.isClippingPath(element.isClippingPath())
|
||||
.isClipWindingFill(element.isClipWindingFill())
|
||||
.isStroked(element.isStroked())
|
||||
.isFilled(element.isFilled())
|
||||
.isWindingFill(element.isWindingFill())
|
||||
.fillColor(Converter.convertColor(element.getGState().getFillColorSpace(), element.getGState().getFillColor()))
|
||||
.strokeColor(Converter.convertColor(element.getGState().getStrokeColorSpace(), element.getGState().getStrokeColor()))
|
||||
.linePath(Converter.convertToGeneralPathAndTransformToInitialUserSpace(element.getPathData(), element.getCTM()))
|
||||
.build();
|
||||
case Element.e_text -> Text.builder()
|
||||
.elementType(element.getType())
|
||||
.boundingBox(Converter.toRectangle2D(element.getBBox()))
|
||||
.text(element.getTextString())
|
||||
.font(element.getGState().getFont().getType())
|
||||
.fontsize(element.getGState().getFontSize())
|
||||
.build();
|
||||
case Element.e_image, Element.e_inline_image -> Image.builder()
|
||||
.elementType(element.getType())
|
||||
.boundingBox(Converter.toRectangle2D(element.getBBox()))
|
||||
.dataSize(element.getImageDataSize())
|
||||
.height(element.getImageHeight())
|
||||
.width(element.getImageWidth())
|
||||
.renderingIntent(element.getImageRenderingIntent())
|
||||
.componentNum(element.getComponentNum())
|
||||
.bitsPerComponent(element.getBitsPerComponent())
|
||||
.build();
|
||||
case Element.e_form -> Form.builder()
|
||||
.elementType(element.getType())
|
||||
.boundingBox(Converter.toRectangle2D(element.getBBox()))
|
||||
.xObjectType(element.getXObject().getType())
|
||||
.dictOrArrayOrStreamLength(element.getXObject().getType() == 7 ? element.getXObject().getDecodedStream().size() : 0)
|
||||
.build();
|
||||
// This technically should never happen, it's a safetynet
|
||||
default -> throw new RuntimeException("Feature Extraction is not supported for PDFTron.Element with type: " + element.getType());
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
public boolean almostMatches(Element element) throws PDFNetException {
|
||||
|
||||
@ -115,7 +72,7 @@ public class ElementFeatures {
|
||||
@Getter
|
||||
@SuperBuilder
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
private static class Text extends ElementFeatures {
|
||||
public static class Text extends ElementFeatures {
|
||||
|
||||
String text;
|
||||
int font;
|
||||
@ -181,7 +138,7 @@ public class ElementFeatures {
|
||||
@Getter
|
||||
@SuperBuilder
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
private static class Image extends ElementFeatures {
|
||||
public static class Image extends ElementFeatures {
|
||||
|
||||
int dataSize;
|
||||
int height;
|
||||
@ -189,7 +146,7 @@ public class ElementFeatures {
|
||||
int renderingIntent;
|
||||
int componentNum;
|
||||
int bitsPerComponent;
|
||||
|
||||
String hashOfImage;
|
||||
|
||||
@Override
|
||||
public boolean almostMatches(Element element) throws PDFNetException {
|
||||
@ -203,13 +160,41 @@ public class ElementFeatures {
|
||||
bitsPerComponent == element.getBitsPerComponent();
|
||||
}
|
||||
|
||||
public boolean almostMatches(ElementFeatures elementFeatures){
|
||||
if(elementFeatures.getClass() != this.getClass()){
|
||||
return false;
|
||||
}
|
||||
return super.almostMatches(elementFeatures) &&
|
||||
this.dataSize == ((Image) elementFeatures).getDataSize() &&
|
||||
this.height == ((Image) elementFeatures).getHeight() &&
|
||||
this.width == ((Image) elementFeatures).getWidth() &&
|
||||
this.renderingIntent == ((Image) elementFeatures).getRenderingIntent() &&
|
||||
this.componentNum == ((Image) elementFeatures).getComponentNum() &&
|
||||
this.bitsPerComponent == ((Image) elementFeatures).getBitsPerComponent() &&
|
||||
calculateHammingDistance(((Image) elementFeatures).getHashOfImage()) <=4;
|
||||
}
|
||||
|
||||
// Helper method to calculate the Hamming distance between two hexadecimal strings
|
||||
private int calculateHammingDistance(String hash2) {
|
||||
int distance = 0;
|
||||
int maxLength = Math.max(this.hashOfImage.length(), hash2.length());
|
||||
for (int i = 0; i < maxLength; i++) {
|
||||
char char1 = (i < this.hashOfImage.length()) ? this.hashOfImage.charAt(i) : '0';
|
||||
char char2 = (i < hash2.length()) ? hash2.charAt(i) : '0';
|
||||
if (char1 != char2) {
|
||||
distance++;
|
||||
}
|
||||
}
|
||||
return distance;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@Getter
|
||||
@SuperBuilder
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
private static class Form extends ElementFeatures {
|
||||
public static class Form extends ElementFeatures {
|
||||
|
||||
int xObjectType;
|
||||
long dictOrArrayOrStreamLength;
|
||||
|
||||
@ -0,0 +1,116 @@
|
||||
package com.iqser.red.pdftronlogic.commons;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.ByteArrayInputStream;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
|
||||
import com.pdftron.filters.FilterWriter;
|
||||
import com.pdftron.filters.MemoryFilter;
|
||||
import com.pdftron.pdf.Element;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class ImageHashFactory {
|
||||
|
||||
@SneakyThrows
|
||||
private byte[] getBytesOfImage(com.pdftron.pdf.Image inputImage) {
|
||||
// 0 because the memory filter determines the size
|
||||
var memFilter = new MemoryFilter(0, false);
|
||||
var filterWriter = new FilterWriter(memFilter);
|
||||
|
||||
inputImage.export(filterWriter);
|
||||
filterWriter.flushAll();
|
||||
byte[] res = memFilter.getBuffer();
|
||||
|
||||
memFilter.flushAll();
|
||||
memFilter.destroy();
|
||||
filterWriter.destroy();
|
||||
return res;
|
||||
}
|
||||
@SneakyThrows
|
||||
public String calculate(Element element) {
|
||||
com.pdftron.pdf.Image image = new com.pdftron.pdf.Image(element.getXObject());
|
||||
|
||||
byte[] imageBytes = getBytesOfImage(image);
|
||||
ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(imageBytes);
|
||||
BufferedImage image1 = ImageIO.read(byteArrayInputStream);
|
||||
|
||||
String hash = getSimplePHash(image1);
|
||||
|
||||
return hash;
|
||||
|
||||
}
|
||||
|
||||
public String getSimplePHash(BufferedImage image) {
|
||||
// Resize the image to a fixed size (e.g., 8x8 pixels)
|
||||
int targetWidth = 8;
|
||||
int targetHeight = 8;
|
||||
BufferedImage resizedImage = new BufferedImage(targetWidth, targetHeight, BufferedImage.TYPE_INT_ARGB);
|
||||
resizedImage.getGraphics().drawImage(image.getScaledInstance(targetWidth, targetHeight, java.awt.Image.SCALE_SMOOTH), 0, 0, targetWidth, targetHeight, null);
|
||||
|
||||
// Convert the image to grayscale
|
||||
BufferedImage grayscaleImage = new BufferedImage(targetWidth, targetHeight, BufferedImage.TYPE_BYTE_GRAY);
|
||||
grayscaleImage.getGraphics().drawImage(resizedImage, 0, 0, null);
|
||||
|
||||
// Calculate the average grayscale pixel value
|
||||
int average = calculateAverage(grayscaleImage);
|
||||
|
||||
// Create a binary hash based on pixel values
|
||||
StringBuilder hashBuilder = new StringBuilder();
|
||||
for (int y = 0; y < targetHeight; y++) {
|
||||
for (int x = 0; x < targetWidth; x++) {
|
||||
int pixelValue = new Color(grayscaleImage.getRGB(x, y)).getRed();
|
||||
if (pixelValue > average) {
|
||||
hashBuilder.append("1");
|
||||
} else {
|
||||
hashBuilder.append("0");
|
||||
}
|
||||
}
|
||||
}
|
||||
return hashBuilder.toString();
|
||||
}
|
||||
|
||||
// Helper method to calculate the average grayscale pixel value
|
||||
private int calculateAverage(BufferedImage image) {
|
||||
int total = 0;
|
||||
int width = image.getWidth();
|
||||
int height = image.getHeight();
|
||||
for (int y = 0; y < height; y++) {
|
||||
for (int x = 0; x < width; x++) {
|
||||
total += new Color(image.getRGB(x, y)).getRed();
|
||||
}
|
||||
}
|
||||
return total / (width * height);
|
||||
}
|
||||
|
||||
// to hash images either use getDHash or getSimplePHash
|
||||
public String getDHash(BufferedImage image) throws Exception {
|
||||
BufferedImage resizedImage = resizeImage(image, 9, 8); // Resize image to 9x8 for dHash
|
||||
|
||||
long hash = 0L;
|
||||
for (int y = 0; y < 8; y++) {
|
||||
for (int x = 0; x < 8; x++) {
|
||||
int leftPixel = resizedImage.getRGB(x, y);
|
||||
int rightPixel = resizedImage.getRGB(x + 1, y);
|
||||
hash <<= 1;
|
||||
hash |= (leftPixel < rightPixel) ? 1 : 0;
|
||||
}
|
||||
}
|
||||
|
||||
return Long.toHexString(hash);
|
||||
}
|
||||
|
||||
// Helper method to resize the image to the desired dimensions
|
||||
private BufferedImage resizeImage(BufferedImage image, int width, int height) {
|
||||
BufferedImage resizedImage = new BufferedImage(width, height, BufferedImage.TYPE_INT_ARGB);
|
||||
resizedImage.getGraphics().drawImage(image, 0, 0, width, height, null);
|
||||
return resizedImage;
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
@ -167,7 +167,7 @@ public class InvisibleElementRemovalService {
|
||||
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
|
||||
|
||||
if (!context.delta() && inClippingPath) {
|
||||
context.visibleElements().add(ElementFeatures.extractFeatures(imageElement));
|
||||
context.visibleElements().add(ElementFeatureFactory.extractFeatures(imageElement));
|
||||
}
|
||||
|
||||
if (context.delta() ^ inClippingPath) {
|
||||
@ -192,7 +192,7 @@ public class InvisibleElementRemovalService {
|
||||
boolean isTextVisible = isTextRenderedVisibly(gState, textBBox, context);
|
||||
|
||||
if (inClippingPath && isTextVisible) {
|
||||
context.visibleElements().add(ElementFeatures.extractFeatures(textElement));
|
||||
context.visibleElements().add(ElementFeatureFactory.extractFeatures(textElement));
|
||||
}
|
||||
if (!context.delta()) {
|
||||
if (inClippingPath && isTextVisible) {
|
||||
@ -291,7 +291,7 @@ public class InvisibleElementRemovalService {
|
||||
context.overlappedElements().addAll(currentOverlappedElements);
|
||||
context.visibleElements().removeAll(currentOverlappedElements);
|
||||
}
|
||||
context.visibleElements().add(ElementFeatures.extractFeatures(pathElement));
|
||||
context.visibleElements().add(ElementFeatureFactory.extractFeatures(pathElement));
|
||||
if (!context.delta()) {
|
||||
writer.writeElement(pathElement);
|
||||
}
|
||||
|
||||
@ -1,61 +1,52 @@
|
||||
package com.iqser.red.pdftronlogic.commons;
|
||||
|
||||
import java.awt.Image;
|
||||
import java.awt.Toolkit;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.File;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.filters.FileDescriptorFilter;
|
||||
import com.pdftron.filters.Filter;
|
||||
import com.pdftron.filters.FilterReader;
|
||||
import com.pdftron.filters.FilterWriter;
|
||||
import com.pdftron.pdf.ColorPt;
|
||||
import com.pdftron.pdf.ColorSpace;
|
||||
import com.pdftron.pdf.Element;
|
||||
import com.pdftron.pdf.ElementBuilder;
|
||||
import com.pdftron.pdf.ElementReader;
|
||||
import com.pdftron.pdf.ElementWriter;
|
||||
import com.pdftron.pdf.Image2RGB;
|
||||
import com.pdftron.pdf.Optimizer;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.pdf.PageIterator;
|
||||
import com.pdftron.sdf.SDFDoc;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.UtilityClass;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@UtilityClass
|
||||
@Slf4j
|
||||
public class WatermarkRemovalService {
|
||||
|
||||
final static double AREA_THRESHOLD = 0.6; // multiplied with page area
|
||||
final static double OCCURING_ON_PAGES_THRESHOLD_FACTOR = 0.4; // multiplied with number of pages
|
||||
final static double OCCURING_ON_PAGES_THRESHOLD_FACTOR = 0.75; // multiplied with number of pages
|
||||
|
||||
final static int MIN_PAGES_THRESHOLD = 3;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static void removeWatermarks(InputStream pdfFile, OutputStream out) {
|
||||
public void removeWatermarks(InputStream pdfFile, OutputStream out) {
|
||||
|
||||
PDFDoc pdfDoc = new PDFDoc(pdfFile);
|
||||
|
||||
if(pdfDoc.getPageCount() < MIN_PAGES_THRESHOLD){
|
||||
log.debug("Document page count {} is below threshold {}", pdfDoc.getPageCount(), MIN_PAGES_THRESHOLD);
|
||||
return;
|
||||
}
|
||||
|
||||
Map<Long, List<ElementFeatures>> formObjectsForPages = findAllFormObjectsAndImages(pdfDoc);
|
||||
|
||||
List<ElementFeatures> watermarkElementFeatures = filterSameFormObjectsOccuringOnMostPages(formObjectsForPages);
|
||||
|
||||
storeWatermarkImageHashValues(watermarkElementFeatures);
|
||||
|
||||
removeAllWatermarks(pdfDoc, watermarkElementFeatures);
|
||||
|
||||
try {
|
||||
@ -68,15 +59,6 @@ public class WatermarkRemovalService {
|
||||
}
|
||||
|
||||
|
||||
private static void storeWatermarkImageHashValues(List<ElementFeatures> watermarkElementFeatures) {
|
||||
for(ElementFeatures elementFeatures : watermarkElementFeatures){
|
||||
if(elementFeatures.getElementType() == Element.e_image || elementFeatures.getElementType() == Element.e_inline_image){
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static Map<Long, List<ElementFeatures>> findAllFormObjectsAndImages(PDFDoc pdfDoc) {
|
||||
|
||||
@ -86,6 +68,8 @@ public class WatermarkRemovalService {
|
||||
|
||||
ElementReader reader = new ElementReader();
|
||||
|
||||
|
||||
|
||||
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
|
||||
|
||||
Page page = iterator.next();
|
||||
@ -96,19 +80,7 @@ public class WatermarkRemovalService {
|
||||
|
||||
reader.begin(page);
|
||||
for (Element element = reader.next(); element != null; element = reader.next()) {
|
||||
if(element.getBBox() == null){
|
||||
continue;
|
||||
}
|
||||
if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (element.getType() == Element.e_form) {
|
||||
//processXObject(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringFromPage);
|
||||
} else if (element.getType() == Element.e_image || element.getType() == Element.e_inline_image) {
|
||||
// causes empty pages so far
|
||||
processImages(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage);
|
||||
}
|
||||
processElement(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringFromPage);
|
||||
}
|
||||
|
||||
formObjectsAndImagesForPages.put(page.getSDFObj().getObjNum(), elementFeaturesLinkedList);
|
||||
@ -120,93 +92,54 @@ public class WatermarkRemovalService {
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static void processImages(Element element,
|
||||
Set<Long> visitedXObjIds,
|
||||
LinkedList<ElementFeatures> elementFeaturesLinkedList,
|
||||
List<ElementFeatures> formObjectsOccuringMoreThanOnceOnAPage) {
|
||||
private static void processElement(Element element,
|
||||
Set<Long> visitedXObjIds,
|
||||
List<ElementFeatures> elementFeaturesLinkedList,
|
||||
List<ElementFeatures> formObjectsOccuringMoreThanOnceOnAPage,
|
||||
double minAreaCoveringPage) throws PDFNetException {
|
||||
|
||||
if(element.getType() == Element.e_image) {
|
||||
|
||||
//element.getImageData();
|
||||
|
||||
/*com.pdftron.pdf.Image image = new com.pdftron.pdf.Image(element.getXObject());
|
||||
System.out.println(image.getImageDataSize());
|
||||
//element.getImageData().writeToFile("C:\\Users\\RaphaelArnold\\AppData\\Local\\Temp\\" + "IMAGE1", false);
|
||||
String fname = "C:\\Users\\RaphaelArnold\\AppData\\Local\\Temp\\" + "IMAGE.png";
|
||||
image.exportAsPng(fname);
|
||||
|
||||
Image2RGB img_conv = new Image2RGB(element);
|
||||
FilterReader reader = new com.pdftron.filters.FilterReader(img_conv);
|
||||
byte[] image_data_out = new byte[element.getImageWidth() * element.getImageHeight() * 3];
|
||||
reader.read(image_data_out);
|
||||
System.out.println("he");
|
||||
|
||||
BufferedImage bufferedImage = ImageIO.read(new ByteArrayInputStream(image_data_out));
|
||||
bufferedImage.getScaledInstance(10,10,0);*/
|
||||
|
||||
|
||||
|
||||
//Optimizer.ImageSettings imageSettings = new Optimizer.ImageSettings();
|
||||
|
||||
|
||||
/*Image img = image.getBitmap();
|
||||
|
||||
BufferedImage bufferedImage= new BufferedImage(img.getWidth(null), img.getHeight(null), BufferedImage.TYPE_INT_RGB);
|
||||
img.getGraphics().drawImage(img, 0, 0, null);
|
||||
ImageIO.write(bufferedImage, "jpg", new File("C:\\myImage.jpg"));*/
|
||||
if (element.getBBox() == null) {
|
||||
return;
|
||||
}
|
||||
ElementFeatures elementFeatures = ElementFeatures.extractFeatures(element);
|
||||
if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringPage) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (element.getType() == Element.e_form) {
|
||||
processXObject(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage);
|
||||
} else if (element.getType() == Element.e_image || element.getType() == Element.e_inline_image) {
|
||||
processImages(element, elementFeaturesLinkedList);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static void processImages(Element element, List<ElementFeatures> elementFeaturesLinkedList) {
|
||||
|
||||
String hashOfImage = ImageHashFactory.calculate(element);
|
||||
ElementFeatures elementFeatures = ElementFeatureFactory.extractFeaturesWithHash(element, hashOfImage);
|
||||
elementFeaturesLinkedList.add(elementFeatures);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static boolean processXObject(Element element,
|
||||
private static void processXObject(Element element,
|
||||
Set<Long> visitedXObjIds,
|
||||
LinkedList<ElementFeatures> elementFeaturesLinkedList,
|
||||
List<ElementFeatures> elementFeaturesLinkedList,
|
||||
List<ElementFeatures> formObjectsOccuringMoreThanOnceOnAPage,
|
||||
double minAreaCoveringPage) {
|
||||
|
||||
/*for(ElementFeatures elementFeatures1 : formObjectsOccuringMoreThanOnceOnAPage){
|
||||
if(elementFeatures1.almostMatches(element)){
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
for (ElementFeatures elementFeatures1 : elementFeaturesLinkedList) {
|
||||
if (elementFeatures1.almostMatches(element)) {
|
||||
ElementFeatures elementFeatures = ElementFeatures.extractFeatures(element);
|
||||
formObjectsOccuringMoreThanOnceOnAPage.add(elementFeatures);
|
||||
elementFeaturesLinkedList.remove(elementFeatures);
|
||||
return;
|
||||
}
|
||||
}*/
|
||||
|
||||
double minAreaCoveringPage) {
|
||||
|
||||
if (visitedXObjIds.add(element.getXObject().getObjNum())) {
|
||||
|
||||
ElementReader xObjectReader = new ElementReader();
|
||||
xObjectReader.begin(element.getXObject());
|
||||
boolean isContainingImageBigEnough = true;
|
||||
for (Element element1 = xObjectReader.next(); element1 != null; element1 = xObjectReader.next()) {
|
||||
if (element1.getType() == Element.e_form) {
|
||||
isContainingImageBigEnough = processXObject(element1, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage);
|
||||
} else if((element1.getType() == Element.e_image || element1.getType() == Element.e_inline_image)) {
|
||||
if(element1.getImageHeight()*element1.getImageWidth() < minAreaCoveringPage){
|
||||
xObjectReader.destroy();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
if(isContainingImageBigEnough) {
|
||||
elementFeaturesLinkedList.add(ElementFeatures.extractFeatures(element));
|
||||
processElement(element1, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage);
|
||||
}
|
||||
elementFeaturesLinkedList.add(ElementFeatureFactory.extractFeatures(element));
|
||||
xObjectReader.destroy();
|
||||
} else {
|
||||
elementFeaturesLinkedList.add(ElementFeatures.extractFeatures(element));
|
||||
elementFeaturesLinkedList.add(ElementFeatureFactory.extractFeatures(element));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@ -259,29 +192,46 @@ public class WatermarkRemovalService {
|
||||
|
||||
reader.begin(page);
|
||||
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
|
||||
processElements(reader, writer, watermarksElementFeaturesList, visitedXObjIds);
|
||||
processElements(page, reader, writer, watermarksElementFeaturesList, visitedXObjIds);
|
||||
writer.end();
|
||||
reader.end();
|
||||
}
|
||||
|
||||
|
||||
private static void processElements(ElementReader reader,
|
||||
private static void processElements(Page page,
|
||||
ElementReader reader,
|
||||
ElementWriter writer,
|
||||
List<ElementFeatures> watermarksElementFeaturesList,
|
||||
Set<Long> visitedXObjIds) throws PDFNetException {
|
||||
|
||||
for (Element element = reader.next(); element != null; element = reader.next())
|
||||
double minAreaCoveringFromPage = AREA_THRESHOLD * page.getPageHeight() * page.getPageWidth();
|
||||
for (Element element = reader.next(); element != null; element = reader.next()) {
|
||||
|
||||
switch (element.getType()) {
|
||||
case Element.e_image, Element.e_inline_image -> removeImages(element,reader,writer, watermarksElementFeaturesList);
|
||||
case Element.e_form -> processForms(element, reader, writer, watermarksElementFeaturesList, visitedXObjIds);
|
||||
case Element.e_image, Element.e_inline_image -> {
|
||||
if (element.getBBox() == null) {
|
||||
continue;
|
||||
}
|
||||
if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage) {
|
||||
writer.writeElement(element);
|
||||
continue;
|
||||
}
|
||||
removeImages(element, writer, watermarksElementFeaturesList);
|
||||
}
|
||||
case Element.e_form -> processForms(page, element, reader, writer, watermarksElementFeaturesList, visitedXObjIds);
|
||||
default -> writer.writeElement(element);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static void removeImages(Element element, ElementReader reader, ElementWriter writer, List<ElementFeatures> watermarksElementFeaturesList) {
|
||||
private static void removeImages(Element element, ElementWriter writer, List<ElementFeatures> watermarksElementFeaturesList) {
|
||||
|
||||
String hashValueOfImage = ImageHashFactory.calculate(element);
|
||||
ElementFeatures imageFeatures = ElementFeatureFactory.extractFeaturesWithHash(element, hashValueOfImage);
|
||||
for (ElementFeatures elementFeatures : watermarksElementFeaturesList) {
|
||||
if (elementFeatures.almostMatches(element)) {
|
||||
if (elementFeatures.almostMatches(imageFeatures)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
@ -290,11 +240,8 @@ public class WatermarkRemovalService {
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Maybe problem with visitedXObjIds, because, if on same page there are two identical xobjects
|
||||
but one is inside another xObject, the other is directly
|
||||
*/
|
||||
private static void processForms(Element element,
|
||||
private static void processForms(Page page,
|
||||
Element element,
|
||||
ElementReader reader,
|
||||
ElementWriter writer,
|
||||
List<ElementFeatures> watermarksElementFeaturesList,
|
||||
@ -319,7 +266,7 @@ public class WatermarkRemovalService {
|
||||
reader.clearChangeList();
|
||||
formWriter.setDefaultGState(reader);
|
||||
|
||||
processElements(reader, formWriter, watermarksElementFeaturesList, visitedXObjIds);
|
||||
processElements(page, reader, formWriter, watermarksElementFeaturesList, visitedXObjIds);
|
||||
formWriter.end();
|
||||
formWriter.destroy();
|
||||
reader.end();
|
||||
@ -327,22 +274,4 @@ public class WatermarkRemovalService {
|
||||
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static void drawBBox(ElementWriter writer, Rectangle2D r, String hexcolor) {
|
||||
|
||||
ColorPt colorPt = new ColorPt(Integer.valueOf(hexcolor.substring(1, 3), 16) / 255d,
|
||||
Integer.valueOf(hexcolor.substring(3, 5), 16) / 255d,
|
||||
Integer.valueOf(hexcolor.substring(5, 7), 16) / 255d);
|
||||
ElementBuilder eb = new ElementBuilder();
|
||||
Element rect = eb.createRect(r.getX(), r.getY(), r.getWidth(), r.getHeight());
|
||||
rect.setPathStroke(true);
|
||||
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
||||
rect.getGState().setStrokeColor(colorPt);
|
||||
writer.writePlacedElement(rect);
|
||||
|
||||
colorPt.destroy();
|
||||
eb.destroy();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user