RED-7075: Improved watermark removal to recognize smaller images and text

This commit is contained in:
RaphaelArnold 2023-08-31 14:58:27 +02:00
parent 8d06304723
commit 84e3390f4e
3 changed files with 254 additions and 70 deletions

View File

@ -3,7 +3,6 @@ package com.iqser.red.pdftronlogic.commons;
import static com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService.TOLERANCE;
import java.awt.Color;
import java.awt.geom.Area;
import java.awt.geom.GeneralPath;
import java.awt.geom.Rectangle2D;
@ -34,18 +33,6 @@ public class ElementFeatures {
rectsAlmostMatch(element.getBBox());
}
public boolean almostMatches(ElementFeatures elementFeatures){
return elementFeatures.getElementType() == elementType &&
elementFeatures.getBoundingBox() != null &&
rectsAlmostMatch(elementFeatures.getBoundingBox());
}
protected boolean almostEqual(double a, double b) {
return Math.abs(a - b) < TOLERANCE;
}
@SneakyThrows
private boolean rectsAlmostMatch(Rect bBox) {
@ -57,6 +44,19 @@ public class ElementFeatures {
almostEqual(bBox.getHeight(), boundingBox.getHeight());
}
protected boolean almostEqual(double a, double b) {
return Math.abs(a - b) < TOLERANCE;
}
public boolean almostMatches(ElementFeatures elementFeatures) {
return elementFeatures.getElementType() == elementType && elementFeatures.getBoundingBox() != null && rectsAlmostMatch(elementFeatures.getBoundingBox());
}
@SneakyThrows
private boolean rectsAlmostMatch(Rectangle2D bBox) {
// To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance
@ -68,6 +68,53 @@ public class ElementFeatures {
}
public boolean similarMatches(Element element) throws PDFNetException {
return element.getType() == elementType && //
element.getBBox() != null && //
rectsSimilarMatch(element.getBBox());
}
@SneakyThrows
private boolean rectsSimilarMatch(Rect rect) {
// To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance
return similarEqualPosition(rect.getX1(), boundingBox.getX(), rect.getWidth()) && //
similarEqualPosition(rect.getY1(), boundingBox.getY(), rect.getHeight()) && //
similarEqualSize(rect.getWidth(), boundingBox.getWidth()) && //
similarEqualSize(rect.getHeight(), boundingBox.getHeight());
}
protected boolean similarEqualPosition(double a, double b, double boxSize) {
return Math.abs(a - b) < boxSize * 0.2;
}
protected boolean similarEqualSize(double a, double b) {
return Math.abs(a - b) < a * 0.1;
}
public boolean similarMatches(ElementFeatures elementFeatures) {
return elementFeatures.getElementType() == elementType && elementFeatures.getBoundingBox() != null && rectsSimilarMatch(elementFeatures.getBoundingBox());
}
private boolean rectsSimilarMatch(Rectangle2D rectangle2D) {
// To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance
return similarEqualPosition(rectangle2D.getX(), boundingBox.getX(), rectangle2D.getWidth()) && //
similarEqualPosition(rectangle2D.getY(), boundingBox.getY(), rectangle2D.getHeight()) && //
similarEqualSize(rectangle2D.getWidth(), boundingBox.getWidth()) && //
similarEqualSize(rectangle2D.getHeight(), boundingBox.getHeight());
}
@EqualsAndHashCode(callSuper = true)
@Getter
@SuperBuilder
@ -148,6 +195,7 @@ public class ElementFeatures {
int bitsPerComponent;
String hashOfImage;
@Override
public boolean almostMatches(Element element) throws PDFNetException {
@ -160,22 +208,39 @@ public class ElementFeatures {
bitsPerComponent == element.getBitsPerComponent();
}
public boolean almostMatches(ElementFeatures elementFeatures){
if(elementFeatures.getClass() != this.getClass()){
public boolean almostMatches(ElementFeatures elementFeatures) {
if (elementFeatures.getClass() != this.getClass()) {
return false;
}
return super.almostMatches(elementFeatures) &&
this.dataSize == ((Image) elementFeatures).getDataSize() &&
this.height == ((Image) elementFeatures).getHeight() &&
this.width == ((Image) elementFeatures).getWidth() &&
this.renderingIntent == ((Image) elementFeatures).getRenderingIntent() &&
this.componentNum == ((Image) elementFeatures).getComponentNum() &&
this.bitsPerComponent == ((Image) elementFeatures).getBitsPerComponent() &&
calculateHammingDistance(((Image) elementFeatures).getHashOfImage()) <=4;
return super.almostMatches(elementFeatures) && this.dataSize == ((Image) elementFeatures).getDataSize() && this.height == ((Image) elementFeatures).getHeight() && this.width == ((Image) elementFeatures).getWidth() && this.renderingIntent == ((Image) elementFeatures).getRenderingIntent() && this.componentNum == ((Image) elementFeatures).getComponentNum() && this.bitsPerComponent == ((Image) elementFeatures).getBitsPerComponent() && calculateHammingDistance(
((Image) elementFeatures).getHashOfImage()) <= 4;
}
public boolean similarMatches(Element element) throws PDFNetException {
return super.almostMatches(element) && //
dataSize == element.getImageDataSize() && //
height == element.getImageHeight() && //
width == element.getImageWidth() && //
renderingIntent == element.getImageRenderingIntent() && //
componentNum == element.getComponentNum() && //
bitsPerComponent == element.getBitsPerComponent();
}
public boolean similarMatches(ElementFeatures elementFeatures) {
return super.similarMatches(elementFeatures) && this.dataSize == ((Image) elementFeatures).getDataSize() && this.height == ((Image) elementFeatures).getHeight() && this.width == ((Image) elementFeatures).getWidth() && this.renderingIntent == ((Image) elementFeatures).getRenderingIntent() && this.componentNum == ((Image) elementFeatures).getComponentNum() && this.bitsPerComponent == ((Image) elementFeatures).getBitsPerComponent() && calculateHammingDistance(
((Image) elementFeatures).getHashOfImage()) <= 4;
}
// Helper method to calculate the Hamming distance between two hexadecimal strings
private int calculateHammingDistance(String hash2) {
int distance = 0;
int maxLength = Math.max(this.hashOfImage.length(), hash2.length());
for (int i = 0; i < maxLength; i++) {
@ -202,34 +267,32 @@ public class ElementFeatures {
@Override
public boolean almostMatches(Element element) throws PDFNetException {
return element.getType() == getElementType() && //
element.getBBox() != null && //
(super.rectsAlmostMatch(element.getBBox()) || almostRotateMatches(element.getBBox().getRectangle())) &&
xObjectType == element.getXObject().getType() &&
dictOrArrayOrStreamLength == element.getXObject().getDecodedStream().size();
(super.rectsAlmostMatch(element.getBBox()) || almostRotateMatches(element.getBBox().getRectangle())) && xObjectType == element.getXObject()
.getType() && dictOrArrayOrStreamLength == element.getXObject().getDecodedStream().size();
}
public boolean almostMatches(ElementFeatures elementFeatures){
if(elementFeatures.getClass() != this.getClass()){
public boolean almostMatches(ElementFeatures elementFeatures) {
if (elementFeatures.getClass() != this.getClass()) {
return false;
}
return elementFeatures.getElementType() == getElementType() &&
elementFeatures.getBoundingBox() != null &&
(super.rectsAlmostMatch(elementFeatures.getBoundingBox()) || almostRotateMatches(elementFeatures.getBoundingBox().getBounds2D())) &&
xObjectType == ((Form)elementFeatures).getXObjectType() &&
dictOrArrayOrStreamLength == ((Form)elementFeatures).getDictOrArrayOrStreamLength();
return elementFeatures.getElementType() == getElementType() && elementFeatures.getBoundingBox() != null && (super.rectsAlmostMatch(elementFeatures.getBoundingBox()) || almostRotateMatches(
elementFeatures.getBoundingBox()
.getBounds2D())) && xObjectType == ((Form) elementFeatures).getXObjectType() && dictOrArrayOrStreamLength == ((Form) elementFeatures).getDictOrArrayOrStreamLength();
}
private boolean almostRotateMatches(Rectangle2D bBox) {
return almostEqual(bBox.getWidth(), getBoundingBox().getHeight()) && //
almostEqual(bBox.getHeight(), getBoundingBox().getWidth());
}
}
}

View File

@ -1,23 +1,47 @@
package com.iqser.red.pdftronlogic.commons;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.*;
import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.*;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementReader;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public class WatermarkRemovalService {
final static double AREA_THRESHOLD = 0.6; // multiplied with page area
final static double AREA_THRESHOLD = 0.5; // multiplied with page area
final static double OCCURING_ON_PAGES_THRESHOLD_FACTOR = 0.75; // multiplied with number of pages
final static int MIN_PAGES_THRESHOLD = 3;
final static double IMAGE_POSITION_HEIGHT_THRESHOLD = 0.2; // multiplied with page height
final static double IMAGE_POSITION_WIDTH_THRESHOLD = 0.125; // multiplied with page width
final static double TEXT_POSITION_THRESHOLD = 0.15;
final static double MIN_TEXTWATERMAK_HEIGHT_THRESHOLD = 0.125; // multiplied with page height
final static int PAGE_NUMBER_TEXT_SEARCH_THRESHOLD = 5; // stop text based search after 5 pages without watermark
final static double ROTATED_TEXT_THRESHOLD = 12.5; //this is in degrees
static boolean foundTextWatermark = true;
/**
* The method remove watermark works only for Documents with size greater than MIN_PAGES_THRESHOLD.
@ -46,7 +70,7 @@ public class WatermarkRemovalService {
log.info("Watermark found and will be removed!");
removeAllWatermarks(pdfDoc, watermarkElementFeatures);
} else {
log.info("No watermark found!");
log.info("No unlabeled watermark found!");
}
}
@ -69,7 +93,6 @@ public class WatermarkRemovalService {
ElementReader reader = new ElementReader();
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
Page page = iterator.next();
@ -80,7 +103,7 @@ public class WatermarkRemovalService {
reader.begin(page);
for (Element element = reader.next(); element != null; element = reader.next()) {
processElement(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringFromPage);
processElement(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringFromPage, page);
}
formObjectsAndImagesForPages.put(page.getSDFObj().getObjNum(), elementFeaturesLinkedList);
@ -96,22 +119,81 @@ public class WatermarkRemovalService {
Set<Long> visitedXObjIds,
List<ElementFeatures> elementFeaturesLinkedList,
List<ElementFeatures> formObjectsOccuringMoreThanOnceOnAPage,
double minAreaCoveringPage) throws PDFNetException {
double minAreaCoveringPage,
Page page) throws PDFNetException {
if (element.getBBox() == null) {
return;
}
if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringPage) {
return;
}
if (element.getType() == Element.e_form) {
processXObject(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage);
if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringPage) {
return;
}
processXObject(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage, page);
} else if (element.getType() == Element.e_image || element.getType() == Element.e_inline_image) {
if (element.getXObject() == null) {
return;
}
if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringPage) {
boolean isLocatedNearBorder = element.getBBox().getY1() < page.getVisibleContentBox()
.getY1() + page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD || element.getBBox().getY2() > page.getVisibleContentBox()
.getY2() - page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD || element.getBBox().getX1() < page.getVisibleContentBox()
.getX1() + page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD || element.getBBox().getX2() > page.getVisibleContentBox()
.getX2() - page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD;
if (isLocatedNearBorder) {
return;
}
}
processImages(element, elementFeaturesLinkedList);
} else if (element.getType() == Element.e_text) {
processText(element, elementFeaturesLinkedList, page);
}
}
@SneakyThrows
private void processText(Element element, List<ElementFeatures> elementFeaturesLinkedList, Page page) {
if (page.getIndex() == PAGE_NUMBER_TEXT_SEARCH_THRESHOLD) {
shouldTextSearchBeContinued(elementFeaturesLinkedList);
}
if (!foundTextWatermark) {
return;
}
if (Math.abs(element.getCTM().getB()) < Math.sin(Math.toRadians(ROTATED_TEXT_THRESHOLD)) || Math.abs(element.getCTM()
.getB()) > Math.sin(Math.toRadians(70 - ROTATED_TEXT_THRESHOLD))) {
return;
}
if (Math.max(element.getBBox().getY1(), element.getBBox().getY2()) < page.getVisibleContentBox().getY1() + page.getPageHeight() * TEXT_POSITION_THRESHOLD) {
return;
}
boolean isBigEnough = Math.abs(element.getBBox().getY1() - element.getBBox().getY2()) > page.getPageHeight() * MIN_TEXTWATERMAK_HEIGHT_THRESHOLD;
if (isBigEnough) {
ElementFeatures elementFeatures = ElementFeatureFactory.extractFeatures(element);
elementFeaturesLinkedList.add(elementFeatures);
}
}
private void shouldTextSearchBeContinued(List<ElementFeatures> elementFeaturesLinkedList) {
int countTextWatermarks = 0;
for (ElementFeatures elementFeatures : elementFeaturesLinkedList) {
if (elementFeatures.getElementType() == Element.e_text) {
countTextWatermarks++;
}
}
if (countTextWatermarks < elementFeaturesLinkedList.size() * OCCURING_ON_PAGES_THRESHOLD_FACTOR) {
foundTextWatermark = false;
}
}
@ -130,13 +212,14 @@ public class WatermarkRemovalService {
Set<Long> visitedXObjIds,
List<ElementFeatures> elementFeaturesLinkedList,
List<ElementFeatures> formObjectsOccuringMoreThanOnceOnAPage,
double minAreaCoveringPage) {
double minAreaCoveringPage,
Page page) {
if (visitedXObjIds.add(element.getXObject().getObjNum())) {
ElementReader xObjectReader = new ElementReader();
xObjectReader.begin(element.getXObject());
for (Element element1 = xObjectReader.next(); element1 != null; element1 = xObjectReader.next()) {
processElement(element1, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage);
processElement(element1, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage, page);
}
elementFeaturesLinkedList.add(ElementFeatureFactory.extractFeatures(element));
xObjectReader.destroy();
@ -159,7 +242,8 @@ public class WatermarkRemovalService {
.flatMap(Collection::stream)
.filter(elementFeature -> formObjectsPerPage.values()
.stream()
.filter(elementFeaturesOnPage -> elementFeaturesOnPage.stream().anyMatch(elementFeature::almostMatches))
.filter(elementFeaturesOnPage -> elementFeaturesOnPage.stream()
.anyMatch(elementFeature.getElementType() == Element.e_image || elementFeature.getElementType() == Element.e_inline_image ? elementFeature::similarMatches : elementFeature::almostMatches))
.count() >= minPagesFilter)
.toList();
}
@ -210,21 +294,28 @@ public class WatermarkRemovalService {
double minAreaCoveringFromPage = AREA_THRESHOLD * page.getPageHeight() * page.getPageWidth();
for (Element element = reader.next(); element != null; element = reader.next()) {
switch (element.getType()) {
case Element.e_image, Element.e_inline_image -> {
if (element.getBBox() == null) {
writer.writeElement(element);
continue;
}
if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage || element.getXObject() == null) {
writer.writeElement(element);
continue;
if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage) {
boolean isLocatedNearBorder = element.getBBox().getY1() < page.getVisibleContentBox()
.getY1() + page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD || element.getBBox().getY2() > page.getVisibleContentBox()
.getY2() - page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD || element.getBBox().getX1() < page.getVisibleContentBox()
.getX1() + page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD || element.getBBox().getX2() > page.getVisibleContentBox()
.getX2() - page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD;
if ((isLocatedNearBorder && element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage) || element.getXObject() == null) {
writer.writeElement(element);
continue;
}
}
removeImages(element, writer, watermarksElementFeaturesList);
}
case Element.e_form ->
processForms(page, element, reader, writer, watermarksElementFeaturesList, visitedXObjIds);
case Element.e_form -> processForms(page, element, reader, writer, watermarksElementFeaturesList, visitedXObjIds);
case Element.e_text -> processText(element, reader, writer, watermarksElementFeaturesList, page);
default -> writer.writeElement(element);
}
}
@ -232,13 +323,41 @@ public class WatermarkRemovalService {
@SneakyThrows
private void removeImages(Element element, ElementWriter
writer, List<ElementFeatures> watermarksElementFeaturesList) {
private void processText(Element element, ElementReader reader, ElementWriter writer, List<ElementFeatures> watermarksElementFeaturesList, Page page) {
if (!foundTextWatermark) {
writer.writeElement(element);
return;
}
if (Math.abs(element.getCTM().getB()) < Math.sin(Math.toRadians(ROTATED_TEXT_THRESHOLD)) || Math.abs(element.getCTM()
.getB()) > Math.sin(Math.toRadians(70 - ROTATED_TEXT_THRESHOLD))) {
writer.writeElement(element);
return;
}
if (Math.max(element.getBBox().getY1(), element.getBBox().getY2()) < page.getVisibleContentBox().getY1() + page.getPageHeight() * TEXT_POSITION_THRESHOLD) {
writer.writeElement(element);
return;
}
for (ElementFeatures elementFeatures : watermarksElementFeaturesList) {
if (elementFeatures.almostMatches(element)) {
return;
}
}
writer.writeElement(element);
}
@SneakyThrows
private void removeImages(Element element, ElementWriter writer, List<ElementFeatures> watermarksElementFeaturesList) {
String hashValueOfImage = ImageHashFactory.calculate(element);
ElementFeatures imageFeatures = ElementFeatureFactory.extractFeaturesWithHash(element, hashValueOfImage);
for (ElementFeatures elementFeatures : watermarksElementFeaturesList) {
if (elementFeatures.almostMatches(imageFeatures)) {
if (elementFeatures.similarMatches(imageFeatures)) {
return;
}
}

View File

@ -1,15 +1,17 @@
package com.iqser.red.pdftronlogic.commons;
import com.pdftron.pdf.PDFNet;
import lombok.SneakyThrows;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.junit.platform.commons.util.StringUtils;
import java.io.FileOutputStream;
import java.nio.file.Path;
import java.util.Locale;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.junit.platform.commons.util.StringUtils;
import com.pdftron.pdf.PDFNet;
import lombok.SneakyThrows;
@Disabled
class WatermarkRemovalServiceTest {