Merge branch 'RED-7075' into 'master'

RED-7075: Improved watermark removal to recognize smaller images and text

Closes RED-7075

See merge request redactmanager/commons/pdftron-logic-commons!15
This commit is contained in:
Raphael Arnold 2023-09-01 12:06:56 +02:00
commit 612bb5a63a
3 changed files with 235 additions and 74 deletions

View File

@ -3,7 +3,6 @@ package com.iqser.red.pdftronlogic.commons;
import static com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService.TOLERANCE;
import java.awt.Color;
import java.awt.geom.Area;
import java.awt.geom.GeneralPath;
import java.awt.geom.Rectangle2D;
@ -23,6 +22,9 @@ import lombok.experimental.SuperBuilder;
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ElementFeatures {
final private static double RECT_POSITION_SIMILARITY_THRESHOLD_FACTOR = 0.2; // specify how much the x and y value are allowed to differ
final private static double RECT_SIZE_SIMILARITY_THRESHOLD_FACTOR = 0.1; // the scale the images are allowed to differ
final private static double HAMMING_DISTANCE_THRESHOLD = 4; // defines the similarity of the hash of images
int elementType;
Rectangle2D boundingBox;
@ -34,18 +36,6 @@ public class ElementFeatures {
rectsAlmostMatch(element.getBBox());
}
public boolean almostMatches(ElementFeatures elementFeatures){
return elementFeatures.getElementType() == elementType &&
elementFeatures.getBoundingBox() != null &&
rectsAlmostMatch(elementFeatures.getBoundingBox());
}
protected boolean almostEqual(double a, double b) {
return Math.abs(a - b) < TOLERANCE;
}
@SneakyThrows
private boolean rectsAlmostMatch(Rect bBox) {
@ -57,6 +47,19 @@ public class ElementFeatures {
almostEqual(bBox.getHeight(), boundingBox.getHeight());
}
protected boolean almostEqual(double a, double b) {
return Math.abs(a - b) < TOLERANCE;
}
public boolean almostMatches(ElementFeatures elementFeatures) {
return elementFeatures.getElementType() == elementType && elementFeatures.getBoundingBox() != null && rectsAlmostMatch(elementFeatures.getBoundingBox());
}
@SneakyThrows
private boolean rectsAlmostMatch(Rectangle2D bBox) {
// To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance
@ -68,6 +71,34 @@ public class ElementFeatures {
}
public boolean isSimilarTo(ElementFeatures elementFeatures) {
return elementFeatures.getElementType() == elementType && elementFeatures.getBoundingBox() != null && areRectsSimilar(elementFeatures.getBoundingBox());
}
private boolean areRectsSimilar(Rectangle2D rectangle2D) {
// To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance
return isPositionSimilar(rectangle2D.getX(), boundingBox.getX(), rectangle2D.getWidth()) && //
isPositionSimilar(rectangle2D.getY(), boundingBox.getY(), rectangle2D.getHeight()) && //
isSizeSimilar(rectangle2D.getWidth(), boundingBox.getWidth()) && //
isSizeSimilar(rectangle2D.getHeight(), boundingBox.getHeight());
}
protected boolean isPositionSimilar(double a, double b, double boxSize) {
return Math.abs(a - b) < boxSize * RECT_POSITION_SIMILARITY_THRESHOLD_FACTOR;
}
protected boolean isSizeSimilar(double a, double b) {
return Math.abs(a - b) < a * RECT_SIZE_SIMILARITY_THRESHOLD_FACTOR;
}
@EqualsAndHashCode(callSuper = true)
@Getter
@SuperBuilder
@ -148,6 +179,7 @@ public class ElementFeatures {
int bitsPerComponent;
String hashOfImage;
@Override
public boolean almostMatches(Element element) throws PDFNetException {
@ -160,22 +192,27 @@ public class ElementFeatures {
bitsPerComponent == element.getBitsPerComponent();
}
public boolean almostMatches(ElementFeatures elementFeatures){
if(elementFeatures.getClass() != this.getClass()){
public boolean almostMatches(ElementFeatures elementFeatures) {
if (elementFeatures.getClass() != this.getClass()) {
return false;
}
return super.almostMatches(elementFeatures) &&
this.dataSize == ((Image) elementFeatures).getDataSize() &&
this.height == ((Image) elementFeatures).getHeight() &&
this.width == ((Image) elementFeatures).getWidth() &&
this.renderingIntent == ((Image) elementFeatures).getRenderingIntent() &&
this.componentNum == ((Image) elementFeatures).getComponentNum() &&
this.bitsPerComponent == ((Image) elementFeatures).getBitsPerComponent() &&
calculateHammingDistance(((Image) elementFeatures).getHashOfImage()) <=4;
return super.almostMatches(elementFeatures) && this.dataSize == ((Image) elementFeatures).getDataSize() && this.height == ((Image) elementFeatures).getHeight() && this.width == ((Image) elementFeatures).getWidth() && this.renderingIntent == ((Image) elementFeatures).getRenderingIntent() && this.componentNum == ((Image) elementFeatures).getComponentNum() && this.bitsPerComponent == ((Image) elementFeatures).getBitsPerComponent() && calculateHammingDistance(
((Image) elementFeatures).getHashOfImage()) <= HAMMING_DISTANCE_THRESHOLD;
}
public boolean isSimilarTo(ElementFeatures elementFeatures) {
return super.isSimilarTo(elementFeatures) && //
calculateHammingDistance(((Image) elementFeatures).getHashOfImage()) <= HAMMING_DISTANCE_THRESHOLD;
}
// Helper method to calculate the Hamming distance between two hexadecimal strings
private int calculateHammingDistance(String hash2) {
int distance = 0;
int maxLength = Math.max(this.hashOfImage.length(), hash2.length());
for (int i = 0; i < maxLength; i++) {
@ -202,34 +239,32 @@ public class ElementFeatures {
@Override
public boolean almostMatches(Element element) throws PDFNetException {
return element.getType() == getElementType() && //
element.getBBox() != null && //
(super.rectsAlmostMatch(element.getBBox()) || almostRotateMatches(element.getBBox().getRectangle())) &&
xObjectType == element.getXObject().getType() &&
dictOrArrayOrStreamLength == element.getXObject().getDecodedStream().size();
(super.rectsAlmostMatch(element.getBBox()) || almostRotateMatches(element.getBBox().getRectangle())) && xObjectType == element.getXObject()
.getType() && dictOrArrayOrStreamLength == element.getXObject().getDecodedStream().size();
}
public boolean almostMatches(ElementFeatures elementFeatures){
if(elementFeatures.getClass() != this.getClass()){
public boolean almostMatches(ElementFeatures elementFeatures) {
if (elementFeatures.getClass() != this.getClass()) {
return false;
}
return elementFeatures.getElementType() == getElementType() &&
elementFeatures.getBoundingBox() != null &&
(super.rectsAlmostMatch(elementFeatures.getBoundingBox()) || almostRotateMatches(elementFeatures.getBoundingBox().getBounds2D())) &&
xObjectType == ((Form)elementFeatures).getXObjectType() &&
dictOrArrayOrStreamLength == ((Form)elementFeatures).getDictOrArrayOrStreamLength();
return elementFeatures.getElementType() == getElementType() && elementFeatures.getBoundingBox() != null && (super.rectsAlmostMatch(elementFeatures.getBoundingBox()) || almostRotateMatches(
elementFeatures.getBoundingBox()
.getBounds2D())) && xObjectType == ((Form) elementFeatures).getXObjectType() && dictOrArrayOrStreamLength == ((Form) elementFeatures).getDictOrArrayOrStreamLength();
}
private boolean almostRotateMatches(Rectangle2D bBox) {
return almostEqual(bBox.getWidth(), getBoundingBox().getHeight()) && //
almostEqual(bBox.getHeight(), getBoundingBox().getWidth());
}
}
}

View File

@ -1,28 +1,54 @@
package com.iqser.red.pdftronlogic.commons;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.*;
import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.*;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementReader;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public class WatermarkRemovalService {
final static double AREA_THRESHOLD = 0.6; // multiplied with page area
final static double AREA_THRESHOLD = 0.5; // multiplied with page area
final static double OCCURING_ON_PAGES_THRESHOLD_FACTOR = 0.75; // multiplied with number of pages
final static int MIN_PAGES_THRESHOLD = 3;
final static double IMAGE_POSITION_HEIGHT_THRESHOLD = 0.2; // multiplied with page height
final static double IMAGE_POSITION_WIDTH_THRESHOLD = 0.125; // multiplied with page width
final static double TEXT_POSITION_THRESHOLD = 0.15;
final static double MIN_TEXTWATERMARK_HEIGHT_THRESHOLD = 0.125; // multiplied with page height
final static int PAGE_NUMBER_TEXT_SEARCH_THRESHOLD = 5; // stop text based search after 5 pages without watermark
final static double ROTATED_TEXT_THRESHOLD = 12.5; //this is in degrees
static boolean foundTextWatermark = true;
/**
* The method remove watermark works only for Documents with size greater than MIN_PAGES_THRESHOLD.
* First the possible watermarks (big XObjects or Images) will be detected and then checked if those appear on most pages according to the
* OCCURING_ON_PAGES_THRESHOLD_FACTOR by using image hashing for similarity and size and stream size of the xobjects.
* The following watermarks will be found: big XObjects, big Images, small Images that appear in the middle of the page, and
* text that is rotated and big enough compared to height of page.
* First the possible watermarks will be detected and then checked if those appear on most pages according to the
* OCCURING_ON_PAGES_THRESHOLD_FACTOR. We us image hashing for similarity between pictures and size and stream size of the xobjects.
* If so, these detected and confirmed will not be written to the pdf file.
*
* @param pdfFile PDFFile to remove watermarks
@ -46,7 +72,7 @@ public class WatermarkRemovalService {
log.info("Watermark found and will be removed!");
removeAllWatermarks(pdfDoc, watermarkElementFeatures);
} else {
log.info("No watermark found!");
log.info("No unlabeled watermark found!");
}
}
@ -69,7 +95,6 @@ public class WatermarkRemovalService {
ElementReader reader = new ElementReader();
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
Page page = iterator.next();
@ -80,7 +105,7 @@ public class WatermarkRemovalService {
reader.begin(page);
for (Element element = reader.next(); element != null; element = reader.next()) {
processElement(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringFromPage);
processElement(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringFromPage, page);
}
formObjectsAndImagesForPages.put(page.getSDFObj().getObjNum(), elementFeaturesLinkedList);
@ -96,28 +121,74 @@ public class WatermarkRemovalService {
Set<Long> visitedXObjIds,
List<ElementFeatures> elementFeaturesLinkedList,
List<ElementFeatures> formObjectsOccuringMoreThanOnceOnAPage,
double minAreaCoveringPage) throws PDFNetException {
double minAreaCoveringPage,
Page page) throws PDFNetException {
if (element.getBBox() == null) {
return;
}
if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringPage) {
switch (element.getType()) {
case Element.e_form -> processXObject(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage, page);
case Element.e_image, Element.e_inline_image -> processImages(element, elementFeaturesLinkedList, page, minAreaCoveringPage);
case Element.e_text -> processText(element, elementFeaturesLinkedList, page);
}
}
@SneakyThrows
private void processText(Element element, List<ElementFeatures> elementFeaturesLinkedList, Page page) {
if (page.getIndex() == PAGE_NUMBER_TEXT_SEARCH_THRESHOLD) {
shouldTextSearchBeContinued(elementFeaturesLinkedList);
}
if (!couldTextBeAWatermark(element, page)) {
return;
}
if (element.getType() == Element.e_form) {
processXObject(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage);
} else if (element.getType() == Element.e_image || element.getType() == Element.e_inline_image) {
if (element.getXObject() == null) {
return;
boolean isBigEnough = Math.abs(element.getBBox().getY1() - element.getBBox().getY2()) > page.getPageHeight() * MIN_TEXTWATERMARK_HEIGHT_THRESHOLD;
if (isBigEnough) {
ElementFeatures elementFeatures = ElementFeatureFactory.extractFeatures(element);
elementFeaturesLinkedList.add(elementFeatures);
}
}
@SneakyThrows
private boolean isTextRotated(Element element) {
return Math.abs(element.getCTM().getB()) < Math.sin(Math.toRadians(ROTATED_TEXT_THRESHOLD)) || Math.abs(element.getCTM()
.getB()) > Math.sin(Math.toRadians(70 - ROTATED_TEXT_THRESHOLD));
}
private void shouldTextSearchBeContinued(List<ElementFeatures> elementFeaturesLinkedList) {
int countTextWatermarks = 0;
for (ElementFeatures elementFeatures : elementFeaturesLinkedList) {
if (elementFeatures.getElementType() == Element.e_text) {
countTextWatermarks++;
}
processImages(element, elementFeaturesLinkedList);
}
if (countTextWatermarks < elementFeaturesLinkedList.size() * OCCURING_ON_PAGES_THRESHOLD_FACTOR) {
foundTextWatermark = false;
}
}
@SneakyThrows
private void processImages(Element element, List<ElementFeatures> elementFeaturesLinkedList) {
private void processImages(Element element, List<ElementFeatures> elementFeaturesLinkedList, Page page, double minAreaCoveringPage) {
if (element.getXObject() == null) {
return;
}
if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringPage && isLocatedNearBorder(element, page)) {
return;
}
String hashOfImage = ImageHashFactory.calculate(element);
ElementFeatures elementFeatures = ElementFeatureFactory.extractFeaturesWithHash(element, hashOfImage);
@ -125,18 +196,34 @@ public class WatermarkRemovalService {
}
// Typically company logos on dossier pages are located near the border and should be excluded from the watermark removal
@SneakyThrows
private boolean isLocatedNearBorder(Element element, Page page) {
return element.getBBox().getY1() < page.getVisibleContentBox().getY1() + page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD || element.getBBox()
.getY2() > page.getVisibleContentBox().getY2() - page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD || element.getBBox().getX1() < page.getVisibleContentBox()
.getX1() + page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD || element.getBBox().getX2() > page.getVisibleContentBox()
.getX2() - page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD;
}
@SneakyThrows
private void processXObject(Element element,
Set<Long> visitedXObjIds,
List<ElementFeatures> elementFeaturesLinkedList,
List<ElementFeatures> formObjectsOccuringMoreThanOnceOnAPage,
double minAreaCoveringPage) {
double minAreaCoveringPage,
Page page) {
if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringPage) {
return;
}
if (visitedXObjIds.add(element.getXObject().getObjNum())) {
ElementReader xObjectReader = new ElementReader();
xObjectReader.begin(element.getXObject());
for (Element element1 = xObjectReader.next(); element1 != null; element1 = xObjectReader.next()) {
processElement(element1, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage);
processElement(element1, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage, page);
}
elementFeaturesLinkedList.add(ElementFeatureFactory.extractFeatures(element));
xObjectReader.destroy();
@ -159,7 +246,8 @@ public class WatermarkRemovalService {
.flatMap(Collection::stream)
.filter(elementFeature -> formObjectsPerPage.values()
.stream()
.filter(elementFeaturesOnPage -> elementFeaturesOnPage.stream().anyMatch(elementFeature::almostMatches))
.filter(elementFeaturesOnPage -> elementFeaturesOnPage.stream()
.anyMatch(elementFeature.getElementType() == Element.e_image || elementFeature.getElementType() == Element.e_inline_image ? elementFeature::isSimilarTo : elementFeature::almostMatches))
.count() >= minPagesFilter)
.toList();
}
@ -210,21 +298,23 @@ public class WatermarkRemovalService {
double minAreaCoveringFromPage = AREA_THRESHOLD * page.getPageHeight() * page.getPageWidth();
for (Element element = reader.next(); element != null; element = reader.next()) {
switch (element.getType()) {
case Element.e_image, Element.e_inline_image -> {
if (element.getBBox() == null) {
writer.writeElement(element);
continue;
}
if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage || element.getXObject() == null) {
if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage && isLocatedNearBorder(element, page) && element.getBBox()
.getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage || element.getXObject() == null) {
writer.writeElement(element);
continue;
}
removeImages(element, writer, watermarksElementFeaturesList);
}
case Element.e_form ->
processForms(page, element, reader, writer, watermarksElementFeaturesList, visitedXObjIds);
case Element.e_form -> processForms(page, element, reader, writer, watermarksElementFeaturesList, visitedXObjIds);
case Element.e_text -> processText(element, writer, watermarksElementFeaturesList, page);
default -> writer.writeElement(element);
}
}
@ -232,13 +322,47 @@ public class WatermarkRemovalService {
@SneakyThrows
private void removeImages(Element element, ElementWriter
writer, List<ElementFeatures> watermarksElementFeaturesList) {
private void processText(Element element, ElementWriter writer, List<ElementFeatures> watermarksElementFeaturesList, Page page) {
if (!couldTextBeAWatermark(element, page)) {
writer.writeElement(element);
return;
}
for (ElementFeatures elementFeatures : watermarksElementFeaturesList) {
if (elementFeatures.almostMatches(element)) {
return;
}
}
writer.writeElement(element);
}
private boolean couldTextBeAWatermark(Element element, Page page) throws PDFNetException {
if (!foundTextWatermark) {
return false;
}
if (isTextRotated(element)) {
return false;
}
if (Math.max(element.getBBox().getY1(), element.getBBox().getY2()) < page.getVisibleContentBox().getY1() + page.getPageHeight() * TEXT_POSITION_THRESHOLD) {
return false;
}
return true;
}
@SneakyThrows
private void removeImages(Element element, ElementWriter writer, List<ElementFeatures> watermarksElementFeaturesList) {
String hashValueOfImage = ImageHashFactory.calculate(element);
ElementFeatures imageFeatures = ElementFeatureFactory.extractFeaturesWithHash(element, hashValueOfImage);
for (ElementFeatures elementFeatures : watermarksElementFeaturesList) {
if (elementFeatures.almostMatches(imageFeatures)) {
if (elementFeatures.isSimilarTo(imageFeatures)) {
return;
}
}

View File

@ -1,15 +1,17 @@
package com.iqser.red.pdftronlogic.commons;
import com.pdftron.pdf.PDFNet;
import lombok.SneakyThrows;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.junit.platform.commons.util.StringUtils;
import java.io.FileOutputStream;
import java.nio.file.Path;
import java.util.Locale;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.junit.platform.commons.util.StringUtils;
import com.pdftron.pdf.PDFNet;
import lombok.SneakyThrows;
@Disabled
class WatermarkRemovalServiceTest {