RED-7075: Styling changes

This commit is contained in:
RaphaelArnold 2023-09-01 09:58:16 +02:00
parent 42836ae35a
commit bf3015161b
2 changed files with 57 additions and 73 deletions

View File

@ -22,6 +22,9 @@ import lombok.experimental.SuperBuilder;
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ElementFeatures {
final private static double RECT_POSITION_SIMILARITY_THRESHOLD_FACTOR = 0.2; // specify how much the x and y value are allowed to differ
final private static double RECT_SIZE_SIMILARITY_THRESHOLD_FACTOR = 0.1; // the scale the images are allowed to differ
final private static double HAMMING_DISTANCE_THRESHOLD = 4; // defines the similarity of the hash of images
int elementType;
Rectangle2D boundingBox;
@ -68,53 +71,33 @@ public class ElementFeatures {
}
public boolean similarMatches(Element element) throws PDFNetException {
public boolean isSimilarTo(ElementFeatures elementFeatures) {
return element.getType() == elementType && //
element.getBBox() != null && //
rectsSimilarMatch(element.getBBox());
return elementFeatures.getElementType() == elementType && elementFeatures.getBoundingBox() != null && areRectsSimilar(elementFeatures.getBoundingBox());
}
@SneakyThrows
private boolean rectsSimilarMatch(Rect rect) {
private boolean areRectsSimilar(Rectangle2D rectangle2D) {
// To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance
return similarEqualPosition(rect.getX1(), boundingBox.getX(), rect.getWidth()) && //
similarEqualPosition(rect.getY1(), boundingBox.getY(), rect.getHeight()) && //
similarEqualSize(rect.getWidth(), boundingBox.getWidth()) && //
similarEqualSize(rect.getHeight(), boundingBox.getHeight());
return isPositionSimilar(rectangle2D.getX(), boundingBox.getX(), rectangle2D.getWidth()) && //
isPositionSimilar(rectangle2D.getY(), boundingBox.getY(), rectangle2D.getHeight()) && //
isSizeSimilar(rectangle2D.getWidth(), boundingBox.getWidth()) && //
isSizeSimilar(rectangle2D.getHeight(), boundingBox.getHeight());
}
protected boolean similarEqualPosition(double a, double b, double boxSize) {
protected boolean isPositionSimilar(double a, double b, double boxSize) {
return Math.abs(a - b) < boxSize * 0.2;
return Math.abs(a - b) < boxSize * RECT_POSITION_SIMILARITY_THRESHOLD_FACTOR;
}
protected boolean similarEqualSize(double a, double b) {
protected boolean isSizeSimilar(double a, double b) {
return Math.abs(a - b) < a * 0.1;
return Math.abs(a - b) < a * RECT_SIZE_SIMILARITY_THRESHOLD_FACTOR;
}
public boolean similarMatches(ElementFeatures elementFeatures) {
return elementFeatures.getElementType() == elementType && elementFeatures.getBoundingBox() != null && rectsSimilarMatch(elementFeatures.getBoundingBox());
}
private boolean rectsSimilarMatch(Rectangle2D rectangle2D) {
// To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance
return similarEqualPosition(rectangle2D.getX(), boundingBox.getX(), rectangle2D.getWidth()) && //
similarEqualPosition(rectangle2D.getY(), boundingBox.getY(), rectangle2D.getHeight()) && //
similarEqualSize(rectangle2D.getWidth(), boundingBox.getWidth()) && //
similarEqualSize(rectangle2D.getHeight(), boundingBox.getHeight());
}
@EqualsAndHashCode(callSuper = true)
@Getter
@SuperBuilder
@ -215,26 +198,20 @@ public class ElementFeatures {
return false;
}
return super.almostMatches(elementFeatures) && this.dataSize == ((Image) elementFeatures).getDataSize() && this.height == ((Image) elementFeatures).getHeight() && this.width == ((Image) elementFeatures).getWidth() && this.renderingIntent == ((Image) elementFeatures).getRenderingIntent() && this.componentNum == ((Image) elementFeatures).getComponentNum() && this.bitsPerComponent == ((Image) elementFeatures).getBitsPerComponent() && calculateHammingDistance(
((Image) elementFeatures).getHashOfImage()) <= 4;
((Image) elementFeatures).getHashOfImage()) <= HAMMING_DISTANCE_THRESHOLD;
}
public boolean similarMatches(Element element) throws PDFNetException {
public boolean isSimilarTo(ElementFeatures elementFeatures) {
return super.almostMatches(element) && //
dataSize == element.getImageDataSize() && //
height == element.getImageHeight() && //
width == element.getImageWidth() && //
renderingIntent == element.getImageRenderingIntent() && //
componentNum == element.getComponentNum() && //
bitsPerComponent == element.getBitsPerComponent();
}
public boolean similarMatches(ElementFeatures elementFeatures) {
return super.similarMatches(elementFeatures) && this.dataSize == ((Image) elementFeatures).getDataSize() && this.height == ((Image) elementFeatures).getHeight() && this.width == ((Image) elementFeatures).getWidth() && this.renderingIntent == ((Image) elementFeatures).getRenderingIntent() && this.componentNum == ((Image) elementFeatures).getComponentNum() && this.bitsPerComponent == ((Image) elementFeatures).getBitsPerComponent() && calculateHammingDistance(
((Image) elementFeatures).getHashOfImage()) <= 4;
return super.isSimilarTo(elementFeatures) && //
//this.dataSize == ((Image) elementFeatures).getDataSize() && //
//this.height == ((Image) elementFeatures).getHeight() && //
//this.width == ((Image) elementFeatures).getWidth() && //
//this.renderingIntent == ((Image) elementFeatures).getRenderingIntent() && //
//this.componentNum == ((Image) elementFeatures).getComponentNum() && //
//this.bitsPerComponent == ((Image) elementFeatures).getBitsPerComponent() && //
calculateHammingDistance(((Image) elementFeatures).getHashOfImage()) <= HAMMING_DISTANCE_THRESHOLD;
}

View File

@ -45,8 +45,10 @@ public class WatermarkRemovalService {
/**
* The method remove watermark works only for Documents with size greater than MIN_PAGES_THRESHOLD.
* First the possible watermarks (big XObjects or Images) will be detected and then checked if those appear on most pages according to the
* OCCURING_ON_PAGES_THRESHOLD_FACTOR by using image hashing for similarity and size and stream size of the xobjects.
* The following watermarks will be found: big XObjects, big Images, small Images that appear in the middle of the page, and
* text that is rotated and big enough compared to height of page.
* First the possible watermarks will be detected and then checked if those appear on most pages according to the
* OCCURING_ON_PAGES_THRESHOLD_FACTOR. We us image hashing for similarity between pictures and size and stream size of the xobjects.
* If so, these detected and confirmed will not be written to the pdf file.
*
* @param pdfFile PDFFile to remove watermarks
@ -142,16 +144,7 @@ public class WatermarkRemovalService {
shouldTextSearchBeContinued(elementFeaturesLinkedList);
}
if (!foundTextWatermark) {
return;
}
if (Math.abs(element.getCTM().getB()) < Math.sin(Math.toRadians(ROTATED_TEXT_THRESHOLD)) || Math.abs(element.getCTM()
.getB()) > Math.sin(Math.toRadians(70 - ROTATED_TEXT_THRESHOLD))) {
return;
}
if (Math.max(element.getBBox().getY1(), element.getBBox().getY2()) < page.getVisibleContentBox().getY1() + page.getPageHeight() * TEXT_POSITION_THRESHOLD) {
if (!couldTextBeAWatermark(element, page)) {
return;
}
@ -165,6 +158,14 @@ public class WatermarkRemovalService {
}
@SneakyThrows
private boolean isTextRotated(Element element) {
return Math.abs(element.getCTM().getB()) < Math.sin(Math.toRadians(ROTATED_TEXT_THRESHOLD)) || Math.abs(element.getCTM()
.getB()) > Math.sin(Math.toRadians(70 - ROTATED_TEXT_THRESHOLD));
}
private void shouldTextSearchBeContinued(List<ElementFeatures> elementFeaturesLinkedList) {
int countTextWatermarks = 0;
@ -245,7 +246,7 @@ public class WatermarkRemovalService {
.filter(elementFeature -> formObjectsPerPage.values()
.stream()
.filter(elementFeaturesOnPage -> elementFeaturesOnPage.stream()
.anyMatch(elementFeature.getElementType() == Element.e_image || elementFeature.getElementType() == Element.e_inline_image ? elementFeature::similarMatches : elementFeature::almostMatches))
.anyMatch(elementFeature.getElementType() == Element.e_image || elementFeature.getElementType() == Element.e_inline_image ? elementFeature::isSimilarTo : elementFeature::almostMatches))
.count() >= minPagesFilter)
.toList();
}
@ -322,18 +323,7 @@ public class WatermarkRemovalService {
@SneakyThrows
private void processText(Element element, ElementWriter writer, List<ElementFeatures> watermarksElementFeaturesList, Page page) {
if (!foundTextWatermark) {
writer.writeElement(element);
return;
}
if (Math.abs(element.getCTM().getB()) < Math.sin(Math.toRadians(ROTATED_TEXT_THRESHOLD)) || Math.abs(element.getCTM()
.getB()) > Math.sin(Math.toRadians(70 - ROTATED_TEXT_THRESHOLD))) {
writer.writeElement(element);
return;
}
if (Math.max(element.getBBox().getY1(), element.getBBox().getY2()) < page.getVisibleContentBox().getY1() + page.getPageHeight() * TEXT_POSITION_THRESHOLD) {
if (!couldTextBeAWatermark(element, page)) {
writer.writeElement(element);
return;
}
@ -348,13 +338,30 @@ public class WatermarkRemovalService {
}
private boolean couldTextBeAWatermark(Element element, Page page) throws PDFNetException {
if (!foundTextWatermark) {
return false;
}
if (isTextRotated(element)) {
return false;
}
if (Math.max(element.getBBox().getY1(), element.getBBox().getY2()) < page.getVisibleContentBox().getY1() + page.getPageHeight() * TEXT_POSITION_THRESHOLD) {
return false;
}
return true;
}
@SneakyThrows
private void removeImages(Element element, ElementWriter writer, List<ElementFeatures> watermarksElementFeaturesList) {
String hashValueOfImage = ImageHashFactory.calculate(element);
ElementFeatures imageFeatures = ElementFeatureFactory.extractFeaturesWithHash(element, hashValueOfImage);
for (ElementFeatures elementFeatures : watermarksElementFeaturesList) {
if (elementFeatures.similarMatches(imageFeatures)) {
if (elementFeatures.isSimilarTo(imageFeatures)) {
return;
}
}