RED-7080: Remove watermarks that are named as watermarks in OCG

This commit is contained in:
deiflaender 2023-08-14 13:11:46 +02:00
parent f39ed2e586
commit a6e10ad5b9

View File

@ -1,26 +1,17 @@
package com.iqser.red.pdftronlogic.commons;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.*;
import com.pdftron.pdf.ocg.Group;
import com.pdftron.pdf.ocg.OCMD;
import com.pdftron.sdf.Obj;
import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementReader;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import java.util.*;
@Slf4j
public class WatermarkRemovalService {
@ -36,15 +27,16 @@ public class WatermarkRemovalService {
* First the possible watermarks (big XObjects or Images) will be detected and then checked if those appear on most pages according to the
* OCCURING_ON_PAGES_THRESHOLD_FACTOR by using image hashing for similarity and size and stream size of the xobjects.
* If so, these detected and confirmed will not be written to the pdf file.
*
* @param pdfFile PDFFile to remove watermarks
* @param out The OutputStream the final file will be written to
* @param out The OutputStream the final file will be written to
*/
@SneakyThrows
public void removeWatermarks(InputStream pdfFile, OutputStream out) {
PDFDoc pdfDoc = new PDFDoc(pdfFile);
if(pdfDoc.getPageCount() < MIN_PAGES_THRESHOLD){
if (pdfDoc.getPageCount() < MIN_PAGES_THRESHOLD) {
log.info("Document page count {} is below threshold {}", pdfDoc.getPageCount(), MIN_PAGES_THRESHOLD);
} else {
Map<Long, List<ElementFeatures>> formObjectsForPages = findAllFormObjectsAndImages(pdfDoc);
@ -79,7 +71,6 @@ public class WatermarkRemovalService {
ElementReader reader = new ElementReader();
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
Page page = iterator.next();
@ -103,10 +94,10 @@ public class WatermarkRemovalService {
private void processElement(Element element,
Set<Long> visitedXObjIds,
List<ElementFeatures> elementFeaturesLinkedList,
List<ElementFeatures> formObjectsOccuringMoreThanOnceOnAPage,
double minAreaCoveringPage) throws PDFNetException {
Set<Long> visitedXObjIds,
List<ElementFeatures> elementFeaturesLinkedList,
List<ElementFeatures> formObjectsOccuringMoreThanOnceOnAPage,
double minAreaCoveringPage) throws PDFNetException {
if (element.getBBox() == null) {
return;
@ -137,10 +128,10 @@ public class WatermarkRemovalService {
@SneakyThrows
private void processXObject(Element element,
Set<Long> visitedXObjIds,
List<ElementFeatures> elementFeaturesLinkedList,
List<ElementFeatures> formObjectsOccuringMoreThanOnceOnAPage,
double minAreaCoveringPage) {
Set<Long> visitedXObjIds,
List<ElementFeatures> elementFeaturesLinkedList,
List<ElementFeatures> formObjectsOccuringMoreThanOnceOnAPage,
double minAreaCoveringPage) {
if (visitedXObjIds.add(element.getXObject().getObjNum())) {
ElementReader xObjectReader = new ElementReader();
@ -198,10 +189,10 @@ public class WatermarkRemovalService {
@SneakyThrows
private void writeAllElementsExceptWatermarks(Page page,
ElementReader reader,
ElementWriter writer,
List<ElementFeatures> watermarksElementFeaturesList,
Set<Long> visitedXObjIds) {
ElementReader reader,
ElementWriter writer,
List<ElementFeatures> watermarksElementFeaturesList,
Set<Long> visitedXObjIds) {
reader.begin(page);
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
@ -212,14 +203,18 @@ public class WatermarkRemovalService {
private void processElements(Page page,
ElementReader reader,
ElementWriter writer,
List<ElementFeatures> watermarksElementFeaturesList,
Set<Long> visitedXObjIds) throws PDFNetException {
ElementReader reader,
ElementWriter writer,
List<ElementFeatures> watermarksElementFeaturesList,
Set<Long> visitedXObjIds) throws PDFNetException {
double minAreaCoveringFromPage = AREA_THRESHOLD * page.getPageHeight() * page.getPageWidth();
for (Element element = reader.next(); element != null; element = reader.next()) {
if (inOCGWatermark(element)) {
continue;
}
switch (element.getType()) {
case Element.e_image, Element.e_inline_image -> {
if (element.getBBox() == null) {
@ -232,7 +227,8 @@ public class WatermarkRemovalService {
}
removeImages(element, writer, watermarksElementFeaturesList);
}
case Element.e_form -> processForms(page, element, reader, writer, watermarksElementFeaturesList, visitedXObjIds);
case Element.e_form ->
processForms(page, element, reader, writer, watermarksElementFeaturesList, visitedXObjIds);
default -> writer.writeElement(element);
}
}
@ -240,7 +236,27 @@ public class WatermarkRemovalService {
@SneakyThrows
private void removeImages(Element element, ElementWriter writer, List<ElementFeatures> watermarksElementFeaturesList) {
private boolean inOCGWatermark(Element element) {
var xObj = element.getXObject();
if (xObj != null) {
Obj oc = xObj.findObj("OC");
if (oc != null) {
OCMD ocmd = new OCMD(oc);
if (ocmd.isValid()) {
Group group = new Group(ocmd.getOCGs());
if (group.isValid() && group.getName().equals("Watermark")) {
return true;
}
}
}
}
return false;
}
@SneakyThrows
private void removeImages(Element element, ElementWriter
writer, List<ElementFeatures> watermarksElementFeaturesList) {
String hashValueOfImage = ImageHashFactory.calculate(element);
ElementFeatures imageFeatures = ElementFeatureFactory.extractFeaturesWithHash(element, hashValueOfImage);
@ -255,11 +271,11 @@ public class WatermarkRemovalService {
private void processForms(Page page,
Element element,
ElementReader reader,
ElementWriter writer,
List<ElementFeatures> watermarksElementFeaturesList,
Set<Long> visitedXObjIds) throws PDFNetException {
Element element,
ElementReader reader,
ElementWriter writer,
List<ElementFeatures> watermarksElementFeaturesList,
Set<Long> visitedXObjIds) throws PDFNetException {
for (ElementFeatures elementFeatures : watermarksElementFeaturesList) {
if (elementFeatures.almostMatches(element)) {