RED-7080: Remove watermarks that are named as watermarks in OCG
This commit is contained in:
parent
f39ed2e586
commit
a6e10ad5b9
@ -1,26 +1,17 @@
|
||||
package com.iqser.red.pdftronlogic.commons;
|
||||
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.*;
|
||||
import com.pdftron.pdf.ocg.Group;
|
||||
import com.pdftron.pdf.ocg.OCMD;
|
||||
import com.pdftron.sdf.Obj;
|
||||
import com.pdftron.sdf.SDFDoc;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.Element;
|
||||
import com.pdftron.pdf.ElementReader;
|
||||
import com.pdftron.pdf.ElementWriter;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.pdf.PageIterator;
|
||||
import com.pdftron.sdf.SDFDoc;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import java.util.*;
|
||||
|
||||
@Slf4j
|
||||
public class WatermarkRemovalService {
|
||||
@ -36,15 +27,16 @@ public class WatermarkRemovalService {
|
||||
* First the possible watermarks (big XObjects or Images) will be detected and then checked if those appear on most pages according to the
|
||||
* OCCURING_ON_PAGES_THRESHOLD_FACTOR by using image hashing for similarity and size and stream size of the xobjects.
|
||||
* If so, these detected and confirmed will not be written to the pdf file.
|
||||
*
|
||||
* @param pdfFile PDFFile to remove watermarks
|
||||
* @param out The OutputStream the final file will be written to
|
||||
* @param out The OutputStream the final file will be written to
|
||||
*/
|
||||
@SneakyThrows
|
||||
public void removeWatermarks(InputStream pdfFile, OutputStream out) {
|
||||
|
||||
PDFDoc pdfDoc = new PDFDoc(pdfFile);
|
||||
|
||||
if(pdfDoc.getPageCount() < MIN_PAGES_THRESHOLD){
|
||||
if (pdfDoc.getPageCount() < MIN_PAGES_THRESHOLD) {
|
||||
log.info("Document page count {} is below threshold {}", pdfDoc.getPageCount(), MIN_PAGES_THRESHOLD);
|
||||
} else {
|
||||
Map<Long, List<ElementFeatures>> formObjectsForPages = findAllFormObjectsAndImages(pdfDoc);
|
||||
@ -79,7 +71,6 @@ public class WatermarkRemovalService {
|
||||
ElementReader reader = new ElementReader();
|
||||
|
||||
|
||||
|
||||
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
|
||||
|
||||
Page page = iterator.next();
|
||||
@ -103,10 +94,10 @@ public class WatermarkRemovalService {
|
||||
|
||||
|
||||
private void processElement(Element element,
|
||||
Set<Long> visitedXObjIds,
|
||||
List<ElementFeatures> elementFeaturesLinkedList,
|
||||
List<ElementFeatures> formObjectsOccuringMoreThanOnceOnAPage,
|
||||
double minAreaCoveringPage) throws PDFNetException {
|
||||
Set<Long> visitedXObjIds,
|
||||
List<ElementFeatures> elementFeaturesLinkedList,
|
||||
List<ElementFeatures> formObjectsOccuringMoreThanOnceOnAPage,
|
||||
double minAreaCoveringPage) throws PDFNetException {
|
||||
|
||||
if (element.getBBox() == null) {
|
||||
return;
|
||||
@ -137,10 +128,10 @@ public class WatermarkRemovalService {
|
||||
|
||||
@SneakyThrows
|
||||
private void processXObject(Element element,
|
||||
Set<Long> visitedXObjIds,
|
||||
List<ElementFeatures> elementFeaturesLinkedList,
|
||||
List<ElementFeatures> formObjectsOccuringMoreThanOnceOnAPage,
|
||||
double minAreaCoveringPage) {
|
||||
Set<Long> visitedXObjIds,
|
||||
List<ElementFeatures> elementFeaturesLinkedList,
|
||||
List<ElementFeatures> formObjectsOccuringMoreThanOnceOnAPage,
|
||||
double minAreaCoveringPage) {
|
||||
|
||||
if (visitedXObjIds.add(element.getXObject().getObjNum())) {
|
||||
ElementReader xObjectReader = new ElementReader();
|
||||
@ -198,10 +189,10 @@ public class WatermarkRemovalService {
|
||||
|
||||
@SneakyThrows
|
||||
private void writeAllElementsExceptWatermarks(Page page,
|
||||
ElementReader reader,
|
||||
ElementWriter writer,
|
||||
List<ElementFeatures> watermarksElementFeaturesList,
|
||||
Set<Long> visitedXObjIds) {
|
||||
ElementReader reader,
|
||||
ElementWriter writer,
|
||||
List<ElementFeatures> watermarksElementFeaturesList,
|
||||
Set<Long> visitedXObjIds) {
|
||||
|
||||
reader.begin(page);
|
||||
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
|
||||
@ -212,14 +203,18 @@ public class WatermarkRemovalService {
|
||||
|
||||
|
||||
private void processElements(Page page,
|
||||
ElementReader reader,
|
||||
ElementWriter writer,
|
||||
List<ElementFeatures> watermarksElementFeaturesList,
|
||||
Set<Long> visitedXObjIds) throws PDFNetException {
|
||||
ElementReader reader,
|
||||
ElementWriter writer,
|
||||
List<ElementFeatures> watermarksElementFeaturesList,
|
||||
Set<Long> visitedXObjIds) throws PDFNetException {
|
||||
|
||||
double minAreaCoveringFromPage = AREA_THRESHOLD * page.getPageHeight() * page.getPageWidth();
|
||||
for (Element element = reader.next(); element != null; element = reader.next()) {
|
||||
|
||||
if (inOCGWatermark(element)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
switch (element.getType()) {
|
||||
case Element.e_image, Element.e_inline_image -> {
|
||||
if (element.getBBox() == null) {
|
||||
@ -232,7 +227,8 @@ public class WatermarkRemovalService {
|
||||
}
|
||||
removeImages(element, writer, watermarksElementFeaturesList);
|
||||
}
|
||||
case Element.e_form -> processForms(page, element, reader, writer, watermarksElementFeaturesList, visitedXObjIds);
|
||||
case Element.e_form ->
|
||||
processForms(page, element, reader, writer, watermarksElementFeaturesList, visitedXObjIds);
|
||||
default -> writer.writeElement(element);
|
||||
}
|
||||
}
|
||||
@ -240,7 +236,27 @@ public class WatermarkRemovalService {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void removeImages(Element element, ElementWriter writer, List<ElementFeatures> watermarksElementFeaturesList) {
|
||||
private boolean inOCGWatermark(Element element) {
|
||||
var xObj = element.getXObject();
|
||||
if (xObj != null) {
|
||||
Obj oc = xObj.findObj("OC");
|
||||
if (oc != null) {
|
||||
OCMD ocmd = new OCMD(oc);
|
||||
if (ocmd.isValid()) {
|
||||
Group group = new Group(ocmd.getOCGs());
|
||||
if (group.isValid() && group.getName().equals("Watermark")) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void removeImages(Element element, ElementWriter
|
||||
writer, List<ElementFeatures> watermarksElementFeaturesList) {
|
||||
|
||||
String hashValueOfImage = ImageHashFactory.calculate(element);
|
||||
ElementFeatures imageFeatures = ElementFeatureFactory.extractFeaturesWithHash(element, hashValueOfImage);
|
||||
@ -255,11 +271,11 @@ public class WatermarkRemovalService {
|
||||
|
||||
|
||||
private void processForms(Page page,
|
||||
Element element,
|
||||
ElementReader reader,
|
||||
ElementWriter writer,
|
||||
List<ElementFeatures> watermarksElementFeaturesList,
|
||||
Set<Long> visitedXObjIds) throws PDFNetException {
|
||||
Element element,
|
||||
ElementReader reader,
|
||||
ElementWriter writer,
|
||||
List<ElementFeatures> watermarksElementFeaturesList,
|
||||
Set<Long> visitedXObjIds) throws PDFNetException {
|
||||
|
||||
for (ElementFeatures elementFeatures : watermarksElementFeaturesList) {
|
||||
if (elementFeatures.almostMatches(element)) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user