memory optimisations

This commit is contained in:
Timo Bejan 2024-03-14 18:50:07 +02:00
parent e926083881
commit 1b4ab8dc88
6 changed files with 283 additions and 199 deletions

View File

@ -1,7 +1,5 @@
package com.iqser.red.pdftronlogic.commons;
import static com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService.TOLERANCE;
import java.awt.geom.Area;
@ -35,6 +33,13 @@ public class ClippingPathStack {
}
@SneakyThrows
public void intersectClippingPath(Rectangle2D path) {
getCurrentClippingPath().intersect(new Area(path));
}
public boolean almostIntersects(double x, double y, double width, double height) {
// To address inconsistencies in the calculation of the bounding box we slightly increase the rectangle
// Height or width are zero for straight lines, even though they are being rendered. Therefore, height or width must be at minimum >0.
@ -56,15 +61,16 @@ public class ClippingPathStack {
public void enterNewGState() {
Area current = stack.peek();
Area cloned = new Area();
cloned.add(current);
Area cloned = (Area) current.clone();
stack.push(cloned);
}
public void leaveGState() {
stack.pop();
// somehow this greatly helps memory management
var popped = stack.pop();
popped.reset();
}
}
}

View File

@ -4,6 +4,7 @@ import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.Element;
public class ElementFeatureFactory {
public static ElementFeatures extractFeatures(Element element) throws PDFNetException {
return switch (element.getType()) {
@ -16,7 +17,9 @@ public class ElementFeatureFactory {
};
}
public static ElementFeatures extractFeaturesWithHash(Element element, String hashObject) throws PDFNetException {
return buildImage(element)
.hashOfImage(hashObject)
.build();
@ -25,56 +28,65 @@ public class ElementFeatureFactory {
private static ElementFeatures.Form buildForm(Element element) throws PDFNetException {
return ElementFeatures.Form.builder()
.elementType(element.getType())
.boundingBox(Converter.toRectangle2D(element.getBBox()))
.xObjectType(element.getXObject().getType())
.dictOrArrayOrStreamLength(element.getXObject().getType() == 7 ? element.getXObject().getDecodedStream().size() : 0)
.build();
try (var bbox = element.getBBox();) {
return ElementFeatures.Form.builder()
.elementType(element.getType())
.boundingBox(Converter.toRectangle2D(bbox))
.xObjectType(element.getXObject().getType())
.dictOrArrayOrStreamLength(element.getXObject().getType() == 7 ? element.getXObject().getDecodedStream().size() : 0)
.build();
}
}
private static ElementFeatures.Image.ImageBuilder<?, ?> buildImage(Element element) throws PDFNetException {
return ElementFeatures.Image.builder()
.elementType(element.getType())
.boundingBox(Converter.toRectangle2D(element.getBBox()))
.dataSize(element.getImageDataSize())
.height(element.getImageHeight())
.width(element.getImageWidth())
.renderingIntent(element.getImageRenderingIntent())
.componentNum(element.getComponentNum())
.bitsPerComponent(element.getBitsPerComponent());
try (var bbox = element.getBBox();) {
return ElementFeatures.Image.builder()
.elementType(element.getType())
.boundingBox(Converter.toRectangle2D(bbox))
.dataSize(element.getImageDataSize())
.height(element.getImageHeight())
.width(element.getImageWidth())
.renderingIntent(element.getImageRenderingIntent())
.componentNum(element.getComponentNum())
.bitsPerComponent(element.getBitsPerComponent());
}
}
private static ElementFeatures.Text buildText(Element element) throws PDFNetException {
return ElementFeatures.Text.builder()
.elementType(element.getType())
.boundingBox(Converter.toRectangle2D(element.getBBox()))
.text(element.getTextString())
.font(element.getGState().getFont().getType())
.fontsize(element.getGState().getFontSize())
.build();
try (var bbox = element.getBBox();) {
return ElementFeatures.Text.builder()
.elementType(element.getType())
.boundingBox(Converter.toRectangle2D(bbox))
.text(element.getTextString())
.font(element.getGState().getFont().getType())
.fontsize(element.getGState().getFontSize())
.build();
}
}
private static ElementFeatures.Path buildPath(Element element) throws PDFNetException {
return ElementFeatures.Path.builder()
.elementType(element.getType())
.boundingBox(Converter.toRectangle2D(element.getBBox()))
.isClippingPath(element.isClippingPath())
.isClipWindingFill(element.isClipWindingFill())
.isStroked(element.isStroked())
.isFilled(element.isFilled())
.isWindingFill(element.isWindingFill())
.fillColor(Converter.convertColor(element.getGState().getFillColorSpace(), element.getGState().getFillColor()))
.strokeColor(Converter.convertColor(element.getGState().getStrokeColorSpace(), element.getGState().getStrokeColor()))
.linePath(Converter.convertToGeneralPathAndTransformToInitialUserSpace(element.getPathData(), element.getCTM()))
.build();
try (var bbox = element.getBBox(); var ctm = element.getCTM();
var fillColor = element.getGState().getFillColor();
var strokeColor = element.getGState().getStrokeColor()) {
return ElementFeatures.Path.builder()
.elementType(element.getType())
.boundingBox(Converter.toRectangle2D(bbox))
.isClippingPath(element.isClippingPath())
.isClipWindingFill(element.isClipWindingFill())
.isStroked(element.isStroked())
.isFilled(element.isFilled())
.isWindingFill(element.isWindingFill())
.fillColor(Converter.convertColor(element.getGState().getFillColorSpace(), fillColor))
.strokeColor(Converter.convertColor(element.getGState().getStrokeColorSpace(), strokeColor))
.linePath(Converter.convertToGeneralPathAndTransformToInitialUserSpace(element.getPathData(), ctm))
.build();
}
}
}

View File

@ -31,9 +31,11 @@ public class ElementFeatures {
public boolean almostMatches(Element element) throws PDFNetException {
return element.getType() == elementType && //
element.getBBox() != null && //
rectsAlmostMatch(element.getBBox());
try (var bbox = element.getBBox()) {
return element.getType() == elementType && //
bbox != null && //
rectsAlmostMatch(bbox);
}
}
@ -42,9 +44,9 @@ public class ElementFeatures {
// To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance
return almostEqual(bBox.getX1(), boundingBox.getX()) && //
almostEqual(bBox.getY1(), boundingBox.getY()) && //
almostEqual(bBox.getWidth(), boundingBox.getWidth()) && //
almostEqual(bBox.getHeight(), boundingBox.getHeight());
almostEqual(bBox.getY1(), boundingBox.getY()) && //
almostEqual(bBox.getWidth(), boundingBox.getWidth()) && //
almostEqual(bBox.getHeight(), boundingBox.getHeight());
}
@ -65,9 +67,9 @@ public class ElementFeatures {
// To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance
return almostEqual(bBox.getX(), boundingBox.getX()) && //
almostEqual(bBox.getY(), boundingBox.getY()) && //
almostEqual(bBox.getWidth(), boundingBox.getWidth()) && //
almostEqual(bBox.getHeight(), boundingBox.getHeight());
almostEqual(bBox.getY(), boundingBox.getY()) && //
almostEqual(bBox.getWidth(), boundingBox.getWidth()) && //
almostEqual(bBox.getHeight(), boundingBox.getHeight());
}
@ -81,9 +83,9 @@ public class ElementFeatures {
// To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance
return isPositionSimilar(rectangle2D.getX(), boundingBox.getX(), rectangle2D.getWidth()) && //
isPositionSimilar(rectangle2D.getY(), boundingBox.getY(), rectangle2D.getHeight()) && //
isSizeSimilar(rectangle2D.getWidth(), boundingBox.getWidth()) && //
isSizeSimilar(rectangle2D.getHeight(), boundingBox.getHeight());
isPositionSimilar(rectangle2D.getY(), boundingBox.getY(), rectangle2D.getHeight()) && //
isSizeSimilar(rectangle2D.getWidth(), boundingBox.getWidth()) && //
isSizeSimilar(rectangle2D.getHeight(), boundingBox.getHeight());
}
@ -115,9 +117,9 @@ public class ElementFeatures {
public boolean almostMatches(Element element) throws PDFNetException {
return super.almostMatches(element) && //
text.equals(element.getTextString()) && //
font == element.getGState().getFont().getType() && //
almostEqual(fontsize, element.getGState().getFontSize());
text.equals(element.getTextString()) && //
font == element.getGState().getFont().getType() && //
almostEqual(fontsize, element.getGState().getFontSize());
}
}
@ -142,11 +144,11 @@ public class ElementFeatures {
public boolean almostMatches(Element element) throws PDFNetException {
return super.almostMatches(element) && //
isClippingPath == element.isClippingPath() && //
isClipWindingFill == element.isClipWindingFill() && //
isStroked == element.isStroked() && //
isFilled == element.isFilled() && //
isWindingFill == element.isWindingFill();
isClippingPath == element.isClippingPath() && //
isClipWindingFill == element.isClipWindingFill() && //
isStroked == element.isStroked() && //
isFilled == element.isFilled() && //
isWindingFill == element.isWindingFill();
}
@ -161,7 +163,7 @@ public class ElementFeatures {
public boolean isBackground(Rect area) {
return isFilled && //
getBoundingBox().intersects(area.getX1(), area.getY1(), area.getWidth(), area.getHeight());
getBoundingBox().intersects(area.getX1(), area.getY1(), area.getWidth(), area.getHeight());
}
}
@ -185,12 +187,12 @@ public class ElementFeatures {
public boolean almostMatches(Element element) throws PDFNetException {
return super.almostMatches(element) && //
dataSize == element.getImageDataSize() && //
height == element.getImageHeight() && //
width == element.getImageWidth() && //
renderingIntent == element.getImageRenderingIntent() && //
componentNum == element.getComponentNum() && //
bitsPerComponent == element.getBitsPerComponent();
dataSize == element.getImageDataSize() && //
height == element.getImageHeight() && //
width == element.getImageWidth() && //
renderingIntent == element.getImageRenderingIntent() && //
componentNum == element.getComponentNum() && //
bitsPerComponent == element.getBitsPerComponent();
}
@ -199,15 +201,18 @@ public class ElementFeatures {
if (elementFeatures.getClass() != this.getClass()) {
return false;
}
return super.almostMatches(elementFeatures) && this.dataSize == ((Image) elementFeatures).getDataSize() && this.height == ((Image) elementFeatures).getHeight() && this.width == ((Image) elementFeatures).getWidth() && this.renderingIntent == ((Image) elementFeatures).getRenderingIntent() && this.componentNum == ((Image) elementFeatures).getComponentNum() && this.bitsPerComponent == ((Image) elementFeatures).getBitsPerComponent() && calculateHammingDistance(
((Image) elementFeatures).getHashOfImage()) <= HAMMING_DISTANCE_THRESHOLD;
return super.almostMatches(elementFeatures) && this.dataSize == ((Image) elementFeatures).getDataSize() && this.height == ((Image) elementFeatures).getHeight() &&
this.width == ((Image) elementFeatures).getWidth() && this.renderingIntent == ((Image) elementFeatures).getRenderingIntent() &&
this.componentNum == ((Image) elementFeatures).getComponentNum() && this.bitsPerComponent == ((Image) elementFeatures).getBitsPerComponent() &&
calculateHammingDistance(
((Image) elementFeatures).getHashOfImage()) <= HAMMING_DISTANCE_THRESHOLD;
}
public boolean isSimilarTo(ElementFeatures elementFeatures) {
return super.isSimilarTo(elementFeatures) && //
calculateHammingDistance(((Image) elementFeatures).getHashOfImage()) <= HAMMING_DISTANCE_THRESHOLD;
calculateHammingDistance(((Image) elementFeatures).getHashOfImage()) <= HAMMING_DISTANCE_THRESHOLD;
}
@ -241,10 +246,12 @@ public class ElementFeatures {
@Override
public boolean almostMatches(Element element) throws PDFNetException {
return element.getType() == getElementType() && //
element.getBBox() != null && //
(super.rectsAlmostMatch(element.getBBox()) || almostRotateMatches(element.getBBox().getRectangle())) && xObjectType == element.getXObject()
.getType() && dictOrArrayOrStreamLength == element.getXObject().getDecodedStream().size();
try (var bbox = element.getBBox()) {
return element.getType() == getElementType() && //
bbox != null && //
(super.rectsAlmostMatch(bbox) || almostRotateMatches(bbox.getRectangle())) && xObjectType == element.getXObject()
.getType() && dictOrArrayOrStreamLength == element.getXObject().getDecodedStream().size();
}
}
@ -253,9 +260,11 @@ public class ElementFeatures {
if (elementFeatures.getClass() != this.getClass()) {
return false;
}
return elementFeatures.getElementType() == getElementType() && elementFeatures.getBoundingBox() != null && (super.rectsAlmostMatch(elementFeatures.getBoundingBox()) || almostRotateMatches(
elementFeatures.getBoundingBox()
.getBounds2D())) && xObjectType == ((Form) elementFeatures).getXObjectType() && dictOrArrayOrStreamLength == ((Form) elementFeatures).getDictOrArrayOrStreamLength();
return elementFeatures.getElementType() == getElementType() && elementFeatures.getBoundingBox() != null &&
(super.rectsAlmostMatch(elementFeatures.getBoundingBox()) || almostRotateMatches(
elementFeatures.getBoundingBox()
.getBounds2D())) && xObjectType == ((Form) elementFeatures).getXObjectType() &&
dictOrArrayOrStreamLength == ((Form) elementFeatures).getDictOrArrayOrStreamLength();
}
@ -263,7 +272,7 @@ public class ElementFeatures {
private boolean almostRotateMatches(Rectangle2D bBox) {
return almostEqual(bBox.getWidth(), getBoundingBox().getHeight()) && //
almostEqual(bBox.getHeight(), getBoundingBox().getWidth());
almostEqual(bBox.getHeight(), getBoundingBox().getWidth());
}
}

View File

@ -63,15 +63,12 @@ public class InvisibleElementRemovalService {
PDFDoc pdfDoc = new PDFDoc(pdfFile);
execute(pdfDoc, delta, removePaths, markedContentToIgnore);
try {
try (pdfDoc) {
execute(pdfDoc, delta, removePaths, markedContentToIgnore);
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
} catch (Exception e) {
log.error("File could not be saved after invisible element removal");
throw new RuntimeException(e);
} finally {
pdfDoc.close();
}
}
@ -151,8 +148,8 @@ public class InvisibleElementRemovalService {
private void execute(PDFDoc pdfDoc, boolean delta, boolean removePaths, Set<String> markedContentToIgnore) {
log.info("Start removing invisible Elements");
try(ElementWriter writer = new ElementWriter();
ElementReader reader = new ElementReader()) {
try (ElementWriter writer = new ElementWriter();
ElementReader reader = new ElementReader()) {
Set<Long> visitedXObjIds = new TreeSet<>();
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
@ -236,7 +233,7 @@ public class InvisibleElementRemovalService {
private void processImages(Element imageElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
try(Rect rect = imageElement.getBBox()) {
try (Rect rect = imageElement.getBBox()) {
if (rect == null) {
return;
@ -257,7 +254,7 @@ public class InvisibleElementRemovalService {
private void processText(Element textElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
try(Rect textBBox = textElement.getBBox()) {
try (Rect textBBox = textElement.getBBox()) {
if (textBBox == null) {
writer.writeElement(textElement);
@ -289,13 +286,17 @@ public class InvisibleElementRemovalService {
if (!inClippingPath) {
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
// red for elements removed by clipping path
gState.setFillColor(new ColorPt(1, 0, 0));
try (var color = new ColorPt(1, 0, 0)) {
gState.setFillColor(color);
}
writer.writeElement(textElement);
}
if (!isTextVisible) {
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
// blue for elements removed due to transparency or not rendered or same color as background
gState.setFillColor(new ColorPt(0, 0, 1));
try (var color = new ColorPt(0, 0, 1)) {
gState.setFillColor(color);
}
gState.setTextRenderMode(GState.e_fill_text);
gState.setFillOpacity(1);
writer.writeElement(textElement);
@ -314,19 +315,21 @@ public class InvisibleElementRemovalService {
context.visitedXObjIds().add(formObj.getObjNum());
// writer needs to be newly initialized when entering a new content stream
// see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
try(ElementWriter formWriter = new ElementWriter()) {
try (ElementWriter formWriter = new ElementWriter()) {
context.clippingPathStack().enterNewGState();
context.clippingPathStack().intersectClippingPath(new GeneralPath(Converter.toRectangle2D(formElement.getBBox())));
context.reader().formBegin();
formWriter.begin(formObj);
try (var formElementBBOX = formElement.getBBox()) {
context.clippingPathStack().intersectClippingPath(Converter.toRectangle2D(formElementBBOX));
context.reader().formBegin();
formWriter.begin(formObj);
context.reader().clearChangeList();
formWriter.setDefaultGState(context.reader());
context.reader().clearChangeList();
formWriter.setDefaultGState(context.reader());
processElements(formWriter, context);
formWriter.end();
context.reader().end();
context.clippingPathStack().leaveGState();
processElements(formWriter, context);
formWriter.end();
context.reader().end();
context.clippingPathStack().leaveGState();
}
}
}
}
@ -336,52 +339,58 @@ public class InvisibleElementRemovalService {
PathData pathData = pathElement.getPathData();
if (pathData.getOperators().length == 0 && pathData.getPoints().length == 0 || pathElement.getBBox() == null) {
writer.writeElement(pathElement);
return;
}
GeneralPath linePath = Converter.convertToGeneralPathAndTransformToInitialUserSpace(pathData, pathElement.getCTM());
var rect = linePath.getBounds2D();
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight());
if (pathElement.isClippingPath()) {
if (pathElement.isClipWindingFill()) {
linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
} else {
linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
try (var bbox = pathElement.getBBox()) {
if (pathData.getOperators().length == 0 && pathData.getPoints().length == 0 || bbox == null) {
writer.writeElement(pathElement);
return;
}
context.clippingPathStack().intersectClippingPath(linePath);
pathElement.setPathClip(!context.delta());
writer.writeElement(pathElement);
try (var ctm = pathElement.getCTM()) {
GeneralPath linePath = Converter.convertToGeneralPathAndTransformToInitialUserSpace(pathData, ctm);
} else {
if (pathElement.isWindingFill()) {
linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
} else {
linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
}
var rect = linePath.getBounds2D();
if (inClippingPath) {
if (isFilledAndNonTransparent(pathElement)) {
calculateOverlapsForLinePath(context, linePath);
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight());
if (pathElement.isClippingPath()) {
if (pathElement.isClipWindingFill()) {
linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
} else {
linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
}
context.clippingPathStack().intersectClippingPath(linePath);
pathElement.setPathClip(!context.delta());
writer.writeElement(pathElement);
} else {
if (pathElement.isWindingFill()) {
linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
} else {
linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
}
if (inClippingPath) {
if (isFilledAndNonTransparent(pathElement)) {
calculateOverlapsForLinePath(context, linePath);
}
context.visibleElements().add(ElementFeatureFactory.extractFeatures(pathElement));
}
if (!context.delta() && (inClippingPath || !context.removePaths())) {
writer.writeElement(pathElement);
}
if (context.delta() && !inClippingPath && context.removePaths()) {
try (var color = new ColorPt(1, 0, 0)) {
pathElement.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
pathElement.getGState().setFillColor(color);
pathElement.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
pathElement.getGState().setStrokeColor(color);
writer.writeElement(pathElement);
}
}
}
context.visibleElements().add(ElementFeatureFactory.extractFeatures(pathElement));
}
if (!context.delta() && (inClippingPath || !context.removePaths())) {
writer.writeElement(pathElement);
}
if (context.delta() && !inClippingPath && context.removePaths()) {
pathElement.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
pathElement.getGState().setFillColor(new ColorPt(1, 0, 0));
pathElement.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
pathElement.getGState().setStrokeColor(new ColorPt(1, 0, 0));
writer.writeElement(pathElement);
}
}
}
@ -480,7 +489,7 @@ public class InvisibleElementRemovalService {
context.visitedXObjIds().add(formObj.getObjNum());
// writer needs to be newly initialized when entering a new content stream
// see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
try(ElementWriter formWriter = new ElementWriter()) {
try (ElementWriter formWriter = new ElementWriter()) {
context.reader().formBegin();
formWriter.begin(formObj);
@ -516,7 +525,9 @@ public class InvisibleElementRemovalService {
private boolean fillIsVisible(GState gState, Rect textBBox, InvisibleElementRemovalContext context) throws PDFNetException {
return gState.getFillOpacity() != 0 && differentColorThanBackgroundColor(Converter.convertColor(gState.getFillColorSpace(), gState.getFillColor()), textBBox, context);
try (var color = gState.getFillColor()) {
return gState.getFillOpacity() != 0 && differentColorThanBackgroundColor(Converter.convertColor(gState.getFillColorSpace(), color), textBBox, context);
}
}
@ -541,13 +552,16 @@ public class InvisibleElementRemovalService {
private static List<ElementFeatures.Path> findVisiblePathElementsThatIntersect(Rect textBBox, InvisibleElementRemovalContext context) {
return context.visibleElements()
.stream()
.filter(elementFeatures -> elementFeatures.getElementType() == Element.e_path)
.map(elementFeatures -> (ElementFeatures.Path) elementFeatures)
.filter(elementFeatures -> !elementFeatures.getFillColor().equals(Color.WHITE))
.filter(element -> element.isBackground(textBBox))
.toList();
var result = new ArrayList<ElementFeatures.Path>();
for (var element : context.visibleElements()) {
if (element.getElementType() == Element.e_path
&& !((ElementFeatures.Path) element).getFillColor().equals(Color.WHITE)
&& ((ElementFeatures.Path) element).isBackground(textBBox)) {
result.add((ElementFeatures.Path) element);
}
}
return result;
}
@ -581,10 +595,10 @@ public class InvisibleElementRemovalService {
@SneakyThrows
private void drawBBox(ElementWriter writer, Rectangle2D r, String hexcolor) {
try(ColorPt colorPt = new ColorPt(Integer.valueOf(hexcolor.substring(1, 3), 16) / 255d,
try (ColorPt colorPt = new ColorPt(Integer.valueOf(hexcolor.substring(1, 3), 16) / 255d,
Integer.valueOf(hexcolor.substring(3, 5), 16) / 255d,
Integer.valueOf(hexcolor.substring(5, 7), 16) / 255d);
ElementBuilder eb = new ElementBuilder()) {
ElementBuilder eb = new ElementBuilder()) {
Element rect = eb.createRect(r.getX(), r.getY(), r.getWidth(), r.getHeight());
rect.setPathStroke(true);
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
@ -608,4 +622,4 @@ public class InvisibleElementRemovalService {
}
}
}

View File

@ -93,7 +93,7 @@ public class WatermarkRemovalService {
Map<Long, List<ElementFeatures>> formObjectsAndImagesForPages = new HashMap<>();
Set<Long> visitedXObjIds = new TreeSet<>();
try(ElementReader reader = new ElementReader()) {
try (ElementReader reader = new ElementReader()) {
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
@ -123,14 +123,16 @@ public class WatermarkRemovalService {
double minAreaCoveringPage,
Page page) throws PDFNetException {
if (element.getBBox() == null) {
return;
}
try (var bbox = element.getBBox()) {
if (bbox == null) {
return;
}
switch (element.getType()) {
case Element.e_form -> processXObject(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage, page);
case Element.e_image, Element.e_inline_image -> processImages(element, elementFeaturesLinkedList, page, minAreaCoveringPage);
case Element.e_text -> processText(element, elementFeaturesLinkedList, page);
switch (element.getType()) {
case Element.e_form -> processXObject(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage, page);
case Element.e_image, Element.e_inline_image -> processImages(element, elementFeaturesLinkedList, page, minAreaCoveringPage);
case Element.e_text -> processText(element, elementFeaturesLinkedList, page);
}
}
}
@ -147,11 +149,13 @@ public class WatermarkRemovalService {
return;
}
boolean isBigEnough = Math.abs(element.getBBox().getY1() - element.getBBox().getY2()) > page.getPageHeight() * MIN_TEXTWATERMARK_HEIGHT_THRESHOLD;
try (var bbox = element.getBBox()) {
boolean isBigEnough = Math.abs(bbox.getY1() - bbox.getY2()) > page.getPageHeight() * MIN_TEXTWATERMARK_HEIGHT_THRESHOLD;
if (isBigEnough) {
ElementFeatures elementFeatures = ElementFeatureFactory.extractFeatures(element);
elementFeaturesLinkedList.add(elementFeatures);
if (isBigEnough) {
ElementFeatures elementFeatures = ElementFeatureFactory.extractFeatures(element);
elementFeaturesLinkedList.add(elementFeatures);
}
}
}
@ -160,8 +164,10 @@ public class WatermarkRemovalService {
@SneakyThrows
private boolean isTextRotated(Element element) {
return Math.abs(element.getCTM().getB()) < Math.sin(Math.toRadians(ROTATED_TEXT_THRESHOLD)) || Math.abs(element.getCTM()
.getB()) > Math.sin(Math.toRadians(70 - ROTATED_TEXT_THRESHOLD));
try (var ctm = element.getCTM()) {
return Math.abs(ctm.getB()) < Math.sin(Math.toRadians(ROTATED_TEXT_THRESHOLD)) || Math.abs(ctm
.getB()) > Math.sin(Math.toRadians(70 - ROTATED_TEXT_THRESHOLD));
}
}
@ -185,13 +191,15 @@ public class WatermarkRemovalService {
if (element.getXObject() == null) {
return;
}
if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringPage && isLocatedNearBorder(element, page)) {
return;
}
try (var bbox = element.getBBox()) {
if (bbox.getHeight() * bbox.getWidth() < minAreaCoveringPage && isLocatedNearBorder(element, page)) {
return;
}
String hashOfImage = ImageHashFactory.calculate(element);
ElementFeatures elementFeatures = ElementFeatureFactory.extractFeaturesWithHash(element, hashOfImage);
elementFeaturesLinkedList.add(elementFeatures);
String hashOfImage = ImageHashFactory.calculate(element);
ElementFeatures elementFeatures = ElementFeatureFactory.extractFeaturesWithHash(element, hashOfImage);
elementFeaturesLinkedList.add(elementFeatures);
}
}
@ -199,10 +207,18 @@ public class WatermarkRemovalService {
@SneakyThrows
private boolean isLocatedNearBorder(Element element, Page page) {
return element.getBBox().getY1() < page.getVisibleContentBox().getY1() + page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD || element.getBBox()
.getY2() > page.getVisibleContentBox().getY2() - page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD || element.getBBox().getX1() < page.getVisibleContentBox()
.getX1() + page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD || element.getBBox().getX2() > page.getVisibleContentBox()
.getX2() - page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD;
try (var bbox = element.getBBox(); var contentBox = page.getVisibleContentBox();) {
return bbox.getY1() < contentBox.getY1() + page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD || bbox
.getY2() >
contentBox.getY2() -
page.getPageHeight() *
IMAGE_POSITION_HEIGHT_THRESHOLD ||
bbox.getX1() < contentBox
.getX1() + page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD || bbox.getX2() > contentBox
.getX2() -
page.getPageWidth() *
IMAGE_POSITION_WIDTH_THRESHOLD;
}
}
@ -214,12 +230,14 @@ public class WatermarkRemovalService {
double minAreaCoveringPage,
Page page) {
if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringPage) {
return;
try (var bbox = element.getBBox()) {
if (bbox.getHeight() * bbox.getWidth() < minAreaCoveringPage) {
return;
}
}
if (visitedXObjIds.add(element.getXObject().getObjNum())) {
try(ElementReader xObjectReader = new ElementReader()) {
try (ElementReader xObjectReader = new ElementReader()) {
xObjectReader.begin(element.getXObject());
for (Element element1 = xObjectReader.next(); element1 != null; element1 = xObjectReader.next()) {
processElement(element1, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage, page);
@ -244,10 +262,12 @@ public class WatermarkRemovalService {
.stream()
.flatMap(Collection::stream)
.filter(elementFeature -> formObjectsPerPage.values()
.stream()
.filter(elementFeaturesOnPage -> elementFeaturesOnPage.stream()
.anyMatch(elementFeature.getElementType() == Element.e_image || elementFeature.getElementType() == Element.e_inline_image ? elementFeature::isSimilarTo : elementFeature::almostMatches))
.count() >= minPagesFilter)
.stream()
.filter(elementFeaturesOnPage -> elementFeaturesOnPage.stream()
.anyMatch(
elementFeature.getElementType() == Element.e_image || elementFeature.getElementType() == Element.e_inline_image ?
elementFeature::isSimilarTo : elementFeature::almostMatches))
.count() >= minPagesFilter)
.toList();
}
@ -255,8 +275,8 @@ public class WatermarkRemovalService {
@SneakyThrows
private void removeAllWatermarks(PDFDoc pdfDoc, List<ElementFeatures> watermarksElementFeaturesList) {
try(ElementReader reader = new ElementReader();
ElementWriter writer = new ElementWriter()) {
try (ElementReader reader = new ElementReader();
ElementWriter writer = new ElementWriter()) {
Set<Long> visitedXObjIds = new TreeSet<>();
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
@ -295,16 +315,24 @@ public class WatermarkRemovalService {
switch (element.getType()) {
case Element.e_image, Element.e_inline_image -> {
if (element.getBBox() == null) {
writer.writeElement(element);
continue;
try (var bbox = element.getBBox()) {
if (bbox == null) {
writer.writeElement(element);
continue;
}
}
if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage && isLocatedNearBorder(element, page) && element.getBBox()
.getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage || element.getXObject() == null) {
try (var bbox = element.getBBox()) {
if (bbox.getHeight() * bbox.getWidth() < minAreaCoveringFromPage && isLocatedNearBorder(element, page) && bbox
.getHeight() *
bbox
.getWidth() <
minAreaCoveringFromPage ||
element.getXObject() == null) {
writer.writeElement(element);
continue;
writer.writeElement(element);
continue;
}
}
removeImages(element, writer, watermarksElementFeaturesList);
}
@ -344,8 +372,10 @@ public class WatermarkRemovalService {
return false;
}
if (Math.max(element.getBBox().getY1(), element.getBBox().getY2()) < page.getVisibleContentBox().getY1() + page.getPageHeight() * TEXT_POSITION_THRESHOLD) {
return false;
try (var bbox = element.getBBox(); var contents = page.getVisibleContentBox();) {
if (Math.max(bbox.getY1(), bbox.getY2()) < contents.getY1() + page.getPageHeight() * TEXT_POSITION_THRESHOLD) {
return false;
}
}
return true;
}
@ -385,7 +415,7 @@ public class WatermarkRemovalService {
visitedXObjIds.add(element.getXObject().getObjNum());
// writer needs to be newly initialized when entering a new content stream
// see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
try(ElementWriter formWriter = new ElementWriter()) {
try (ElementWriter formWriter = new ElementWriter()) {
reader.formBegin();
formWriter.begin(element.getXObject());

View File

@ -6,6 +6,7 @@ import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import lombok.extern.slf4j.Slf4j;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@ -14,6 +15,7 @@ import com.pdftron.pdf.PDFNet;
import lombok.SneakyThrows;
@SuppressWarnings("PMD")
@Slf4j
class InvisibleElementRemovalServiceTest {
InvisibleElementRemovalService invisibleElementRemovalService;
@ -101,8 +103,19 @@ class InvisibleElementRemovalServiceTest {
String result = PdfTextExtraction.extractAllTextFromDocument(in);
assertThat(result).contains("#1 Dark",
"#13 Yellow",
"Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper suscipit lobortis nisl ut aliquip\n" + "ex ea commodo consequat. Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie\n" + "consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim\n" + "qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi.");
assertThat(result).doesNotContain("Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut\n" + "labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et\n" + "ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem\n" + "ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et\n" + "dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea\n" + "rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum\n" + "dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore\n" + "magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet\n" + "clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. ");
"Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper suscipit lobortis nisl ut aliquip\n" +
"ex ea commodo consequat. Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie\n" +
"consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim\n" +
"qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi.");
assertThat(result).doesNotContain("Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut\n" +
"labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et\n" +
"ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem\n" +
"ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et\n" +
"dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea\n" +
"rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum\n" +
"dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore\n" +
"magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet\n" +
"clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. ");
}
}
@ -176,4 +189,4 @@ class InvisibleElementRemovalServiceTest {
}
}
}