memory optimisations
This commit is contained in:
parent
e926083881
commit
1b4ab8dc88
@ -1,7 +1,5 @@
|
||||
package com.iqser.red.pdftronlogic.commons;
|
||||
|
||||
|
||||
|
||||
import static com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService.TOLERANCE;
|
||||
|
||||
import java.awt.geom.Area;
|
||||
@ -35,6 +33,13 @@ public class ClippingPathStack {
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void intersectClippingPath(Rectangle2D path) {
|
||||
|
||||
getCurrentClippingPath().intersect(new Area(path));
|
||||
}
|
||||
|
||||
|
||||
public boolean almostIntersects(double x, double y, double width, double height) {
|
||||
// To address inconsistencies in the calculation of the bounding box we slightly increase the rectangle
|
||||
// Height or width are zero for straight lines, even though they are being rendered. Therefore, height or width must be at minimum >0.
|
||||
@ -56,15 +61,16 @@ public class ClippingPathStack {
|
||||
public void enterNewGState() {
|
||||
|
||||
Area current = stack.peek();
|
||||
Area cloned = new Area();
|
||||
cloned.add(current);
|
||||
Area cloned = (Area) current.clone();
|
||||
stack.push(cloned);
|
||||
}
|
||||
|
||||
|
||||
public void leaveGState() {
|
||||
|
||||
stack.pop();
|
||||
// somehow this greatly helps memory management
|
||||
var popped = stack.pop();
|
||||
popped.reset();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -4,6 +4,7 @@ import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.Element;
|
||||
|
||||
public class ElementFeatureFactory {
|
||||
|
||||
public static ElementFeatures extractFeatures(Element element) throws PDFNetException {
|
||||
|
||||
return switch (element.getType()) {
|
||||
@ -16,7 +17,9 @@ public class ElementFeatureFactory {
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
public static ElementFeatures extractFeaturesWithHash(Element element, String hashObject) throws PDFNetException {
|
||||
|
||||
return buildImage(element)
|
||||
.hashOfImage(hashObject)
|
||||
.build();
|
||||
@ -25,56 +28,65 @@ public class ElementFeatureFactory {
|
||||
|
||||
private static ElementFeatures.Form buildForm(Element element) throws PDFNetException {
|
||||
|
||||
return ElementFeatures.Form.builder()
|
||||
.elementType(element.getType())
|
||||
.boundingBox(Converter.toRectangle2D(element.getBBox()))
|
||||
.xObjectType(element.getXObject().getType())
|
||||
.dictOrArrayOrStreamLength(element.getXObject().getType() == 7 ? element.getXObject().getDecodedStream().size() : 0)
|
||||
.build();
|
||||
try (var bbox = element.getBBox();) {
|
||||
return ElementFeatures.Form.builder()
|
||||
.elementType(element.getType())
|
||||
.boundingBox(Converter.toRectangle2D(bbox))
|
||||
.xObjectType(element.getXObject().getType())
|
||||
.dictOrArrayOrStreamLength(element.getXObject().getType() == 7 ? element.getXObject().getDecodedStream().size() : 0)
|
||||
.build();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static ElementFeatures.Image.ImageBuilder<?, ?> buildImage(Element element) throws PDFNetException {
|
||||
|
||||
return ElementFeatures.Image.builder()
|
||||
.elementType(element.getType())
|
||||
.boundingBox(Converter.toRectangle2D(element.getBBox()))
|
||||
.dataSize(element.getImageDataSize())
|
||||
.height(element.getImageHeight())
|
||||
.width(element.getImageWidth())
|
||||
.renderingIntent(element.getImageRenderingIntent())
|
||||
.componentNum(element.getComponentNum())
|
||||
.bitsPerComponent(element.getBitsPerComponent());
|
||||
try (var bbox = element.getBBox();) {
|
||||
return ElementFeatures.Image.builder()
|
||||
.elementType(element.getType())
|
||||
.boundingBox(Converter.toRectangle2D(bbox))
|
||||
.dataSize(element.getImageDataSize())
|
||||
.height(element.getImageHeight())
|
||||
.width(element.getImageWidth())
|
||||
.renderingIntent(element.getImageRenderingIntent())
|
||||
.componentNum(element.getComponentNum())
|
||||
.bitsPerComponent(element.getBitsPerComponent());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static ElementFeatures.Text buildText(Element element) throws PDFNetException {
|
||||
|
||||
return ElementFeatures.Text.builder()
|
||||
.elementType(element.getType())
|
||||
.boundingBox(Converter.toRectangle2D(element.getBBox()))
|
||||
.text(element.getTextString())
|
||||
.font(element.getGState().getFont().getType())
|
||||
.fontsize(element.getGState().getFontSize())
|
||||
.build();
|
||||
try (var bbox = element.getBBox();) {
|
||||
return ElementFeatures.Text.builder()
|
||||
.elementType(element.getType())
|
||||
.boundingBox(Converter.toRectangle2D(bbox))
|
||||
.text(element.getTextString())
|
||||
.font(element.getGState().getFont().getType())
|
||||
.fontsize(element.getGState().getFontSize())
|
||||
.build();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static ElementFeatures.Path buildPath(Element element) throws PDFNetException {
|
||||
|
||||
return ElementFeatures.Path.builder()
|
||||
.elementType(element.getType())
|
||||
.boundingBox(Converter.toRectangle2D(element.getBBox()))
|
||||
.isClippingPath(element.isClippingPath())
|
||||
.isClipWindingFill(element.isClipWindingFill())
|
||||
.isStroked(element.isStroked())
|
||||
.isFilled(element.isFilled())
|
||||
.isWindingFill(element.isWindingFill())
|
||||
.fillColor(Converter.convertColor(element.getGState().getFillColorSpace(), element.getGState().getFillColor()))
|
||||
.strokeColor(Converter.convertColor(element.getGState().getStrokeColorSpace(), element.getGState().getStrokeColor()))
|
||||
.linePath(Converter.convertToGeneralPathAndTransformToInitialUserSpace(element.getPathData(), element.getCTM()))
|
||||
.build();
|
||||
try (var bbox = element.getBBox(); var ctm = element.getCTM();
|
||||
var fillColor = element.getGState().getFillColor();
|
||||
var strokeColor = element.getGState().getStrokeColor()) {
|
||||
return ElementFeatures.Path.builder()
|
||||
.elementType(element.getType())
|
||||
.boundingBox(Converter.toRectangle2D(bbox))
|
||||
.isClippingPath(element.isClippingPath())
|
||||
.isClipWindingFill(element.isClipWindingFill())
|
||||
.isStroked(element.isStroked())
|
||||
.isFilled(element.isFilled())
|
||||
.isWindingFill(element.isWindingFill())
|
||||
.fillColor(Converter.convertColor(element.getGState().getFillColorSpace(), fillColor))
|
||||
.strokeColor(Converter.convertColor(element.getGState().getStrokeColorSpace(), strokeColor))
|
||||
.linePath(Converter.convertToGeneralPathAndTransformToInitialUserSpace(element.getPathData(), ctm))
|
||||
.build();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@ -31,9 +31,11 @@ public class ElementFeatures {
|
||||
|
||||
public boolean almostMatches(Element element) throws PDFNetException {
|
||||
|
||||
return element.getType() == elementType && //
|
||||
element.getBBox() != null && //
|
||||
rectsAlmostMatch(element.getBBox());
|
||||
try (var bbox = element.getBBox()) {
|
||||
return element.getType() == elementType && //
|
||||
bbox != null && //
|
||||
rectsAlmostMatch(bbox);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -42,9 +44,9 @@ public class ElementFeatures {
|
||||
// To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance
|
||||
|
||||
return almostEqual(bBox.getX1(), boundingBox.getX()) && //
|
||||
almostEqual(bBox.getY1(), boundingBox.getY()) && //
|
||||
almostEqual(bBox.getWidth(), boundingBox.getWidth()) && //
|
||||
almostEqual(bBox.getHeight(), boundingBox.getHeight());
|
||||
almostEqual(bBox.getY1(), boundingBox.getY()) && //
|
||||
almostEqual(bBox.getWidth(), boundingBox.getWidth()) && //
|
||||
almostEqual(bBox.getHeight(), boundingBox.getHeight());
|
||||
}
|
||||
|
||||
|
||||
@ -65,9 +67,9 @@ public class ElementFeatures {
|
||||
// To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance
|
||||
|
||||
return almostEqual(bBox.getX(), boundingBox.getX()) && //
|
||||
almostEqual(bBox.getY(), boundingBox.getY()) && //
|
||||
almostEqual(bBox.getWidth(), boundingBox.getWidth()) && //
|
||||
almostEqual(bBox.getHeight(), boundingBox.getHeight());
|
||||
almostEqual(bBox.getY(), boundingBox.getY()) && //
|
||||
almostEqual(bBox.getWidth(), boundingBox.getWidth()) && //
|
||||
almostEqual(bBox.getHeight(), boundingBox.getHeight());
|
||||
}
|
||||
|
||||
|
||||
@ -81,9 +83,9 @@ public class ElementFeatures {
|
||||
// To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance
|
||||
|
||||
return isPositionSimilar(rectangle2D.getX(), boundingBox.getX(), rectangle2D.getWidth()) && //
|
||||
isPositionSimilar(rectangle2D.getY(), boundingBox.getY(), rectangle2D.getHeight()) && //
|
||||
isSizeSimilar(rectangle2D.getWidth(), boundingBox.getWidth()) && //
|
||||
isSizeSimilar(rectangle2D.getHeight(), boundingBox.getHeight());
|
||||
isPositionSimilar(rectangle2D.getY(), boundingBox.getY(), rectangle2D.getHeight()) && //
|
||||
isSizeSimilar(rectangle2D.getWidth(), boundingBox.getWidth()) && //
|
||||
isSizeSimilar(rectangle2D.getHeight(), boundingBox.getHeight());
|
||||
}
|
||||
|
||||
|
||||
@ -115,9 +117,9 @@ public class ElementFeatures {
|
||||
public boolean almostMatches(Element element) throws PDFNetException {
|
||||
|
||||
return super.almostMatches(element) && //
|
||||
text.equals(element.getTextString()) && //
|
||||
font == element.getGState().getFont().getType() && //
|
||||
almostEqual(fontsize, element.getGState().getFontSize());
|
||||
text.equals(element.getTextString()) && //
|
||||
font == element.getGState().getFont().getType() && //
|
||||
almostEqual(fontsize, element.getGState().getFontSize());
|
||||
}
|
||||
|
||||
}
|
||||
@ -142,11 +144,11 @@ public class ElementFeatures {
|
||||
public boolean almostMatches(Element element) throws PDFNetException {
|
||||
|
||||
return super.almostMatches(element) && //
|
||||
isClippingPath == element.isClippingPath() && //
|
||||
isClipWindingFill == element.isClipWindingFill() && //
|
||||
isStroked == element.isStroked() && //
|
||||
isFilled == element.isFilled() && //
|
||||
isWindingFill == element.isWindingFill();
|
||||
isClippingPath == element.isClippingPath() && //
|
||||
isClipWindingFill == element.isClipWindingFill() && //
|
||||
isStroked == element.isStroked() && //
|
||||
isFilled == element.isFilled() && //
|
||||
isWindingFill == element.isWindingFill();
|
||||
|
||||
}
|
||||
|
||||
@ -161,7 +163,7 @@ public class ElementFeatures {
|
||||
public boolean isBackground(Rect area) {
|
||||
|
||||
return isFilled && //
|
||||
getBoundingBox().intersects(area.getX1(), area.getY1(), area.getWidth(), area.getHeight());
|
||||
getBoundingBox().intersects(area.getX1(), area.getY1(), area.getWidth(), area.getHeight());
|
||||
}
|
||||
|
||||
}
|
||||
@ -185,12 +187,12 @@ public class ElementFeatures {
|
||||
public boolean almostMatches(Element element) throws PDFNetException {
|
||||
|
||||
return super.almostMatches(element) && //
|
||||
dataSize == element.getImageDataSize() && //
|
||||
height == element.getImageHeight() && //
|
||||
width == element.getImageWidth() && //
|
||||
renderingIntent == element.getImageRenderingIntent() && //
|
||||
componentNum == element.getComponentNum() && //
|
||||
bitsPerComponent == element.getBitsPerComponent();
|
||||
dataSize == element.getImageDataSize() && //
|
||||
height == element.getImageHeight() && //
|
||||
width == element.getImageWidth() && //
|
||||
renderingIntent == element.getImageRenderingIntent() && //
|
||||
componentNum == element.getComponentNum() && //
|
||||
bitsPerComponent == element.getBitsPerComponent();
|
||||
}
|
||||
|
||||
|
||||
@ -199,15 +201,18 @@ public class ElementFeatures {
|
||||
if (elementFeatures.getClass() != this.getClass()) {
|
||||
return false;
|
||||
}
|
||||
return super.almostMatches(elementFeatures) && this.dataSize == ((Image) elementFeatures).getDataSize() && this.height == ((Image) elementFeatures).getHeight() && this.width == ((Image) elementFeatures).getWidth() && this.renderingIntent == ((Image) elementFeatures).getRenderingIntent() && this.componentNum == ((Image) elementFeatures).getComponentNum() && this.bitsPerComponent == ((Image) elementFeatures).getBitsPerComponent() && calculateHammingDistance(
|
||||
((Image) elementFeatures).getHashOfImage()) <= HAMMING_DISTANCE_THRESHOLD;
|
||||
return super.almostMatches(elementFeatures) && this.dataSize == ((Image) elementFeatures).getDataSize() && this.height == ((Image) elementFeatures).getHeight() &&
|
||||
this.width == ((Image) elementFeatures).getWidth() && this.renderingIntent == ((Image) elementFeatures).getRenderingIntent() &&
|
||||
this.componentNum == ((Image) elementFeatures).getComponentNum() && this.bitsPerComponent == ((Image) elementFeatures).getBitsPerComponent() &&
|
||||
calculateHammingDistance(
|
||||
((Image) elementFeatures).getHashOfImage()) <= HAMMING_DISTANCE_THRESHOLD;
|
||||
}
|
||||
|
||||
|
||||
public boolean isSimilarTo(ElementFeatures elementFeatures) {
|
||||
|
||||
return super.isSimilarTo(elementFeatures) && //
|
||||
calculateHammingDistance(((Image) elementFeatures).getHashOfImage()) <= HAMMING_DISTANCE_THRESHOLD;
|
||||
calculateHammingDistance(((Image) elementFeatures).getHashOfImage()) <= HAMMING_DISTANCE_THRESHOLD;
|
||||
}
|
||||
|
||||
|
||||
@ -241,10 +246,12 @@ public class ElementFeatures {
|
||||
@Override
|
||||
public boolean almostMatches(Element element) throws PDFNetException {
|
||||
|
||||
return element.getType() == getElementType() && //
|
||||
element.getBBox() != null && //
|
||||
(super.rectsAlmostMatch(element.getBBox()) || almostRotateMatches(element.getBBox().getRectangle())) && xObjectType == element.getXObject()
|
||||
.getType() && dictOrArrayOrStreamLength == element.getXObject().getDecodedStream().size();
|
||||
try (var bbox = element.getBBox()) {
|
||||
return element.getType() == getElementType() && //
|
||||
bbox != null && //
|
||||
(super.rectsAlmostMatch(bbox) || almostRotateMatches(bbox.getRectangle())) && xObjectType == element.getXObject()
|
||||
.getType() && dictOrArrayOrStreamLength == element.getXObject().getDecodedStream().size();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -253,9 +260,11 @@ public class ElementFeatures {
|
||||
if (elementFeatures.getClass() != this.getClass()) {
|
||||
return false;
|
||||
}
|
||||
return elementFeatures.getElementType() == getElementType() && elementFeatures.getBoundingBox() != null && (super.rectsAlmostMatch(elementFeatures.getBoundingBox()) || almostRotateMatches(
|
||||
elementFeatures.getBoundingBox()
|
||||
.getBounds2D())) && xObjectType == ((Form) elementFeatures).getXObjectType() && dictOrArrayOrStreamLength == ((Form) elementFeatures).getDictOrArrayOrStreamLength();
|
||||
return elementFeatures.getElementType() == getElementType() && elementFeatures.getBoundingBox() != null &&
|
||||
(super.rectsAlmostMatch(elementFeatures.getBoundingBox()) || almostRotateMatches(
|
||||
elementFeatures.getBoundingBox()
|
||||
.getBounds2D())) && xObjectType == ((Form) elementFeatures).getXObjectType() &&
|
||||
dictOrArrayOrStreamLength == ((Form) elementFeatures).getDictOrArrayOrStreamLength();
|
||||
|
||||
}
|
||||
|
||||
@ -263,7 +272,7 @@ public class ElementFeatures {
|
||||
private boolean almostRotateMatches(Rectangle2D bBox) {
|
||||
|
||||
return almostEqual(bBox.getWidth(), getBoundingBox().getHeight()) && //
|
||||
almostEqual(bBox.getHeight(), getBoundingBox().getWidth());
|
||||
almostEqual(bBox.getHeight(), getBoundingBox().getWidth());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -63,15 +63,12 @@ public class InvisibleElementRemovalService {
|
||||
|
||||
PDFDoc pdfDoc = new PDFDoc(pdfFile);
|
||||
|
||||
execute(pdfDoc, delta, removePaths, markedContentToIgnore);
|
||||
|
||||
try {
|
||||
try (pdfDoc) {
|
||||
execute(pdfDoc, delta, removePaths, markedContentToIgnore);
|
||||
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
|
||||
} catch (Exception e) {
|
||||
log.error("File could not be saved after invisible element removal");
|
||||
throw new RuntimeException(e);
|
||||
} finally {
|
||||
pdfDoc.close();
|
||||
}
|
||||
|
||||
}
|
||||
@ -151,8 +148,8 @@ public class InvisibleElementRemovalService {
|
||||
private void execute(PDFDoc pdfDoc, boolean delta, boolean removePaths, Set<String> markedContentToIgnore) {
|
||||
|
||||
log.info("Start removing invisible Elements");
|
||||
try(ElementWriter writer = new ElementWriter();
|
||||
ElementReader reader = new ElementReader()) {
|
||||
try (ElementWriter writer = new ElementWriter();
|
||||
ElementReader reader = new ElementReader()) {
|
||||
Set<Long> visitedXObjIds = new TreeSet<>();
|
||||
|
||||
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
|
||||
@ -236,7 +233,7 @@ public class InvisibleElementRemovalService {
|
||||
|
||||
private void processImages(Element imageElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
|
||||
|
||||
try(Rect rect = imageElement.getBBox()) {
|
||||
try (Rect rect = imageElement.getBBox()) {
|
||||
|
||||
if (rect == null) {
|
||||
return;
|
||||
@ -257,7 +254,7 @@ public class InvisibleElementRemovalService {
|
||||
|
||||
private void processText(Element textElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
|
||||
|
||||
try(Rect textBBox = textElement.getBBox()) {
|
||||
try (Rect textBBox = textElement.getBBox()) {
|
||||
|
||||
if (textBBox == null) {
|
||||
writer.writeElement(textElement);
|
||||
@ -289,13 +286,17 @@ public class InvisibleElementRemovalService {
|
||||
if (!inClippingPath) {
|
||||
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||
// red for elements removed by clipping path
|
||||
gState.setFillColor(new ColorPt(1, 0, 0));
|
||||
try (var color = new ColorPt(1, 0, 0)) {
|
||||
gState.setFillColor(color);
|
||||
}
|
||||
writer.writeElement(textElement);
|
||||
}
|
||||
if (!isTextVisible) {
|
||||
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||
// blue for elements removed due to transparency or not rendered or same color as background
|
||||
gState.setFillColor(new ColorPt(0, 0, 1));
|
||||
try (var color = new ColorPt(0, 0, 1)) {
|
||||
gState.setFillColor(color);
|
||||
}
|
||||
gState.setTextRenderMode(GState.e_fill_text);
|
||||
gState.setFillOpacity(1);
|
||||
writer.writeElement(textElement);
|
||||
@ -314,19 +315,21 @@ public class InvisibleElementRemovalService {
|
||||
context.visitedXObjIds().add(formObj.getObjNum());
|
||||
// writer needs to be newly initialized when entering a new content stream
|
||||
// see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
|
||||
try(ElementWriter formWriter = new ElementWriter()) {
|
||||
try (ElementWriter formWriter = new ElementWriter()) {
|
||||
context.clippingPathStack().enterNewGState();
|
||||
context.clippingPathStack().intersectClippingPath(new GeneralPath(Converter.toRectangle2D(formElement.getBBox())));
|
||||
context.reader().formBegin();
|
||||
formWriter.begin(formObj);
|
||||
try (var formElementBBOX = formElement.getBBox()) {
|
||||
context.clippingPathStack().intersectClippingPath(Converter.toRectangle2D(formElementBBOX));
|
||||
context.reader().formBegin();
|
||||
formWriter.begin(formObj);
|
||||
|
||||
context.reader().clearChangeList();
|
||||
formWriter.setDefaultGState(context.reader());
|
||||
context.reader().clearChangeList();
|
||||
formWriter.setDefaultGState(context.reader());
|
||||
|
||||
processElements(formWriter, context);
|
||||
formWriter.end();
|
||||
context.reader().end();
|
||||
context.clippingPathStack().leaveGState();
|
||||
processElements(formWriter, context);
|
||||
formWriter.end();
|
||||
context.reader().end();
|
||||
context.clippingPathStack().leaveGState();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -336,52 +339,58 @@ public class InvisibleElementRemovalService {
|
||||
|
||||
PathData pathData = pathElement.getPathData();
|
||||
|
||||
if (pathData.getOperators().length == 0 && pathData.getPoints().length == 0 || pathElement.getBBox() == null) {
|
||||
writer.writeElement(pathElement);
|
||||
return;
|
||||
}
|
||||
|
||||
GeneralPath linePath = Converter.convertToGeneralPathAndTransformToInitialUserSpace(pathData, pathElement.getCTM());
|
||||
|
||||
var rect = linePath.getBounds2D();
|
||||
|
||||
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight());
|
||||
|
||||
if (pathElement.isClippingPath()) {
|
||||
if (pathElement.isClipWindingFill()) {
|
||||
linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
|
||||
} else {
|
||||
linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
|
||||
try (var bbox = pathElement.getBBox()) {
|
||||
if (pathData.getOperators().length == 0 && pathData.getPoints().length == 0 || bbox == null) {
|
||||
writer.writeElement(pathElement);
|
||||
return;
|
||||
}
|
||||
|
||||
context.clippingPathStack().intersectClippingPath(linePath);
|
||||
pathElement.setPathClip(!context.delta());
|
||||
writer.writeElement(pathElement);
|
||||
try (var ctm = pathElement.getCTM()) {
|
||||
GeneralPath linePath = Converter.convertToGeneralPathAndTransformToInitialUserSpace(pathData, ctm);
|
||||
|
||||
} else {
|
||||
if (pathElement.isWindingFill()) {
|
||||
linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
|
||||
} else {
|
||||
linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
|
||||
}
|
||||
var rect = linePath.getBounds2D();
|
||||
|
||||
if (inClippingPath) {
|
||||
if (isFilledAndNonTransparent(pathElement)) {
|
||||
calculateOverlapsForLinePath(context, linePath);
|
||||
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight());
|
||||
|
||||
if (pathElement.isClippingPath()) {
|
||||
if (pathElement.isClipWindingFill()) {
|
||||
linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
|
||||
} else {
|
||||
linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
|
||||
}
|
||||
|
||||
context.clippingPathStack().intersectClippingPath(linePath);
|
||||
pathElement.setPathClip(!context.delta());
|
||||
writer.writeElement(pathElement);
|
||||
|
||||
} else {
|
||||
if (pathElement.isWindingFill()) {
|
||||
linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
|
||||
} else {
|
||||
linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
|
||||
}
|
||||
|
||||
if (inClippingPath) {
|
||||
if (isFilledAndNonTransparent(pathElement)) {
|
||||
calculateOverlapsForLinePath(context, linePath);
|
||||
}
|
||||
context.visibleElements().add(ElementFeatureFactory.extractFeatures(pathElement));
|
||||
}
|
||||
|
||||
if (!context.delta() && (inClippingPath || !context.removePaths())) {
|
||||
writer.writeElement(pathElement);
|
||||
}
|
||||
|
||||
if (context.delta() && !inClippingPath && context.removePaths()) {
|
||||
try (var color = new ColorPt(1, 0, 0)) {
|
||||
pathElement.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||
pathElement.getGState().setFillColor(color);
|
||||
pathElement.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
||||
pathElement.getGState().setStrokeColor(color);
|
||||
writer.writeElement(pathElement);
|
||||
}
|
||||
}
|
||||
}
|
||||
context.visibleElements().add(ElementFeatureFactory.extractFeatures(pathElement));
|
||||
}
|
||||
|
||||
if (!context.delta() && (inClippingPath || !context.removePaths())) {
|
||||
writer.writeElement(pathElement);
|
||||
}
|
||||
|
||||
if (context.delta() && !inClippingPath && context.removePaths()) {
|
||||
pathElement.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||
pathElement.getGState().setFillColor(new ColorPt(1, 0, 0));
|
||||
pathElement.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
||||
pathElement.getGState().setStrokeColor(new ColorPt(1, 0, 0));
|
||||
writer.writeElement(pathElement);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -480,7 +489,7 @@ public class InvisibleElementRemovalService {
|
||||
context.visitedXObjIds().add(formObj.getObjNum());
|
||||
// writer needs to be newly initialized when entering a new content stream
|
||||
// see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
|
||||
try(ElementWriter formWriter = new ElementWriter()) {
|
||||
try (ElementWriter formWriter = new ElementWriter()) {
|
||||
context.reader().formBegin();
|
||||
formWriter.begin(formObj);
|
||||
|
||||
@ -516,7 +525,9 @@ public class InvisibleElementRemovalService {
|
||||
|
||||
private boolean fillIsVisible(GState gState, Rect textBBox, InvisibleElementRemovalContext context) throws PDFNetException {
|
||||
|
||||
return gState.getFillOpacity() != 0 && differentColorThanBackgroundColor(Converter.convertColor(gState.getFillColorSpace(), gState.getFillColor()), textBBox, context);
|
||||
try (var color = gState.getFillColor()) {
|
||||
return gState.getFillOpacity() != 0 && differentColorThanBackgroundColor(Converter.convertColor(gState.getFillColorSpace(), color), textBBox, context);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -541,13 +552,16 @@ public class InvisibleElementRemovalService {
|
||||
|
||||
private static List<ElementFeatures.Path> findVisiblePathElementsThatIntersect(Rect textBBox, InvisibleElementRemovalContext context) {
|
||||
|
||||
return context.visibleElements()
|
||||
.stream()
|
||||
.filter(elementFeatures -> elementFeatures.getElementType() == Element.e_path)
|
||||
.map(elementFeatures -> (ElementFeatures.Path) elementFeatures)
|
||||
.filter(elementFeatures -> !elementFeatures.getFillColor().equals(Color.WHITE))
|
||||
.filter(element -> element.isBackground(textBBox))
|
||||
.toList();
|
||||
var result = new ArrayList<ElementFeatures.Path>();
|
||||
for (var element : context.visibleElements()) {
|
||||
if (element.getElementType() == Element.e_path
|
||||
&& !((ElementFeatures.Path) element).getFillColor().equals(Color.WHITE)
|
||||
&& ((ElementFeatures.Path) element).isBackground(textBBox)) {
|
||||
result.add((ElementFeatures.Path) element);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -581,10 +595,10 @@ public class InvisibleElementRemovalService {
|
||||
@SneakyThrows
|
||||
private void drawBBox(ElementWriter writer, Rectangle2D r, String hexcolor) {
|
||||
|
||||
try(ColorPt colorPt = new ColorPt(Integer.valueOf(hexcolor.substring(1, 3), 16) / 255d,
|
||||
try (ColorPt colorPt = new ColorPt(Integer.valueOf(hexcolor.substring(1, 3), 16) / 255d,
|
||||
Integer.valueOf(hexcolor.substring(3, 5), 16) / 255d,
|
||||
Integer.valueOf(hexcolor.substring(5, 7), 16) / 255d);
|
||||
ElementBuilder eb = new ElementBuilder()) {
|
||||
ElementBuilder eb = new ElementBuilder()) {
|
||||
Element rect = eb.createRect(r.getX(), r.getY(), r.getWidth(), r.getHeight());
|
||||
rect.setPathStroke(true);
|
||||
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
||||
@ -608,4 +622,4 @@ public class InvisibleElementRemovalService {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -93,7 +93,7 @@ public class WatermarkRemovalService {
|
||||
Map<Long, List<ElementFeatures>> formObjectsAndImagesForPages = new HashMap<>();
|
||||
Set<Long> visitedXObjIds = new TreeSet<>();
|
||||
|
||||
try(ElementReader reader = new ElementReader()) {
|
||||
try (ElementReader reader = new ElementReader()) {
|
||||
|
||||
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
|
||||
|
||||
@ -123,14 +123,16 @@ public class WatermarkRemovalService {
|
||||
double minAreaCoveringPage,
|
||||
Page page) throws PDFNetException {
|
||||
|
||||
if (element.getBBox() == null) {
|
||||
return;
|
||||
}
|
||||
try (var bbox = element.getBBox()) {
|
||||
if (bbox == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
switch (element.getType()) {
|
||||
case Element.e_form -> processXObject(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage, page);
|
||||
case Element.e_image, Element.e_inline_image -> processImages(element, elementFeaturesLinkedList, page, minAreaCoveringPage);
|
||||
case Element.e_text -> processText(element, elementFeaturesLinkedList, page);
|
||||
switch (element.getType()) {
|
||||
case Element.e_form -> processXObject(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage, page);
|
||||
case Element.e_image, Element.e_inline_image -> processImages(element, elementFeaturesLinkedList, page, minAreaCoveringPage);
|
||||
case Element.e_text -> processText(element, elementFeaturesLinkedList, page);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -147,11 +149,13 @@ public class WatermarkRemovalService {
|
||||
return;
|
||||
}
|
||||
|
||||
boolean isBigEnough = Math.abs(element.getBBox().getY1() - element.getBBox().getY2()) > page.getPageHeight() * MIN_TEXTWATERMARK_HEIGHT_THRESHOLD;
|
||||
try (var bbox = element.getBBox()) {
|
||||
boolean isBigEnough = Math.abs(bbox.getY1() - bbox.getY2()) > page.getPageHeight() * MIN_TEXTWATERMARK_HEIGHT_THRESHOLD;
|
||||
|
||||
if (isBigEnough) {
|
||||
ElementFeatures elementFeatures = ElementFeatureFactory.extractFeatures(element);
|
||||
elementFeaturesLinkedList.add(elementFeatures);
|
||||
if (isBigEnough) {
|
||||
ElementFeatures elementFeatures = ElementFeatureFactory.extractFeatures(element);
|
||||
elementFeaturesLinkedList.add(elementFeatures);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -160,8 +164,10 @@ public class WatermarkRemovalService {
|
||||
@SneakyThrows
|
||||
private boolean isTextRotated(Element element) {
|
||||
|
||||
return Math.abs(element.getCTM().getB()) < Math.sin(Math.toRadians(ROTATED_TEXT_THRESHOLD)) || Math.abs(element.getCTM()
|
||||
.getB()) > Math.sin(Math.toRadians(70 - ROTATED_TEXT_THRESHOLD));
|
||||
try (var ctm = element.getCTM()) {
|
||||
return Math.abs(ctm.getB()) < Math.sin(Math.toRadians(ROTATED_TEXT_THRESHOLD)) || Math.abs(ctm
|
||||
.getB()) > Math.sin(Math.toRadians(70 - ROTATED_TEXT_THRESHOLD));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -185,13 +191,15 @@ public class WatermarkRemovalService {
|
||||
if (element.getXObject() == null) {
|
||||
return;
|
||||
}
|
||||
if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringPage && isLocatedNearBorder(element, page)) {
|
||||
return;
|
||||
}
|
||||
try (var bbox = element.getBBox()) {
|
||||
if (bbox.getHeight() * bbox.getWidth() < minAreaCoveringPage && isLocatedNearBorder(element, page)) {
|
||||
return;
|
||||
}
|
||||
|
||||
String hashOfImage = ImageHashFactory.calculate(element);
|
||||
ElementFeatures elementFeatures = ElementFeatureFactory.extractFeaturesWithHash(element, hashOfImage);
|
||||
elementFeaturesLinkedList.add(elementFeatures);
|
||||
String hashOfImage = ImageHashFactory.calculate(element);
|
||||
ElementFeatures elementFeatures = ElementFeatureFactory.extractFeaturesWithHash(element, hashOfImage);
|
||||
elementFeaturesLinkedList.add(elementFeatures);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -199,10 +207,18 @@ public class WatermarkRemovalService {
|
||||
@SneakyThrows
|
||||
private boolean isLocatedNearBorder(Element element, Page page) {
|
||||
|
||||
return element.getBBox().getY1() < page.getVisibleContentBox().getY1() + page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD || element.getBBox()
|
||||
.getY2() > page.getVisibleContentBox().getY2() - page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD || element.getBBox().getX1() < page.getVisibleContentBox()
|
||||
.getX1() + page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD || element.getBBox().getX2() > page.getVisibleContentBox()
|
||||
.getX2() - page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD;
|
||||
try (var bbox = element.getBBox(); var contentBox = page.getVisibleContentBox();) {
|
||||
return bbox.getY1() < contentBox.getY1() + page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD || bbox
|
||||
.getY2() >
|
||||
contentBox.getY2() -
|
||||
page.getPageHeight() *
|
||||
IMAGE_POSITION_HEIGHT_THRESHOLD ||
|
||||
bbox.getX1() < contentBox
|
||||
.getX1() + page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD || bbox.getX2() > contentBox
|
||||
.getX2() -
|
||||
page.getPageWidth() *
|
||||
IMAGE_POSITION_WIDTH_THRESHOLD;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -214,12 +230,14 @@ public class WatermarkRemovalService {
|
||||
double minAreaCoveringPage,
|
||||
Page page) {
|
||||
|
||||
if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringPage) {
|
||||
return;
|
||||
try (var bbox = element.getBBox()) {
|
||||
if (bbox.getHeight() * bbox.getWidth() < minAreaCoveringPage) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (visitedXObjIds.add(element.getXObject().getObjNum())) {
|
||||
try(ElementReader xObjectReader = new ElementReader()) {
|
||||
try (ElementReader xObjectReader = new ElementReader()) {
|
||||
xObjectReader.begin(element.getXObject());
|
||||
for (Element element1 = xObjectReader.next(); element1 != null; element1 = xObjectReader.next()) {
|
||||
processElement(element1, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage, page);
|
||||
@ -244,10 +262,12 @@ public class WatermarkRemovalService {
|
||||
.stream()
|
||||
.flatMap(Collection::stream)
|
||||
.filter(elementFeature -> formObjectsPerPage.values()
|
||||
.stream()
|
||||
.filter(elementFeaturesOnPage -> elementFeaturesOnPage.stream()
|
||||
.anyMatch(elementFeature.getElementType() == Element.e_image || elementFeature.getElementType() == Element.e_inline_image ? elementFeature::isSimilarTo : elementFeature::almostMatches))
|
||||
.count() >= minPagesFilter)
|
||||
.stream()
|
||||
.filter(elementFeaturesOnPage -> elementFeaturesOnPage.stream()
|
||||
.anyMatch(
|
||||
elementFeature.getElementType() == Element.e_image || elementFeature.getElementType() == Element.e_inline_image ?
|
||||
elementFeature::isSimilarTo : elementFeature::almostMatches))
|
||||
.count() >= minPagesFilter)
|
||||
.toList();
|
||||
}
|
||||
|
||||
@ -255,8 +275,8 @@ public class WatermarkRemovalService {
|
||||
@SneakyThrows
|
||||
private void removeAllWatermarks(PDFDoc pdfDoc, List<ElementFeatures> watermarksElementFeaturesList) {
|
||||
|
||||
try(ElementReader reader = new ElementReader();
|
||||
ElementWriter writer = new ElementWriter()) {
|
||||
try (ElementReader reader = new ElementReader();
|
||||
ElementWriter writer = new ElementWriter()) {
|
||||
Set<Long> visitedXObjIds = new TreeSet<>();
|
||||
|
||||
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
|
||||
@ -295,16 +315,24 @@ public class WatermarkRemovalService {
|
||||
|
||||
switch (element.getType()) {
|
||||
case Element.e_image, Element.e_inline_image -> {
|
||||
if (element.getBBox() == null) {
|
||||
writer.writeElement(element);
|
||||
continue;
|
||||
try (var bbox = element.getBBox()) {
|
||||
if (bbox == null) {
|
||||
writer.writeElement(element);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage && isLocatedNearBorder(element, page) && element.getBBox()
|
||||
.getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage || element.getXObject() == null) {
|
||||
try (var bbox = element.getBBox()) {
|
||||
if (bbox.getHeight() * bbox.getWidth() < minAreaCoveringFromPage && isLocatedNearBorder(element, page) && bbox
|
||||
.getHeight() *
|
||||
bbox
|
||||
.getWidth() <
|
||||
minAreaCoveringFromPage ||
|
||||
element.getXObject() == null) {
|
||||
|
||||
writer.writeElement(element);
|
||||
continue;
|
||||
writer.writeElement(element);
|
||||
continue;
|
||||
|
||||
}
|
||||
}
|
||||
removeImages(element, writer, watermarksElementFeaturesList);
|
||||
}
|
||||
@ -344,8 +372,10 @@ public class WatermarkRemovalService {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (Math.max(element.getBBox().getY1(), element.getBBox().getY2()) < page.getVisibleContentBox().getY1() + page.getPageHeight() * TEXT_POSITION_THRESHOLD) {
|
||||
return false;
|
||||
try (var bbox = element.getBBox(); var contents = page.getVisibleContentBox();) {
|
||||
if (Math.max(bbox.getY1(), bbox.getY2()) < contents.getY1() + page.getPageHeight() * TEXT_POSITION_THRESHOLD) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@ -385,7 +415,7 @@ public class WatermarkRemovalService {
|
||||
visitedXObjIds.add(element.getXObject().getObjNum());
|
||||
// writer needs to be newly initialized when entering a new content stream
|
||||
// see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
|
||||
try(ElementWriter formWriter = new ElementWriter()) {
|
||||
try (ElementWriter formWriter = new ElementWriter()) {
|
||||
reader.formBegin();
|
||||
formWriter.begin(element.getXObject());
|
||||
|
||||
|
||||
@ -6,6 +6,7 @@ import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
@ -14,6 +15,7 @@ import com.pdftron.pdf.PDFNet;
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@SuppressWarnings("PMD")
|
||||
@Slf4j
|
||||
class InvisibleElementRemovalServiceTest {
|
||||
|
||||
InvisibleElementRemovalService invisibleElementRemovalService;
|
||||
@ -101,8 +103,19 @@ class InvisibleElementRemovalServiceTest {
|
||||
String result = PdfTextExtraction.extractAllTextFromDocument(in);
|
||||
assertThat(result).contains("#1 Dark",
|
||||
"#13 Yellow",
|
||||
"Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper suscipit lobortis nisl ut aliquip\n" + "ex ea commodo consequat. Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie\n" + "consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim\n" + "qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi.");
|
||||
assertThat(result).doesNotContain("Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut\n" + "labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et\n" + "ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem\n" + "ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et\n" + "dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea\n" + "rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum\n" + "dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore\n" + "magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet\n" + "clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. ");
|
||||
"Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper suscipit lobortis nisl ut aliquip\n" +
|
||||
"ex ea commodo consequat. Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie\n" +
|
||||
"consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim\n" +
|
||||
"qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi.");
|
||||
assertThat(result).doesNotContain("Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut\n" +
|
||||
"labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et\n" +
|
||||
"ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem\n" +
|
||||
"ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et\n" +
|
||||
"dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea\n" +
|
||||
"rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum\n" +
|
||||
"dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore\n" +
|
||||
"magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet\n" +
|
||||
"clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. ");
|
||||
}
|
||||
|
||||
}
|
||||
@ -176,4 +189,4 @@ class InvisibleElementRemovalServiceTest {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user