diff --git a/src/main/java/com/iqser/red/pdftronlogic/commons/ClippingPathStack.java b/src/main/java/com/iqser/red/pdftronlogic/commons/ClippingPathStack.java index 5d8f8c0..d3e9c87 100644 --- a/src/main/java/com/iqser/red/pdftronlogic/commons/ClippingPathStack.java +++ b/src/main/java/com/iqser/red/pdftronlogic/commons/ClippingPathStack.java @@ -1,7 +1,5 @@ package com.iqser.red.pdftronlogic.commons; - - import static com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService.TOLERANCE; import java.awt.geom.Area; @@ -35,6 +33,13 @@ public class ClippingPathStack { } + @SneakyThrows + public void intersectClippingPath(Rectangle2D path) { + + getCurrentClippingPath().intersect(new Area(path)); + } + + public boolean almostIntersects(double x, double y, double width, double height) { // To address inconsistencies in the calculation of the bounding box we slightly increase the rectangle // Height or width are zero for straight lines, even though they are being rendered. Therefore, height or width must be at minimum >0. @@ -56,15 +61,16 @@ public class ClippingPathStack { public void enterNewGState() { Area current = stack.peek(); - Area cloned = new Area(); - cloned.add(current); + Area cloned = (Area) current.clone(); stack.push(cloned); } public void leaveGState() { - stack.pop(); + // somehow this greatly helps memory management + var popped = stack.pop(); + popped.reset(); } -} \ No newline at end of file +} diff --git a/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatureFactory.java b/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatureFactory.java index 6f1407e..92ea5dd 100644 --- a/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatureFactory.java +++ b/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatureFactory.java @@ -4,6 +4,7 @@ import com.pdftron.common.PDFNetException; import com.pdftron.pdf.Element; public class ElementFeatureFactory { + public static ElementFeatures extractFeatures(Element element) throws PDFNetException { return switch (element.getType()) { @@ -16,7 +17,9 @@ public class ElementFeatureFactory { }; } + public static ElementFeatures extractFeaturesWithHash(Element element, String hashObject) throws PDFNetException { + return buildImage(element) .hashOfImage(hashObject) .build(); @@ -25,56 +28,65 @@ public class ElementFeatureFactory { private static ElementFeatures.Form buildForm(Element element) throws PDFNetException { - return ElementFeatures.Form.builder() - .elementType(element.getType()) - .boundingBox(Converter.toRectangle2D(element.getBBox())) - .xObjectType(element.getXObject().getType()) - .dictOrArrayOrStreamLength(element.getXObject().getType() == 7 ? element.getXObject().getDecodedStream().size() : 0) - .build(); + try (var bbox = element.getBBox();) { + return ElementFeatures.Form.builder() + .elementType(element.getType()) + .boundingBox(Converter.toRectangle2D(bbox)) + .xObjectType(element.getXObject().getType()) + .dictOrArrayOrStreamLength(element.getXObject().getType() == 7 ? element.getXObject().getDecodedStream().size() : 0) + .build(); + } } private static ElementFeatures.Image.ImageBuilder buildImage(Element element) throws PDFNetException { - return ElementFeatures.Image.builder() - .elementType(element.getType()) - .boundingBox(Converter.toRectangle2D(element.getBBox())) - .dataSize(element.getImageDataSize()) - .height(element.getImageHeight()) - .width(element.getImageWidth()) - .renderingIntent(element.getImageRenderingIntent()) - .componentNum(element.getComponentNum()) - .bitsPerComponent(element.getBitsPerComponent()); + try (var bbox = element.getBBox();) { + return ElementFeatures.Image.builder() + .elementType(element.getType()) + .boundingBox(Converter.toRectangle2D(bbox)) + .dataSize(element.getImageDataSize()) + .height(element.getImageHeight()) + .width(element.getImageWidth()) + .renderingIntent(element.getImageRenderingIntent()) + .componentNum(element.getComponentNum()) + .bitsPerComponent(element.getBitsPerComponent()); + } } private static ElementFeatures.Text buildText(Element element) throws PDFNetException { - return ElementFeatures.Text.builder() - .elementType(element.getType()) - .boundingBox(Converter.toRectangle2D(element.getBBox())) - .text(element.getTextString()) - .font(element.getGState().getFont().getType()) - .fontsize(element.getGState().getFontSize()) - .build(); + try (var bbox = element.getBBox();) { + return ElementFeatures.Text.builder() + .elementType(element.getType()) + .boundingBox(Converter.toRectangle2D(bbox)) + .text(element.getTextString()) + .font(element.getGState().getFont().getType()) + .fontsize(element.getGState().getFontSize()) + .build(); + } } private static ElementFeatures.Path buildPath(Element element) throws PDFNetException { - return ElementFeatures.Path.builder() - .elementType(element.getType()) - .boundingBox(Converter.toRectangle2D(element.getBBox())) - .isClippingPath(element.isClippingPath()) - .isClipWindingFill(element.isClipWindingFill()) - .isStroked(element.isStroked()) - .isFilled(element.isFilled()) - .isWindingFill(element.isWindingFill()) - .fillColor(Converter.convertColor(element.getGState().getFillColorSpace(), element.getGState().getFillColor())) - .strokeColor(Converter.convertColor(element.getGState().getStrokeColorSpace(), element.getGState().getStrokeColor())) - .linePath(Converter.convertToGeneralPathAndTransformToInitialUserSpace(element.getPathData(), element.getCTM())) - .build(); + try (var bbox = element.getBBox(); var ctm = element.getCTM(); + var fillColor = element.getGState().getFillColor(); + var strokeColor = element.getGState().getStrokeColor()) { + return ElementFeatures.Path.builder() + .elementType(element.getType()) + .boundingBox(Converter.toRectangle2D(bbox)) + .isClippingPath(element.isClippingPath()) + .isClipWindingFill(element.isClipWindingFill()) + .isStroked(element.isStroked()) + .isFilled(element.isFilled()) + .isWindingFill(element.isWindingFill()) + .fillColor(Converter.convertColor(element.getGState().getFillColorSpace(), fillColor)) + .strokeColor(Converter.convertColor(element.getGState().getStrokeColorSpace(), strokeColor)) + .linePath(Converter.convertToGeneralPathAndTransformToInitialUserSpace(element.getPathData(), ctm)) + .build(); + } } - } diff --git a/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java b/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java index 3ef3ec7..4332532 100644 --- a/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java +++ b/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java @@ -31,9 +31,11 @@ public class ElementFeatures { public boolean almostMatches(Element element) throws PDFNetException { - return element.getType() == elementType && // - element.getBBox() != null && // - rectsAlmostMatch(element.getBBox()); + try (var bbox = element.getBBox()) { + return element.getType() == elementType && // + bbox != null && // + rectsAlmostMatch(bbox); + } } @@ -42,9 +44,9 @@ public class ElementFeatures { // To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance return almostEqual(bBox.getX1(), boundingBox.getX()) && // - almostEqual(bBox.getY1(), boundingBox.getY()) && // - almostEqual(bBox.getWidth(), boundingBox.getWidth()) && // - almostEqual(bBox.getHeight(), boundingBox.getHeight()); + almostEqual(bBox.getY1(), boundingBox.getY()) && // + almostEqual(bBox.getWidth(), boundingBox.getWidth()) && // + almostEqual(bBox.getHeight(), boundingBox.getHeight()); } @@ -65,9 +67,9 @@ public class ElementFeatures { // To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance return almostEqual(bBox.getX(), boundingBox.getX()) && // - almostEqual(bBox.getY(), boundingBox.getY()) && // - almostEqual(bBox.getWidth(), boundingBox.getWidth()) && // - almostEqual(bBox.getHeight(), boundingBox.getHeight()); + almostEqual(bBox.getY(), boundingBox.getY()) && // + almostEqual(bBox.getWidth(), boundingBox.getWidth()) && // + almostEqual(bBox.getHeight(), boundingBox.getHeight()); } @@ -81,9 +83,9 @@ public class ElementFeatures { // To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance return isPositionSimilar(rectangle2D.getX(), boundingBox.getX(), rectangle2D.getWidth()) && // - isPositionSimilar(rectangle2D.getY(), boundingBox.getY(), rectangle2D.getHeight()) && // - isSizeSimilar(rectangle2D.getWidth(), boundingBox.getWidth()) && // - isSizeSimilar(rectangle2D.getHeight(), boundingBox.getHeight()); + isPositionSimilar(rectangle2D.getY(), boundingBox.getY(), rectangle2D.getHeight()) && // + isSizeSimilar(rectangle2D.getWidth(), boundingBox.getWidth()) && // + isSizeSimilar(rectangle2D.getHeight(), boundingBox.getHeight()); } @@ -115,9 +117,9 @@ public class ElementFeatures { public boolean almostMatches(Element element) throws PDFNetException { return super.almostMatches(element) && // - text.equals(element.getTextString()) && // - font == element.getGState().getFont().getType() && // - almostEqual(fontsize, element.getGState().getFontSize()); + text.equals(element.getTextString()) && // + font == element.getGState().getFont().getType() && // + almostEqual(fontsize, element.getGState().getFontSize()); } } @@ -142,11 +144,11 @@ public class ElementFeatures { public boolean almostMatches(Element element) throws PDFNetException { return super.almostMatches(element) && // - isClippingPath == element.isClippingPath() && // - isClipWindingFill == element.isClipWindingFill() && // - isStroked == element.isStroked() && // - isFilled == element.isFilled() && // - isWindingFill == element.isWindingFill(); + isClippingPath == element.isClippingPath() && // + isClipWindingFill == element.isClipWindingFill() && // + isStroked == element.isStroked() && // + isFilled == element.isFilled() && // + isWindingFill == element.isWindingFill(); } @@ -161,7 +163,7 @@ public class ElementFeatures { public boolean isBackground(Rect area) { return isFilled && // - getBoundingBox().intersects(area.getX1(), area.getY1(), area.getWidth(), area.getHeight()); + getBoundingBox().intersects(area.getX1(), area.getY1(), area.getWidth(), area.getHeight()); } } @@ -185,12 +187,12 @@ public class ElementFeatures { public boolean almostMatches(Element element) throws PDFNetException { return super.almostMatches(element) && // - dataSize == element.getImageDataSize() && // - height == element.getImageHeight() && // - width == element.getImageWidth() && // - renderingIntent == element.getImageRenderingIntent() && // - componentNum == element.getComponentNum() && // - bitsPerComponent == element.getBitsPerComponent(); + dataSize == element.getImageDataSize() && // + height == element.getImageHeight() && // + width == element.getImageWidth() && // + renderingIntent == element.getImageRenderingIntent() && // + componentNum == element.getComponentNum() && // + bitsPerComponent == element.getBitsPerComponent(); } @@ -199,15 +201,18 @@ public class ElementFeatures { if (elementFeatures.getClass() != this.getClass()) { return false; } - return super.almostMatches(elementFeatures) && this.dataSize == ((Image) elementFeatures).getDataSize() && this.height == ((Image) elementFeatures).getHeight() && this.width == ((Image) elementFeatures).getWidth() && this.renderingIntent == ((Image) elementFeatures).getRenderingIntent() && this.componentNum == ((Image) elementFeatures).getComponentNum() && this.bitsPerComponent == ((Image) elementFeatures).getBitsPerComponent() && calculateHammingDistance( - ((Image) elementFeatures).getHashOfImage()) <= HAMMING_DISTANCE_THRESHOLD; + return super.almostMatches(elementFeatures) && this.dataSize == ((Image) elementFeatures).getDataSize() && this.height == ((Image) elementFeatures).getHeight() && + this.width == ((Image) elementFeatures).getWidth() && this.renderingIntent == ((Image) elementFeatures).getRenderingIntent() && + this.componentNum == ((Image) elementFeatures).getComponentNum() && this.bitsPerComponent == ((Image) elementFeatures).getBitsPerComponent() && + calculateHammingDistance( + ((Image) elementFeatures).getHashOfImage()) <= HAMMING_DISTANCE_THRESHOLD; } public boolean isSimilarTo(ElementFeatures elementFeatures) { return super.isSimilarTo(elementFeatures) && // - calculateHammingDistance(((Image) elementFeatures).getHashOfImage()) <= HAMMING_DISTANCE_THRESHOLD; + calculateHammingDistance(((Image) elementFeatures).getHashOfImage()) <= HAMMING_DISTANCE_THRESHOLD; } @@ -241,10 +246,12 @@ public class ElementFeatures { @Override public boolean almostMatches(Element element) throws PDFNetException { - return element.getType() == getElementType() && // - element.getBBox() != null && // - (super.rectsAlmostMatch(element.getBBox()) || almostRotateMatches(element.getBBox().getRectangle())) && xObjectType == element.getXObject() - .getType() && dictOrArrayOrStreamLength == element.getXObject().getDecodedStream().size(); + try (var bbox = element.getBBox()) { + return element.getType() == getElementType() && // + bbox != null && // + (super.rectsAlmostMatch(bbox) || almostRotateMatches(bbox.getRectangle())) && xObjectType == element.getXObject() + .getType() && dictOrArrayOrStreamLength == element.getXObject().getDecodedStream().size(); + } } @@ -253,9 +260,11 @@ public class ElementFeatures { if (elementFeatures.getClass() != this.getClass()) { return false; } - return elementFeatures.getElementType() == getElementType() && elementFeatures.getBoundingBox() != null && (super.rectsAlmostMatch(elementFeatures.getBoundingBox()) || almostRotateMatches( - elementFeatures.getBoundingBox() - .getBounds2D())) && xObjectType == ((Form) elementFeatures).getXObjectType() && dictOrArrayOrStreamLength == ((Form) elementFeatures).getDictOrArrayOrStreamLength(); + return elementFeatures.getElementType() == getElementType() && elementFeatures.getBoundingBox() != null && + (super.rectsAlmostMatch(elementFeatures.getBoundingBox()) || almostRotateMatches( + elementFeatures.getBoundingBox() + .getBounds2D())) && xObjectType == ((Form) elementFeatures).getXObjectType() && + dictOrArrayOrStreamLength == ((Form) elementFeatures).getDictOrArrayOrStreamLength(); } @@ -263,7 +272,7 @@ public class ElementFeatures { private boolean almostRotateMatches(Rectangle2D bBox) { return almostEqual(bBox.getWidth(), getBoundingBox().getHeight()) && // - almostEqual(bBox.getHeight(), getBoundingBox().getWidth()); + almostEqual(bBox.getHeight(), getBoundingBox().getWidth()); } } diff --git a/src/main/java/com/iqser/red/pdftronlogic/commons/InvisibleElementRemovalService.java b/src/main/java/com/iqser/red/pdftronlogic/commons/InvisibleElementRemovalService.java index ae60168..5bd92d3 100644 --- a/src/main/java/com/iqser/red/pdftronlogic/commons/InvisibleElementRemovalService.java +++ b/src/main/java/com/iqser/red/pdftronlogic/commons/InvisibleElementRemovalService.java @@ -63,15 +63,12 @@ public class InvisibleElementRemovalService { PDFDoc pdfDoc = new PDFDoc(pdfFile); - execute(pdfDoc, delta, removePaths, markedContentToIgnore); - - try { + try (pdfDoc) { + execute(pdfDoc, delta, removePaths, markedContentToIgnore); pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null); } catch (Exception e) { log.error("File could not be saved after invisible element removal"); throw new RuntimeException(e); - } finally { - pdfDoc.close(); } } @@ -151,8 +148,8 @@ public class InvisibleElementRemovalService { private void execute(PDFDoc pdfDoc, boolean delta, boolean removePaths, Set markedContentToIgnore) { log.info("Start removing invisible Elements"); - try(ElementWriter writer = new ElementWriter(); - ElementReader reader = new ElementReader()) { + try (ElementWriter writer = new ElementWriter(); + ElementReader reader = new ElementReader()) { Set visitedXObjIds = new TreeSet<>(); for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) { @@ -236,7 +233,7 @@ public class InvisibleElementRemovalService { private void processImages(Element imageElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { - try(Rect rect = imageElement.getBBox()) { + try (Rect rect = imageElement.getBBox()) { if (rect == null) { return; @@ -257,7 +254,7 @@ public class InvisibleElementRemovalService { private void processText(Element textElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { - try(Rect textBBox = textElement.getBBox()) { + try (Rect textBBox = textElement.getBBox()) { if (textBBox == null) { writer.writeElement(textElement); @@ -289,13 +286,17 @@ public class InvisibleElementRemovalService { if (!inClippingPath) { gState.setFillColorSpace(ColorSpace.createDeviceRGB()); // red for elements removed by clipping path - gState.setFillColor(new ColorPt(1, 0, 0)); + try (var color = new ColorPt(1, 0, 0)) { + gState.setFillColor(color); + } writer.writeElement(textElement); } if (!isTextVisible) { gState.setFillColorSpace(ColorSpace.createDeviceRGB()); // blue for elements removed due to transparency or not rendered or same color as background - gState.setFillColor(new ColorPt(0, 0, 1)); + try (var color = new ColorPt(0, 0, 1)) { + gState.setFillColor(color); + } gState.setTextRenderMode(GState.e_fill_text); gState.setFillOpacity(1); writer.writeElement(textElement); @@ -314,19 +315,21 @@ public class InvisibleElementRemovalService { context.visitedXObjIds().add(formObj.getObjNum()); // writer needs to be newly initialized when entering a new content stream // see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest) - try(ElementWriter formWriter = new ElementWriter()) { + try (ElementWriter formWriter = new ElementWriter()) { context.clippingPathStack().enterNewGState(); - context.clippingPathStack().intersectClippingPath(new GeneralPath(Converter.toRectangle2D(formElement.getBBox()))); - context.reader().formBegin(); - formWriter.begin(formObj); + try (var formElementBBOX = formElement.getBBox()) { + context.clippingPathStack().intersectClippingPath(Converter.toRectangle2D(formElementBBOX)); + context.reader().formBegin(); + formWriter.begin(formObj); - context.reader().clearChangeList(); - formWriter.setDefaultGState(context.reader()); + context.reader().clearChangeList(); + formWriter.setDefaultGState(context.reader()); - processElements(formWriter, context); - formWriter.end(); - context.reader().end(); - context.clippingPathStack().leaveGState(); + processElements(formWriter, context); + formWriter.end(); + context.reader().end(); + context.clippingPathStack().leaveGState(); + } } } } @@ -336,52 +339,58 @@ public class InvisibleElementRemovalService { PathData pathData = pathElement.getPathData(); - if (pathData.getOperators().length == 0 && pathData.getPoints().length == 0 || pathElement.getBBox() == null) { - writer.writeElement(pathElement); - return; - } - - GeneralPath linePath = Converter.convertToGeneralPathAndTransformToInitialUserSpace(pathData, pathElement.getCTM()); - - var rect = linePath.getBounds2D(); - - boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight()); - - if (pathElement.isClippingPath()) { - if (pathElement.isClipWindingFill()) { - linePath.setWindingRule(GeneralPath.WIND_NON_ZERO); - } else { - linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD); + try (var bbox = pathElement.getBBox()) { + if (pathData.getOperators().length == 0 && pathData.getPoints().length == 0 || bbox == null) { + writer.writeElement(pathElement); + return; } - context.clippingPathStack().intersectClippingPath(linePath); - pathElement.setPathClip(!context.delta()); - writer.writeElement(pathElement); + try (var ctm = pathElement.getCTM()) { + GeneralPath linePath = Converter.convertToGeneralPathAndTransformToInitialUserSpace(pathData, ctm); - } else { - if (pathElement.isWindingFill()) { - linePath.setWindingRule(GeneralPath.WIND_NON_ZERO); - } else { - linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD); - } + var rect = linePath.getBounds2D(); - if (inClippingPath) { - if (isFilledAndNonTransparent(pathElement)) { - calculateOverlapsForLinePath(context, linePath); + boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight()); + + if (pathElement.isClippingPath()) { + if (pathElement.isClipWindingFill()) { + linePath.setWindingRule(GeneralPath.WIND_NON_ZERO); + } else { + linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD); + } + + context.clippingPathStack().intersectClippingPath(linePath); + pathElement.setPathClip(!context.delta()); + writer.writeElement(pathElement); + + } else { + if (pathElement.isWindingFill()) { + linePath.setWindingRule(GeneralPath.WIND_NON_ZERO); + } else { + linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD); + } + + if (inClippingPath) { + if (isFilledAndNonTransparent(pathElement)) { + calculateOverlapsForLinePath(context, linePath); + } + context.visibleElements().add(ElementFeatureFactory.extractFeatures(pathElement)); + } + + if (!context.delta() && (inClippingPath || !context.removePaths())) { + writer.writeElement(pathElement); + } + + if (context.delta() && !inClippingPath && context.removePaths()) { + try (var color = new ColorPt(1, 0, 0)) { + pathElement.getGState().setFillColorSpace(ColorSpace.createDeviceRGB()); + pathElement.getGState().setFillColor(color); + pathElement.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB()); + pathElement.getGState().setStrokeColor(color); + writer.writeElement(pathElement); + } + } } - context.visibleElements().add(ElementFeatureFactory.extractFeatures(pathElement)); - } - - if (!context.delta() && (inClippingPath || !context.removePaths())) { - writer.writeElement(pathElement); - } - - if (context.delta() && !inClippingPath && context.removePaths()) { - pathElement.getGState().setFillColorSpace(ColorSpace.createDeviceRGB()); - pathElement.getGState().setFillColor(new ColorPt(1, 0, 0)); - pathElement.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB()); - pathElement.getGState().setStrokeColor(new ColorPt(1, 0, 0)); - writer.writeElement(pathElement); } } } @@ -480,7 +489,7 @@ public class InvisibleElementRemovalService { context.visitedXObjIds().add(formObj.getObjNum()); // writer needs to be newly initialized when entering a new content stream // see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest) - try(ElementWriter formWriter = new ElementWriter()) { + try (ElementWriter formWriter = new ElementWriter()) { context.reader().formBegin(); formWriter.begin(formObj); @@ -516,7 +525,9 @@ public class InvisibleElementRemovalService { private boolean fillIsVisible(GState gState, Rect textBBox, InvisibleElementRemovalContext context) throws PDFNetException { - return gState.getFillOpacity() != 0 && differentColorThanBackgroundColor(Converter.convertColor(gState.getFillColorSpace(), gState.getFillColor()), textBBox, context); + try (var color = gState.getFillColor()) { + return gState.getFillOpacity() != 0 && differentColorThanBackgroundColor(Converter.convertColor(gState.getFillColorSpace(), color), textBBox, context); + } } @@ -541,13 +552,16 @@ public class InvisibleElementRemovalService { private static List findVisiblePathElementsThatIntersect(Rect textBBox, InvisibleElementRemovalContext context) { - return context.visibleElements() - .stream() - .filter(elementFeatures -> elementFeatures.getElementType() == Element.e_path) - .map(elementFeatures -> (ElementFeatures.Path) elementFeatures) - .filter(elementFeatures -> !elementFeatures.getFillColor().equals(Color.WHITE)) - .filter(element -> element.isBackground(textBBox)) - .toList(); + var result = new ArrayList(); + for (var element : context.visibleElements()) { + if (element.getElementType() == Element.e_path + && !((ElementFeatures.Path) element).getFillColor().equals(Color.WHITE) + && ((ElementFeatures.Path) element).isBackground(textBBox)) { + result.add((ElementFeatures.Path) element); + } + } + return result; + } @@ -581,10 +595,10 @@ public class InvisibleElementRemovalService { @SneakyThrows private void drawBBox(ElementWriter writer, Rectangle2D r, String hexcolor) { - try(ColorPt colorPt = new ColorPt(Integer.valueOf(hexcolor.substring(1, 3), 16) / 255d, + try (ColorPt colorPt = new ColorPt(Integer.valueOf(hexcolor.substring(1, 3), 16) / 255d, Integer.valueOf(hexcolor.substring(3, 5), 16) / 255d, Integer.valueOf(hexcolor.substring(5, 7), 16) / 255d); - ElementBuilder eb = new ElementBuilder()) { + ElementBuilder eb = new ElementBuilder()) { Element rect = eb.createRect(r.getX(), r.getY(), r.getWidth(), r.getHeight()); rect.setPathStroke(true); rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB()); @@ -608,4 +622,4 @@ public class InvisibleElementRemovalService { } -} \ No newline at end of file +} diff --git a/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java b/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java index 34e6cbe..7342c08 100644 --- a/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java +++ b/src/main/java/com/iqser/red/pdftronlogic/commons/WatermarkRemovalService.java @@ -93,7 +93,7 @@ public class WatermarkRemovalService { Map> formObjectsAndImagesForPages = new HashMap<>(); Set visitedXObjIds = new TreeSet<>(); - try(ElementReader reader = new ElementReader()) { + try (ElementReader reader = new ElementReader()) { for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) { @@ -123,14 +123,16 @@ public class WatermarkRemovalService { double minAreaCoveringPage, Page page) throws PDFNetException { - if (element.getBBox() == null) { - return; - } + try (var bbox = element.getBBox()) { + if (bbox == null) { + return; + } - switch (element.getType()) { - case Element.e_form -> processXObject(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage, page); - case Element.e_image, Element.e_inline_image -> processImages(element, elementFeaturesLinkedList, page, minAreaCoveringPage); - case Element.e_text -> processText(element, elementFeaturesLinkedList, page); + switch (element.getType()) { + case Element.e_form -> processXObject(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage, page); + case Element.e_image, Element.e_inline_image -> processImages(element, elementFeaturesLinkedList, page, minAreaCoveringPage); + case Element.e_text -> processText(element, elementFeaturesLinkedList, page); + } } } @@ -147,11 +149,13 @@ public class WatermarkRemovalService { return; } - boolean isBigEnough = Math.abs(element.getBBox().getY1() - element.getBBox().getY2()) > page.getPageHeight() * MIN_TEXTWATERMARK_HEIGHT_THRESHOLD; + try (var bbox = element.getBBox()) { + boolean isBigEnough = Math.abs(bbox.getY1() - bbox.getY2()) > page.getPageHeight() * MIN_TEXTWATERMARK_HEIGHT_THRESHOLD; - if (isBigEnough) { - ElementFeatures elementFeatures = ElementFeatureFactory.extractFeatures(element); - elementFeaturesLinkedList.add(elementFeatures); + if (isBigEnough) { + ElementFeatures elementFeatures = ElementFeatureFactory.extractFeatures(element); + elementFeaturesLinkedList.add(elementFeatures); + } } } @@ -160,8 +164,10 @@ public class WatermarkRemovalService { @SneakyThrows private boolean isTextRotated(Element element) { - return Math.abs(element.getCTM().getB()) < Math.sin(Math.toRadians(ROTATED_TEXT_THRESHOLD)) || Math.abs(element.getCTM() - .getB()) > Math.sin(Math.toRadians(70 - ROTATED_TEXT_THRESHOLD)); + try (var ctm = element.getCTM()) { + return Math.abs(ctm.getB()) < Math.sin(Math.toRadians(ROTATED_TEXT_THRESHOLD)) || Math.abs(ctm + .getB()) > Math.sin(Math.toRadians(70 - ROTATED_TEXT_THRESHOLD)); + } } @@ -185,13 +191,15 @@ public class WatermarkRemovalService { if (element.getXObject() == null) { return; } - if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringPage && isLocatedNearBorder(element, page)) { - return; - } + try (var bbox = element.getBBox()) { + if (bbox.getHeight() * bbox.getWidth() < minAreaCoveringPage && isLocatedNearBorder(element, page)) { + return; + } - String hashOfImage = ImageHashFactory.calculate(element); - ElementFeatures elementFeatures = ElementFeatureFactory.extractFeaturesWithHash(element, hashOfImage); - elementFeaturesLinkedList.add(elementFeatures); + String hashOfImage = ImageHashFactory.calculate(element); + ElementFeatures elementFeatures = ElementFeatureFactory.extractFeaturesWithHash(element, hashOfImage); + elementFeaturesLinkedList.add(elementFeatures); + } } @@ -199,10 +207,18 @@ public class WatermarkRemovalService { @SneakyThrows private boolean isLocatedNearBorder(Element element, Page page) { - return element.getBBox().getY1() < page.getVisibleContentBox().getY1() + page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD || element.getBBox() - .getY2() > page.getVisibleContentBox().getY2() - page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD || element.getBBox().getX1() < page.getVisibleContentBox() - .getX1() + page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD || element.getBBox().getX2() > page.getVisibleContentBox() - .getX2() - page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD; + try (var bbox = element.getBBox(); var contentBox = page.getVisibleContentBox();) { + return bbox.getY1() < contentBox.getY1() + page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD || bbox + .getY2() > + contentBox.getY2() - + page.getPageHeight() * + IMAGE_POSITION_HEIGHT_THRESHOLD || + bbox.getX1() < contentBox + .getX1() + page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD || bbox.getX2() > contentBox + .getX2() - + page.getPageWidth() * + IMAGE_POSITION_WIDTH_THRESHOLD; + } } @@ -214,12 +230,14 @@ public class WatermarkRemovalService { double minAreaCoveringPage, Page page) { - if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringPage) { - return; + try (var bbox = element.getBBox()) { + if (bbox.getHeight() * bbox.getWidth() < minAreaCoveringPage) { + return; + } } if (visitedXObjIds.add(element.getXObject().getObjNum())) { - try(ElementReader xObjectReader = new ElementReader()) { + try (ElementReader xObjectReader = new ElementReader()) { xObjectReader.begin(element.getXObject()); for (Element element1 = xObjectReader.next(); element1 != null; element1 = xObjectReader.next()) { processElement(element1, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage, page); @@ -244,10 +262,12 @@ public class WatermarkRemovalService { .stream() .flatMap(Collection::stream) .filter(elementFeature -> formObjectsPerPage.values() - .stream() - .filter(elementFeaturesOnPage -> elementFeaturesOnPage.stream() - .anyMatch(elementFeature.getElementType() == Element.e_image || elementFeature.getElementType() == Element.e_inline_image ? elementFeature::isSimilarTo : elementFeature::almostMatches)) - .count() >= minPagesFilter) + .stream() + .filter(elementFeaturesOnPage -> elementFeaturesOnPage.stream() + .anyMatch( + elementFeature.getElementType() == Element.e_image || elementFeature.getElementType() == Element.e_inline_image ? + elementFeature::isSimilarTo : elementFeature::almostMatches)) + .count() >= minPagesFilter) .toList(); } @@ -255,8 +275,8 @@ public class WatermarkRemovalService { @SneakyThrows private void removeAllWatermarks(PDFDoc pdfDoc, List watermarksElementFeaturesList) { - try(ElementReader reader = new ElementReader(); - ElementWriter writer = new ElementWriter()) { + try (ElementReader reader = new ElementReader(); + ElementWriter writer = new ElementWriter()) { Set visitedXObjIds = new TreeSet<>(); for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) { @@ -295,16 +315,24 @@ public class WatermarkRemovalService { switch (element.getType()) { case Element.e_image, Element.e_inline_image -> { - if (element.getBBox() == null) { - writer.writeElement(element); - continue; + try (var bbox = element.getBBox()) { + if (bbox == null) { + writer.writeElement(element); + continue; + } } - if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage && isLocatedNearBorder(element, page) && element.getBBox() - .getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage || element.getXObject() == null) { + try (var bbox = element.getBBox()) { + if (bbox.getHeight() * bbox.getWidth() < minAreaCoveringFromPage && isLocatedNearBorder(element, page) && bbox + .getHeight() * + bbox + .getWidth() < + minAreaCoveringFromPage || + element.getXObject() == null) { - writer.writeElement(element); - continue; + writer.writeElement(element); + continue; + } } removeImages(element, writer, watermarksElementFeaturesList); } @@ -344,8 +372,10 @@ public class WatermarkRemovalService { return false; } - if (Math.max(element.getBBox().getY1(), element.getBBox().getY2()) < page.getVisibleContentBox().getY1() + page.getPageHeight() * TEXT_POSITION_THRESHOLD) { - return false; + try (var bbox = element.getBBox(); var contents = page.getVisibleContentBox();) { + if (Math.max(bbox.getY1(), bbox.getY2()) < contents.getY1() + page.getPageHeight() * TEXT_POSITION_THRESHOLD) { + return false; + } } return true; } @@ -385,7 +415,7 @@ public class WatermarkRemovalService { visitedXObjIds.add(element.getXObject().getObjNum()); // writer needs to be newly initialized when entering a new content stream // see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest) - try(ElementWriter formWriter = new ElementWriter()) { + try (ElementWriter formWriter = new ElementWriter()) { reader.formBegin(); formWriter.begin(element.getXObject()); diff --git a/src/test/java/com/iqser/red/pdftronlogic/commons/InvisibleElementRemovalServiceTest.java b/src/test/java/com/iqser/red/pdftronlogic/commons/InvisibleElementRemovalServiceTest.java index 760d14b..8f2dd0b 100644 --- a/src/test/java/com/iqser/red/pdftronlogic/commons/InvisibleElementRemovalServiceTest.java +++ b/src/test/java/com/iqser/red/pdftronlogic/commons/InvisibleElementRemovalServiceTest.java @@ -6,6 +6,7 @@ import static org.assertj.core.api.AssertionsForClassTypes.assertThat; import java.io.FileInputStream; import java.io.FileOutputStream; +import lombok.extern.slf4j.Slf4j; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -14,6 +15,7 @@ import com.pdftron.pdf.PDFNet; import lombok.SneakyThrows; @SuppressWarnings("PMD") +@Slf4j class InvisibleElementRemovalServiceTest { InvisibleElementRemovalService invisibleElementRemovalService; @@ -101,8 +103,19 @@ class InvisibleElementRemovalServiceTest { String result = PdfTextExtraction.extractAllTextFromDocument(in); assertThat(result).contains("#1 Dark", "#13 Yellow", - "Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper suscipit lobortis nisl ut aliquip\n" + "ex ea commodo consequat. Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie\n" + "consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim\n" + "qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi."); - assertThat(result).doesNotContain("Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut\n" + "labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et\n" + "ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem\n" + "ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et\n" + "dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea\n" + "rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum\n" + "dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore\n" + "magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet\n" + "clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. "); + "Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper suscipit lobortis nisl ut aliquip\n" + + "ex ea commodo consequat. Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie\n" + + "consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim\n" + + "qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi."); + assertThat(result).doesNotContain("Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut\n" + + "labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et\n" + + "ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem\n" + + "ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et\n" + + "dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea\n" + + "rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum\n" + + "dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore\n" + + "magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet\n" + + "clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. "); } } @@ -176,4 +189,4 @@ class InvisibleElementRemovalServiceTest { } -} \ No newline at end of file +}