RED-9746: Document hardly editable

* revert quadtree lookup, since the lib does not seem to work reliably, also, no significant speed boost
* check each individual glyph instead of only a text run and remember past overlaps in glyph
* added logic to extract all glyphs exactly
This commit is contained in:
Kilian Schuettler 2024-08-15 20:07:59 +02:00
parent f4538648ff
commit aa3823c9db
22 changed files with 886 additions and 231 deletions

View File

@ -28,8 +28,7 @@ dependencies {
api("org.projectlombok:lombok:1.18.30")
api("com.google.guava:guava:33.0.0-jre")
api("com.pdftron:PDFNet:10.11.0")
implementation("org.locationtech.jts:jts-core:1.19.0")
implementation("net.sourceforge.lept4j:lept4j:1.19.1")
testImplementation("net.sourceforge.lept4j:lept4j:1.19.1")
testImplementation("org.junit.jupiter:junit-jupiter:5.10.2")
testImplementation("org.assertj:assertj-core:3.24.2")
testImplementation("org.mockito:mockito-core:5.2.0")

View File

@ -2,28 +2,75 @@ package com.iqser.red.pdftronlogic.commons;
import static com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService.TOLERANCE;
import java.awt.Shape;
import java.awt.geom.Rectangle2D;
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
import lombok.experimental.UtilityClass;
@UtilityClass
public class ComparisonUtils {
public static Rectangle2D getPaddedRectangle(ElementFeatures elementFeatures) {
Rectangle2D inner = elementFeatures.getBoundingBox();
public Rectangle2D shrinkRectangle(Rectangle2D inner) {
//To address inconsistencies in the calculation of the bounding box we slightly shrink the inner rectangle
double x_with_tolerance = inner.getX() >= 0 ? inner.getX() + TOLERANCE : inner.getX() - TOLERANCE;
double y_with_tolerance = inner.getY() >= 0 ? inner.getY() + TOLERANCE : inner.getY() - TOLERANCE;
double height_with_tolerance = inner.getHeight() - (2 * TOLERANCE);
double width_with_tolerance = inner.getWidth() - (2 * TOLERANCE);
return new Rectangle2D.Double(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance);
return shrinkRectangle(inner, TOLERANCE);
}
public Rectangle2D shrinkRectangle(Rectangle2D rect, double tolerance) {
double newX = rect.getX() + tolerance;
double newY = rect.getY() + tolerance;
double newWidth = rect.getWidth() - 2 * tolerance;
double newHeight = rect.getHeight() - 2 * tolerance;
if (newWidth <= 1e-1) {
newWidth = 1e-1;
newX = rect.getX() + newWidth / 2;
}
if (newHeight <= 1e-1) {
newHeight = 1e-1;
newY = rect.getY() + newHeight / 2;
}
return new Rectangle2D.Double(newX, newY, newWidth, newHeight);
}
public Rectangle2D padRectangle(Rectangle2D inner) {
return padRectangle(inner, TOLERANCE);
}
public Rectangle2D padRectangle(Rectangle2D rect, double tolerance) {
double newX = rect.getX() - tolerance;
double newY = rect.getY() - tolerance;
double newWidth = rect.getWidth() + 2 * tolerance;
double newHeight = rect.getHeight() + 2 * tolerance;
if (newWidth <= 0) {
newWidth = 1e-2;
}
if (newHeight <= 0) {
newHeight = 1e-2;
}
return new Rectangle2D.Double(newX, newY, newWidth, newHeight);
}
public boolean almostContains(Shape outer, Rectangle2D inner) {
Rectangle2D innerRect = ComparisonUtils.shrinkRectangle(inner);
return outer.contains(innerRect);
}
public static boolean almostEqual(double a, double b) {
return Math.abs(a - b) < TOLERANCE;
}
}

View File

@ -3,8 +3,11 @@ package com.iqser.red.pdftronlogic.commons;
import java.awt.Color;
import java.awt.geom.AffineTransform;
import java.awt.geom.GeneralPath;
import java.awt.geom.PathIterator;
import java.awt.geom.Rectangle2D;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import com.google.common.primitives.Bytes;
import com.google.common.primitives.Doubles;
@ -21,16 +24,18 @@ import lombok.experimental.UtilityClass;
@UtilityClass
public class Converter {
public GeneralPath convertToGeneralPath(PathData pathData) throws PDFNetException {
public GeneralPath convertToGeneralPath(PathData pathData) {
GeneralPath linePath = new GeneralPath();
Iterator<Double> points = Doubles.asList(pathData.getPoints()).iterator();
Iterator<Double> points = Doubles.asList(pathData.getPoints())
.iterator();
Iterable<Byte> operators = Bytes.asList(pathData.getOperators());
for (var operator : operators) {
switch (operator) {
case PathData.e_moveto -> linePath.moveTo(points.next(), points.next());
case PathData.e_lineto -> linePath.lineTo(points.next(), points.next());
case PathData.e_cubicto -> linePath.curveTo(points.next(), points.next(), points.next(), points.next(), points.next(), points.next());
case PathData.e_conicto -> linePath.quadTo(points.next(), points.next(), points.next(), points.next());
case PathData.e_closepath -> linePath.closePath();
case PathData.e_rect -> {
double x = points.next();
@ -43,14 +48,67 @@ public class Converter {
linePath.lineTo(x, y + h);
linePath.closePath();
}
default -> throw new PDFNetException("Invalid Element Type", 0, "", "", "");
default -> throw new IllegalArgumentException("Invalid Operator Type " + operator);
}
}
return linePath;
}
public GeneralPath convertToGeneralPathAndTransformToInitialUserSpace(PathData pathData, Matrix2D ctm) throws PDFNetException{
public PathData convertToPathData(GeneralPath linePath) {
PathIterator pathIterator = linePath.getPathIterator(null);
List<Byte> operators = new LinkedList<>();
List<Double> points = new LinkedList<>();
while (!pathIterator.isDone()) {
double[] currentPoints = new double[6];
int type = pathIterator.currentSegment(currentPoints);
switch (type) {
case PathIterator.SEG_MOVETO -> {
operators.add((byte) PathData.e_moveto);
points.add(currentPoints[0]);
points.add(currentPoints[1]);
}
case PathIterator.SEG_LINETO -> {
operators.add((byte) PathData.e_lineto);
points.add(currentPoints[0]);
points.add(currentPoints[1]);
}
case PathIterator.SEG_QUADTO -> {
operators.add((byte) PathData.e_conicto);
points.add(currentPoints[0]);
points.add(currentPoints[1]);
points.add(currentPoints[2]);
points.add(currentPoints[3]);
}
case PathIterator.SEG_CUBICTO -> {
operators.add((byte) PathData.e_cubicto);
points.add(currentPoints[0]);
points.add(currentPoints[1]);
points.add(currentPoints[2]);
points.add(currentPoints[3]);
points.add(currentPoints[4]);
points.add(currentPoints[5]);
}
case PathIterator.SEG_CLOSE -> {
operators.add((byte) PathData.e_closepath);
}
}
}
byte[] operatorArr = new byte[operators.size()];
for (int i = 0; i < operators.size(); i++) {
operatorArr[i] = operators.get(i);
}
double[] pointArr = new double[points.size()];
for (int i = 0; i < points.size(); i++) {
pointArr[i] = points.get(i);
}
return new PathData(true, operatorArr, pointArr);
}
public GeneralPath convertToGeneralPathAndTransformToInitialUserSpace(PathData pathData, Matrix2D ctm) throws PDFNetException {
GeneralPath linePath = Converter.convertToGeneralPath(pathData);
//transform path to initial user space
@ -59,6 +117,7 @@ public class Converter {
return linePath;
}
@SneakyThrows
public static Color convertColor(ColorSpace colorSpace, ColorPt colorPt) {
@ -74,4 +133,15 @@ public class Converter {
return new Rectangle2D.Double(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
}
@SneakyThrows
public static AffineTransform toAffineTransform(Matrix2D textMatrix) {
if (textMatrix == null) {
return null;
}
return new AffineTransform(textMatrix.getA(), textMatrix.getB(), textMatrix.getC(), textMatrix.getD(), textMatrix.getV(), textMatrix.getH());
}
}

View File

@ -1,5 +1,8 @@
package com.iqser.red.pdftronlogic.commons;
import static com.iqser.red.pdftronlogic.commons.VisualizationUtils.drawFeature;
import static com.iqser.red.pdftronlogic.commons.VisualizationUtils.drawRect;
import java.awt.Color;
import java.awt.Shape;
import java.awt.geom.Area;
@ -10,13 +13,16 @@ import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.TreeSet;
import com.iqser.red.pdftronlogic.commons.features.ElementFeatureFactory;
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
import com.iqser.red.pdftronlogic.commons.features.GlyphInfo;
import com.iqser.red.pdftronlogic.commons.features.ImageFeatures;
import com.iqser.red.pdftronlogic.commons.features.PathFeatures;
import com.iqser.red.pdftronlogic.commons.features.TextFeatures;
import com.iqser.red.pdftronlogic.commons.lookup.ElementFeatureLookup;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.ColorPt;
@ -26,6 +32,7 @@ import com.pdftron.pdf.ElementBuilder;
import com.pdftron.pdf.ElementReader;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.GState;
import com.pdftron.pdf.Image;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
@ -155,9 +162,9 @@ public class InvisibleElementRemovalService {
log.info("Start removing invisible Elements");
try (ElementWriter writer = new ElementWriter(); ElementReader reader = new ElementReader()) {
Set<Long> visitedXObjIds = new TreeSet<>();
int pageIndex = 1;
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
Page page = iterator.next();
visitedXObjIds.add(page.getSDFObj().getObjNum());
@ -180,10 +187,9 @@ public class InvisibleElementRemovalService {
context.markedContentStack().clear();
removeOverlappedElements(page, writer, context);
pageIndex++;
}
log.info("Finished removing invisible Elements");
}
log.info("Finished removing invisible Elements");
}
@ -245,17 +251,13 @@ public class InvisibleElementRemovalService {
}
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
boolean nonTransparent = imageElement.getGState().getBlendMode() == GState.e_bl_normal
&& imageElement.getGState().getFillOpacity() == 1
&& imageElement.getGState().getStrokeOpacity() == 1
&& imageElement.getGState().getSoftMask() == null;
if (inClippingPath) {
ImageFeatures image = ElementFeatureFactory.buildImage(imageElement);
if (nonTransparent) {
// calculateOverlaps(context, image, writer);
ImageFeatures imageFeatures = ElementFeatureFactory.buildImage(imageElement);
if (!(imageFeatures.isTransparent() || imageFeatures.isImageMask() || imageFeatures.isSoftMask())) {
calculateOverlaps(context, imageFeatures);
}
context.visibleElements().add(image);
context.visibleElements().add(imageFeatures);
}
if (context.delta() ^ inClippingPath) {
@ -281,7 +283,7 @@ public class InvisibleElementRemovalService {
boolean isTextVisible = isTextRenderedVisibly(gState, textBBox, context);
if (inClippingPath && isTextVisible) {
context.visibleElements().add(ElementFeatureFactory.extractFeatures(textElement));
context.visibleElements().add(ElementFeatureFactory.buildText(textElement, context.delta()));
}
if (!context.delta()) {
if (inClippingPath && isTextVisible) {
@ -385,7 +387,7 @@ public class InvisibleElementRemovalService {
if (inClippingPath) {
if (isFilledAndNonTransparent(pathElement)) {
calculateOverlaps(context, pathFeatures, writer);
calculateOverlaps(context, pathFeatures);
}
context.visibleElements().add(ElementFeatureFactory.extractFeatures(pathElement));
}
@ -408,9 +410,9 @@ public class InvisibleElementRemovalService {
}
private void calculateOverlaps(InvisibleElementRemovalContext context, ElementFeatures elementFeatures, ElementWriter writer) {
private void calculateOverlaps(InvisibleElementRemovalContext context, ElementFeatures elementFeatures) {
List<ElementFeatures> currentOverlappedElements = context.visibleElements().findAlmostContained(elementFeatures);
List<ElementFeatures> currentOverlappedElements = context.visibleElements().findOverlapped(elementFeatures);
context.overlappedElements().addAll(currentOverlappedElements);
context.visibleElements().removeAll(currentOverlappedElements);
}
@ -423,7 +425,7 @@ public class InvisibleElementRemovalService {
if (context.delta()) {
// green for element removed due to overlapping
context.overlappedElements()
.forEach(feature -> drawBBox(writer, feature.getBoundingBox(), "#00FF00"));
.forEach(feature -> drawFeature(writer, feature, Color.GREEN));
context.overlappedElements().clear();
}
processOverlappedElements(writer, context);
@ -471,14 +473,19 @@ public class InvisibleElementRemovalService {
private static void removeOverlappedElement(ElementWriter writer, InvisibleElementRemovalContext context, Element element) throws PDFNetException {
if (context.overlappedElements.matchesAny(ElementFeatureFactory.extractFeatures(element))) {
Optional<ElementFeatures> optionalElementMatch = context.overlappedElements()
.anyMatch(ElementFeatureFactory.extractFeatures(element));
if (optionalElementMatch.isPresent()) {
context.overlappedElements().remove(optionalElementMatch.get());
if (element.getType() == 3 && element.hasTextMatrix()) {
/*
PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
Therefore, the position of a following Tj is affected by not writing the first Element.
This is why, we write only the Tm command:
*/
writer.writeGStateChanges(element);
writer.writeGStateChanges(element);
}
} else {
writer.writeElement(element);
}
@ -552,7 +559,7 @@ public class InvisibleElementRemovalService {
return true;
}
Area backgroundArea = mergeLinePathsToArea(pathElementsByColor);
return !almostContains(backgroundArea, Converter.toRectangle2D(textBBox));
return !ComparisonUtils.almostContains(backgroundArea, Converter.toRectangle2D(textBBox));
}
@ -562,10 +569,7 @@ public class InvisibleElementRemovalService {
var result = new ArrayList<PathFeatures>();
context.visibleElements().findIntersecting(textBBox)
.forEach(element -> {
if (element instanceof PathFeatures pathFeatures
&& pathFeatures.isBackground(textBBox)
&& !pathFeatures.getFillColor().equals(Color.WHITE)
&& pathFeatures.isFilled()) {
if (element instanceof PathFeatures pathFeatures && !pathFeatures.getFillColor().equals(Color.WHITE) && pathFeatures.isFilled()) {
result.add(pathFeatures);
}
});
@ -585,40 +589,12 @@ public class InvisibleElementRemovalService {
}
private boolean almostContains(Shape outer, Rectangle2D inner) {
//To address inconsistencies in the calculation of the bounding box we slightly shrink the inner rectangle
double x_with_tolerance = inner.getX() >= 0 ? inner.getX() + TOLERANCE : inner.getX() - TOLERANCE;
double y_with_tolerance = inner.getY() >= 0 ? inner.getY() + TOLERANCE : inner.getY() - TOLERANCE;
double height_with_tolerance = inner.getHeight() - (2 * TOLERANCE);
double width_with_tolerance = inner.getWidth() - (2 * TOLERANCE);
Rectangle2D innerRect = new Rectangle2D.Double(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance);
return outer.contains(innerRect);
}
private boolean isFilledAndNonTransparent(Element element) throws PDFNetException {
return element.isFilled() && element.getGState().getFillOpacity() == 1;
}
@SneakyThrows
private void drawBBox(ElementWriter writer, Rectangle2D r, String hexcolor) {
try (ColorPt colorPt = new ColorPt(Integer.valueOf(hexcolor.substring(1, 3), 16) / 255d,
Integer.valueOf(hexcolor.substring(3, 5), 16) / 255d,
Integer.valueOf(hexcolor.substring(5, 7), 16) / 255d); ElementBuilder eb = new ElementBuilder()) {
Element rect = eb.createRect(r.getX(), r.getY(), r.getWidth(), r.getHeight());
rect.setPathStroke(true);
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
rect.getGState().setStrokeColor(colorPt);
writer.writePlacedElement(rect);
}
}
@Builder
private record InvisibleElementRemovalContext(
boolean removePaths,

View File

@ -0,0 +1,71 @@
package com.iqser.red.pdftronlogic.commons;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import com.iqser.red.pdftronlogic.commons.features.ElementFeatureFactory;
import com.iqser.red.pdftronlogic.commons.features.ImageFeatures;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementReader;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.sdf.Obj;
import lombok.experimental.UtilityClass;
@UtilityClass
public class PdfImageExtraction {
public List<List<ImageFeatures>> extractImages(InputStream fileStream) throws IOException, PDFNetException {
try (PDFDoc pdfDoc = new PDFDoc(fileStream); ElementReader reader = new ElementReader()) {
List<List<ImageFeatures>> imagesPerPage = new ArrayList<>(pdfDoc.getPageCount());
var iter = pdfDoc.getPageIterator();
while (iter.hasNext()) {
Page page = iter.next();
Set<Long> visitedXObjIds = new HashSet<>();
visitedXObjIds.add(page.getSDFObj().getObjNum());
List<ImageFeatures> imageFeatures = new LinkedList<>();
reader.begin(page);
processElements(reader, imageFeatures, visitedXObjIds);
reader.end();
imagesPerPage.add(imageFeatures);
}
return imagesPerPage;
}
}
private void processElements(ElementReader reader, List<ImageFeatures> imageFeaturesOnPage, Set<Long> visitedXObjIds) throws PDFNetException {
for (Element element = reader.next(); element != null; element = reader.next()) {
switch (element.getType()) {
case Element.e_image, Element.e_inline_image -> imageFeaturesOnPage.add(ElementFeatureFactory.buildImage(element));
case Element.e_form -> {
Obj formObj = element.getXObject();
if (!visitedXObjIds.contains(formObj.getObjNum())) {
visitedXObjIds.add(formObj.getObjNum());
reader.formBegin();
processElements(reader, imageFeaturesOnPage, visitedXObjIds);
reader.end();
}
}
}
}
}
}

View File

@ -3,21 +3,30 @@ package com.iqser.red.pdftronlogic.commons;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import com.iqser.red.pdftronlogic.commons.features.ElementFeatureFactory;
import com.iqser.red.pdftronlogic.commons.features.TextFeatures;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementReader;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
import com.pdftron.pdf.TextExtractor;
import com.pdftron.sdf.Obj;
import lombok.experimental.UtilityClass;
@UtilityClass
public class PdfTextExtraction {
private static String execute(PDFDoc pdfDoc) throws PDFNetException{
try(TextExtractor extractor = new TextExtractor()) {
private static String execute(PDFDoc pdfDoc) throws PDFNetException {
try (TextExtractor extractor = new TextExtractor()) {
List<String> texts = new ArrayList<>();
PageIterator iterator = pdfDoc.getPageIterator();
@ -32,13 +41,65 @@ public class PdfTextExtraction {
}
}
public static String extractAllTextFromDocument(InputStream fileStream) throws IOException, PDFNetException {
PDFDoc pdfDoc = new PDFDoc(fileStream);
return execute(pdfDoc);
}
public static String extractAllTextFromDocument(PDFDoc pdfDoc) throws IOException, PDFNetException {
return execute(pdfDoc);
}
public static List<List<TextFeatures>> extractAllGlyphsFromDocument(InputStream fileStream, boolean includePathData) throws IOException, PDFNetException {
try (PDFDoc pdfDoc = new PDFDoc(fileStream); ElementReader reader = new ElementReader()) {
List<List<TextFeatures>> glyphsPerPages = new ArrayList<>(pdfDoc.getPageCount());
var iter = pdfDoc.getPageIterator();
while (iter.hasNext()) {
Page page = iter.next();
Set<Long> visitedXObjIds = new HashSet<>();
visitedXObjIds.add(page.getSDFObj().getObjNum());
List<TextFeatures> textFeatures = new LinkedList<>();
reader.begin(page);
processElements(reader, textFeatures, visitedXObjIds, includePathData);
reader.end();
glyphsPerPages.add(textFeatures);
}
return glyphsPerPages;
}
}
private static void processElements(ElementReader reader, List<TextFeatures> textFeaturesOnPage, Set<Long> visitedXObjIds, boolean includePathData) throws PDFNetException {
for (Element element = reader.next(); element != null; element = reader.next()) {
switch (element.getType()) {
case Element.e_text -> textFeaturesOnPage.add(ElementFeatureFactory.buildText(element, includePathData));
case Element.e_form -> {
Obj formObj = element.getXObject();
if (!visitedXObjIds.contains(formObj.getObjNum())) {
visitedXObjIds.add(formObj.getObjNum());
reader.formBegin();
processElements(reader, textFeaturesOnPage, visitedXObjIds, includePathData);
reader.end();
}
}
}
}
}
}

View File

@ -0,0 +1,91 @@
package com.iqser.red.pdftronlogic.commons;
import java.awt.Color;
import java.awt.geom.Rectangle2D;
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
import com.iqser.red.pdftronlogic.commons.features.GlyphInfo;
import com.iqser.red.pdftronlogic.commons.features.TextFeatures;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.ColorPt;
import com.pdftron.pdf.ColorSpace;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementBuilder;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.PathData;
import lombok.SneakyThrows;
import lombok.experimental.UtilityClass;
@UtilityClass
public class VisualizationUtils {
@SneakyThrows
public static void drawFeature(ElementWriter writer, ElementFeatures features, Color color) {
try (ElementBuilder builder = new ElementBuilder()) {
if (features instanceof TextFeatures textFeatures) {
for (GlyphInfo glyph : textFeatures.getGlyphs()) {
if (glyph.getPathData().isPresent()) {
drawPathData(glyph.getPathData().get(), builder, writer, color);
}
}
}
drawRect(features.getBoundingBox(), builder, writer, color);
}
}
public static void drawPathData(PathData pathData, ElementBuilder builder, ElementWriter writer, Color color) throws PDFNetException {
Element path = builder.createPath(pathData.getPoints(), pathData.getOperators());
path.setPathFill(true);
path.setPathStroke(false);
path.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
float[] comp = color.getColorComponents(null);
try (ColorPt colorPt = new ColorPt(comp[0], comp[1], comp[2])) {
path.getGState().setFillColor(colorPt);
}
path.setWindingFill(true);
writer.writeElement(path);
}
public static void drawRect(Rectangle2D rectangle2D, ElementBuilder builder, ElementWriter writer, Color color) throws PDFNetException {
drawRect(rectangle2D, builder, writer, color, false);
}
public static void drawRect(Rectangle2D rectangle2D, ElementBuilder builder, ElementWriter writer, Color color, boolean fill) throws PDFNetException {
Element rect = builder.createRect(rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight());
rect.setPathFill(false);
rect.setPathStroke(true);
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
float[] comp = fill ? Color.BLACK.getColorComponents(null) : color.getColorComponents(null);
try (ColorPt colorPt = new ColorPt(comp[0], comp[1], comp[2])) {
rect.getGState().setStrokeColor(colorPt);
}
double lineWidth = fill ? 0.1 : 0.5;
rect.getGState().setLineWidth(lineWidth);
writer.writeElement(rect);
if (fill) {
Element filledRect = builder.createRect(rectangle2D.getX() + lineWidth,
rectangle2D.getY() + lineWidth,
rectangle2D.getWidth() - 2 * lineWidth,
rectangle2D.getHeight() - 2 * lineWidth);
filledRect.setPathFill(true);
filledRect.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
float[] comp2 = color.getColorComponents(null);
try (ColorPt colorPt = new ColorPt(comp2[0], comp2[1], comp2[2])) {
filledRect.getGState().setFillColor(colorPt);
}
writer.writeElement(filledRect);
}
}
}

View File

@ -1,12 +1,27 @@
package com.iqser.red.pdftronlogic.commons.features;
import com.iqser.red.pdftronlogic.commons.Converter;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.Element;
import java.awt.geom.GeneralPath;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import com.iqser.red.pdftronlogic.commons.Converter;
import com.pdftron.common.Matrix2D;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.CharData;
import com.pdftron.pdf.CharIterator;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.Font;
import com.pdftron.pdf.GState;
import com.pdftron.pdf.PathData;
import lombok.SneakyThrows;
import lombok.experimental.UtilityClass;
@UtilityClass
public class ElementFeatureFactory {
public static ElementFeatures extractFeatures(Element element) throws PDFNetException {
public ElementFeatures extractFeatures(Element element) throws PDFNetException {
return switch (element.getType()) {
case Element.e_path -> buildPath(element);
@ -19,19 +34,19 @@ public class ElementFeatureFactory {
}
public static ImageFeatures buildImageWithHash(Element element, String hashObject) throws PDFNetException {
public ImageFeatures buildImageWithHash(Element element, String hashObject) throws PDFNetException {
return buildImageBase(element).hashOfImage(hashObject).build();
}
public static ImageFeatures buildImage(Element element) throws PDFNetException {
public ImageFeatures buildImage(Element element) throws PDFNetException {
return buildImageBase(element).build();
}
public static FormFeatures buildForm(Element element) throws PDFNetException {
public FormFeatures buildForm(Element element) throws PDFNetException {
try (var bbox = element.getBBox();) {
return FormFeatures.builder()
@ -44,9 +59,12 @@ public class ElementFeatureFactory {
}
private static ImageFeatures.ImageFeaturesBuilder<?, ?> buildImageBase(Element element) throws PDFNetException {
private ImageFeatures.ImageFeaturesBuilder<?, ?> buildImageBase(Element element) throws PDFNetException {
try (var bbox = element.getBBox();) {
boolean transparent = element.getGState().getBlendMode() != GState.e_bl_normal
|| element.getGState().getFillOpacity() > 1
|| element.getGState().getStrokeOpacity() > 1;
return ImageFeatures.builder()
.elementType(element.getType())
.boundingBox(Converter.toRectangle2D(bbox))
@ -55,26 +73,40 @@ public class ElementFeatureFactory {
.width(element.getImageWidth())
.renderingIntent(element.getImageRenderingIntent())
.componentNum(element.getComponentNum())
.bitsPerComponent(element.getBitsPerComponent());
.bitsPerComponent(element.getBitsPerComponent())
.imageMask(element.isImageMask())
.softMask(element.getGState().getSoftMask() != null)
.transparent(transparent);
}
}
public static TextFeatures buildText(Element element) throws PDFNetException {
public TextFeatures buildText(Element element) throws PDFNetException {
return buildText(element, false);
}
/*
Use includePathData = true, when trying to draw the glyphs, see GlyphExtractionTest
*/
public TextFeatures buildText(Element element, boolean includePathData) throws PDFNetException {
try (var bbox = element.getBBox()) {
return TextFeatures.builder()
.elementType(element.getType())
.boundingBox(Converter.toRectangle2D(bbox))
.text(element.getTextString())
.font(element.getGState().getFont().getType())
.fontsize(element.getGState().getFontSize())
.glyphs(extractGlyphInfo(element, includePathData))
.build();
}
}
public static PathFeatures buildPath(Element element) throws PDFNetException {
public PathFeatures buildPath(Element element) throws PDFNetException {
try (var bbox = element.getBBox(); var ctm = element.getCTM(); var fillColor = element.getGState().getFillColor(); var strokeColor = element.getGState().getStrokeColor()) {
return PathFeatures.builder()
@ -92,4 +124,68 @@ public class ElementFeatureFactory {
}
}
@SneakyThrows
private List<GlyphInfo> extractGlyphInfo(Element textElement, boolean includePathData) {
assert textElement != null && textElement.getType() == Element.e_text;
if (textElement.getBBox() == null) {
return Collections.emptyList();
}
Font font = textElement.getGState().getFont();
if (font.getType() == Font.e_Type3) {
// type 3 fonts seem to be much more difficult, one must use font.getType3GlyphStream and font.getType3FontMatrix instead
// couldn't find much information except this post https://groups.google.com/g/pdfnet-sdk/c/SvhMflbtQho
// will implement this when necessary
return Collections.emptyList();
}
List<GlyphInfo> glyphs = new ArrayList<>();
CharIterator charIterator = textElement.getCharIterator();
while (charIterator.hasNext()) {
CharData charData = charIterator.next();
long charCode = charData.getCharCode();
String glyphText = new String(font.mapToUnicode(charCode));
if (Character.isWhitespace(glyphText.charAt(0))) {
continue;
}
try (Matrix2D fontMatrix = computeFontMatrix(charData, textElement, font); //
Matrix2D glyphMatrix = textElement.getCTM()//
.multiply(textElement.getTextMatrix())//
.multiply(fontMatrix)) {
PathData pathData = font.getGlyphPath(charCode, true, glyphMatrix);
if (pathData.getOperators().length == 1 && pathData.getOperators()[0] == 6) {
// This happens for some chinese characters or whitespaces, don't know why...
continue;
}
GeneralPath glyphPath = Converter.convertToGeneralPath(pathData);
GlyphInfo.GlyphInfoBuilder glyphInfo = GlyphInfo.builder().unicode(glyphText).bbox(glyphPath.getBounds2D());
if (includePathData) {
glyphInfo.pathData(pathData);
}
glyphs.add(glyphInfo.build());
}
}
return glyphs;
}
private Matrix2D computeFontMatrix(CharData charData, Element textElement, Font font) throws PDFNetException {
double yScaleFactor = textElement.getGState().getFontSize() / font.getUnitsPerEm();
double xScaleFactor = (textElement.getGState().getHorizontalScale() / 100) * yScaleFactor;
return new Matrix2D(xScaleFactor, 0, 0, -yScaleFactor, charData.getGlyphX(), charData.getGlyphY());
}
}

View File

@ -2,8 +2,11 @@ package com.iqser.red.pdftronlogic.commons.features;
import static com.iqser.red.pdftronlogic.commons.ComparisonUtils.almostEqual;
import java.awt.Shape;
import java.awt.geom.Rectangle2D;
import com.iqser.red.pdftronlogic.commons.ComparisonUtils;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.SneakyThrows;
@ -24,12 +27,12 @@ public class ElementFeatures {
public boolean matches(ElementFeatures elementFeatures) {
return elementFeatures.getElementType() == elementType && elementFeatures.getBoundingBox() != null && rectsAlmostMatch(elementFeatures.getBoundingBox());
return elementFeatures.getElementType() == elementType && elementFeatures.getBoundingBox() != null && bboxMatches(elementFeatures.getBoundingBox());
}
@SneakyThrows
protected boolean rectsAlmostMatch(Rectangle2D bBox) {
protected boolean bboxMatches(Rectangle2D bBox) {
// To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance
return almostEqual(bBox.getX(), boundingBox.getX()) && //
@ -39,6 +42,12 @@ public class ElementFeatures {
}
public Shape getOverlapShape() {
return boundingBox;
}
public boolean similar(ElementFeatures elementFeatures) {
return elementFeatures.getElementType() == elementType && elementFeatures.getBoundingBox() != null && areRectsSimilar(elementFeatures.getBoundingBox());
@ -67,10 +76,22 @@ public class ElementFeatures {
}
public boolean almostContains(ElementFeatures features) {
public boolean contains(ElementFeatures features) {
Rectangle2D inner = features.getBoundingBox();
return boundingBox.contains(inner);
return features.containedBy(this);
}
public boolean testOverlapped(ElementFeatures overlappingElement) {
return containedBy(overlappingElement);
}
private boolean containedBy(ElementFeatures features) {
Shape overlapShape = features.getOverlapShape();
return overlapShape.contains(ComparisonUtils.shrinkRectangle(boundingBox));
}
}

View File

@ -22,21 +22,21 @@ public class FormFeatures extends ElementFeatures {
public boolean matches(ElementFeatures elementFeatures) {
if (elementFeatures.getClass() != this.getClass()) {
return false;
if (elementFeatures instanceof FormFeatures features) {
return elementFeatures.getElementType() == getElementType()
&& elementFeatures.getBoundingBox() != null
&& (super.bboxMatches(elementFeatures.getBoundingBox())
|| rotationMatches(elementFeatures.getBoundingBox()
.getBounds2D()))
&& xObjectType == features.getXObjectType()
&& dictOrArrayOrStreamLength == features.getDictOrArrayOrStreamLength();
}
return elementFeatures.getElementType() == getElementType()
&& elementFeatures.getBoundingBox() != null
&& (super.rectsAlmostMatch(elementFeatures.getBoundingBox())
|| almostRotateMatches(elementFeatures.getBoundingBox()
.getBounds2D()))
&& xObjectType == ((FormFeatures) elementFeatures).getXObjectType()
&& dictOrArrayOrStreamLength == ((FormFeatures) elementFeatures).getDictOrArrayOrStreamLength();
return false;
}
private boolean almostRotateMatches(Rectangle2D bBox) {
private boolean rotationMatches(Rectangle2D bBox) {
return almostEqual(bBox.getWidth(), getBoundingBox().getHeight()) && //
almostEqual(bBox.getHeight(), getBoundingBox().getWidth());

View File

@ -0,0 +1,70 @@
package com.iqser.red.pdftronlogic.commons.features;
import java.awt.Shape;
import java.awt.geom.Rectangle2D;
import java.util.Optional;
import com.iqser.red.pdftronlogic.commons.ComparisonUtils;
import com.pdftron.pdf.PathData;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.experimental.FieldDefaults;
@Builder
@AllArgsConstructor
@RequiredArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class GlyphInfo {
@Getter
final String unicode;
@Getter
final Rectangle2D bbox;
final PathData pathData;
boolean overlapped;
ElementFeatures overlappingElement;
public boolean testOverlapped(ElementFeatures overlappingElement) {
if (overlapped) {
return true;
}
if (overlappingElement.getOverlapShape().contains(ComparisonUtils.shrinkRectangle(bbox))) {
overlapped = true;
this.overlappingElement = overlappingElement;
}
return overlapped;
}
public boolean matches(GlyphInfo glyph2) {
return unicode.equals(glyph2.unicode)//
&& calculateIntersectedArea(glyph2.bbox, bbox) > 0.9 * Math.min(bbox.getWidth() * bbox.getHeight(), glyph2.bbox.getHeight() * glyph2.bbox.getWidth());
}
public Optional<PathData> getPathData() {
return Optional.ofNullable(pathData);
}
private static double calculateIntersectedArea(Rectangle2D r1, Rectangle2D r2) {
double xOverlap = Math.max(0, Math.min(r1.getMaxX(), r2.getMaxX()) - Math.max(r1.getMinX(), r2.getMinX()));
double yOverlap = Math.max(0, Math.min(r1.getMaxY(), r2.getMaxY()) - Math.max(r1.getY(), r2.getY()));
return xOverlap * yOverlap;
}
}

View File

@ -19,6 +19,9 @@ public class ImageFeatures extends ElementFeatures {
int renderingIntent;
int componentNum;
int bitsPerComponent;
boolean imageMask;
boolean softMask;
boolean transparent;
String hashOfImage;
@ -33,6 +36,9 @@ public class ImageFeatures extends ElementFeatures {
&& this.renderingIntent == imageFeatures.getRenderingIntent()
&& this.componentNum == imageFeatures.getComponentNum()
&& this.bitsPerComponent == imageFeatures.getBitsPerComponent()
&& this.imageMask == imageFeatures.isImageMask()
&& this.softMask == imageFeatures.isSoftMask()
&& this.transparent == imageFeatures.isTransparent()
&& calculateHammingDistance(imageFeatures.getHashOfImage()) <= HAMMING_DISTANCE_THRESHOLD;
}
return false;

View File

@ -1,24 +1,19 @@
package com.iqser.red.pdftronlogic.commons.features;
import static com.iqser.red.pdftronlogic.commons.ComparisonUtils.getPaddedRectangle;
import java.awt.Color;
import java.awt.Shape;
import java.awt.geom.GeneralPath;
import java.awt.geom.Rectangle2D;
import com.pdftron.pdf.Rect;
import lombok.AccessLevel;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@EqualsAndHashCode(callSuper = true)
@Getter
@SuperBuilder
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
@EqualsAndHashCode(callSuper = true)
public class PathFeatures extends ElementFeatures {
boolean clippingPath;
@ -47,26 +42,10 @@ public class PathFeatures extends ElementFeatures {
}
public boolean matchesFillColor(Color color) {
return color.equals(fillColor);
}
@SneakyThrows
public boolean isBackground(Rect area) {
return filled && //
getBoundingBox().intersects(area.getX1(), area.getY1(), area.getWidth(), area.getHeight());
}
@Override
public boolean almostContains(ElementFeatures elementFeatures) {
public Shape getOverlapShape() {
Rectangle2D innerRect = getPaddedRectangle(elementFeatures);
return linePath.contains(innerRect);
return linePath;
}
}

View File

@ -2,7 +2,13 @@ package com.iqser.red.pdftronlogic.commons.features;
import static com.iqser.red.pdftronlogic.commons.ComparisonUtils.almostEqual;
import java.util.ArrayList;
import java.util.List;
import com.iqser.red.pdftronlogic.commons.ComparisonUtils;
import lombok.AccessLevel;
import lombok.Builder;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
@ -18,19 +24,45 @@ public class TextFeatures extends ElementFeatures {
String text;
int font;
double fontsize;
@Builder.Default
List<GlyphInfo> glyphs = new ArrayList<>();
@Override
public boolean matches(ElementFeatures element) {
if (element instanceof TextFeatures textFeaturesElement) {
return super.matches(textFeaturesElement) //
return super.matches(textFeaturesElement)//
&& text.equals(textFeaturesElement.getText()) //
&& font == textFeaturesElement.getFont() //
&& font == textFeaturesElement.getFont()//
&& almostEqual(fontsize, textFeaturesElement.getFontsize());
}
return false;
}
private boolean glyphsMatch(TextFeatures textFeaturesElement) {
if (glyphs.size() != textFeaturesElement.getGlyphs().size()) {
return false;
}
for (int i = 0; i < glyphs.size(); i++) {
if (!glyphs.get(i).matches(textFeaturesElement.getGlyphs().get(i))) {
return false;
}
}
return true;
}
public boolean testOverlapped(ElementFeatures overlappingElement) {
if (glyphs.isEmpty()) {
return super.testOverlapped(overlappingElement);
}
return super.testOverlapped(overlappingElement) || glyphs.stream()
.allMatch(glyph -> glyph.testOverlapped(overlappingElement));
}
}

View File

@ -1,37 +1,43 @@
package com.iqser.red.pdftronlogic.commons.lookup;
import org.locationtech.jts.index.ItemVisitor;
import java.util.Optional;
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
@RequiredArgsConstructor
public class AnyMatchVisitor implements ItemVisitor {
public class AnyMatchVisitor implements ElementFeatureVisitor {
private final ElementFeatures queryFeatures;
private boolean anyMatch = false;
@Getter
private ElementFeatures match;
public boolean hasAnyMatch() {
public Optional<ElementFeatures> getAnyMatch() {
return anyMatch;
return Optional.ofNullable(match);
}
@Override
public void visitItem(Object o) {
public void visitItem(ElementFeatures features) {
if (anyMatch) {
if (hasAnyMatch()) {
return;
}
if (o instanceof ElementFeatures features) {
if (queryFeatures.matches(features)) {
anyMatch = true;
}
if (queryFeatures.matches(features)) {
match = features;
}
}
private boolean hasAnyMatch() {
return getAnyMatch().isPresent();
}
}

View File

@ -1,104 +1,101 @@
package com.iqser.red.pdftronlogic.commons.lookup;
import static com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService.TOLERANCE;
import java.awt.geom.Rectangle2D;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.function.Consumer;
import java.util.function.Predicate;
import org.locationtech.jts.geom.Envelope;
import org.locationtech.jts.index.quadtree.Quadtree;
import com.iqser.red.pdftronlogic.commons.Converter;
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
import com.pdftron.pdf.Rect;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
@Getter
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ElementFeatureLookup {
/*
This class looks a bit weird and uses visitors since I tried to use the quadtree implementation by locationtech, since it uses Rectangles by default to query its data structure.
Unfortunately there were always edge cases, where it lost a few elements making it completely unusable. Further, it didn't even speed up the algorithm all that much.
*/
Quadtree quadTree = new Quadtree();
Set<ElementFeatures> allElements = new HashSet<>();
public void add(ElementFeatures elementFeatures) {
quadTree.insert(envelop(elementFeatures), elementFeatures);
allElements.add(elementFeatures);
}
public void remove(ElementFeatures elementFeatures) {
quadTree.remove(envelop(elementFeatures), elementFeatures);
allElements.remove(elementFeatures);
}
public boolean matchesAny(ElementFeatures elementFeatures) {
public Optional<ElementFeatures> anyMatch(ElementFeatures elementFeatures) {
AnyMatchVisitor visitor = new AnyMatchVisitor(elementFeatures);
quadTree.query(queryEnvelop(elementFeatures), visitor);
return visitor.hasAnyMatch();
forEach(visitor::visitItem);
return visitor.getAnyMatch();
}
@SneakyThrows
public List<ElementFeatures> query(Predicate<ElementFeatures> predicate) {
PredicateItemVisitor visitor = new PredicateItemVisitor(predicate);
forEach(visitor::visitItem);
return visitor.getMatchingFeatures();
}
@SneakyThrows
public List<ElementFeatures> findIntersecting(Rect bbox) {
Rectangle2D r = Converter.toRectangle2D(bbox);
return query(elementFeatures -> elementFeatures.getBoundingBox().intersects(r));
}
public List<ElementFeatures> findOverlapped(ElementFeatures overlappingElement) {
OverlapVisitor overlapVisitor = new OverlapVisitor(overlappingElement);
forEach(overlapVisitor::visitItem);
return overlapVisitor.getOverlappedElementFeatures();
}
public void forEach(Consumer<ElementFeatures> consumer) {
quadTree.queryAll()
.forEach(consumer);
allElements.forEach(consumer);
}
public void clear() {
forEach(this::remove);
}
public List<ElementFeatures> findAlmostContained(ElementFeatures elementFeatures) {
PredicateItemVisitor visitor = new PredicateItemVisitor(elementFeatures::almostContains);
quadTree.query(queryEnvelop(elementFeatures), visitor);
return visitor.getMatchingFeatures();
}
public List<ElementFeatures> query(ElementFeatures elementFeatures, Predicate<ElementFeatures> predicate) {
PredicateItemVisitor visitor = new PredicateItemVisitor(predicate);
quadTree.query(queryEnvelop(elementFeatures), visitor);
return visitor.getMatchingFeatures();
}
private static Envelope envelop(ElementFeatures elementFeatures) {
Rectangle2D r = elementFeatures.getBoundingBox();
return new Envelope(r.getX(), r.getY(), r.getWidth(), r.getHeight());
}
private static Envelope queryEnvelop(ElementFeatures elementFeatures) {
Rectangle2D r = elementFeatures.getBoundingBox();
return new Envelope(r.getX() - TOLERANCE, r.getY() - TOLERANCE, r.getWidth() + 2 * TOLERANCE, r.getHeight() + 2 * TOLERANCE);
allElements.clear();
}
public boolean isEmpty() {
return quadTree.isEmpty();
return allElements.isEmpty();
}
public int size() {
return quadTree.size();
return allElements.size();
}
@ -111,26 +108,6 @@ public class ElementFeatureLookup {
public void removeAll(List<ElementFeatures> currentOverlappedElements) {
currentOverlappedElements.forEach(this::remove);
}
@SneakyThrows
public List<ElementFeatures> query(Rect bbox, Predicate<ElementFeatures> predicate) {
PredicateItemVisitor visitor = new PredicateItemVisitor(predicate);
quadTree.query(new Envelope(bbox.getX1(), bbox.getY1(), bbox.getWidth(), bbox.getHeight()), visitor);
return visitor.getMatchingFeatures();
}
@SneakyThrows
public List<ElementFeatures> findIntersecting(Rect bbox) {
Rectangle2D r = Converter.toRectangle2D(bbox);
PredicateItemVisitor visitor = new PredicateItemVisitor(elementFeatures -> elementFeatures.getBoundingBox().intersects(r));
quadTree.query(new Envelope(r.getX(), r.getY(), r.getWidth(), r.getHeight()), visitor);
return visitor.getMatchingFeatures();
}
}

View File

@ -0,0 +1,9 @@
package com.iqser.red.pdftronlogic.commons.lookup;
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
public interface ElementFeatureVisitor {
void visitItem(ElementFeatures features);
}

View File

@ -0,0 +1,35 @@
package com.iqser.red.pdftronlogic.commons.lookup;
import java.util.LinkedList;
import java.util.List;
import com.iqser.red.pdftronlogic.commons.ComparisonUtils;
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.experimental.FieldDefaults;
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class OverlapVisitor implements ElementFeatureVisitor {
ElementFeatures overlappingElement;
@Getter
List<ElementFeatures> overlappedElementFeatures = new LinkedList<>();
@Override
public void visitItem(ElementFeatures features) {
if (ComparisonUtils.padRectangle(features.getBoundingBox()).intersects(ComparisonUtils.padRectangle(overlappingElement.getBoundingBox()))) {
if (features.testOverlapped(overlappingElement)) {
overlappedElementFeatures.add(features);
}
}
}
}

View File

@ -4,15 +4,13 @@ import java.util.ArrayList;
import java.util.List;
import java.util.function.Predicate;
import org.locationtech.jts.index.ItemVisitor;
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
@RequiredArgsConstructor
public class PredicateItemVisitor implements ItemVisitor {
public class PredicateItemVisitor implements ElementFeatureVisitor {
private final Predicate<ElementFeatures> predicate;
@Getter
@ -20,12 +18,10 @@ public class PredicateItemVisitor implements ItemVisitor {
@Override
public void visitItem(Object o) {
public void visitItem(ElementFeatures features) {
if (o instanceof ElementFeatures features) {
if (predicate.test(features)) {
matchingFeatures.add(features);
}
if (predicate.test(features)) {
matchingFeatures.add(features);
}
}

View File

@ -0,0 +1,87 @@
package com.iqser.red.pdftronlogic.commons;
import static com.iqser.red.pdftronlogic.commons.VisualizationUtils.drawPathData;
import static com.iqser.red.pdftronlogic.commons.VisualizationUtils.drawRect;
import java.awt.Color;
import java.io.FileOutputStream;
import java.nio.file.Path;
import java.util.List;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import com.iqser.red.pdftronlogic.commons.features.GlyphInfo;
import com.iqser.red.pdftronlogic.commons.features.ImageFeatures;
import com.iqser.red.pdftronlogic.commons.features.TextFeatures;
import com.pdftron.pdf.ElementBuilder;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.PDFNet;
import com.pdftron.pdf.Page;
import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows;
public class GlyphExtractionTest {
@BeforeEach
void createService() {
PDFNet.initialize(PDFTronConfig.license);
}
@Test
@SneakyThrows
public void testGlyphExtraction() {
String file = "files/everyCharIsImage.pdf";
List<List<TextFeatures>> textsPerPage;
List<List<ImageFeatures>> imagesPerPage;
try (var in = this.getClass().getClassLoader().getResourceAsStream(file)) {
textsPerPage = PdfTextExtraction.extractAllGlyphsFromDocument(in, true);
}
try (var in = this.getClass().getClassLoader().getResourceAsStream(file)) {
imagesPerPage = PdfImageExtraction.extractImages(in);
}
try (var in = this.getClass().getClassLoader().getResourceAsStream(file);//
var out = new FileOutputStream(Path.of("/tmp/").resolve(Path.of(file).getFileName() + "_GLYPHS.pdf").toFile())) {
PDFDoc pdfDoc = new PDFDoc(in);
for (int i = 0; i < pdfDoc.getPageCount(); i++) {
Page page = pdfDoc.getPage(i + 1);
List<TextFeatures> textFeaturesOnPage = textsPerPage.get(i);
List<ImageFeatures> imageFeaturesOnPage = imagesPerPage.get(i);
try (ElementWriter writer = new ElementWriter(); ElementBuilder builder = new ElementBuilder()) {
writer.begin(page, ElementWriter.e_overlay, false);
for (ImageFeatures imageFeatures : imageFeaturesOnPage) {
if (imageFeatures.getBoundingBox().getHeight() * imageFeatures.getBoundingBox().getWidth() >= page.getPageHeight() * page.getPageWidth() * 0.8) {
continue;
}
drawRect(imageFeatures.getBoundingBox(), builder, writer, Color.CYAN, true);
}
for (TextFeatures textFeatures : textFeaturesOnPage) {
drawRect(textFeatures.getBoundingBox(), builder, writer, Color.BLUE);
for (GlyphInfo glyph : textFeatures.getGlyphs()) {
if (glyph.getPathData().isPresent()) {
drawPathData(glyph.getPathData().get(), builder, writer, Color.BLACK);
}
drawRect(ComparisonUtils.shrinkRectangle(glyph.getBbox()), builder, writer, Color.RED);
drawRect(glyph.getBbox(), builder, writer, Color.MAGENTA);
}
}
writer.end();
}
}
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
}
}
}

View File

@ -7,6 +7,7 @@ import java.io.FileInputStream;
import java.io.FileOutputStream;
import lombok.extern.slf4j.Slf4j;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@ -77,9 +78,9 @@ class InvisibleElementRemovalServiceTest {
try (var in = new FileInputStream(deltaResultFileName)) {
String[] text = extractAllTextFromDocument(in).split("\n");
assertThat(text).contains(":Bold S-enantiomer form if two codes are supplied",
"Red : Only observed in laboratory soil studies",
"Green : Observed in both laboratory soil studies and lysimeter leachate",
"Blue : Only observed in lysimeter leachate");
"Red : Only observed in laboratory soil studies",
"Green : Observed in both laboratory soil studies and lysimeter leachate",
"Blue : Only observed in lysimeter leachate");
}
}
@ -102,20 +103,20 @@ class InvisibleElementRemovalServiceTest {
try (var in = new FileInputStream(deltaResultFileName)) {
String result = PdfTextExtraction.extractAllTextFromDocument(in);
assertThat(result).contains("#1 Dark",
"#13 Yellow",
"Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper suscipit lobortis nisl ut aliquip\n" +
"ex ea commodo consequat. Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie\n" +
"consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim\n" +
"qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi.");
assertThat(result).doesNotContain("Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut\n" +
"labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et\n" +
"ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem\n" +
"ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et\n" +
"dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea\n" +
"rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum\n" +
"dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore\n" +
"magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet\n" +
"clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. ");
"#13 Yellow",
"Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper suscipit lobortis nisl ut aliquip\n"
+ "ex ea commodo consequat. Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie\n"
+ "consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim\n"
+ "qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi.");
assertThat(result).doesNotContain("Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut\n"
+ "labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et\n"
+ "ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem\n"
+ "ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et\n"
+ "dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea\n"
+ "rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum\n"
+ "dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore\n"
+ "magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet\n"
+ "clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. ");
}
}
@ -189,4 +190,26 @@ class InvisibleElementRemovalServiceTest {
}
@Test
@SneakyThrows
void removeInvisibleElementsWhereEachCharIsImage() {
String fileName = "files/everyCharIsImage.pdf";
String resultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL");
String deltaResultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL_DELTA");
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(resultFileName)) {
invisibleElementRemovalService.removeInvisibleElementsButKeepOcrText(in, out, false);
}
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(deltaResultFileName)) {
invisibleElementRemovalService.removeInvisibleElementsButKeepOcrText(in, out, true);
}
try (var in = new FileInputStream(resultFileName)) {
String result = PdfTextExtraction.extractAllTextFromDocument(in);
assertThat(result).isBlank();
}
}
}

View File

@ -15,6 +15,7 @@ import java.util.Optional;
import java.util.concurrent.CompletableFuture;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import com.iqser.red.pdftronlogic.commons.rendering.GhostScriptService;
@ -27,19 +28,21 @@ import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.lept4j.util.LeptUtils;
@Disabled // requires leptonica and ghostscript to be installed locally
public class VisualEqualityTest {
public static final double SIMILARITY_THRESHOLD = 0.015; // percentage of pixels which differ by more than 10 points in luminance
public static final String LEPTONICA_DIR = "/home/kschuettler/software/leptonica/vcpkg/installed/x64-linux-dynamic/lib/";
GhostScriptService ghostScriptService = new GhostScriptService();
InvisibleElementRemovalService invisibleElementRemovalService = new InvisibleElementRemovalService();
Path stem = Path.of("/tmp/AAA_EQUALITY_TEST/");
Path TEST_OUTPUT_DIR = Path.of("/tmp/AAA_EQUALITY_TEST/");
@BeforeEach
public void setup() {
PDFNet.initialize(PDFTronConfig.license);
System.setProperty("jna.library.path", "/home/kschuettler/software/leptonica/vcpkg/installed/x64-linux-dynamic/lib/");
System.setProperty("jna.library.path", LEPTONICA_DIR);
try (NativeLibrary leptonicaLib = NativeLibrary.getInstance("leptonica")) {
assert leptonicaLib != null;
@ -51,8 +54,8 @@ public class VisualEqualityTest {
@SneakyThrows
public void assertVisualEqualityOfProcessedFile() {
Path file = Path.of("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/ITEM 23_A19022A - Dermal Absorption Human.pdf");
Context context = new Context(stem, new HashMap<>());
Path file = Path.of("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/ITEM 19_A15149AC - Primary Skin Irritation Rabbit.pdf");
Context context = new Context(TEST_OUTPUT_DIR, new HashMap<>());
runForFile(file, context);
@ -65,7 +68,7 @@ public class VisualEqualityTest {
public void assertVisualEqualityOfProcessedFolder() {
Path folder = Path.of("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles");
Context context = new Context(stem, new HashMap<>());
Context context = new Context(TEST_OUTPUT_DIR, new HashMap<>());
Files.walk(folder)
.filter(Files::isRegularFile)