RED-9746: Document hardly editable
* revert quadtree lookup, since the lib does not seem to work reliably, also, no significant speed boost * check each individual glyph instead of only a text run and remember past overlaps in glyph * added logic to extract all glyphs exactly
This commit is contained in:
parent
f4538648ff
commit
aa3823c9db
@ -28,8 +28,7 @@ dependencies {
|
||||
api("org.projectlombok:lombok:1.18.30")
|
||||
api("com.google.guava:guava:33.0.0-jre")
|
||||
api("com.pdftron:PDFNet:10.11.0")
|
||||
implementation("org.locationtech.jts:jts-core:1.19.0")
|
||||
implementation("net.sourceforge.lept4j:lept4j:1.19.1")
|
||||
testImplementation("net.sourceforge.lept4j:lept4j:1.19.1")
|
||||
testImplementation("org.junit.jupiter:junit-jupiter:5.10.2")
|
||||
testImplementation("org.assertj:assertj-core:3.24.2")
|
||||
testImplementation("org.mockito:mockito-core:5.2.0")
|
||||
|
||||
@ -2,28 +2,75 @@ package com.iqser.red.pdftronlogic.commons;
|
||||
|
||||
import static com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService.TOLERANCE;
|
||||
|
||||
import java.awt.Shape;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class ComparisonUtils {
|
||||
public static Rectangle2D getPaddedRectangle(ElementFeatures elementFeatures) {
|
||||
|
||||
Rectangle2D inner = elementFeatures.getBoundingBox();
|
||||
public Rectangle2D shrinkRectangle(Rectangle2D inner) {
|
||||
|
||||
//To address inconsistencies in the calculation of the bounding box we slightly shrink the inner rectangle
|
||||
double x_with_tolerance = inner.getX() >= 0 ? inner.getX() + TOLERANCE : inner.getX() - TOLERANCE;
|
||||
double y_with_tolerance = inner.getY() >= 0 ? inner.getY() + TOLERANCE : inner.getY() - TOLERANCE;
|
||||
double height_with_tolerance = inner.getHeight() - (2 * TOLERANCE);
|
||||
double width_with_tolerance = inner.getWidth() - (2 * TOLERANCE);
|
||||
return new Rectangle2D.Double(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance);
|
||||
return shrinkRectangle(inner, TOLERANCE);
|
||||
}
|
||||
|
||||
|
||||
public Rectangle2D shrinkRectangle(Rectangle2D rect, double tolerance) {
|
||||
|
||||
double newX = rect.getX() + tolerance;
|
||||
double newY = rect.getY() + tolerance;
|
||||
double newWidth = rect.getWidth() - 2 * tolerance;
|
||||
double newHeight = rect.getHeight() - 2 * tolerance;
|
||||
|
||||
if (newWidth <= 1e-1) {
|
||||
newWidth = 1e-1;
|
||||
newX = rect.getX() + newWidth / 2;
|
||||
}
|
||||
if (newHeight <= 1e-1) {
|
||||
newHeight = 1e-1;
|
||||
newY = rect.getY() + newHeight / 2;
|
||||
}
|
||||
|
||||
return new Rectangle2D.Double(newX, newY, newWidth, newHeight);
|
||||
}
|
||||
|
||||
|
||||
public Rectangle2D padRectangle(Rectangle2D inner) {
|
||||
|
||||
return padRectangle(inner, TOLERANCE);
|
||||
}
|
||||
|
||||
|
||||
public Rectangle2D padRectangle(Rectangle2D rect, double tolerance) {
|
||||
|
||||
double newX = rect.getX() - tolerance;
|
||||
double newY = rect.getY() - tolerance;
|
||||
double newWidth = rect.getWidth() + 2 * tolerance;
|
||||
double newHeight = rect.getHeight() + 2 * tolerance;
|
||||
|
||||
if (newWidth <= 0) {
|
||||
newWidth = 1e-2;
|
||||
}
|
||||
if (newHeight <= 0) {
|
||||
newHeight = 1e-2;
|
||||
}
|
||||
|
||||
return new Rectangle2D.Double(newX, newY, newWidth, newHeight);
|
||||
}
|
||||
|
||||
|
||||
public boolean almostContains(Shape outer, Rectangle2D inner) {
|
||||
|
||||
Rectangle2D innerRect = ComparisonUtils.shrinkRectangle(inner);
|
||||
|
||||
return outer.contains(innerRect);
|
||||
}
|
||||
|
||||
|
||||
public static boolean almostEqual(double a, double b) {
|
||||
|
||||
return Math.abs(a - b) < TOLERANCE;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -3,8 +3,11 @@ package com.iqser.red.pdftronlogic.commons;
|
||||
import java.awt.Color;
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.GeneralPath;
|
||||
import java.awt.geom.PathIterator;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import com.google.common.primitives.Bytes;
|
||||
import com.google.common.primitives.Doubles;
|
||||
@ -21,16 +24,18 @@ import lombok.experimental.UtilityClass;
|
||||
@UtilityClass
|
||||
public class Converter {
|
||||
|
||||
public GeneralPath convertToGeneralPath(PathData pathData) throws PDFNetException {
|
||||
public GeneralPath convertToGeneralPath(PathData pathData) {
|
||||
|
||||
GeneralPath linePath = new GeneralPath();
|
||||
Iterator<Double> points = Doubles.asList(pathData.getPoints()).iterator();
|
||||
Iterator<Double> points = Doubles.asList(pathData.getPoints())
|
||||
.iterator();
|
||||
Iterable<Byte> operators = Bytes.asList(pathData.getOperators());
|
||||
for (var operator : operators) {
|
||||
switch (operator) {
|
||||
case PathData.e_moveto -> linePath.moveTo(points.next(), points.next());
|
||||
case PathData.e_lineto -> linePath.lineTo(points.next(), points.next());
|
||||
case PathData.e_cubicto -> linePath.curveTo(points.next(), points.next(), points.next(), points.next(), points.next(), points.next());
|
||||
case PathData.e_conicto -> linePath.quadTo(points.next(), points.next(), points.next(), points.next());
|
||||
case PathData.e_closepath -> linePath.closePath();
|
||||
case PathData.e_rect -> {
|
||||
double x = points.next();
|
||||
@ -43,14 +48,67 @@ public class Converter {
|
||||
linePath.lineTo(x, y + h);
|
||||
linePath.closePath();
|
||||
}
|
||||
default -> throw new PDFNetException("Invalid Element Type", 0, "", "", "");
|
||||
default -> throw new IllegalArgumentException("Invalid Operator Type " + operator);
|
||||
}
|
||||
}
|
||||
return linePath;
|
||||
}
|
||||
|
||||
|
||||
public GeneralPath convertToGeneralPathAndTransformToInitialUserSpace(PathData pathData, Matrix2D ctm) throws PDFNetException{
|
||||
public PathData convertToPathData(GeneralPath linePath) {
|
||||
|
||||
PathIterator pathIterator = linePath.getPathIterator(null);
|
||||
List<Byte> operators = new LinkedList<>();
|
||||
List<Double> points = new LinkedList<>();
|
||||
while (!pathIterator.isDone()) {
|
||||
double[] currentPoints = new double[6];
|
||||
int type = pathIterator.currentSegment(currentPoints);
|
||||
switch (type) {
|
||||
case PathIterator.SEG_MOVETO -> {
|
||||
operators.add((byte) PathData.e_moveto);
|
||||
points.add(currentPoints[0]);
|
||||
points.add(currentPoints[1]);
|
||||
}
|
||||
case PathIterator.SEG_LINETO -> {
|
||||
operators.add((byte) PathData.e_lineto);
|
||||
points.add(currentPoints[0]);
|
||||
points.add(currentPoints[1]);
|
||||
}
|
||||
case PathIterator.SEG_QUADTO -> {
|
||||
operators.add((byte) PathData.e_conicto);
|
||||
points.add(currentPoints[0]);
|
||||
points.add(currentPoints[1]);
|
||||
points.add(currentPoints[2]);
|
||||
points.add(currentPoints[3]);
|
||||
}
|
||||
case PathIterator.SEG_CUBICTO -> {
|
||||
operators.add((byte) PathData.e_cubicto);
|
||||
points.add(currentPoints[0]);
|
||||
points.add(currentPoints[1]);
|
||||
points.add(currentPoints[2]);
|
||||
points.add(currentPoints[3]);
|
||||
points.add(currentPoints[4]);
|
||||
points.add(currentPoints[5]);
|
||||
}
|
||||
case PathIterator.SEG_CLOSE -> {
|
||||
operators.add((byte) PathData.e_closepath);
|
||||
}
|
||||
}
|
||||
}
|
||||
byte[] operatorArr = new byte[operators.size()];
|
||||
for (int i = 0; i < operators.size(); i++) {
|
||||
operatorArr[i] = operators.get(i);
|
||||
}
|
||||
double[] pointArr = new double[points.size()];
|
||||
for (int i = 0; i < points.size(); i++) {
|
||||
pointArr[i] = points.get(i);
|
||||
}
|
||||
return new PathData(true, operatorArr, pointArr);
|
||||
}
|
||||
|
||||
|
||||
public GeneralPath convertToGeneralPathAndTransformToInitialUserSpace(PathData pathData, Matrix2D ctm) throws PDFNetException {
|
||||
|
||||
GeneralPath linePath = Converter.convertToGeneralPath(pathData);
|
||||
|
||||
//transform path to initial user space
|
||||
@ -59,6 +117,7 @@ public class Converter {
|
||||
return linePath;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static Color convertColor(ColorSpace colorSpace, ColorPt colorPt) {
|
||||
|
||||
@ -74,4 +133,15 @@ public class Converter {
|
||||
return new Rectangle2D.Double(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static AffineTransform toAffineTransform(Matrix2D textMatrix) {
|
||||
|
||||
if (textMatrix == null) {
|
||||
return null;
|
||||
}
|
||||
return new AffineTransform(textMatrix.getA(), textMatrix.getB(), textMatrix.getC(), textMatrix.getD(), textMatrix.getV(), textMatrix.getH());
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,5 +1,8 @@
|
||||
package com.iqser.red.pdftronlogic.commons;
|
||||
|
||||
import static com.iqser.red.pdftronlogic.commons.VisualizationUtils.drawFeature;
|
||||
import static com.iqser.red.pdftronlogic.commons.VisualizationUtils.drawRect;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.Shape;
|
||||
import java.awt.geom.Area;
|
||||
@ -10,13 +13,16 @@ import java.io.OutputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.features.ElementFeatureFactory;
|
||||
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
|
||||
import com.iqser.red.pdftronlogic.commons.features.GlyphInfo;
|
||||
import com.iqser.red.pdftronlogic.commons.features.ImageFeatures;
|
||||
import com.iqser.red.pdftronlogic.commons.features.PathFeatures;
|
||||
import com.iqser.red.pdftronlogic.commons.features.TextFeatures;
|
||||
import com.iqser.red.pdftronlogic.commons.lookup.ElementFeatureLookup;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.ColorPt;
|
||||
@ -26,6 +32,7 @@ import com.pdftron.pdf.ElementBuilder;
|
||||
import com.pdftron.pdf.ElementReader;
|
||||
import com.pdftron.pdf.ElementWriter;
|
||||
import com.pdftron.pdf.GState;
|
||||
import com.pdftron.pdf.Image;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.pdf.PageIterator;
|
||||
@ -155,9 +162,9 @@ public class InvisibleElementRemovalService {
|
||||
log.info("Start removing invisible Elements");
|
||||
try (ElementWriter writer = new ElementWriter(); ElementReader reader = new ElementReader()) {
|
||||
Set<Long> visitedXObjIds = new TreeSet<>();
|
||||
int pageIndex = 1;
|
||||
|
||||
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
|
||||
|
||||
Page page = iterator.next();
|
||||
|
||||
visitedXObjIds.add(page.getSDFObj().getObjNum());
|
||||
@ -180,10 +187,9 @@ public class InvisibleElementRemovalService {
|
||||
context.markedContentStack().clear();
|
||||
|
||||
removeOverlappedElements(page, writer, context);
|
||||
pageIndex++;
|
||||
}
|
||||
log.info("Finished removing invisible Elements");
|
||||
}
|
||||
log.info("Finished removing invisible Elements");
|
||||
}
|
||||
|
||||
|
||||
@ -245,17 +251,13 @@ public class InvisibleElementRemovalService {
|
||||
}
|
||||
|
||||
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
|
||||
boolean nonTransparent = imageElement.getGState().getBlendMode() == GState.e_bl_normal
|
||||
&& imageElement.getGState().getFillOpacity() == 1
|
||||
&& imageElement.getGState().getStrokeOpacity() == 1
|
||||
&& imageElement.getGState().getSoftMask() == null;
|
||||
|
||||
if (inClippingPath) {
|
||||
ImageFeatures image = ElementFeatureFactory.buildImage(imageElement);
|
||||
if (nonTransparent) {
|
||||
// calculateOverlaps(context, image, writer);
|
||||
ImageFeatures imageFeatures = ElementFeatureFactory.buildImage(imageElement);
|
||||
if (!(imageFeatures.isTransparent() || imageFeatures.isImageMask() || imageFeatures.isSoftMask())) {
|
||||
calculateOverlaps(context, imageFeatures);
|
||||
}
|
||||
context.visibleElements().add(image);
|
||||
context.visibleElements().add(imageFeatures);
|
||||
}
|
||||
|
||||
if (context.delta() ^ inClippingPath) {
|
||||
@ -281,7 +283,7 @@ public class InvisibleElementRemovalService {
|
||||
boolean isTextVisible = isTextRenderedVisibly(gState, textBBox, context);
|
||||
|
||||
if (inClippingPath && isTextVisible) {
|
||||
context.visibleElements().add(ElementFeatureFactory.extractFeatures(textElement));
|
||||
context.visibleElements().add(ElementFeatureFactory.buildText(textElement, context.delta()));
|
||||
}
|
||||
if (!context.delta()) {
|
||||
if (inClippingPath && isTextVisible) {
|
||||
@ -385,7 +387,7 @@ public class InvisibleElementRemovalService {
|
||||
|
||||
if (inClippingPath) {
|
||||
if (isFilledAndNonTransparent(pathElement)) {
|
||||
calculateOverlaps(context, pathFeatures, writer);
|
||||
calculateOverlaps(context, pathFeatures);
|
||||
}
|
||||
context.visibleElements().add(ElementFeatureFactory.extractFeatures(pathElement));
|
||||
}
|
||||
@ -408,9 +410,9 @@ public class InvisibleElementRemovalService {
|
||||
}
|
||||
|
||||
|
||||
private void calculateOverlaps(InvisibleElementRemovalContext context, ElementFeatures elementFeatures, ElementWriter writer) {
|
||||
private void calculateOverlaps(InvisibleElementRemovalContext context, ElementFeatures elementFeatures) {
|
||||
|
||||
List<ElementFeatures> currentOverlappedElements = context.visibleElements().findAlmostContained(elementFeatures);
|
||||
List<ElementFeatures> currentOverlappedElements = context.visibleElements().findOverlapped(elementFeatures);
|
||||
context.overlappedElements().addAll(currentOverlappedElements);
|
||||
context.visibleElements().removeAll(currentOverlappedElements);
|
||||
}
|
||||
@ -423,7 +425,7 @@ public class InvisibleElementRemovalService {
|
||||
if (context.delta()) {
|
||||
// green for element removed due to overlapping
|
||||
context.overlappedElements()
|
||||
.forEach(feature -> drawBBox(writer, feature.getBoundingBox(), "#00FF00"));
|
||||
.forEach(feature -> drawFeature(writer, feature, Color.GREEN));
|
||||
context.overlappedElements().clear();
|
||||
}
|
||||
processOverlappedElements(writer, context);
|
||||
@ -471,14 +473,19 @@ public class InvisibleElementRemovalService {
|
||||
|
||||
private static void removeOverlappedElement(ElementWriter writer, InvisibleElementRemovalContext context, Element element) throws PDFNetException {
|
||||
|
||||
if (context.overlappedElements.matchesAny(ElementFeatureFactory.extractFeatures(element))) {
|
||||
Optional<ElementFeatures> optionalElementMatch = context.overlappedElements()
|
||||
.anyMatch(ElementFeatureFactory.extractFeatures(element));
|
||||
if (optionalElementMatch.isPresent()) {
|
||||
context.overlappedElements().remove(optionalElementMatch.get());
|
||||
if (element.getType() == 3 && element.hasTextMatrix()) {
|
||||
/*
|
||||
PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
|
||||
hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
|
||||
Therefore, the position of a following Tj is affected by not writing the first Element.
|
||||
This is why, we write only the Tm command:
|
||||
*/
|
||||
writer.writeGStateChanges(element);
|
||||
writer.writeGStateChanges(element);
|
||||
}
|
||||
} else {
|
||||
writer.writeElement(element);
|
||||
}
|
||||
@ -552,7 +559,7 @@ public class InvisibleElementRemovalService {
|
||||
return true;
|
||||
}
|
||||
Area backgroundArea = mergeLinePathsToArea(pathElementsByColor);
|
||||
return !almostContains(backgroundArea, Converter.toRectangle2D(textBBox));
|
||||
return !ComparisonUtils.almostContains(backgroundArea, Converter.toRectangle2D(textBBox));
|
||||
|
||||
}
|
||||
|
||||
@ -562,10 +569,7 @@ public class InvisibleElementRemovalService {
|
||||
var result = new ArrayList<PathFeatures>();
|
||||
context.visibleElements().findIntersecting(textBBox)
|
||||
.forEach(element -> {
|
||||
if (element instanceof PathFeatures pathFeatures
|
||||
&& pathFeatures.isBackground(textBBox)
|
||||
&& !pathFeatures.getFillColor().equals(Color.WHITE)
|
||||
&& pathFeatures.isFilled()) {
|
||||
if (element instanceof PathFeatures pathFeatures && !pathFeatures.getFillColor().equals(Color.WHITE) && pathFeatures.isFilled()) {
|
||||
result.add(pathFeatures);
|
||||
}
|
||||
});
|
||||
@ -585,40 +589,12 @@ public class InvisibleElementRemovalService {
|
||||
}
|
||||
|
||||
|
||||
private boolean almostContains(Shape outer, Rectangle2D inner) {
|
||||
|
||||
//To address inconsistencies in the calculation of the bounding box we slightly shrink the inner rectangle
|
||||
double x_with_tolerance = inner.getX() >= 0 ? inner.getX() + TOLERANCE : inner.getX() - TOLERANCE;
|
||||
double y_with_tolerance = inner.getY() >= 0 ? inner.getY() + TOLERANCE : inner.getY() - TOLERANCE;
|
||||
double height_with_tolerance = inner.getHeight() - (2 * TOLERANCE);
|
||||
double width_with_tolerance = inner.getWidth() - (2 * TOLERANCE);
|
||||
Rectangle2D innerRect = new Rectangle2D.Double(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance);
|
||||
|
||||
return outer.contains(innerRect);
|
||||
}
|
||||
|
||||
|
||||
private boolean isFilledAndNonTransparent(Element element) throws PDFNetException {
|
||||
|
||||
return element.isFilled() && element.getGState().getFillOpacity() == 1;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void drawBBox(ElementWriter writer, Rectangle2D r, String hexcolor) {
|
||||
|
||||
try (ColorPt colorPt = new ColorPt(Integer.valueOf(hexcolor.substring(1, 3), 16) / 255d,
|
||||
Integer.valueOf(hexcolor.substring(3, 5), 16) / 255d,
|
||||
Integer.valueOf(hexcolor.substring(5, 7), 16) / 255d); ElementBuilder eb = new ElementBuilder()) {
|
||||
Element rect = eb.createRect(r.getX(), r.getY(), r.getWidth(), r.getHeight());
|
||||
rect.setPathStroke(true);
|
||||
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
||||
rect.getGState().setStrokeColor(colorPt);
|
||||
writer.writePlacedElement(rect);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Builder
|
||||
private record InvisibleElementRemovalContext(
|
||||
boolean removePaths,
|
||||
|
||||
@ -0,0 +1,71 @@
|
||||
package com.iqser.red.pdftronlogic.commons;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.features.ElementFeatureFactory;
|
||||
import com.iqser.red.pdftronlogic.commons.features.ImageFeatures;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.Element;
|
||||
import com.pdftron.pdf.ElementReader;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.sdf.Obj;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class PdfImageExtraction {
|
||||
|
||||
public List<List<ImageFeatures>> extractImages(InputStream fileStream) throws IOException, PDFNetException {
|
||||
|
||||
try (PDFDoc pdfDoc = new PDFDoc(fileStream); ElementReader reader = new ElementReader()) {
|
||||
List<List<ImageFeatures>> imagesPerPage = new ArrayList<>(pdfDoc.getPageCount());
|
||||
|
||||
var iter = pdfDoc.getPageIterator();
|
||||
while (iter.hasNext()) {
|
||||
Page page = iter.next();
|
||||
Set<Long> visitedXObjIds = new HashSet<>();
|
||||
visitedXObjIds.add(page.getSDFObj().getObjNum());
|
||||
List<ImageFeatures> imageFeatures = new LinkedList<>();
|
||||
|
||||
reader.begin(page);
|
||||
|
||||
processElements(reader, imageFeatures, visitedXObjIds);
|
||||
|
||||
reader.end();
|
||||
|
||||
imagesPerPage.add(imageFeatures);
|
||||
}
|
||||
|
||||
return imagesPerPage;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private void processElements(ElementReader reader, List<ImageFeatures> imageFeaturesOnPage, Set<Long> visitedXObjIds) throws PDFNetException {
|
||||
|
||||
for (Element element = reader.next(); element != null; element = reader.next()) {
|
||||
switch (element.getType()) {
|
||||
case Element.e_image, Element.e_inline_image -> imageFeaturesOnPage.add(ElementFeatureFactory.buildImage(element));
|
||||
case Element.e_form -> {
|
||||
Obj formObj = element.getXObject();
|
||||
if (!visitedXObjIds.contains(formObj.getObjNum())) {
|
||||
visitedXObjIds.add(formObj.getObjNum());
|
||||
reader.formBegin();
|
||||
processElements(reader, imageFeaturesOnPage, visitedXObjIds);
|
||||
reader.end();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -3,21 +3,30 @@ package com.iqser.red.pdftronlogic.commons;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.features.ElementFeatureFactory;
|
||||
import com.iqser.red.pdftronlogic.commons.features.TextFeatures;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.Element;
|
||||
import com.pdftron.pdf.ElementReader;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.pdf.PageIterator;
|
||||
import com.pdftron.pdf.TextExtractor;
|
||||
import com.pdftron.sdf.Obj;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class PdfTextExtraction {
|
||||
|
||||
private static String execute(PDFDoc pdfDoc) throws PDFNetException{
|
||||
try(TextExtractor extractor = new TextExtractor()) {
|
||||
private static String execute(PDFDoc pdfDoc) throws PDFNetException {
|
||||
|
||||
try (TextExtractor extractor = new TextExtractor()) {
|
||||
List<String> texts = new ArrayList<>();
|
||||
|
||||
PageIterator iterator = pdfDoc.getPageIterator();
|
||||
@ -32,13 +41,65 @@ public class PdfTextExtraction {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static String extractAllTextFromDocument(InputStream fileStream) throws IOException, PDFNetException {
|
||||
|
||||
PDFDoc pdfDoc = new PDFDoc(fileStream);
|
||||
return execute(pdfDoc);
|
||||
}
|
||||
|
||||
|
||||
public static String extractAllTextFromDocument(PDFDoc pdfDoc) throws IOException, PDFNetException {
|
||||
|
||||
return execute(pdfDoc);
|
||||
}
|
||||
|
||||
|
||||
public static List<List<TextFeatures>> extractAllGlyphsFromDocument(InputStream fileStream, boolean includePathData) throws IOException, PDFNetException {
|
||||
|
||||
try (PDFDoc pdfDoc = new PDFDoc(fileStream); ElementReader reader = new ElementReader()) {
|
||||
List<List<TextFeatures>> glyphsPerPages = new ArrayList<>(pdfDoc.getPageCount());
|
||||
|
||||
var iter = pdfDoc.getPageIterator();
|
||||
while (iter.hasNext()) {
|
||||
Page page = iter.next();
|
||||
Set<Long> visitedXObjIds = new HashSet<>();
|
||||
visitedXObjIds.add(page.getSDFObj().getObjNum());
|
||||
List<TextFeatures> textFeatures = new LinkedList<>();
|
||||
|
||||
reader.begin(page);
|
||||
|
||||
processElements(reader, textFeatures, visitedXObjIds, includePathData);
|
||||
|
||||
reader.end();
|
||||
|
||||
glyphsPerPages.add(textFeatures);
|
||||
}
|
||||
|
||||
return glyphsPerPages;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static void processElements(ElementReader reader, List<TextFeatures> textFeaturesOnPage, Set<Long> visitedXObjIds, boolean includePathData) throws PDFNetException {
|
||||
|
||||
for (Element element = reader.next(); element != null; element = reader.next()) {
|
||||
switch (element.getType()) {
|
||||
case Element.e_text -> textFeaturesOnPage.add(ElementFeatureFactory.buildText(element, includePathData));
|
||||
case Element.e_form -> {
|
||||
Obj formObj = element.getXObject();
|
||||
|
||||
if (!visitedXObjIds.contains(formObj.getObjNum())) {
|
||||
visitedXObjIds.add(formObj.getObjNum());
|
||||
reader.formBegin();
|
||||
processElements(reader, textFeaturesOnPage, visitedXObjIds, includePathData);
|
||||
reader.end();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,91 @@
|
||||
package com.iqser.red.pdftronlogic.commons;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
|
||||
import com.iqser.red.pdftronlogic.commons.features.GlyphInfo;
|
||||
import com.iqser.red.pdftronlogic.commons.features.TextFeatures;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.ColorPt;
|
||||
import com.pdftron.pdf.ColorSpace;
|
||||
import com.pdftron.pdf.Element;
|
||||
import com.pdftron.pdf.ElementBuilder;
|
||||
import com.pdftron.pdf.ElementWriter;
|
||||
import com.pdftron.pdf.PathData;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class VisualizationUtils {
|
||||
|
||||
@SneakyThrows
|
||||
public static void drawFeature(ElementWriter writer, ElementFeatures features, Color color) {
|
||||
|
||||
try (ElementBuilder builder = new ElementBuilder()) {
|
||||
|
||||
if (features instanceof TextFeatures textFeatures) {
|
||||
for (GlyphInfo glyph : textFeatures.getGlyphs()) {
|
||||
if (glyph.getPathData().isPresent()) {
|
||||
drawPathData(glyph.getPathData().get(), builder, writer, color);
|
||||
}
|
||||
}
|
||||
}
|
||||
drawRect(features.getBoundingBox(), builder, writer, color);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
public static void drawPathData(PathData pathData, ElementBuilder builder, ElementWriter writer, Color color) throws PDFNetException {
|
||||
|
||||
Element path = builder.createPath(pathData.getPoints(), pathData.getOperators());
|
||||
path.setPathFill(true);
|
||||
path.setPathStroke(false);
|
||||
path.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||
float[] comp = color.getColorComponents(null);
|
||||
try (ColorPt colorPt = new ColorPt(comp[0], comp[1], comp[2])) {
|
||||
path.getGState().setFillColor(colorPt);
|
||||
}
|
||||
path.setWindingFill(true);
|
||||
writer.writeElement(path);
|
||||
}
|
||||
|
||||
|
||||
public static void drawRect(Rectangle2D rectangle2D, ElementBuilder builder, ElementWriter writer, Color color) throws PDFNetException {
|
||||
|
||||
drawRect(rectangle2D, builder, writer, color, false);
|
||||
}
|
||||
|
||||
|
||||
public static void drawRect(Rectangle2D rectangle2D, ElementBuilder builder, ElementWriter writer, Color color, boolean fill) throws PDFNetException {
|
||||
|
||||
Element rect = builder.createRect(rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight());
|
||||
rect.setPathFill(false);
|
||||
rect.setPathStroke(true);
|
||||
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
||||
float[] comp = fill ? Color.BLACK.getColorComponents(null) : color.getColorComponents(null);
|
||||
try (ColorPt colorPt = new ColorPt(comp[0], comp[1], comp[2])) {
|
||||
rect.getGState().setStrokeColor(colorPt);
|
||||
}
|
||||
double lineWidth = fill ? 0.1 : 0.5;
|
||||
rect.getGState().setLineWidth(lineWidth);
|
||||
writer.writeElement(rect);
|
||||
|
||||
if (fill) {
|
||||
Element filledRect = builder.createRect(rectangle2D.getX() + lineWidth,
|
||||
rectangle2D.getY() + lineWidth,
|
||||
rectangle2D.getWidth() - 2 * lineWidth,
|
||||
rectangle2D.getHeight() - 2 * lineWidth);
|
||||
filledRect.setPathFill(true);
|
||||
filledRect.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||
float[] comp2 = color.getColorComponents(null);
|
||||
try (ColorPt colorPt = new ColorPt(comp2[0], comp2[1], comp2[2])) {
|
||||
filledRect.getGState().setFillColor(colorPt);
|
||||
}
|
||||
writer.writeElement(filledRect);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,12 +1,27 @@
|
||||
package com.iqser.red.pdftronlogic.commons.features;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.Converter;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.Element;
|
||||
import java.awt.geom.GeneralPath;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.Converter;
|
||||
import com.pdftron.common.Matrix2D;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.CharData;
|
||||
import com.pdftron.pdf.CharIterator;
|
||||
import com.pdftron.pdf.Element;
|
||||
import com.pdftron.pdf.Font;
|
||||
import com.pdftron.pdf.GState;
|
||||
import com.pdftron.pdf.PathData;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class ElementFeatureFactory {
|
||||
|
||||
public static ElementFeatures extractFeatures(Element element) throws PDFNetException {
|
||||
public ElementFeatures extractFeatures(Element element) throws PDFNetException {
|
||||
|
||||
return switch (element.getType()) {
|
||||
case Element.e_path -> buildPath(element);
|
||||
@ -19,19 +34,19 @@ public class ElementFeatureFactory {
|
||||
}
|
||||
|
||||
|
||||
public static ImageFeatures buildImageWithHash(Element element, String hashObject) throws PDFNetException {
|
||||
public ImageFeatures buildImageWithHash(Element element, String hashObject) throws PDFNetException {
|
||||
|
||||
return buildImageBase(element).hashOfImage(hashObject).build();
|
||||
}
|
||||
|
||||
|
||||
public static ImageFeatures buildImage(Element element) throws PDFNetException {
|
||||
public ImageFeatures buildImage(Element element) throws PDFNetException {
|
||||
|
||||
return buildImageBase(element).build();
|
||||
}
|
||||
|
||||
|
||||
public static FormFeatures buildForm(Element element) throws PDFNetException {
|
||||
public FormFeatures buildForm(Element element) throws PDFNetException {
|
||||
|
||||
try (var bbox = element.getBBox();) {
|
||||
return FormFeatures.builder()
|
||||
@ -44,9 +59,12 @@ public class ElementFeatureFactory {
|
||||
}
|
||||
|
||||
|
||||
private static ImageFeatures.ImageFeaturesBuilder<?, ?> buildImageBase(Element element) throws PDFNetException {
|
||||
private ImageFeatures.ImageFeaturesBuilder<?, ?> buildImageBase(Element element) throws PDFNetException {
|
||||
|
||||
try (var bbox = element.getBBox();) {
|
||||
boolean transparent = element.getGState().getBlendMode() != GState.e_bl_normal
|
||||
|| element.getGState().getFillOpacity() > 1
|
||||
|| element.getGState().getStrokeOpacity() > 1;
|
||||
return ImageFeatures.builder()
|
||||
.elementType(element.getType())
|
||||
.boundingBox(Converter.toRectangle2D(bbox))
|
||||
@ -55,26 +73,40 @@ public class ElementFeatureFactory {
|
||||
.width(element.getImageWidth())
|
||||
.renderingIntent(element.getImageRenderingIntent())
|
||||
.componentNum(element.getComponentNum())
|
||||
.bitsPerComponent(element.getBitsPerComponent());
|
||||
.bitsPerComponent(element.getBitsPerComponent())
|
||||
.imageMask(element.isImageMask())
|
||||
.softMask(element.getGState().getSoftMask() != null)
|
||||
.transparent(transparent);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static TextFeatures buildText(Element element) throws PDFNetException {
|
||||
public TextFeatures buildText(Element element) throws PDFNetException {
|
||||
|
||||
return buildText(element, false);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Use includePathData = true, when trying to draw the glyphs, see GlyphExtractionTest
|
||||
*/
|
||||
public TextFeatures buildText(Element element, boolean includePathData) throws PDFNetException {
|
||||
|
||||
try (var bbox = element.getBBox()) {
|
||||
|
||||
return TextFeatures.builder()
|
||||
.elementType(element.getType())
|
||||
.boundingBox(Converter.toRectangle2D(bbox))
|
||||
.text(element.getTextString())
|
||||
.font(element.getGState().getFont().getType())
|
||||
.fontsize(element.getGState().getFontSize())
|
||||
.glyphs(extractGlyphInfo(element, includePathData))
|
||||
.build();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static PathFeatures buildPath(Element element) throws PDFNetException {
|
||||
public PathFeatures buildPath(Element element) throws PDFNetException {
|
||||
|
||||
try (var bbox = element.getBBox(); var ctm = element.getCTM(); var fillColor = element.getGState().getFillColor(); var strokeColor = element.getGState().getStrokeColor()) {
|
||||
return PathFeatures.builder()
|
||||
@ -92,4 +124,68 @@ public class ElementFeatureFactory {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private List<GlyphInfo> extractGlyphInfo(Element textElement, boolean includePathData) {
|
||||
|
||||
assert textElement != null && textElement.getType() == Element.e_text;
|
||||
|
||||
if (textElement.getBBox() == null) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
Font font = textElement.getGState().getFont();
|
||||
|
||||
if (font.getType() == Font.e_Type3) {
|
||||
// type 3 fonts seem to be much more difficult, one must use font.getType3GlyphStream and font.getType3FontMatrix instead
|
||||
// couldn't find much information except this post https://groups.google.com/g/pdfnet-sdk/c/SvhMflbtQho
|
||||
// will implement this when necessary
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
List<GlyphInfo> glyphs = new ArrayList<>();
|
||||
|
||||
CharIterator charIterator = textElement.getCharIterator();
|
||||
while (charIterator.hasNext()) {
|
||||
CharData charData = charIterator.next();
|
||||
long charCode = charData.getCharCode();
|
||||
String glyphText = new String(font.mapToUnicode(charCode));
|
||||
|
||||
if (Character.isWhitespace(glyphText.charAt(0))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
try (Matrix2D fontMatrix = computeFontMatrix(charData, textElement, font); //
|
||||
Matrix2D glyphMatrix = textElement.getCTM()//
|
||||
.multiply(textElement.getTextMatrix())//
|
||||
.multiply(fontMatrix)) {
|
||||
PathData pathData = font.getGlyphPath(charCode, true, glyphMatrix);
|
||||
if (pathData.getOperators().length == 1 && pathData.getOperators()[0] == 6) {
|
||||
// This happens for some chinese characters or whitespaces, don't know why...
|
||||
continue;
|
||||
}
|
||||
GeneralPath glyphPath = Converter.convertToGeneralPath(pathData);
|
||||
GlyphInfo.GlyphInfoBuilder glyphInfo = GlyphInfo.builder().unicode(glyphText).bbox(glyphPath.getBounds2D());
|
||||
|
||||
if (includePathData) {
|
||||
glyphInfo.pathData(pathData);
|
||||
}
|
||||
|
||||
glyphs.add(glyphInfo.build());
|
||||
}
|
||||
}
|
||||
|
||||
return glyphs;
|
||||
|
||||
}
|
||||
|
||||
|
||||
private Matrix2D computeFontMatrix(CharData charData, Element textElement, Font font) throws PDFNetException {
|
||||
|
||||
double yScaleFactor = textElement.getGState().getFontSize() / font.getUnitsPerEm();
|
||||
double xScaleFactor = (textElement.getGState().getHorizontalScale() / 100) * yScaleFactor;
|
||||
|
||||
return new Matrix2D(xScaleFactor, 0, 0, -yScaleFactor, charData.getGlyphX(), charData.getGlyphY());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -2,8 +2,11 @@ package com.iqser.red.pdftronlogic.commons.features;
|
||||
|
||||
import static com.iqser.red.pdftronlogic.commons.ComparisonUtils.almostEqual;
|
||||
|
||||
import java.awt.Shape;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.ComparisonUtils;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.SneakyThrows;
|
||||
@ -24,12 +27,12 @@ public class ElementFeatures {
|
||||
|
||||
public boolean matches(ElementFeatures elementFeatures) {
|
||||
|
||||
return elementFeatures.getElementType() == elementType && elementFeatures.getBoundingBox() != null && rectsAlmostMatch(elementFeatures.getBoundingBox());
|
||||
return elementFeatures.getElementType() == elementType && elementFeatures.getBoundingBox() != null && bboxMatches(elementFeatures.getBoundingBox());
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
protected boolean rectsAlmostMatch(Rectangle2D bBox) {
|
||||
protected boolean bboxMatches(Rectangle2D bBox) {
|
||||
// To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance
|
||||
|
||||
return almostEqual(bBox.getX(), boundingBox.getX()) && //
|
||||
@ -39,6 +42,12 @@ public class ElementFeatures {
|
||||
}
|
||||
|
||||
|
||||
public Shape getOverlapShape() {
|
||||
|
||||
return boundingBox;
|
||||
}
|
||||
|
||||
|
||||
public boolean similar(ElementFeatures elementFeatures) {
|
||||
|
||||
return elementFeatures.getElementType() == elementType && elementFeatures.getBoundingBox() != null && areRectsSimilar(elementFeatures.getBoundingBox());
|
||||
@ -67,10 +76,22 @@ public class ElementFeatures {
|
||||
}
|
||||
|
||||
|
||||
public boolean almostContains(ElementFeatures features) {
|
||||
public boolean contains(ElementFeatures features) {
|
||||
|
||||
Rectangle2D inner = features.getBoundingBox();
|
||||
return boundingBox.contains(inner);
|
||||
return features.containedBy(this);
|
||||
}
|
||||
|
||||
|
||||
public boolean testOverlapped(ElementFeatures overlappingElement) {
|
||||
|
||||
return containedBy(overlappingElement);
|
||||
}
|
||||
|
||||
|
||||
private boolean containedBy(ElementFeatures features) {
|
||||
|
||||
Shape overlapShape = features.getOverlapShape();
|
||||
return overlapShape.contains(ComparisonUtils.shrinkRectangle(boundingBox));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -22,21 +22,21 @@ public class FormFeatures extends ElementFeatures {
|
||||
|
||||
public boolean matches(ElementFeatures elementFeatures) {
|
||||
|
||||
if (elementFeatures.getClass() != this.getClass()) {
|
||||
return false;
|
||||
if (elementFeatures instanceof FormFeatures features) {
|
||||
return elementFeatures.getElementType() == getElementType()
|
||||
&& elementFeatures.getBoundingBox() != null
|
||||
&& (super.bboxMatches(elementFeatures.getBoundingBox())
|
||||
|| rotationMatches(elementFeatures.getBoundingBox()
|
||||
.getBounds2D()))
|
||||
&& xObjectType == features.getXObjectType()
|
||||
&& dictOrArrayOrStreamLength == features.getDictOrArrayOrStreamLength();
|
||||
}
|
||||
return elementFeatures.getElementType() == getElementType()
|
||||
&& elementFeatures.getBoundingBox() != null
|
||||
&& (super.rectsAlmostMatch(elementFeatures.getBoundingBox())
|
||||
|| almostRotateMatches(elementFeatures.getBoundingBox()
|
||||
.getBounds2D()))
|
||||
&& xObjectType == ((FormFeatures) elementFeatures).getXObjectType()
|
||||
&& dictOrArrayOrStreamLength == ((FormFeatures) elementFeatures).getDictOrArrayOrStreamLength();
|
||||
return false;
|
||||
|
||||
}
|
||||
|
||||
|
||||
private boolean almostRotateMatches(Rectangle2D bBox) {
|
||||
private boolean rotationMatches(Rectangle2D bBox) {
|
||||
|
||||
return almostEqual(bBox.getWidth(), getBoundingBox().getHeight()) && //
|
||||
almostEqual(bBox.getHeight(), getBoundingBox().getWidth());
|
||||
|
||||
@ -0,0 +1,70 @@
|
||||
package com.iqser.red.pdftronlogic.commons.features;
|
||||
|
||||
import java.awt.Shape;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Optional;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.ComparisonUtils;
|
||||
import com.pdftron.pdf.PathData;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class GlyphInfo {
|
||||
|
||||
@Getter
|
||||
final String unicode;
|
||||
@Getter
|
||||
final Rectangle2D bbox;
|
||||
final PathData pathData;
|
||||
|
||||
boolean overlapped;
|
||||
ElementFeatures overlappingElement;
|
||||
|
||||
|
||||
public boolean testOverlapped(ElementFeatures overlappingElement) {
|
||||
|
||||
if (overlapped) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (overlappingElement.getOverlapShape().contains(ComparisonUtils.shrinkRectangle(bbox))) {
|
||||
overlapped = true;
|
||||
this.overlappingElement = overlappingElement;
|
||||
}
|
||||
|
||||
return overlapped;
|
||||
|
||||
}
|
||||
|
||||
|
||||
public boolean matches(GlyphInfo glyph2) {
|
||||
|
||||
return unicode.equals(glyph2.unicode)//
|
||||
&& calculateIntersectedArea(glyph2.bbox, bbox) > 0.9 * Math.min(bbox.getWidth() * bbox.getHeight(), glyph2.bbox.getHeight() * glyph2.bbox.getWidth());
|
||||
}
|
||||
|
||||
|
||||
public Optional<PathData> getPathData() {
|
||||
|
||||
return Optional.ofNullable(pathData);
|
||||
}
|
||||
|
||||
|
||||
private static double calculateIntersectedArea(Rectangle2D r1, Rectangle2D r2) {
|
||||
|
||||
double xOverlap = Math.max(0, Math.min(r1.getMaxX(), r2.getMaxX()) - Math.max(r1.getMinX(), r2.getMinX()));
|
||||
double yOverlap = Math.max(0, Math.min(r1.getMaxY(), r2.getMaxY()) - Math.max(r1.getY(), r2.getY()));
|
||||
|
||||
return xOverlap * yOverlap;
|
||||
}
|
||||
|
||||
}
|
||||
@ -19,6 +19,9 @@ public class ImageFeatures extends ElementFeatures {
|
||||
int renderingIntent;
|
||||
int componentNum;
|
||||
int bitsPerComponent;
|
||||
boolean imageMask;
|
||||
boolean softMask;
|
||||
boolean transparent;
|
||||
String hashOfImage;
|
||||
|
||||
|
||||
@ -33,6 +36,9 @@ public class ImageFeatures extends ElementFeatures {
|
||||
&& this.renderingIntent == imageFeatures.getRenderingIntent()
|
||||
&& this.componentNum == imageFeatures.getComponentNum()
|
||||
&& this.bitsPerComponent == imageFeatures.getBitsPerComponent()
|
||||
&& this.imageMask == imageFeatures.isImageMask()
|
||||
&& this.softMask == imageFeatures.isSoftMask()
|
||||
&& this.transparent == imageFeatures.isTransparent()
|
||||
&& calculateHammingDistance(imageFeatures.getHashOfImage()) <= HAMMING_DISTANCE_THRESHOLD;
|
||||
}
|
||||
return false;
|
||||
|
||||
@ -1,24 +1,19 @@
|
||||
package com.iqser.red.pdftronlogic.commons.features;
|
||||
|
||||
import static com.iqser.red.pdftronlogic.commons.ComparisonUtils.getPaddedRectangle;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.Shape;
|
||||
import java.awt.geom.GeneralPath;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import com.pdftron.pdf.Rect;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Getter;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@Getter
|
||||
@SuperBuilder
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
public class PathFeatures extends ElementFeatures {
|
||||
|
||||
boolean clippingPath;
|
||||
@ -47,26 +42,10 @@ public class PathFeatures extends ElementFeatures {
|
||||
}
|
||||
|
||||
|
||||
public boolean matchesFillColor(Color color) {
|
||||
|
||||
return color.equals(fillColor);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public boolean isBackground(Rect area) {
|
||||
|
||||
return filled && //
|
||||
getBoundingBox().intersects(area.getX1(), area.getY1(), area.getWidth(), area.getHeight());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean almostContains(ElementFeatures elementFeatures) {
|
||||
public Shape getOverlapShape() {
|
||||
|
||||
Rectangle2D innerRect = getPaddedRectangle(elementFeatures);
|
||||
|
||||
return linePath.contains(innerRect);
|
||||
return linePath;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -2,7 +2,13 @@ package com.iqser.red.pdftronlogic.commons.features;
|
||||
|
||||
import static com.iqser.red.pdftronlogic.commons.ComparisonUtils.almostEqual;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.ComparisonUtils;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Builder;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Getter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
@ -18,19 +24,45 @@ public class TextFeatures extends ElementFeatures {
|
||||
String text;
|
||||
int font;
|
||||
double fontsize;
|
||||
@Builder.Default
|
||||
List<GlyphInfo> glyphs = new ArrayList<>();
|
||||
|
||||
|
||||
@Override
|
||||
public boolean matches(ElementFeatures element) {
|
||||
|
||||
if (element instanceof TextFeatures textFeaturesElement) {
|
||||
|
||||
return super.matches(textFeaturesElement) //
|
||||
return super.matches(textFeaturesElement)//
|
||||
&& text.equals(textFeaturesElement.getText()) //
|
||||
&& font == textFeaturesElement.getFont() //
|
||||
&& font == textFeaturesElement.getFont()//
|
||||
&& almostEqual(fontsize, textFeaturesElement.getFontsize());
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private boolean glyphsMatch(TextFeatures textFeaturesElement) {
|
||||
|
||||
if (glyphs.size() != textFeaturesElement.getGlyphs().size()) {
|
||||
return false;
|
||||
}
|
||||
for (int i = 0; i < glyphs.size(); i++) {
|
||||
if (!glyphs.get(i).matches(textFeaturesElement.getGlyphs().get(i))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
public boolean testOverlapped(ElementFeatures overlappingElement) {
|
||||
|
||||
if (glyphs.isEmpty()) {
|
||||
return super.testOverlapped(overlappingElement);
|
||||
}
|
||||
|
||||
return super.testOverlapped(overlappingElement) || glyphs.stream()
|
||||
.allMatch(glyph -> glyph.testOverlapped(overlappingElement));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,37 +1,43 @@
|
||||
package com.iqser.red.pdftronlogic.commons.lookup;
|
||||
|
||||
import org.locationtech.jts.index.ItemVisitor;
|
||||
import java.util.Optional;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@RequiredArgsConstructor
|
||||
public class AnyMatchVisitor implements ItemVisitor {
|
||||
public class AnyMatchVisitor implements ElementFeatureVisitor {
|
||||
|
||||
private final ElementFeatures queryFeatures;
|
||||
private boolean anyMatch = false;
|
||||
@Getter
|
||||
private ElementFeatures match;
|
||||
|
||||
|
||||
public boolean hasAnyMatch() {
|
||||
public Optional<ElementFeatures> getAnyMatch() {
|
||||
|
||||
return anyMatch;
|
||||
return Optional.ofNullable(match);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void visitItem(Object o) {
|
||||
public void visitItem(ElementFeatures features) {
|
||||
|
||||
if (anyMatch) {
|
||||
if (hasAnyMatch()) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (o instanceof ElementFeatures features) {
|
||||
if (queryFeatures.matches(features)) {
|
||||
anyMatch = true;
|
||||
}
|
||||
if (queryFeatures.matches(features)) {
|
||||
match = features;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private boolean hasAnyMatch() {
|
||||
|
||||
return getAnyMatch().isPresent();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,104 +1,101 @@
|
||||
package com.iqser.red.pdftronlogic.commons.lookup;
|
||||
|
||||
import static com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService.TOLERANCE;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
import org.locationtech.jts.geom.Envelope;
|
||||
import org.locationtech.jts.index.quadtree.Quadtree;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.Converter;
|
||||
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
|
||||
import com.pdftron.pdf.Rect;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Getter
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class ElementFeatureLookup {
|
||||
/*
|
||||
This class looks a bit weird and uses visitors since I tried to use the quadtree implementation by locationtech, since it uses Rectangles by default to query its data structure.
|
||||
Unfortunately there were always edge cases, where it lost a few elements making it completely unusable. Further, it didn't even speed up the algorithm all that much.
|
||||
*/
|
||||
|
||||
Quadtree quadTree = new Quadtree();
|
||||
Set<ElementFeatures> allElements = new HashSet<>();
|
||||
|
||||
|
||||
public void add(ElementFeatures elementFeatures) {
|
||||
|
||||
quadTree.insert(envelop(elementFeatures), elementFeatures);
|
||||
allElements.add(elementFeatures);
|
||||
}
|
||||
|
||||
|
||||
public void remove(ElementFeatures elementFeatures) {
|
||||
|
||||
quadTree.remove(envelop(elementFeatures), elementFeatures);
|
||||
allElements.remove(elementFeatures);
|
||||
}
|
||||
|
||||
|
||||
public boolean matchesAny(ElementFeatures elementFeatures) {
|
||||
public Optional<ElementFeatures> anyMatch(ElementFeatures elementFeatures) {
|
||||
|
||||
AnyMatchVisitor visitor = new AnyMatchVisitor(elementFeatures);
|
||||
quadTree.query(queryEnvelop(elementFeatures), visitor);
|
||||
return visitor.hasAnyMatch();
|
||||
forEach(visitor::visitItem);
|
||||
return visitor.getAnyMatch();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public List<ElementFeatures> query(Predicate<ElementFeatures> predicate) {
|
||||
|
||||
PredicateItemVisitor visitor = new PredicateItemVisitor(predicate);
|
||||
forEach(visitor::visitItem);
|
||||
return visitor.getMatchingFeatures();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public List<ElementFeatures> findIntersecting(Rect bbox) {
|
||||
|
||||
Rectangle2D r = Converter.toRectangle2D(bbox);
|
||||
return query(elementFeatures -> elementFeatures.getBoundingBox().intersects(r));
|
||||
}
|
||||
|
||||
|
||||
public List<ElementFeatures> findOverlapped(ElementFeatures overlappingElement) {
|
||||
|
||||
OverlapVisitor overlapVisitor = new OverlapVisitor(overlappingElement);
|
||||
forEach(overlapVisitor::visitItem);
|
||||
return overlapVisitor.getOverlappedElementFeatures();
|
||||
}
|
||||
|
||||
|
||||
public void forEach(Consumer<ElementFeatures> consumer) {
|
||||
|
||||
quadTree.queryAll()
|
||||
.forEach(consumer);
|
||||
allElements.forEach(consumer);
|
||||
}
|
||||
|
||||
|
||||
public void clear() {
|
||||
|
||||
forEach(this::remove);
|
||||
}
|
||||
|
||||
|
||||
public List<ElementFeatures> findAlmostContained(ElementFeatures elementFeatures) {
|
||||
|
||||
PredicateItemVisitor visitor = new PredicateItemVisitor(elementFeatures::almostContains);
|
||||
quadTree.query(queryEnvelop(elementFeatures), visitor);
|
||||
return visitor.getMatchingFeatures();
|
||||
}
|
||||
|
||||
|
||||
public List<ElementFeatures> query(ElementFeatures elementFeatures, Predicate<ElementFeatures> predicate) {
|
||||
|
||||
PredicateItemVisitor visitor = new PredicateItemVisitor(predicate);
|
||||
quadTree.query(queryEnvelop(elementFeatures), visitor);
|
||||
return visitor.getMatchingFeatures();
|
||||
}
|
||||
|
||||
|
||||
private static Envelope envelop(ElementFeatures elementFeatures) {
|
||||
|
||||
Rectangle2D r = elementFeatures.getBoundingBox();
|
||||
return new Envelope(r.getX(), r.getY(), r.getWidth(), r.getHeight());
|
||||
}
|
||||
|
||||
|
||||
private static Envelope queryEnvelop(ElementFeatures elementFeatures) {
|
||||
|
||||
Rectangle2D r = elementFeatures.getBoundingBox();
|
||||
return new Envelope(r.getX() - TOLERANCE, r.getY() - TOLERANCE, r.getWidth() + 2 * TOLERANCE, r.getHeight() + 2 * TOLERANCE);
|
||||
allElements.clear();
|
||||
}
|
||||
|
||||
|
||||
public boolean isEmpty() {
|
||||
|
||||
return quadTree.isEmpty();
|
||||
return allElements.isEmpty();
|
||||
}
|
||||
|
||||
|
||||
public int size() {
|
||||
|
||||
return quadTree.size();
|
||||
return allElements.size();
|
||||
}
|
||||
|
||||
|
||||
@ -111,26 +108,6 @@ public class ElementFeatureLookup {
|
||||
public void removeAll(List<ElementFeatures> currentOverlappedElements) {
|
||||
|
||||
currentOverlappedElements.forEach(this::remove);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public List<ElementFeatures> query(Rect bbox, Predicate<ElementFeatures> predicate) {
|
||||
|
||||
PredicateItemVisitor visitor = new PredicateItemVisitor(predicate);
|
||||
quadTree.query(new Envelope(bbox.getX1(), bbox.getY1(), bbox.getWidth(), bbox.getHeight()), visitor);
|
||||
return visitor.getMatchingFeatures();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public List<ElementFeatures> findIntersecting(Rect bbox) {
|
||||
|
||||
Rectangle2D r = Converter.toRectangle2D(bbox);
|
||||
PredicateItemVisitor visitor = new PredicateItemVisitor(elementFeatures -> elementFeatures.getBoundingBox().intersects(r));
|
||||
quadTree.query(new Envelope(r.getX(), r.getY(), r.getWidth(), r.getHeight()), visitor);
|
||||
return visitor.getMatchingFeatures();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,9 @@
|
||||
package com.iqser.red.pdftronlogic.commons.lookup;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
|
||||
|
||||
public interface ElementFeatureVisitor {
|
||||
|
||||
void visitItem(ElementFeatures features);
|
||||
|
||||
}
|
||||
@ -0,0 +1,35 @@
|
||||
package com.iqser.red.pdftronlogic.commons.lookup;
|
||||
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.ComparisonUtils;
|
||||
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class OverlapVisitor implements ElementFeatureVisitor {
|
||||
|
||||
ElementFeatures overlappingElement;
|
||||
|
||||
@Getter
|
||||
List<ElementFeatures> overlappedElementFeatures = new LinkedList<>();
|
||||
|
||||
|
||||
@Override
|
||||
public void visitItem(ElementFeatures features) {
|
||||
|
||||
if (ComparisonUtils.padRectangle(features.getBoundingBox()).intersects(ComparisonUtils.padRectangle(overlappingElement.getBoundingBox()))) {
|
||||
if (features.testOverlapped(overlappingElement)) {
|
||||
overlappedElementFeatures.add(features);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -4,15 +4,13 @@ import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
import org.locationtech.jts.index.ItemVisitor;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@RequiredArgsConstructor
|
||||
public class PredicateItemVisitor implements ItemVisitor {
|
||||
public class PredicateItemVisitor implements ElementFeatureVisitor {
|
||||
|
||||
private final Predicate<ElementFeatures> predicate;
|
||||
@Getter
|
||||
@ -20,12 +18,10 @@ public class PredicateItemVisitor implements ItemVisitor {
|
||||
|
||||
|
||||
@Override
|
||||
public void visitItem(Object o) {
|
||||
public void visitItem(ElementFeatures features) {
|
||||
|
||||
if (o instanceof ElementFeatures features) {
|
||||
if (predicate.test(features)) {
|
||||
matchingFeatures.add(features);
|
||||
}
|
||||
if (predicate.test(features)) {
|
||||
matchingFeatures.add(features);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,87 @@
|
||||
package com.iqser.red.pdftronlogic.commons;
|
||||
|
||||
import static com.iqser.red.pdftronlogic.commons.VisualizationUtils.drawPathData;
|
||||
import static com.iqser.red.pdftronlogic.commons.VisualizationUtils.drawRect;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.io.FileOutputStream;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.features.GlyphInfo;
|
||||
import com.iqser.red.pdftronlogic.commons.features.ImageFeatures;
|
||||
import com.iqser.red.pdftronlogic.commons.features.TextFeatures;
|
||||
import com.pdftron.pdf.ElementBuilder;
|
||||
import com.pdftron.pdf.ElementWriter;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.PDFNet;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.sdf.SDFDoc;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
public class GlyphExtractionTest {
|
||||
|
||||
@BeforeEach
|
||||
void createService() {
|
||||
|
||||
PDFNet.initialize(PDFTronConfig.license);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testGlyphExtraction() {
|
||||
|
||||
String file = "files/everyCharIsImage.pdf";
|
||||
|
||||
List<List<TextFeatures>> textsPerPage;
|
||||
List<List<ImageFeatures>> imagesPerPage;
|
||||
try (var in = this.getClass().getClassLoader().getResourceAsStream(file)) {
|
||||
textsPerPage = PdfTextExtraction.extractAllGlyphsFromDocument(in, true);
|
||||
}
|
||||
try (var in = this.getClass().getClassLoader().getResourceAsStream(file)) {
|
||||
imagesPerPage = PdfImageExtraction.extractImages(in);
|
||||
}
|
||||
try (var in = this.getClass().getClassLoader().getResourceAsStream(file);//
|
||||
var out = new FileOutputStream(Path.of("/tmp/").resolve(Path.of(file).getFileName() + "_GLYPHS.pdf").toFile())) {
|
||||
|
||||
PDFDoc pdfDoc = new PDFDoc(in);
|
||||
for (int i = 0; i < pdfDoc.getPageCount(); i++) {
|
||||
Page page = pdfDoc.getPage(i + 1);
|
||||
List<TextFeatures> textFeaturesOnPage = textsPerPage.get(i);
|
||||
List<ImageFeatures> imageFeaturesOnPage = imagesPerPage.get(i);
|
||||
try (ElementWriter writer = new ElementWriter(); ElementBuilder builder = new ElementBuilder()) {
|
||||
writer.begin(page, ElementWriter.e_overlay, false);
|
||||
|
||||
for (ImageFeatures imageFeatures : imageFeaturesOnPage) {
|
||||
if (imageFeatures.getBoundingBox().getHeight() * imageFeatures.getBoundingBox().getWidth() >= page.getPageHeight() * page.getPageWidth() * 0.8) {
|
||||
continue;
|
||||
}
|
||||
drawRect(imageFeatures.getBoundingBox(), builder, writer, Color.CYAN, true);
|
||||
}
|
||||
for (TextFeatures textFeatures : textFeaturesOnPage) {
|
||||
|
||||
drawRect(textFeatures.getBoundingBox(), builder, writer, Color.BLUE);
|
||||
|
||||
for (GlyphInfo glyph : textFeatures.getGlyphs()) {
|
||||
|
||||
if (glyph.getPathData().isPresent()) {
|
||||
drawPathData(glyph.getPathData().get(), builder, writer, Color.BLACK);
|
||||
}
|
||||
drawRect(ComparisonUtils.shrinkRectangle(glyph.getBbox()), builder, writer, Color.RED);
|
||||
drawRect(glyph.getBbox(), builder, writer, Color.MAGENTA);
|
||||
|
||||
}
|
||||
}
|
||||
writer.end();
|
||||
}
|
||||
}
|
||||
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -7,6 +7,7 @@ import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
@ -77,9 +78,9 @@ class InvisibleElementRemovalServiceTest {
|
||||
try (var in = new FileInputStream(deltaResultFileName)) {
|
||||
String[] text = extractAllTextFromDocument(in).split("\n");
|
||||
assertThat(text).contains(":Bold S-enantiomer form if two codes are supplied",
|
||||
"Red : Only observed in laboratory soil studies",
|
||||
"Green : Observed in both laboratory soil studies and lysimeter leachate",
|
||||
"Blue : Only observed in lysimeter leachate");
|
||||
"Red : Only observed in laboratory soil studies",
|
||||
"Green : Observed in both laboratory soil studies and lysimeter leachate",
|
||||
"Blue : Only observed in lysimeter leachate");
|
||||
}
|
||||
}
|
||||
|
||||
@ -102,20 +103,20 @@ class InvisibleElementRemovalServiceTest {
|
||||
try (var in = new FileInputStream(deltaResultFileName)) {
|
||||
String result = PdfTextExtraction.extractAllTextFromDocument(in);
|
||||
assertThat(result).contains("#1 Dark",
|
||||
"#13 Yellow",
|
||||
"Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper suscipit lobortis nisl ut aliquip\n" +
|
||||
"ex ea commodo consequat. Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie\n" +
|
||||
"consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim\n" +
|
||||
"qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi.");
|
||||
assertThat(result).doesNotContain("Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut\n" +
|
||||
"labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et\n" +
|
||||
"ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem\n" +
|
||||
"ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et\n" +
|
||||
"dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea\n" +
|
||||
"rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum\n" +
|
||||
"dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore\n" +
|
||||
"magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet\n" +
|
||||
"clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. ");
|
||||
"#13 Yellow",
|
||||
"Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper suscipit lobortis nisl ut aliquip\n"
|
||||
+ "ex ea commodo consequat. Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie\n"
|
||||
+ "consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim\n"
|
||||
+ "qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi.");
|
||||
assertThat(result).doesNotContain("Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut\n"
|
||||
+ "labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et\n"
|
||||
+ "ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem\n"
|
||||
+ "ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et\n"
|
||||
+ "dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea\n"
|
||||
+ "rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum\n"
|
||||
+ "dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore\n"
|
||||
+ "magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet\n"
|
||||
+ "clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. ");
|
||||
}
|
||||
|
||||
}
|
||||
@ -189,4 +190,26 @@ class InvisibleElementRemovalServiceTest {
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
void removeInvisibleElementsWhereEachCharIsImage() {
|
||||
|
||||
String fileName = "files/everyCharIsImage.pdf";
|
||||
String resultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL");
|
||||
String deltaResultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL_DELTA");
|
||||
|
||||
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(resultFileName)) {
|
||||
invisibleElementRemovalService.removeInvisibleElementsButKeepOcrText(in, out, false);
|
||||
}
|
||||
|
||||
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(deltaResultFileName)) {
|
||||
invisibleElementRemovalService.removeInvisibleElementsButKeepOcrText(in, out, true);
|
||||
}
|
||||
try (var in = new FileInputStream(resultFileName)) {
|
||||
String result = PdfTextExtraction.extractAllTextFromDocument(in);
|
||||
assertThat(result).isBlank();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -15,6 +15,7 @@ import java.util.Optional;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.rendering.GhostScriptService;
|
||||
@ -27,19 +28,21 @@ import net.sourceforge.lept4j.Leptonica1;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.lept4j.util.LeptUtils;
|
||||
|
||||
@Disabled // requires leptonica and ghostscript to be installed locally
|
||||
public class VisualEqualityTest {
|
||||
|
||||
public static final double SIMILARITY_THRESHOLD = 0.015; // percentage of pixels which differ by more than 10 points in luminance
|
||||
public static final String LEPTONICA_DIR = "/home/kschuettler/software/leptonica/vcpkg/installed/x64-linux-dynamic/lib/";
|
||||
GhostScriptService ghostScriptService = new GhostScriptService();
|
||||
InvisibleElementRemovalService invisibleElementRemovalService = new InvisibleElementRemovalService();
|
||||
Path stem = Path.of("/tmp/AAA_EQUALITY_TEST/");
|
||||
Path TEST_OUTPUT_DIR = Path.of("/tmp/AAA_EQUALITY_TEST/");
|
||||
|
||||
|
||||
@BeforeEach
|
||||
public void setup() {
|
||||
|
||||
PDFNet.initialize(PDFTronConfig.license);
|
||||
System.setProperty("jna.library.path", "/home/kschuettler/software/leptonica/vcpkg/installed/x64-linux-dynamic/lib/");
|
||||
System.setProperty("jna.library.path", LEPTONICA_DIR);
|
||||
|
||||
try (NativeLibrary leptonicaLib = NativeLibrary.getInstance("leptonica")) {
|
||||
assert leptonicaLib != null;
|
||||
@ -51,8 +54,8 @@ public class VisualEqualityTest {
|
||||
@SneakyThrows
|
||||
public void assertVisualEqualityOfProcessedFile() {
|
||||
|
||||
Path file = Path.of("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/ITEM 23_A19022A - Dermal Absorption Human.pdf");
|
||||
Context context = new Context(stem, new HashMap<>());
|
||||
Path file = Path.of("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/ITEM 19_A15149AC - Primary Skin Irritation Rabbit.pdf");
|
||||
Context context = new Context(TEST_OUTPUT_DIR, new HashMap<>());
|
||||
|
||||
runForFile(file, context);
|
||||
|
||||
@ -65,7 +68,7 @@ public class VisualEqualityTest {
|
||||
public void assertVisualEqualityOfProcessedFolder() {
|
||||
|
||||
Path folder = Path.of("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles");
|
||||
Context context = new Context(stem, new HashMap<>());
|
||||
Context context = new Context(TEST_OUTPUT_DIR, new HashMap<>());
|
||||
|
||||
Files.walk(folder)
|
||||
.filter(Files::isRegularFile)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user