Compare commits

..

No commits in common. "master" and "2.30.0" have entirely different histories.

13 changed files with 149 additions and 244 deletions

View File

@ -27,7 +27,7 @@ repositories {
dependencies {
api("org.projectlombok:lombok:1.18.30")
api("com.google.guava:guava:33.0.0-jre")
api("com.pdftron:PDFNet:11.0.0")
api("com.pdftron:PDFNet:10.11.0")
testImplementation("net.sourceforge.lept4j:lept4j:1.19.1")
testImplementation("org.junit.jupiter:junit-jupiter:5.10.2")
testImplementation("org.assertj:assertj-core:3.24.2")

View File

@ -163,7 +163,7 @@ public class InvisibleElementRemovalService {
visitedXObjIds.add(page.getSDFObj().getObjNum());
try (InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder()
InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder()
.reader(reader)
.clippingPathStack(new ClippingPathStack(page.getMediaBox()))
.markedContentStack(new MarkedContentStack(pdfDoc))
@ -173,15 +173,14 @@ public class InvisibleElementRemovalService {
.visibleElements(new ElementFeatureLookup())
.visitedXObjIds(visitedXObjIds)
.markedContentToIgnore(markedContentToIgnore)
.build()) {
.build();
removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context);
removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context);
context.visitedXObjIds().clear();
context.markedContentStack().clear();
context.visitedXObjIds().clear();
context.markedContentStack().clear();
removeOverlappedElements(page, writer, context);
}
removeOverlappedElements(page, writer, context);
}
}
@ -249,7 +248,10 @@ public class InvisibleElementRemovalService {
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
if (inClippingPath) {
ImageFeatures imageFeatures = ElementFeatureFactory.buildImage(imageElement);
if (!(context.markedContentStack.contextHasTransparency() || imageFeatures.isTransparent() || imageFeatures.isImageMask() || imageFeatures.isSoftMask())) {
if (!(context.markedContentStack.contextHasTransparency()
|| imageFeatures.isTransparent()
|| imageFeatures.isImageMask()
|| imageFeatures.isSoftMask())) {
calculateOverlaps(context, imageFeatures, imageFeatures.isMasked());
}
context.visibleElements().add(imageFeatures);
@ -278,7 +280,7 @@ public class InvisibleElementRemovalService {
boolean isTextVisible = isTextRenderedVisibly(gState, textBBox, context);
if (inClippingPath && isTextVisible) {
context.visibleElements().add(ElementFeatureFactory.buildText(textElement, true, context.delta()));
context.visibleElements().add(ElementFeatureFactory.buildText(textElement, context.delta()));
}
if (!context.delta()) {
if (inClippingPath && isTextVisible) {
@ -290,8 +292,7 @@ public class InvisibleElementRemovalService {
Therefore, the position of a following Tj is affected by not writing the first Element.
This is why, we write only the Tm command:
*/
textElement.setTextData(new byte[]{});
writer.writeElement(textElement);
writer.writeGStateChanges(textElement);
}
} else {
if (!inClippingPath) {
@ -432,7 +433,7 @@ public class InvisibleElementRemovalService {
context.reader().end();
if (!context.overlappedElements().isEmpty()) {
log.debug(context.overlappedElements().size() + " overlapped elements have not been found or removed");
log.warn(context.overlappedElements().size() + " overlapped elements have not been found or removed");
}
}
@ -491,8 +492,7 @@ public class InvisibleElementRemovalService {
Therefore, the position of a following Tj is affected by not writing the first Element.
This is why, we write only the Tm command:
*/
element.setTextData(new byte[]{});
writer.writeElement(element);
writer.writeGStateChanges(element);
}
} else {
writer.writeElement(element);
@ -614,14 +614,7 @@ public class InvisibleElementRemovalService {
ElementFeatureLookup visibleElements,
Set<Long> visitedXObjIds,
Set<String> markedContentToIgnore
) implements AutoCloseable {
@Override
public void close() {
overlappedElements.close();
visibleElements.close();
}
) {
}

View File

@ -1,49 +0,0 @@
package com.iqser.red.pdftronlogic.commons;
import java.lang.reflect.Field;
import com.pdftron.pdf.Font;
import lombok.experimental.UtilityClass;
@UtilityClass
public class PDFNetUtils {
@SuppressWarnings("PMD")
public void requireFontNotClosed(Font font) {
try {
if (font.__GetHandle() == 0L) {
throw new AssertionError("Font is already closed!");
}
Object refHandle = font.__GetRefHandle();
Class<?> clazz = refHandle.getClass();
Field implField = null;
while (clazz != null) {
try {
implField = clazz.getDeclaredField("impl");
implField.setAccessible(true);
break;
} catch (NoSuchFieldException e) {
clazz = clazz.getSuperclass();
}
}
if (implField != null) {
long implValue = (Long) implField.get(refHandle);
if (implValue == 0L) {
throw new AssertionError("Associated ElementReader of Font is already closed!");
}
}
} catch (IllegalAccessException e) {
throw new AssertionError("Font Ref is missing the field impl, should never happen!");
}
}
}

View File

@ -85,7 +85,7 @@ public class PdfTextExtraction {
for (Element element = reader.next(); element != null; element = reader.next()) {
switch (element.getType()) {
case Element.e_text -> textFeaturesOnPage.add(ElementFeatureFactory.buildText(element, includePathData, includePathData));
case Element.e_text -> textFeaturesOnPage.add(ElementFeatureFactory.buildText(element, includePathData));
case Element.e_form -> {
Obj formObj = element.getXObject();

View File

@ -1,5 +1,6 @@
package com.iqser.red.pdftronlogic.commons.features;
import java.awt.geom.GeneralPath;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
@ -13,7 +14,7 @@ import com.pdftron.pdf.Element;
import com.pdftron.pdf.Font;
import com.pdftron.pdf.GState;
import com.pdftron.pdf.Image;
import com.pdftron.sdf.Obj;
import com.pdftron.pdf.PathData;
import lombok.SneakyThrows;
import lombok.experimental.UtilityClass;
@ -25,7 +26,7 @@ public class ElementFeatureFactory {
return switch (element.getType()) {
case Element.e_path -> buildPath(element);
case Element.e_text -> buildText(element, false, false);
case Element.e_text -> buildText(element);
case Element.e_image, Element.e_inline_image -> buildImage(element);
case Element.e_form -> buildForm(element);
// This technically should never happen, it's a safetynet
@ -71,7 +72,7 @@ public class ElementFeatureFactory {
boolean masked = false;
if (element.getType() == Element.e_image) {
Image image = new Image(element.getXObject());
if (image.getMask() != null && image.getMask().getType() == Obj.e_stream) {
if (image.getMask() != null) {
Image imageMask = new Image(image.getMask());
masked = imageMask.isImageMask();
}
@ -93,25 +94,27 @@ public class ElementFeatureFactory {
}
public TextFeatures buildText(Element element) throws PDFNetException {
return buildText(element, false);
}
/*
Use includeGlyphs = true and preComputePathData = true, when trying to draw the glyphs, see GlyphExtractionTest
precomputePathData = true is needed, when trying to access the PathData after the PDFDoc/ElementReader has been closed
Use includePathData = true, when trying to draw the glyphs, see GlyphExtractionTest
*/
public TextFeatures buildText(Element element, boolean includeGlyphs, boolean preComputePathData) throws PDFNetException {
public TextFeatures buildText(Element element, boolean includePathData) throws PDFNetException {
try (var bbox = element.getBBox()) {
TextFeatures.TextFeaturesBuilder<?, ?> simpleTextFeatures = TextFeatures.builder()
return TextFeatures.builder()
.elementType(element.getType())
.boundingBox(Converter.toRectangle2D(bbox))
.text(element.getTextString())
.font(element.getGState().getFont().getType())
.fontsize(element.getGState().getFontSize());
if (includeGlyphs) {
simpleTextFeatures.glyphs(extractGlyphInfo(element, preComputePathData));
}
return simpleTextFeatures.build();
.fontsize(element.getGState().getFontSize())
.glyphs(extractGlyphInfo(element, includePathData))
.build();
}
}
@ -136,7 +139,7 @@ public class ElementFeatureFactory {
@SneakyThrows
private List<GlyphInfo> extractGlyphInfo(Element textElement, boolean precomputePathData) {
private List<GlyphInfo> extractGlyphInfo(Element textElement, boolean includePathData) {
assert textElement != null && textElement.getType() == Element.e_text;
@ -154,29 +157,34 @@ public class ElementFeatureFactory {
}
List<GlyphInfo> glyphs = new ArrayList<>();
short unitsPerEm = font.getUnitsPerEm();
try (CharIterator charIterator = textElement.getCharIterator(); Matrix2D ctm = textElement.getCTM().multiply(textElement.getTextMatrix());) {
try (CharIterator charIterator = textElement.getCharIterator()) {
while (charIterator.hasNext()) {
CharData charData = charIterator.next();
long charCode = charData.getCharCode();
String glyphText = new String(font.mapToUnicode(charCode));
try (Matrix2D fontMatrix = computeFontMatrix(charData, textElement, unitsPerEm)) {
if (Character.isWhitespace(glyphText.charAt(0))) {
continue;
}
GlyphInfo glyph = GlyphInfo.builder() //
.charCode(charCode) //
.cachePathData(precomputePathData) //
.glyphMatrix(ctm.multiply(fontMatrix)) //
.font(font) //
.build();
glyphs.add(glyph);
if (precomputePathData) {
// call the functions once to cache all data
glyph.getBoundingBox();
try (Matrix2D fontMatrix = computeFontMatrix(charData, textElement, font); //
Matrix2D glyphMatrix = textElement.getCTM()//
.multiply(textElement.getTextMatrix())//
.multiply(fontMatrix)) {
PathData pathData = font.getGlyphPath(charCode, true, glyphMatrix);
if (pathData.getOperators().length == 1 && pathData.getOperators()[0] == 6) {
// This happens for some chinese characters or whitespaces, don't know why...
continue;
}
GeneralPath glyphPath = Converter.convertToGeneralPath(pathData);
GlyphInfo.GlyphInfoBuilder glyphInfo = GlyphInfo.builder().unicode(glyphText).bbox(glyphPath.getBounds2D());
if (includePathData) {
glyphInfo.pathData(pathData);
}
glyphs.add(glyphInfo.build());
}
}
}
@ -186,9 +194,9 @@ public class ElementFeatureFactory {
}
private Matrix2D computeFontMatrix(CharData charData, Element textElement, short unitsPerEm) throws PDFNetException {
private Matrix2D computeFontMatrix(CharData charData, Element textElement, Font font) throws PDFNetException {
double yScaleFactor = textElement.getGState().getFontSize() / unitsPerEm;
double yScaleFactor = textElement.getGState().getFontSize() / font.getUnitsPerEm();
double xScaleFactor = (textElement.getGState().getHorizontalScale() / 100) * yScaleFactor;
return new Matrix2D(xScaleFactor, 0, 0, -yScaleFactor, charData.getGlyphX(), charData.getGlyphY());

View File

@ -94,9 +94,4 @@ public class ElementFeatures {
return overlapShape.contains(ComparisonUtils.shrinkRectangle(boundingBox));
}
public void destroy() {
// do nothing, except for text
}
}

View File

@ -1,40 +1,30 @@
package com.iqser.red.pdftronlogic.commons.features;
import java.awt.Shape;
import java.awt.geom.Rectangle2D;
import java.util.Optional;
import com.iqser.red.pdftronlogic.commons.ComparisonUtils;
import com.iqser.red.pdftronlogic.commons.Converter;
import com.iqser.red.pdftronlogic.commons.PDFNetUtils;
import com.pdftron.common.Matrix2D;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.Font;
import com.pdftron.pdf.PathData;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Builder
@AllArgsConstructor
@RequiredArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class GlyphInfo {
final Matrix2D glyphMatrix;
final long charCode;
final Font font;
// in order to speed up invisible element removal, we only calculate the pathdata where necessary, as it is the costliest operation.
// It will only work as long as the associated ElementReader is still open, as the Font is bound to the ContentStream being read.
Rectangle2D bbox;
final boolean cachePathData;
PathData pathData;
@Getter
final String unicode;
@Getter
final Rectangle2D bbox;
final PathData pathData;
boolean overlapped;
ElementFeatures overlappingElement;
@ -45,12 +35,8 @@ public class GlyphInfo {
if (overlapped) {
return true;
}
Optional<Rectangle2D> bbox = getBoundingBox();
if (bbox.isEmpty()) {
return true;
}
if (overlappingElement.getOverlapShape().contains(ComparisonUtils.shrinkRectangle(bbox.get()))) {
if (overlappingElement.getOverlapShape().contains(ComparisonUtils.shrinkRectangle(bbox))) {
overlapped = true;
this.overlappingElement = overlappingElement;
}
@ -60,57 +46,25 @@ public class GlyphInfo {
}
public String getUnicode() {
public boolean matches(GlyphInfo glyph2) {
try {
return new String(font.mapToUnicode(charCode));
} catch (PDFNetException e) {
return "";
}
return unicode.equals(glyph2.unicode)//
&& calculateIntersectedArea(glyph2.bbox, bbox) > 0.9 * Math.min(bbox.getWidth() * bbox.getHeight(), glyph2.bbox.getHeight() * glyph2.bbox.getWidth());
}
@SneakyThrows
public Optional<PathData> getPathData() {
if (pathData == null) {
PDFNetUtils.requireFontNotClosed(font);
PathData computedPathData = font.getGlyphPath(charCode, true, glyphMatrix);
if (computedPathData.getOperators().length == 1 && computedPathData.getOperators()[0] == 6) {
// This happens for some chinese characters or whitespaces, don't know why...
return Optional.empty();
}
if (cachePathData) {
pathData = computedPathData;
}
return Optional.of(computedPathData);
}
return Optional.of(pathData);
return Optional.ofNullable(pathData);
}
@SneakyThrows
public Optional<Rectangle2D> getBoundingBox() {
private static double calculateIntersectedArea(Rectangle2D r1, Rectangle2D r2) {
if (bbox == null) {
Optional<PathData> pathData = getPathData();
if (pathData.isEmpty()) {
return Optional.empty();
}
bbox = Converter.convertToGeneralPath(pathData.get()).getBounds2D();
}
return Optional.of(bbox);
}
double xOverlap = Math.max(0, Math.min(r1.getMaxX(), r2.getMaxX()) - Math.max(r1.getMinX(), r2.getMinX()));
double yOverlap = Math.max(0, Math.min(r1.getMaxY(), r2.getMaxY()) - Math.max(r1.getY(), r2.getY()));
@SneakyThrows
public void destroy() {
if (glyphMatrix != null) {
glyphMatrix.close();
}
return xOverlap * yOverlap;
}
}

View File

@ -5,6 +5,8 @@ import static com.iqser.red.pdftronlogic.commons.ComparisonUtils.almostEqual;
import java.util.ArrayList;
import java.util.List;
import com.iqser.red.pdftronlogic.commons.ComparisonUtils;
import lombok.AccessLevel;
import lombok.Builder;
import lombok.EqualsAndHashCode;
@ -22,7 +24,6 @@ public class TextFeatures extends ElementFeatures {
String text;
int font;
double fontsize;
@Builder.Default
List<GlyphInfo> glyphs = new ArrayList<>();
@ -40,6 +41,20 @@ public class TextFeatures extends ElementFeatures {
}
private boolean glyphsMatch(TextFeatures textFeaturesElement) {
if (glyphs.size() != textFeaturesElement.getGlyphs().size()) {
return false;
}
for (int i = 0; i < glyphs.size(); i++) {
if (!glyphs.get(i).matches(textFeaturesElement.getGlyphs().get(i))) {
return false;
}
}
return true;
}
public boolean testOverlapped(ElementFeatures overlappingElement) {
if (glyphs.isEmpty()) {
@ -50,11 +65,4 @@ public class TextFeatures extends ElementFeatures {
.allMatch(glyph -> glyph.testOverlapped(overlappingElement));
}
@Override
public void destroy() {
glyphs.forEach(GlyphInfo::destroy);
}
}

View File

@ -1,16 +1,15 @@
package com.iqser.red.pdftronlogic.commons.lookup;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.HashSet;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.function.Consumer;
import java.util.function.Predicate;
import com.iqser.red.pdftronlogic.commons.Converter;
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.Rect;
import lombok.AccessLevel;
@ -22,13 +21,13 @@ import lombok.experimental.FieldDefaults;
@Getter
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ElementFeatureLookup implements AutoCloseable {
public class ElementFeatureLookup {
/*
This class looks a bit weird and uses visitors since I tried to use the quadtree implementation by locationtech, as it uses Rectangles by default to query its data structure.
Unfortunately there were always edge cases where it lost a few elements making it completely unusable. Further, it didn't even speed up the algorithm all that much.
This class looks a bit weird and uses visitors since I tried to use the quadtree implementation by locationtech, since it uses Rectangles by default to query its data structure.
Unfortunately there were always edge cases, where it lost a few elements making it completely unusable. Further, it didn't even speed up the algorithm all that much.
*/
List<ElementFeatures> allElements = new ArrayList<>();
Set<ElementFeatures> allElements = new HashSet<>();
public void add(ElementFeatures elementFeatures) {
@ -70,23 +69,9 @@ public class ElementFeatureLookup implements AutoCloseable {
public List<ElementFeatures> findOverlapped(ElementFeatures overlappingElement, boolean textOnly) {
List<ElementFeatures> overlappedElementFeatures = new LinkedList<>();
for (int i = 0; i < allElements.size(); i++) {
ElementFeatures features = allElements.get(i);
if (textOnly && features.getElementType() != Element.e_text) {
continue;
}
if (features.getBoundingBox().intersects(overlappingElement.getBoundingBox())) {
if (features.testOverlapped(overlappingElement)) {
overlappedElementFeatures.add(features);
}
}
}
return overlappedElementFeatures;
OverlapVisitor overlapVisitor = new OverlapVisitor(overlappingElement, textOnly);
forEach(overlapVisitor::visitItem);
return overlapVisitor.getOverlappedElementFeatures();
}
@ -116,20 +101,13 @@ public class ElementFeatureLookup implements AutoCloseable {
public void addAll(List<ElementFeatures> currentOverlappedElements) {
allElements.addAll(currentOverlappedElements);
currentOverlappedElements.forEach(this::add);
}
public void removeAll(List<ElementFeatures> currentOverlappedElements) {
allElements.removeAll(currentOverlappedElements);
}
@Override
public void close() {
allElements.forEach(ElementFeatures::destroy);
currentOverlappedElements.forEach(this::remove);
}
}

View File

@ -0,0 +1,42 @@
package com.iqser.red.pdftronlogic.commons.lookup;
import java.util.LinkedList;
import java.util.List;
import com.iqser.red.pdftronlogic.commons.ComparisonUtils;
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
import com.pdftron.pdf.Element;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.experimental.FieldDefaults;
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class OverlapVisitor implements ElementFeatureVisitor {
ElementFeatures overlappingElement;
boolean textOnly;
@Getter
List<ElementFeatures> overlappedElementFeatures = new LinkedList<>();
@Override
public void visitItem(ElementFeatures features) {
if (textOnly && features.getElementType() != Element.e_text) {
return;
}
if (ComparisonUtils.padRectangle(features.getBoundingBox()).intersects(ComparisonUtils.padRectangle(overlappingElement.getBoundingBox()))) {
if (features.testOverlapped(overlappingElement)) {
overlappedElementFeatures.add(features);
}
}
}
}

View File

@ -8,9 +8,7 @@ import java.io.FileOutputStream;
import java.nio.file.Path;
import java.util.List;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import com.iqser.red.pdftronlogic.commons.features.GlyphInfo;
@ -25,11 +23,10 @@ import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows;
@Disabled // makes no sense to run in pipeline
public class GlyphExtractionTest {
@BeforeAll
static void init() {
@BeforeEach
void createService() {
PDFNet.initialize(PDFTronConfig.license);
}
@ -72,11 +69,11 @@ public class GlyphExtractionTest {
for (GlyphInfo glyph : textFeatures.getGlyphs()) {
if (glyph.getPathData().isPresent() && glyph.getBoundingBox().isPresent()) {
if (glyph.getPathData().isPresent()) {
drawPathData(glyph.getPathData().get(), builder, writer, Color.BLACK);
drawRect(ComparisonUtils.shrinkRectangle(glyph.getBoundingBox().get()), builder, writer, Color.RED);
drawRect(glyph.getBoundingBox().get(), builder, writer, Color.MAGENTA);
}
drawRect(ComparisonUtils.shrinkRectangle(glyph.getBbox()), builder, writer, Color.RED);
drawRect(glyph.getBbox(), builder, writer, Color.MAGENTA);
}
}

View File

@ -3,13 +3,11 @@ package com.iqser.red.pdftronlogic.commons;
import static com.iqser.red.pdftronlogic.commons.PdfTextExtraction.extractAllTextFromDocument;
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
import java.io.ByteArrayOutputStream;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import lombok.extern.slf4j.Slf4j;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@ -24,16 +22,10 @@ class InvisibleElementRemovalServiceTest {
InvisibleElementRemovalService invisibleElementRemovalService;
@BeforeAll
static void init() {
@BeforeEach
void createService() {
PDFNet.initialize(PDFTronConfig.license);
}
@BeforeEach
void createServices() {
invisibleElementRemovalService = new InvisibleElementRemovalService();
}
@ -65,19 +57,6 @@ class InvisibleElementRemovalServiceTest {
}
@Test
@SneakyThrows
void page32DoesNotCrash() {
String fileName = "files/Page32.pdf";
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new ByteArrayOutputStream()) {
invisibleElementRemovalService.removeInvisibleElements(in, out, false);
}
}
@Test
@SneakyThrows
void removeInvisibleTextClippedByFormObjects() {

Binary file not shown.