Compare commits

...

4 Commits

Author SHA1 Message Date
Kilian Schüttler
c9424a5f4b Merge branch 'RED-10365' into 'master'
RED-10365: InvisibleElementRemovalService crashes for specific file

Closes RED-10365

See merge request redactmanager/commons/pdftron-logic-commons!36
2024-11-05 12:23:28 +01:00
Kilian Schuettler
e86e6fba2a RED-10365: InvisibleElementRemovalService crashes for specific file 2024-11-05 12:18:29 +01:00
Kilian Schüttler
ff9fd7bd44 Merge branch 'RED-9864' into 'master'
RED-9864: Ocr not working

Closes RED-9864

See merge request redactmanager/commons/pdftron-logic-commons!35
2024-08-26 14:59:11 +02:00
Kilian Schüttler
e6a1656e18 RED-9864: Ocr not working 2024-08-26 14:59:10 +02:00
13 changed files with 244 additions and 149 deletions

View File

@ -27,7 +27,7 @@ repositories {
dependencies { dependencies {
api("org.projectlombok:lombok:1.18.30") api("org.projectlombok:lombok:1.18.30")
api("com.google.guava:guava:33.0.0-jre") api("com.google.guava:guava:33.0.0-jre")
api("com.pdftron:PDFNet:10.11.0") api("com.pdftron:PDFNet:11.0.0")
testImplementation("net.sourceforge.lept4j:lept4j:1.19.1") testImplementation("net.sourceforge.lept4j:lept4j:1.19.1")
testImplementation("org.junit.jupiter:junit-jupiter:5.10.2") testImplementation("org.junit.jupiter:junit-jupiter:5.10.2")
testImplementation("org.assertj:assertj-core:3.24.2") testImplementation("org.assertj:assertj-core:3.24.2")

View File

@ -163,7 +163,7 @@ public class InvisibleElementRemovalService {
visitedXObjIds.add(page.getSDFObj().getObjNum()); visitedXObjIds.add(page.getSDFObj().getObjNum());
InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder() try (InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder()
.reader(reader) .reader(reader)
.clippingPathStack(new ClippingPathStack(page.getMediaBox())) .clippingPathStack(new ClippingPathStack(page.getMediaBox()))
.markedContentStack(new MarkedContentStack(pdfDoc)) .markedContentStack(new MarkedContentStack(pdfDoc))
@ -173,14 +173,15 @@ public class InvisibleElementRemovalService {
.visibleElements(new ElementFeatureLookup()) .visibleElements(new ElementFeatureLookup())
.visitedXObjIds(visitedXObjIds) .visitedXObjIds(visitedXObjIds)
.markedContentToIgnore(markedContentToIgnore) .markedContentToIgnore(markedContentToIgnore)
.build(); .build()) {
removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context); removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context);
context.visitedXObjIds().clear(); context.visitedXObjIds().clear();
context.markedContentStack().clear(); context.markedContentStack().clear();
removeOverlappedElements(page, writer, context); removeOverlappedElements(page, writer, context);
}
} }
} }
@ -248,10 +249,7 @@ public class InvisibleElementRemovalService {
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight()); boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
if (inClippingPath) { if (inClippingPath) {
ImageFeatures imageFeatures = ElementFeatureFactory.buildImage(imageElement); ImageFeatures imageFeatures = ElementFeatureFactory.buildImage(imageElement);
if (!(context.markedContentStack.contextHasTransparency() if (!(context.markedContentStack.contextHasTransparency() || imageFeatures.isTransparent() || imageFeatures.isImageMask() || imageFeatures.isSoftMask())) {
|| imageFeatures.isTransparent()
|| imageFeatures.isImageMask()
|| imageFeatures.isSoftMask())) {
calculateOverlaps(context, imageFeatures, imageFeatures.isMasked()); calculateOverlaps(context, imageFeatures, imageFeatures.isMasked());
} }
context.visibleElements().add(imageFeatures); context.visibleElements().add(imageFeatures);
@ -280,7 +278,7 @@ public class InvisibleElementRemovalService {
boolean isTextVisible = isTextRenderedVisibly(gState, textBBox, context); boolean isTextVisible = isTextRenderedVisibly(gState, textBBox, context);
if (inClippingPath && isTextVisible) { if (inClippingPath && isTextVisible) {
context.visibleElements().add(ElementFeatureFactory.buildText(textElement, context.delta())); context.visibleElements().add(ElementFeatureFactory.buildText(textElement, true, context.delta()));
} }
if (!context.delta()) { if (!context.delta()) {
if (inClippingPath && isTextVisible) { if (inClippingPath && isTextVisible) {
@ -292,7 +290,8 @@ public class InvisibleElementRemovalService {
Therefore, the position of a following Tj is affected by not writing the first Element. Therefore, the position of a following Tj is affected by not writing the first Element.
This is why, we write only the Tm command: This is why, we write only the Tm command:
*/ */
writer.writeGStateChanges(textElement); textElement.setTextData(new byte[]{});
writer.writeElement(textElement);
} }
} else { } else {
if (!inClippingPath) { if (!inClippingPath) {
@ -433,7 +432,7 @@ public class InvisibleElementRemovalService {
context.reader().end(); context.reader().end();
if (!context.overlappedElements().isEmpty()) { if (!context.overlappedElements().isEmpty()) {
log.warn(context.overlappedElements().size() + " overlapped elements have not been found or removed"); log.debug(context.overlappedElements().size() + " overlapped elements have not been found or removed");
} }
} }
@ -492,7 +491,8 @@ public class InvisibleElementRemovalService {
Therefore, the position of a following Tj is affected by not writing the first Element. Therefore, the position of a following Tj is affected by not writing the first Element.
This is why, we write only the Tm command: This is why, we write only the Tm command:
*/ */
writer.writeGStateChanges(element); element.setTextData(new byte[]{});
writer.writeElement(element);
} }
} else { } else {
writer.writeElement(element); writer.writeElement(element);
@ -614,7 +614,14 @@ public class InvisibleElementRemovalService {
ElementFeatureLookup visibleElements, ElementFeatureLookup visibleElements,
Set<Long> visitedXObjIds, Set<Long> visitedXObjIds,
Set<String> markedContentToIgnore Set<String> markedContentToIgnore
) { ) implements AutoCloseable {
@Override
public void close() {
overlappedElements.close();
visibleElements.close();
}
} }

View File

@ -0,0 +1,49 @@
package com.iqser.red.pdftronlogic.commons;
import java.lang.reflect.Field;
import com.pdftron.pdf.Font;
import lombok.experimental.UtilityClass;
@UtilityClass
public class PDFNetUtils {
@SuppressWarnings("PMD")
public void requireFontNotClosed(Font font) {
try {
if (font.__GetHandle() == 0L) {
throw new AssertionError("Font is already closed!");
}
Object refHandle = font.__GetRefHandle();
Class<?> clazz = refHandle.getClass();
Field implField = null;
while (clazz != null) {
try {
implField = clazz.getDeclaredField("impl");
implField.setAccessible(true);
break;
} catch (NoSuchFieldException e) {
clazz = clazz.getSuperclass();
}
}
if (implField != null) {
long implValue = (Long) implField.get(refHandle);
if (implValue == 0L) {
throw new AssertionError("Associated ElementReader of Font is already closed!");
}
}
} catch (IllegalAccessException e) {
throw new AssertionError("Font Ref is missing the field impl, should never happen!");
}
}
}

View File

@ -85,7 +85,7 @@ public class PdfTextExtraction {
for (Element element = reader.next(); element != null; element = reader.next()) { for (Element element = reader.next(); element != null; element = reader.next()) {
switch (element.getType()) { switch (element.getType()) {
case Element.e_text -> textFeaturesOnPage.add(ElementFeatureFactory.buildText(element, includePathData)); case Element.e_text -> textFeaturesOnPage.add(ElementFeatureFactory.buildText(element, includePathData, includePathData));
case Element.e_form -> { case Element.e_form -> {
Obj formObj = element.getXObject(); Obj formObj = element.getXObject();

View File

@ -1,6 +1,5 @@
package com.iqser.red.pdftronlogic.commons.features; package com.iqser.red.pdftronlogic.commons.features;
import java.awt.geom.GeneralPath;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.List; import java.util.List;
@ -14,7 +13,7 @@ import com.pdftron.pdf.Element;
import com.pdftron.pdf.Font; import com.pdftron.pdf.Font;
import com.pdftron.pdf.GState; import com.pdftron.pdf.GState;
import com.pdftron.pdf.Image; import com.pdftron.pdf.Image;
import com.pdftron.pdf.PathData; import com.pdftron.sdf.Obj;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import lombok.experimental.UtilityClass; import lombok.experimental.UtilityClass;
@ -26,7 +25,7 @@ public class ElementFeatureFactory {
return switch (element.getType()) { return switch (element.getType()) {
case Element.e_path -> buildPath(element); case Element.e_path -> buildPath(element);
case Element.e_text -> buildText(element); case Element.e_text -> buildText(element, false, false);
case Element.e_image, Element.e_inline_image -> buildImage(element); case Element.e_image, Element.e_inline_image -> buildImage(element);
case Element.e_form -> buildForm(element); case Element.e_form -> buildForm(element);
// This technically should never happen, it's a safetynet // This technically should never happen, it's a safetynet
@ -72,7 +71,7 @@ public class ElementFeatureFactory {
boolean masked = false; boolean masked = false;
if (element.getType() == Element.e_image) { if (element.getType() == Element.e_image) {
Image image = new Image(element.getXObject()); Image image = new Image(element.getXObject());
if (image.getMask() != null) { if (image.getMask() != null && image.getMask().getType() == Obj.e_stream) {
Image imageMask = new Image(image.getMask()); Image imageMask = new Image(image.getMask());
masked = imageMask.isImageMask(); masked = imageMask.isImageMask();
} }
@ -94,27 +93,25 @@ public class ElementFeatureFactory {
} }
public TextFeatures buildText(Element element) throws PDFNetException {
return buildText(element, false);
}
/* /*
Use includePathData = true, when trying to draw the glyphs, see GlyphExtractionTest Use includeGlyphs = true and preComputePathData = true, when trying to draw the glyphs, see GlyphExtractionTest
precomputePathData = true is needed, when trying to access the PathData after the PDFDoc/ElementReader has been closed
*/ */
public TextFeatures buildText(Element element, boolean includePathData) throws PDFNetException { public TextFeatures buildText(Element element, boolean includeGlyphs, boolean preComputePathData) throws PDFNetException {
try (var bbox = element.getBBox()) { try (var bbox = element.getBBox()) {
return TextFeatures.builder() TextFeatures.TextFeaturesBuilder<?, ?> simpleTextFeatures = TextFeatures.builder()
.elementType(element.getType()) .elementType(element.getType())
.boundingBox(Converter.toRectangle2D(bbox)) .boundingBox(Converter.toRectangle2D(bbox))
.text(element.getTextString()) .text(element.getTextString())
.font(element.getGState().getFont().getType()) .font(element.getGState().getFont().getType())
.fontsize(element.getGState().getFontSize()) .fontsize(element.getGState().getFontSize());
.glyphs(extractGlyphInfo(element, includePathData))
.build(); if (includeGlyphs) {
simpleTextFeatures.glyphs(extractGlyphInfo(element, preComputePathData));
}
return simpleTextFeatures.build();
} }
} }
@ -139,7 +136,7 @@ public class ElementFeatureFactory {
@SneakyThrows @SneakyThrows
private List<GlyphInfo> extractGlyphInfo(Element textElement, boolean includePathData) { private List<GlyphInfo> extractGlyphInfo(Element textElement, boolean precomputePathData) {
assert textElement != null && textElement.getType() == Element.e_text; assert textElement != null && textElement.getType() == Element.e_text;
@ -157,34 +154,29 @@ public class ElementFeatureFactory {
} }
List<GlyphInfo> glyphs = new ArrayList<>(); List<GlyphInfo> glyphs = new ArrayList<>();
short unitsPerEm = font.getUnitsPerEm();
try (CharIterator charIterator = textElement.getCharIterator(); Matrix2D ctm = textElement.getCTM().multiply(textElement.getTextMatrix());) {
try (CharIterator charIterator = textElement.getCharIterator()) {
while (charIterator.hasNext()) { while (charIterator.hasNext()) {
CharData charData = charIterator.next(); CharData charData = charIterator.next();
long charCode = charData.getCharCode(); long charCode = charData.getCharCode();
String glyphText = new String(font.mapToUnicode(charCode));
if (Character.isWhitespace(glyphText.charAt(0))) { try (Matrix2D fontMatrix = computeFontMatrix(charData, textElement, unitsPerEm)) {
continue;
}
try (Matrix2D fontMatrix = computeFontMatrix(charData, textElement, font); // GlyphInfo glyph = GlyphInfo.builder() //
Matrix2D glyphMatrix = textElement.getCTM()// .charCode(charCode) //
.multiply(textElement.getTextMatrix())// .cachePathData(precomputePathData) //
.multiply(fontMatrix)) { .glyphMatrix(ctm.multiply(fontMatrix)) //
PathData pathData = font.getGlyphPath(charCode, true, glyphMatrix); .font(font) //
if (pathData.getOperators().length == 1 && pathData.getOperators()[0] == 6) { .build();
// This happens for some chinese characters or whitespaces, don't know why...
continue; glyphs.add(glyph);
if (precomputePathData) {
// call the functions once to cache all data
glyph.getBoundingBox();
} }
GeneralPath glyphPath = Converter.convertToGeneralPath(pathData);
GlyphInfo.GlyphInfoBuilder glyphInfo = GlyphInfo.builder().unicode(glyphText).bbox(glyphPath.getBounds2D());
if (includePathData) {
glyphInfo.pathData(pathData);
}
glyphs.add(glyphInfo.build());
} }
} }
} }
@ -194,9 +186,9 @@ public class ElementFeatureFactory {
} }
private Matrix2D computeFontMatrix(CharData charData, Element textElement, Font font) throws PDFNetException { private Matrix2D computeFontMatrix(CharData charData, Element textElement, short unitsPerEm) throws PDFNetException {
double yScaleFactor = textElement.getGState().getFontSize() / font.getUnitsPerEm(); double yScaleFactor = textElement.getGState().getFontSize() / unitsPerEm;
double xScaleFactor = (textElement.getGState().getHorizontalScale() / 100) * yScaleFactor; double xScaleFactor = (textElement.getGState().getHorizontalScale() / 100) * yScaleFactor;
return new Matrix2D(xScaleFactor, 0, 0, -yScaleFactor, charData.getGlyphX(), charData.getGlyphY()); return new Matrix2D(xScaleFactor, 0, 0, -yScaleFactor, charData.getGlyphX(), charData.getGlyphY());

View File

@ -94,4 +94,9 @@ public class ElementFeatures {
return overlapShape.contains(ComparisonUtils.shrinkRectangle(boundingBox)); return overlapShape.contains(ComparisonUtils.shrinkRectangle(boundingBox));
} }
public void destroy() {
// do nothing, except for text
}
} }

View File

@ -1,30 +1,40 @@
package com.iqser.red.pdftronlogic.commons.features; package com.iqser.red.pdftronlogic.commons.features;
import java.awt.Shape;
import java.awt.geom.Rectangle2D; import java.awt.geom.Rectangle2D;
import java.util.Optional; import java.util.Optional;
import com.iqser.red.pdftronlogic.commons.ComparisonUtils; import com.iqser.red.pdftronlogic.commons.ComparisonUtils;
import com.iqser.red.pdftronlogic.commons.Converter;
import com.iqser.red.pdftronlogic.commons.PDFNetUtils;
import com.pdftron.common.Matrix2D;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.Font;
import com.pdftron.pdf.PathData; import com.pdftron.pdf.PathData;
import lombok.AccessLevel; import lombok.AccessLevel;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.Builder; import lombok.Builder;
import lombok.Getter;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults; import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Builder @Builder
@AllArgsConstructor @AllArgsConstructor
@RequiredArgsConstructor @RequiredArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE) @FieldDefaults(level = AccessLevel.PRIVATE)
public class GlyphInfo { public class GlyphInfo {
@Getter final Matrix2D glyphMatrix;
final String unicode; final long charCode;
@Getter final Font font;
final Rectangle2D bbox;
final PathData pathData; // in order to speed up invisible element removal, we only calculate the pathdata where necessary, as it is the costliest operation.
// It will only work as long as the associated ElementReader is still open, as the Font is bound to the ContentStream being read.
Rectangle2D bbox;
final boolean cachePathData;
PathData pathData;
boolean overlapped; boolean overlapped;
ElementFeatures overlappingElement; ElementFeatures overlappingElement;
@ -35,8 +45,12 @@ public class GlyphInfo {
if (overlapped) { if (overlapped) {
return true; return true;
} }
Optional<Rectangle2D> bbox = getBoundingBox();
if (bbox.isEmpty()) {
return true;
}
if (overlappingElement.getOverlapShape().contains(ComparisonUtils.shrinkRectangle(bbox))) { if (overlappingElement.getOverlapShape().contains(ComparisonUtils.shrinkRectangle(bbox.get()))) {
overlapped = true; overlapped = true;
this.overlappingElement = overlappingElement; this.overlappingElement = overlappingElement;
} }
@ -46,25 +60,57 @@ public class GlyphInfo {
} }
public boolean matches(GlyphInfo glyph2) { public String getUnicode() {
return unicode.equals(glyph2.unicode)// try {
&& calculateIntersectedArea(glyph2.bbox, bbox) > 0.9 * Math.min(bbox.getWidth() * bbox.getHeight(), glyph2.bbox.getHeight() * glyph2.bbox.getWidth()); return new String(font.mapToUnicode(charCode));
} catch (PDFNetException e) {
return "";
}
} }
@SneakyThrows
public Optional<PathData> getPathData() { public Optional<PathData> getPathData() {
return Optional.ofNullable(pathData); if (pathData == null) {
PDFNetUtils.requireFontNotClosed(font);
PathData computedPathData = font.getGlyphPath(charCode, true, glyphMatrix);
if (computedPathData.getOperators().length == 1 && computedPathData.getOperators()[0] == 6) {
// This happens for some chinese characters or whitespaces, don't know why...
return Optional.empty();
}
if (cachePathData) {
pathData = computedPathData;
}
return Optional.of(computedPathData);
}
return Optional.of(pathData);
} }
private static double calculateIntersectedArea(Rectangle2D r1, Rectangle2D r2) { @SneakyThrows
public Optional<Rectangle2D> getBoundingBox() {
double xOverlap = Math.max(0, Math.min(r1.getMaxX(), r2.getMaxX()) - Math.max(r1.getMinX(), r2.getMinX())); if (bbox == null) {
double yOverlap = Math.max(0, Math.min(r1.getMaxY(), r2.getMaxY()) - Math.max(r1.getY(), r2.getY())); Optional<PathData> pathData = getPathData();
if (pathData.isEmpty()) {
return Optional.empty();
}
bbox = Converter.convertToGeneralPath(pathData.get()).getBounds2D();
}
return Optional.of(bbox);
}
return xOverlap * yOverlap;
@SneakyThrows
public void destroy() {
if (glyphMatrix != null) {
glyphMatrix.close();
}
} }
} }

View File

@ -5,8 +5,6 @@ import static com.iqser.red.pdftronlogic.commons.ComparisonUtils.almostEqual;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import com.iqser.red.pdftronlogic.commons.ComparisonUtils;
import lombok.AccessLevel; import lombok.AccessLevel;
import lombok.Builder; import lombok.Builder;
import lombok.EqualsAndHashCode; import lombok.EqualsAndHashCode;
@ -24,6 +22,7 @@ public class TextFeatures extends ElementFeatures {
String text; String text;
int font; int font;
double fontsize; double fontsize;
@Builder.Default @Builder.Default
List<GlyphInfo> glyphs = new ArrayList<>(); List<GlyphInfo> glyphs = new ArrayList<>();
@ -41,20 +40,6 @@ public class TextFeatures extends ElementFeatures {
} }
private boolean glyphsMatch(TextFeatures textFeaturesElement) {
if (glyphs.size() != textFeaturesElement.getGlyphs().size()) {
return false;
}
for (int i = 0; i < glyphs.size(); i++) {
if (!glyphs.get(i).matches(textFeaturesElement.getGlyphs().get(i))) {
return false;
}
}
return true;
}
public boolean testOverlapped(ElementFeatures overlappingElement) { public boolean testOverlapped(ElementFeatures overlappingElement) {
if (glyphs.isEmpty()) { if (glyphs.isEmpty()) {
@ -65,4 +50,11 @@ public class TextFeatures extends ElementFeatures {
.allMatch(glyph -> glyph.testOverlapped(overlappingElement)); .allMatch(glyph -> glyph.testOverlapped(overlappingElement));
} }
@Override
public void destroy() {
glyphs.forEach(GlyphInfo::destroy);
}
} }

View File

@ -1,15 +1,16 @@
package com.iqser.red.pdftronlogic.commons.lookup; package com.iqser.red.pdftronlogic.commons.lookup;
import java.awt.geom.Rectangle2D; import java.awt.geom.Rectangle2D;
import java.util.HashSet; import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List; import java.util.List;
import java.util.Optional; import java.util.Optional;
import java.util.Set;
import java.util.function.Consumer; import java.util.function.Consumer;
import java.util.function.Predicate; import java.util.function.Predicate;
import com.iqser.red.pdftronlogic.commons.Converter; import com.iqser.red.pdftronlogic.commons.Converter;
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures; import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.Rect; import com.pdftron.pdf.Rect;
import lombok.AccessLevel; import lombok.AccessLevel;
@ -21,13 +22,13 @@ import lombok.experimental.FieldDefaults;
@Getter @Getter
@RequiredArgsConstructor @RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ElementFeatureLookup { public class ElementFeatureLookup implements AutoCloseable {
/* /*
This class looks a bit weird and uses visitors since I tried to use the quadtree implementation by locationtech, since it uses Rectangles by default to query its data structure. This class looks a bit weird and uses visitors since I tried to use the quadtree implementation by locationtech, as it uses Rectangles by default to query its data structure.
Unfortunately there were always edge cases, where it lost a few elements making it completely unusable. Further, it didn't even speed up the algorithm all that much. Unfortunately there were always edge cases where it lost a few elements making it completely unusable. Further, it didn't even speed up the algorithm all that much.
*/ */
Set<ElementFeatures> allElements = new HashSet<>(); List<ElementFeatures> allElements = new ArrayList<>();
public void add(ElementFeatures elementFeatures) { public void add(ElementFeatures elementFeatures) {
@ -69,9 +70,23 @@ public class ElementFeatureLookup {
public List<ElementFeatures> findOverlapped(ElementFeatures overlappingElement, boolean textOnly) { public List<ElementFeatures> findOverlapped(ElementFeatures overlappingElement, boolean textOnly) {
OverlapVisitor overlapVisitor = new OverlapVisitor(overlappingElement, textOnly); List<ElementFeatures> overlappedElementFeatures = new LinkedList<>();
forEach(overlapVisitor::visitItem);
return overlapVisitor.getOverlappedElementFeatures(); for (int i = 0; i < allElements.size(); i++) {
ElementFeatures features = allElements.get(i);
if (textOnly && features.getElementType() != Element.e_text) {
continue;
}
if (features.getBoundingBox().intersects(overlappingElement.getBoundingBox())) {
if (features.testOverlapped(overlappingElement)) {
overlappedElementFeatures.add(features);
}
}
}
return overlappedElementFeatures;
} }
@ -101,13 +116,20 @@ public class ElementFeatureLookup {
public void addAll(List<ElementFeatures> currentOverlappedElements) { public void addAll(List<ElementFeatures> currentOverlappedElements) {
currentOverlappedElements.forEach(this::add); allElements.addAll(currentOverlappedElements);
} }
public void removeAll(List<ElementFeatures> currentOverlappedElements) { public void removeAll(List<ElementFeatures> currentOverlappedElements) {
currentOverlappedElements.forEach(this::remove); allElements.removeAll(currentOverlappedElements);
}
@Override
public void close() {
allElements.forEach(ElementFeatures::destroy);
} }
} }

View File

@ -1,42 +0,0 @@
package com.iqser.red.pdftronlogic.commons.lookup;
import java.util.LinkedList;
import java.util.List;
import com.iqser.red.pdftronlogic.commons.ComparisonUtils;
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
import com.pdftron.pdf.Element;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.experimental.FieldDefaults;
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class OverlapVisitor implements ElementFeatureVisitor {
ElementFeatures overlappingElement;
boolean textOnly;
@Getter
List<ElementFeatures> overlappedElementFeatures = new LinkedList<>();
@Override
public void visitItem(ElementFeatures features) {
if (textOnly && features.getElementType() != Element.e_text) {
return;
}
if (ComparisonUtils.padRectangle(features.getBoundingBox()).intersects(ComparisonUtils.padRectangle(overlappingElement.getBoundingBox()))) {
if (features.testOverlapped(overlappingElement)) {
overlappedElementFeatures.add(features);
}
}
}
}

View File

@ -8,7 +8,9 @@ import java.io.FileOutputStream;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.List; import java.util.List;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import com.iqser.red.pdftronlogic.commons.features.GlyphInfo; import com.iqser.red.pdftronlogic.commons.features.GlyphInfo;
@ -23,10 +25,11 @@ import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows; import lombok.SneakyThrows;
@Disabled // makes no sense to run in pipeline
public class GlyphExtractionTest { public class GlyphExtractionTest {
@BeforeEach @BeforeAll
void createService() { static void init() {
PDFNet.initialize(PDFTronConfig.license); PDFNet.initialize(PDFTronConfig.license);
} }
@ -69,11 +72,11 @@ public class GlyphExtractionTest {
for (GlyphInfo glyph : textFeatures.getGlyphs()) { for (GlyphInfo glyph : textFeatures.getGlyphs()) {
if (glyph.getPathData().isPresent()) { if (glyph.getPathData().isPresent() && glyph.getBoundingBox().isPresent()) {
drawPathData(glyph.getPathData().get(), builder, writer, Color.BLACK); drawPathData(glyph.getPathData().get(), builder, writer, Color.BLACK);
drawRect(ComparisonUtils.shrinkRectangle(glyph.getBoundingBox().get()), builder, writer, Color.RED);
drawRect(glyph.getBoundingBox().get(), builder, writer, Color.MAGENTA);
} }
drawRect(ComparisonUtils.shrinkRectangle(glyph.getBbox()), builder, writer, Color.RED);
drawRect(glyph.getBbox(), builder, writer, Color.MAGENTA);
} }
} }

View File

@ -3,11 +3,13 @@ package com.iqser.red.pdftronlogic.commons;
import static com.iqser.red.pdftronlogic.commons.PdfTextExtraction.extractAllTextFromDocument; import static com.iqser.red.pdftronlogic.commons.PdfTextExtraction.extractAllTextFromDocument;
import static org.assertj.core.api.AssertionsForClassTypes.assertThat; import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
import java.io.ByteArrayOutputStream;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.FileOutputStream; import java.io.FileOutputStream;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
@ -22,10 +24,16 @@ class InvisibleElementRemovalServiceTest {
InvisibleElementRemovalService invisibleElementRemovalService; InvisibleElementRemovalService invisibleElementRemovalService;
@BeforeEach @BeforeAll
void createService() { static void init() {
PDFNet.initialize(PDFTronConfig.license); PDFNet.initialize(PDFTronConfig.license);
}
@BeforeEach
void createServices() {
invisibleElementRemovalService = new InvisibleElementRemovalService(); invisibleElementRemovalService = new InvisibleElementRemovalService();
} }
@ -57,6 +65,19 @@ class InvisibleElementRemovalServiceTest {
} }
@Test
@SneakyThrows
void page32DoesNotCrash() {
String fileName = "files/Page32.pdf";
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new ByteArrayOutputStream()) {
invisibleElementRemovalService.removeInvisibleElements(in, out, false);
}
}
@Test @Test
@SneakyThrows @SneakyThrows
void removeInvisibleTextClippedByFormObjects() { void removeInvisibleTextClippedByFormObjects() {

Binary file not shown.