Compare commits

..

4 Commits

Author SHA1 Message Date
Kilian Schüttler
c9424a5f4b Merge branch 'RED-10365' into 'master'
RED-10365: InvisibleElementRemovalService crashes for specific file

Closes RED-10365

See merge request redactmanager/commons/pdftron-logic-commons!36
2024-11-05 12:23:28 +01:00
Kilian Schuettler
e86e6fba2a RED-10365: InvisibleElementRemovalService crashes for specific file 2024-11-05 12:18:29 +01:00
Kilian Schüttler
ff9fd7bd44 Merge branch 'RED-9864' into 'master'
RED-9864: Ocr not working

Closes RED-9864

See merge request redactmanager/commons/pdftron-logic-commons!35
2024-08-26 14:59:11 +02:00
Kilian Schüttler
e6a1656e18 RED-9864: Ocr not working 2024-08-26 14:59:10 +02:00
13 changed files with 244 additions and 149 deletions

View File

@ -27,7 +27,7 @@ repositories {
dependencies {
api("org.projectlombok:lombok:1.18.30")
api("com.google.guava:guava:33.0.0-jre")
api("com.pdftron:PDFNet:10.11.0")
api("com.pdftron:PDFNet:11.0.0")
testImplementation("net.sourceforge.lept4j:lept4j:1.19.1")
testImplementation("org.junit.jupiter:junit-jupiter:5.10.2")
testImplementation("org.assertj:assertj-core:3.24.2")

View File

@ -163,7 +163,7 @@ public class InvisibleElementRemovalService {
visitedXObjIds.add(page.getSDFObj().getObjNum());
InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder()
try (InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder()
.reader(reader)
.clippingPathStack(new ClippingPathStack(page.getMediaBox()))
.markedContentStack(new MarkedContentStack(pdfDoc))
@ -173,14 +173,15 @@ public class InvisibleElementRemovalService {
.visibleElements(new ElementFeatureLookup())
.visitedXObjIds(visitedXObjIds)
.markedContentToIgnore(markedContentToIgnore)
.build();
.build()) {
removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context);
removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context);
context.visitedXObjIds().clear();
context.markedContentStack().clear();
context.visitedXObjIds().clear();
context.markedContentStack().clear();
removeOverlappedElements(page, writer, context);
removeOverlappedElements(page, writer, context);
}
}
}
@ -248,10 +249,7 @@ public class InvisibleElementRemovalService {
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
if (inClippingPath) {
ImageFeatures imageFeatures = ElementFeatureFactory.buildImage(imageElement);
if (!(context.markedContentStack.contextHasTransparency()
|| imageFeatures.isTransparent()
|| imageFeatures.isImageMask()
|| imageFeatures.isSoftMask())) {
if (!(context.markedContentStack.contextHasTransparency() || imageFeatures.isTransparent() || imageFeatures.isImageMask() || imageFeatures.isSoftMask())) {
calculateOverlaps(context, imageFeatures, imageFeatures.isMasked());
}
context.visibleElements().add(imageFeatures);
@ -280,7 +278,7 @@ public class InvisibleElementRemovalService {
boolean isTextVisible = isTextRenderedVisibly(gState, textBBox, context);
if (inClippingPath && isTextVisible) {
context.visibleElements().add(ElementFeatureFactory.buildText(textElement, context.delta()));
context.visibleElements().add(ElementFeatureFactory.buildText(textElement, true, context.delta()));
}
if (!context.delta()) {
if (inClippingPath && isTextVisible) {
@ -292,7 +290,8 @@ public class InvisibleElementRemovalService {
Therefore, the position of a following Tj is affected by not writing the first Element.
This is why, we write only the Tm command:
*/
writer.writeGStateChanges(textElement);
textElement.setTextData(new byte[]{});
writer.writeElement(textElement);
}
} else {
if (!inClippingPath) {
@ -433,7 +432,7 @@ public class InvisibleElementRemovalService {
context.reader().end();
if (!context.overlappedElements().isEmpty()) {
log.warn(context.overlappedElements().size() + " overlapped elements have not been found or removed");
log.debug(context.overlappedElements().size() + " overlapped elements have not been found or removed");
}
}
@ -492,7 +491,8 @@ public class InvisibleElementRemovalService {
Therefore, the position of a following Tj is affected by not writing the first Element.
This is why, we write only the Tm command:
*/
writer.writeGStateChanges(element);
element.setTextData(new byte[]{});
writer.writeElement(element);
}
} else {
writer.writeElement(element);
@ -614,7 +614,14 @@ public class InvisibleElementRemovalService {
ElementFeatureLookup visibleElements,
Set<Long> visitedXObjIds,
Set<String> markedContentToIgnore
) {
) implements AutoCloseable {
@Override
public void close() {
overlappedElements.close();
visibleElements.close();
}
}

View File

@ -0,0 +1,49 @@
package com.iqser.red.pdftronlogic.commons;
import java.lang.reflect.Field;
import com.pdftron.pdf.Font;
import lombok.experimental.UtilityClass;
@UtilityClass
public class PDFNetUtils {
@SuppressWarnings("PMD")
public void requireFontNotClosed(Font font) {
try {
if (font.__GetHandle() == 0L) {
throw new AssertionError("Font is already closed!");
}
Object refHandle = font.__GetRefHandle();
Class<?> clazz = refHandle.getClass();
Field implField = null;
while (clazz != null) {
try {
implField = clazz.getDeclaredField("impl");
implField.setAccessible(true);
break;
} catch (NoSuchFieldException e) {
clazz = clazz.getSuperclass();
}
}
if (implField != null) {
long implValue = (Long) implField.get(refHandle);
if (implValue == 0L) {
throw new AssertionError("Associated ElementReader of Font is already closed!");
}
}
} catch (IllegalAccessException e) {
throw new AssertionError("Font Ref is missing the field impl, should never happen!");
}
}
}

View File

@ -85,7 +85,7 @@ public class PdfTextExtraction {
for (Element element = reader.next(); element != null; element = reader.next()) {
switch (element.getType()) {
case Element.e_text -> textFeaturesOnPage.add(ElementFeatureFactory.buildText(element, includePathData));
case Element.e_text -> textFeaturesOnPage.add(ElementFeatureFactory.buildText(element, includePathData, includePathData));
case Element.e_form -> {
Obj formObj = element.getXObject();

View File

@ -1,6 +1,5 @@
package com.iqser.red.pdftronlogic.commons.features;
import java.awt.geom.GeneralPath;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
@ -14,7 +13,7 @@ import com.pdftron.pdf.Element;
import com.pdftron.pdf.Font;
import com.pdftron.pdf.GState;
import com.pdftron.pdf.Image;
import com.pdftron.pdf.PathData;
import com.pdftron.sdf.Obj;
import lombok.SneakyThrows;
import lombok.experimental.UtilityClass;
@ -26,7 +25,7 @@ public class ElementFeatureFactory {
return switch (element.getType()) {
case Element.e_path -> buildPath(element);
case Element.e_text -> buildText(element);
case Element.e_text -> buildText(element, false, false);
case Element.e_image, Element.e_inline_image -> buildImage(element);
case Element.e_form -> buildForm(element);
// This technically should never happen, it's a safetynet
@ -72,7 +71,7 @@ public class ElementFeatureFactory {
boolean masked = false;
if (element.getType() == Element.e_image) {
Image image = new Image(element.getXObject());
if (image.getMask() != null) {
if (image.getMask() != null && image.getMask().getType() == Obj.e_stream) {
Image imageMask = new Image(image.getMask());
masked = imageMask.isImageMask();
}
@ -94,27 +93,25 @@ public class ElementFeatureFactory {
}
public TextFeatures buildText(Element element) throws PDFNetException {
return buildText(element, false);
}
/*
Use includePathData = true, when trying to draw the glyphs, see GlyphExtractionTest
Use includeGlyphs = true and preComputePathData = true, when trying to draw the glyphs, see GlyphExtractionTest
precomputePathData = true is needed, when trying to access the PathData after the PDFDoc/ElementReader has been closed
*/
public TextFeatures buildText(Element element, boolean includePathData) throws PDFNetException {
public TextFeatures buildText(Element element, boolean includeGlyphs, boolean preComputePathData) throws PDFNetException {
try (var bbox = element.getBBox()) {
return TextFeatures.builder()
TextFeatures.TextFeaturesBuilder<?, ?> simpleTextFeatures = TextFeatures.builder()
.elementType(element.getType())
.boundingBox(Converter.toRectangle2D(bbox))
.text(element.getTextString())
.font(element.getGState().getFont().getType())
.fontsize(element.getGState().getFontSize())
.glyphs(extractGlyphInfo(element, includePathData))
.build();
.fontsize(element.getGState().getFontSize());
if (includeGlyphs) {
simpleTextFeatures.glyphs(extractGlyphInfo(element, preComputePathData));
}
return simpleTextFeatures.build();
}
}
@ -139,7 +136,7 @@ public class ElementFeatureFactory {
@SneakyThrows
private List<GlyphInfo> extractGlyphInfo(Element textElement, boolean includePathData) {
private List<GlyphInfo> extractGlyphInfo(Element textElement, boolean precomputePathData) {
assert textElement != null && textElement.getType() == Element.e_text;
@ -157,34 +154,29 @@ public class ElementFeatureFactory {
}
List<GlyphInfo> glyphs = new ArrayList<>();
short unitsPerEm = font.getUnitsPerEm();
try (CharIterator charIterator = textElement.getCharIterator(); Matrix2D ctm = textElement.getCTM().multiply(textElement.getTextMatrix());) {
try (CharIterator charIterator = textElement.getCharIterator()) {
while (charIterator.hasNext()) {
CharData charData = charIterator.next();
long charCode = charData.getCharCode();
String glyphText = new String(font.mapToUnicode(charCode));
if (Character.isWhitespace(glyphText.charAt(0))) {
continue;
}
try (Matrix2D fontMatrix = computeFontMatrix(charData, textElement, unitsPerEm)) {
try (Matrix2D fontMatrix = computeFontMatrix(charData, textElement, font); //
Matrix2D glyphMatrix = textElement.getCTM()//
.multiply(textElement.getTextMatrix())//
.multiply(fontMatrix)) {
PathData pathData = font.getGlyphPath(charCode, true, glyphMatrix);
if (pathData.getOperators().length == 1 && pathData.getOperators()[0] == 6) {
// This happens for some chinese characters or whitespaces, don't know why...
continue;
GlyphInfo glyph = GlyphInfo.builder() //
.charCode(charCode) //
.cachePathData(precomputePathData) //
.glyphMatrix(ctm.multiply(fontMatrix)) //
.font(font) //
.build();
glyphs.add(glyph);
if (precomputePathData) {
// call the functions once to cache all data
glyph.getBoundingBox();
}
GeneralPath glyphPath = Converter.convertToGeneralPath(pathData);
GlyphInfo.GlyphInfoBuilder glyphInfo = GlyphInfo.builder().unicode(glyphText).bbox(glyphPath.getBounds2D());
if (includePathData) {
glyphInfo.pathData(pathData);
}
glyphs.add(glyphInfo.build());
}
}
}
@ -194,9 +186,9 @@ public class ElementFeatureFactory {
}
private Matrix2D computeFontMatrix(CharData charData, Element textElement, Font font) throws PDFNetException {
private Matrix2D computeFontMatrix(CharData charData, Element textElement, short unitsPerEm) throws PDFNetException {
double yScaleFactor = textElement.getGState().getFontSize() / font.getUnitsPerEm();
double yScaleFactor = textElement.getGState().getFontSize() / unitsPerEm;
double xScaleFactor = (textElement.getGState().getHorizontalScale() / 100) * yScaleFactor;
return new Matrix2D(xScaleFactor, 0, 0, -yScaleFactor, charData.getGlyphX(), charData.getGlyphY());

View File

@ -94,4 +94,9 @@ public class ElementFeatures {
return overlapShape.contains(ComparisonUtils.shrinkRectangle(boundingBox));
}
public void destroy() {
// do nothing, except for text
}
}

View File

@ -1,30 +1,40 @@
package com.iqser.red.pdftronlogic.commons.features;
import java.awt.Shape;
import java.awt.geom.Rectangle2D;
import java.util.Optional;
import com.iqser.red.pdftronlogic.commons.ComparisonUtils;
import com.iqser.red.pdftronlogic.commons.Converter;
import com.iqser.red.pdftronlogic.commons.PDFNetUtils;
import com.pdftron.common.Matrix2D;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.Font;
import com.pdftron.pdf.PathData;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Builder
@AllArgsConstructor
@RequiredArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class GlyphInfo {
@Getter
final String unicode;
@Getter
final Rectangle2D bbox;
final PathData pathData;
final Matrix2D glyphMatrix;
final long charCode;
final Font font;
// in order to speed up invisible element removal, we only calculate the pathdata where necessary, as it is the costliest operation.
// It will only work as long as the associated ElementReader is still open, as the Font is bound to the ContentStream being read.
Rectangle2D bbox;
final boolean cachePathData;
PathData pathData;
boolean overlapped;
ElementFeatures overlappingElement;
@ -35,8 +45,12 @@ public class GlyphInfo {
if (overlapped) {
return true;
}
Optional<Rectangle2D> bbox = getBoundingBox();
if (bbox.isEmpty()) {
return true;
}
if (overlappingElement.getOverlapShape().contains(ComparisonUtils.shrinkRectangle(bbox))) {
if (overlappingElement.getOverlapShape().contains(ComparisonUtils.shrinkRectangle(bbox.get()))) {
overlapped = true;
this.overlappingElement = overlappingElement;
}
@ -46,25 +60,57 @@ public class GlyphInfo {
}
public boolean matches(GlyphInfo glyph2) {
public String getUnicode() {
return unicode.equals(glyph2.unicode)//
&& calculateIntersectedArea(glyph2.bbox, bbox) > 0.9 * Math.min(bbox.getWidth() * bbox.getHeight(), glyph2.bbox.getHeight() * glyph2.bbox.getWidth());
try {
return new String(font.mapToUnicode(charCode));
} catch (PDFNetException e) {
return "";
}
}
@SneakyThrows
public Optional<PathData> getPathData() {
return Optional.ofNullable(pathData);
if (pathData == null) {
PDFNetUtils.requireFontNotClosed(font);
PathData computedPathData = font.getGlyphPath(charCode, true, glyphMatrix);
if (computedPathData.getOperators().length == 1 && computedPathData.getOperators()[0] == 6) {
// This happens for some chinese characters or whitespaces, don't know why...
return Optional.empty();
}
if (cachePathData) {
pathData = computedPathData;
}
return Optional.of(computedPathData);
}
return Optional.of(pathData);
}
private static double calculateIntersectedArea(Rectangle2D r1, Rectangle2D r2) {
@SneakyThrows
public Optional<Rectangle2D> getBoundingBox() {
double xOverlap = Math.max(0, Math.min(r1.getMaxX(), r2.getMaxX()) - Math.max(r1.getMinX(), r2.getMinX()));
double yOverlap = Math.max(0, Math.min(r1.getMaxY(), r2.getMaxY()) - Math.max(r1.getY(), r2.getY()));
if (bbox == null) {
Optional<PathData> pathData = getPathData();
if (pathData.isEmpty()) {
return Optional.empty();
}
bbox = Converter.convertToGeneralPath(pathData.get()).getBounds2D();
}
return Optional.of(bbox);
}
return xOverlap * yOverlap;
@SneakyThrows
public void destroy() {
if (glyphMatrix != null) {
glyphMatrix.close();
}
}
}

View File

@ -5,8 +5,6 @@ import static com.iqser.red.pdftronlogic.commons.ComparisonUtils.almostEqual;
import java.util.ArrayList;
import java.util.List;
import com.iqser.red.pdftronlogic.commons.ComparisonUtils;
import lombok.AccessLevel;
import lombok.Builder;
import lombok.EqualsAndHashCode;
@ -24,6 +22,7 @@ public class TextFeatures extends ElementFeatures {
String text;
int font;
double fontsize;
@Builder.Default
List<GlyphInfo> glyphs = new ArrayList<>();
@ -41,20 +40,6 @@ public class TextFeatures extends ElementFeatures {
}
private boolean glyphsMatch(TextFeatures textFeaturesElement) {
if (glyphs.size() != textFeaturesElement.getGlyphs().size()) {
return false;
}
for (int i = 0; i < glyphs.size(); i++) {
if (!glyphs.get(i).matches(textFeaturesElement.getGlyphs().get(i))) {
return false;
}
}
return true;
}
public boolean testOverlapped(ElementFeatures overlappingElement) {
if (glyphs.isEmpty()) {
@ -65,4 +50,11 @@ public class TextFeatures extends ElementFeatures {
.allMatch(glyph -> glyph.testOverlapped(overlappingElement));
}
@Override
public void destroy() {
glyphs.forEach(GlyphInfo::destroy);
}
}

View File

@ -1,15 +1,16 @@
package com.iqser.red.pdftronlogic.commons.lookup;
import java.awt.geom.Rectangle2D;
import java.util.HashSet;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.function.Consumer;
import java.util.function.Predicate;
import com.iqser.red.pdftronlogic.commons.Converter;
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.Rect;
import lombok.AccessLevel;
@ -21,13 +22,13 @@ import lombok.experimental.FieldDefaults;
@Getter
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ElementFeatureLookup {
public class ElementFeatureLookup implements AutoCloseable {
/*
This class looks a bit weird and uses visitors since I tried to use the quadtree implementation by locationtech, since it uses Rectangles by default to query its data structure.
Unfortunately there were always edge cases, where it lost a few elements making it completely unusable. Further, it didn't even speed up the algorithm all that much.
This class looks a bit weird and uses visitors since I tried to use the quadtree implementation by locationtech, as it uses Rectangles by default to query its data structure.
Unfortunately there were always edge cases where it lost a few elements making it completely unusable. Further, it didn't even speed up the algorithm all that much.
*/
Set<ElementFeatures> allElements = new HashSet<>();
List<ElementFeatures> allElements = new ArrayList<>();
public void add(ElementFeatures elementFeatures) {
@ -69,9 +70,23 @@ public class ElementFeatureLookup {
public List<ElementFeatures> findOverlapped(ElementFeatures overlappingElement, boolean textOnly) {
OverlapVisitor overlapVisitor = new OverlapVisitor(overlappingElement, textOnly);
forEach(overlapVisitor::visitItem);
return overlapVisitor.getOverlappedElementFeatures();
List<ElementFeatures> overlappedElementFeatures = new LinkedList<>();
for (int i = 0; i < allElements.size(); i++) {
ElementFeatures features = allElements.get(i);
if (textOnly && features.getElementType() != Element.e_text) {
continue;
}
if (features.getBoundingBox().intersects(overlappingElement.getBoundingBox())) {
if (features.testOverlapped(overlappingElement)) {
overlappedElementFeatures.add(features);
}
}
}
return overlappedElementFeatures;
}
@ -101,13 +116,20 @@ public class ElementFeatureLookup {
public void addAll(List<ElementFeatures> currentOverlappedElements) {
currentOverlappedElements.forEach(this::add);
allElements.addAll(currentOverlappedElements);
}
public void removeAll(List<ElementFeatures> currentOverlappedElements) {
currentOverlappedElements.forEach(this::remove);
allElements.removeAll(currentOverlappedElements);
}
@Override
public void close() {
allElements.forEach(ElementFeatures::destroy);
}
}

View File

@ -1,42 +0,0 @@
package com.iqser.red.pdftronlogic.commons.lookup;
import java.util.LinkedList;
import java.util.List;
import com.iqser.red.pdftronlogic.commons.ComparisonUtils;
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
import com.pdftron.pdf.Element;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.experimental.FieldDefaults;
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class OverlapVisitor implements ElementFeatureVisitor {
ElementFeatures overlappingElement;
boolean textOnly;
@Getter
List<ElementFeatures> overlappedElementFeatures = new LinkedList<>();
@Override
public void visitItem(ElementFeatures features) {
if (textOnly && features.getElementType() != Element.e_text) {
return;
}
if (ComparisonUtils.padRectangle(features.getBoundingBox()).intersects(ComparisonUtils.padRectangle(overlappingElement.getBoundingBox()))) {
if (features.testOverlapped(overlappingElement)) {
overlappedElementFeatures.add(features);
}
}
}
}

View File

@ -8,7 +8,9 @@ import java.io.FileOutputStream;
import java.nio.file.Path;
import java.util.List;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import com.iqser.red.pdftronlogic.commons.features.GlyphInfo;
@ -23,10 +25,11 @@ import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows;
@Disabled // makes no sense to run in pipeline
public class GlyphExtractionTest {
@BeforeEach
void createService() {
@BeforeAll
static void init() {
PDFNet.initialize(PDFTronConfig.license);
}
@ -69,11 +72,11 @@ public class GlyphExtractionTest {
for (GlyphInfo glyph : textFeatures.getGlyphs()) {
if (glyph.getPathData().isPresent()) {
if (glyph.getPathData().isPresent() && glyph.getBoundingBox().isPresent()) {
drawPathData(glyph.getPathData().get(), builder, writer, Color.BLACK);
drawRect(ComparisonUtils.shrinkRectangle(glyph.getBoundingBox().get()), builder, writer, Color.RED);
drawRect(glyph.getBoundingBox().get(), builder, writer, Color.MAGENTA);
}
drawRect(ComparisonUtils.shrinkRectangle(glyph.getBbox()), builder, writer, Color.RED);
drawRect(glyph.getBbox(), builder, writer, Color.MAGENTA);
}
}

View File

@ -3,11 +3,13 @@ package com.iqser.red.pdftronlogic.commons;
import static com.iqser.red.pdftronlogic.commons.PdfTextExtraction.extractAllTextFromDocument;
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
import java.io.ByteArrayOutputStream;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import lombok.extern.slf4j.Slf4j;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@ -22,10 +24,16 @@ class InvisibleElementRemovalServiceTest {
InvisibleElementRemovalService invisibleElementRemovalService;
@BeforeEach
void createService() {
@BeforeAll
static void init() {
PDFNet.initialize(PDFTronConfig.license);
}
@BeforeEach
void createServices() {
invisibleElementRemovalService = new InvisibleElementRemovalService();
}
@ -57,6 +65,19 @@ class InvisibleElementRemovalServiceTest {
}
@Test
@SneakyThrows
void page32DoesNotCrash() {
String fileName = "files/Page32.pdf";
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new ByteArrayOutputStream()) {
invisibleElementRemovalService.removeInvisibleElements(in, out, false);
}
}
@Test
@SneakyThrows
void removeInvisibleTextClippedByFormObjects() {

Binary file not shown.