RED-9746: Document hardly editable

* revert quadtree lookup, since the lib does not seem to work reliably, also, no significant speed boost
* check each individual glyph instead of only a text run and remember past overlaps in glyph
* added logic to extract all glyphs exactly
* check for optional content or transparency in form objects and marked content
This commit is contained in:
Kilian Schuettler 2024-08-15 21:10:38 +02:00
parent aa3823c9db
commit 01d1b35220
3 changed files with 67 additions and 7 deletions

View File

@ -38,6 +38,7 @@ import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
import com.pdftron.pdf.PathData;
import com.pdftron.pdf.Rect;
import com.pdftron.pdf.ocg.Group;
import com.pdftron.sdf.Obj;
import com.pdftron.sdf.SDFDoc;
@ -172,7 +173,7 @@ public class InvisibleElementRemovalService {
InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder()
.reader(reader)
.clippingPathStack(new ClippingPathStack(page.getMediaBox()))
.markedContentStack(new MarkedContentStack())
.markedContentStack(new MarkedContentStack(pdfDoc))
.removePaths(removePaths)
.delta(delta)
.overlappedElements(new ElementFeatureLookup())
@ -251,10 +252,9 @@ public class InvisibleElementRemovalService {
}
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
if (inClippingPath) {
ImageFeatures imageFeatures = ElementFeatureFactory.buildImage(imageElement);
if (!(imageFeatures.isTransparent() || imageFeatures.isImageMask() || imageFeatures.isSoftMask())) {
if (!(context.markedContentStack.contextHasTransparency() || imageFeatures.isTransparent() || imageFeatures.isImageMask() || imageFeatures.isSoftMask())) {
calculateOverlaps(context, imageFeatures);
}
context.visibleElements().add(imageFeatures);
@ -328,9 +328,11 @@ public class InvisibleElementRemovalService {
if (!context.visitedXObjIds().contains(formObj.getObjNum())) {
context.visitedXObjIds().add(formObj.getObjNum());
// writer needs to be newly initialized when entering a new content stream
// see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
try (ElementWriter formWriter = new ElementWriter()) {
context.markedContentStack.enterForm(formElement);
context.clippingPathStack().enterNewGState();
try (var formElementBBOX = formElement.getBBox()) {
context.clippingPathStack().intersectClippingPath(Converter.toRectangle2D(formElementBBOX));
@ -344,6 +346,7 @@ public class InvisibleElementRemovalService {
formWriter.end();
context.reader().end();
context.clippingPathStack().leaveGState();
context.markedContentStack.leaveForm();
}
}
}
@ -386,7 +389,7 @@ public class InvisibleElementRemovalService {
}
if (inClippingPath) {
if (isFilledAndNonTransparent(pathElement)) {
if (!context.markedContentStack.contextHasTransparency() && isFilledAndNonTransparent(pathElement)) {
calculateOverlaps(context, pathFeatures);
}
context.visibleElements().add(ElementFeatureFactory.extractFeatures(pathElement));
@ -473,8 +476,16 @@ public class InvisibleElementRemovalService {
private static void removeOverlappedElement(ElementWriter writer, InvisibleElementRemovalContext context, Element element) throws PDFNetException {
try (Rect bbox = element.getBBox()) {
if (bbox == null) {
writer.writeElement(element);
return;
}
}
Optional<ElementFeatures> optionalElementMatch = context.overlappedElements()
.anyMatch(ElementFeatureFactory.extractFeatures(element));
if (optionalElementMatch.isPresent()) {
context.overlappedElements().remove(optionalElementMatch.get());
if (element.getType() == 3 && element.hasTextMatrix()) {

View File

@ -3,16 +3,45 @@ package com.iqser.red.pdftronlogic.commons;
import java.util.Deque;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Optional;
import java.util.Set;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.ocg.Group;
import com.pdftron.sdf.Obj;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
@RequiredArgsConstructor
public class MarkedContentStack {
private final PDFDoc pdfDoc;
Deque<MarkedContent> stack = new LinkedList<>();
Deque<Form> formStack = new LinkedList<>();
public void enterMarkedContent(String name) {
stack.push(new MarkedContent(name));
stack.push(new MarkedContent(name, name.startsWith("OC")));
}
@SneakyThrows
public void enterForm(Element formElement) {
Obj oc = formElement.getXObject().findObj("OC");
Obj group = formElement.getXObject().findObj("Group");
boolean transparency = false;
if (group != null) {
Obj groupSubType = group.findObj("S");
if (groupSubType != null && groupSubType.isName() && groupSubType.getName().equals("Transparency")) {
transparency = true;
}
}
formStack.push(new Form(formElement.getXObject().getObjNum(), oc != null, transparency));
}
@ -66,7 +95,27 @@ public class MarkedContentStack {
}
private record MarkedContent(String name) {
public boolean contextHasTransparency() {
return formStack.stream()
.anyMatch(form -> form.optionalContent || form.transparency) //
|| stack.stream()
.anyMatch(MarkedContent::optionalContent);
}
public void leaveForm() {
formStack.pop();
}
private record MarkedContent(String name, boolean optionalContent) {
}
private record Form(long ref, boolean optionalContent, boolean transparency) {
}

View File

@ -54,7 +54,7 @@ public class VisualEqualityTest {
@SneakyThrows
public void assertVisualEqualityOfProcessedFile() {
Path file = Path.of("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/ITEM 19_A15149AC - Primary Skin Irritation Rabbit.pdf");
Path file = Path.of("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/SOLICITA_VICTRATO-GOLD-II_Item 20_Sensibilizacao_02.pdf");
Context context = new Context(TEST_OUTPUT_DIR, new HashMap<>());
runForFile(file, context);