RED-9746: Document hardly editable
* revert quadtree lookup, since the lib does not seem to work reliably, also, no significant speed boost * check each individual glyph instead of only a text run and remember past overlaps in glyph * added logic to extract all glyphs exactly * check for optional content or transparency in form objects and marked content
This commit is contained in:
parent
01d1b35220
commit
dc970a64f3
@ -4,10 +4,8 @@ import static com.iqser.red.pdftronlogic.commons.VisualizationUtils.drawFeature;
|
||||
import static com.iqser.red.pdftronlogic.commons.VisualizationUtils.drawRect;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.Shape;
|
||||
import java.awt.geom.Area;
|
||||
import java.awt.geom.GeneralPath;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.ArrayList;
|
||||
@ -19,26 +17,21 @@ import java.util.TreeSet;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.features.ElementFeatureFactory;
|
||||
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
|
||||
import com.iqser.red.pdftronlogic.commons.features.GlyphInfo;
|
||||
import com.iqser.red.pdftronlogic.commons.features.ImageFeatures;
|
||||
import com.iqser.red.pdftronlogic.commons.features.PathFeatures;
|
||||
import com.iqser.red.pdftronlogic.commons.features.TextFeatures;
|
||||
import com.iqser.red.pdftronlogic.commons.lookup.ElementFeatureLookup;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.ColorPt;
|
||||
import com.pdftron.pdf.ColorSpace;
|
||||
import com.pdftron.pdf.Element;
|
||||
import com.pdftron.pdf.ElementBuilder;
|
||||
import com.pdftron.pdf.ElementReader;
|
||||
import com.pdftron.pdf.ElementWriter;
|
||||
import com.pdftron.pdf.GState;
|
||||
import com.pdftron.pdf.Image;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.pdf.PageIterator;
|
||||
import com.pdftron.pdf.PathData;
|
||||
import com.pdftron.pdf.Rect;
|
||||
import com.pdftron.pdf.ocg.Group;
|
||||
import com.pdftron.sdf.Obj;
|
||||
import com.pdftron.sdf.SDFDoc;
|
||||
|
||||
@ -254,7 +247,11 @@ public class InvisibleElementRemovalService {
|
||||
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
|
||||
if (inClippingPath) {
|
||||
ImageFeatures imageFeatures = ElementFeatureFactory.buildImage(imageElement);
|
||||
if (!(context.markedContentStack.contextHasTransparency() || imageFeatures.isTransparent() || imageFeatures.isImageMask() || imageFeatures.isSoftMask())) {
|
||||
if (!(context.markedContentStack.contextHasTransparency()
|
||||
|| imageFeatures.isTransparent()
|
||||
|| imageFeatures.isImageMask()
|
||||
|| imageFeatures.isSoftMask()
|
||||
|| imageFeatures.isMasked())) {
|
||||
calculateOverlaps(context, imageFeatures);
|
||||
}
|
||||
context.visibleElements().add(imageFeatures);
|
||||
|
||||
@ -13,6 +13,7 @@ import com.pdftron.pdf.CharIterator;
|
||||
import com.pdftron.pdf.Element;
|
||||
import com.pdftron.pdf.Font;
|
||||
import com.pdftron.pdf.GState;
|
||||
import com.pdftron.pdf.Image;
|
||||
import com.pdftron.pdf.PathData;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
@ -61,10 +62,21 @@ public class ElementFeatureFactory {
|
||||
|
||||
private ImageFeatures.ImageFeaturesBuilder<?, ?> buildImageBase(Element element) throws PDFNetException {
|
||||
|
||||
assert element.getType() == Element.e_image || element.getType() == Element.e_inline_image;
|
||||
try (var bbox = element.getBBox();) {
|
||||
boolean transparent = element.getGState().getBlendMode() != GState.e_bl_normal
|
||||
|| element.getGState().getFillOpacity() > 1
|
||||
|| element.getGState().getStrokeOpacity() > 1;
|
||||
|
||||
// see spec: 8.9.6.3 Explicit masking
|
||||
boolean masked = false;
|
||||
if (element.getType() == Element.e_image) {
|
||||
Image image = new Image(element.getXObject());
|
||||
if (image.getMask() != null) {
|
||||
Image imageMask = new Image(image.getMask());
|
||||
masked = imageMask.isImageMask();
|
||||
}
|
||||
}
|
||||
return ImageFeatures.builder()
|
||||
.elementType(element.getType())
|
||||
.boundingBox(Converter.toRectangle2D(bbox))
|
||||
@ -76,6 +88,7 @@ public class ElementFeatureFactory {
|
||||
.bitsPerComponent(element.getBitsPerComponent())
|
||||
.imageMask(element.isImageMask())
|
||||
.softMask(element.getGState().getSoftMask() != null)
|
||||
.masked(masked)
|
||||
.transparent(transparent);
|
||||
}
|
||||
}
|
||||
|
||||
@ -21,6 +21,7 @@ public class ImageFeatures extends ElementFeatures {
|
||||
int bitsPerComponent;
|
||||
boolean imageMask;
|
||||
boolean softMask;
|
||||
boolean masked;
|
||||
boolean transparent;
|
||||
String hashOfImage;
|
||||
|
||||
|
||||
@ -1,12 +1,13 @@
|
||||
package com.iqser.red.pdftronlogic.commons;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.nio.IntBuffer;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
@ -20,10 +21,18 @@ import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.rendering.GhostScriptService;
|
||||
import com.iqser.red.pdftronlogic.commons.rendering.ImageFile;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.PDFNet;
|
||||
import com.pdftron.sdf.SDFDoc;
|
||||
import com.sun.jna.Memory;
|
||||
import com.sun.jna.Native;
|
||||
import com.sun.jna.NativeLibrary;
|
||||
import com.sun.jna.Pointer;
|
||||
import com.sun.jna.ptr.PointerByReference;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import net.sourceforge.lept4j.Box;
|
||||
import net.sourceforge.lept4j.Boxa;
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.lept4j.util.LeptUtils;
|
||||
@ -31,11 +40,22 @@ import net.sourceforge.lept4j.util.LeptUtils;
|
||||
@Disabled // requires leptonica and ghostscript to be installed locally
|
||||
public class VisualEqualityTest {
|
||||
|
||||
public static final double SIMILARITY_THRESHOLD = 0.015; // percentage of pixels which differ by more than 10 points in luminance
|
||||
public static final String LEPTONICA_DIR = "/home/kschuettler/software/leptonica/vcpkg/installed/x64-linux-dynamic/lib/";
|
||||
/*
|
||||
We render both the origin and the processed file and then computes a diff per page, we then threshold and invert the diff.
|
||||
This means, a visual difference of luminance greater than the threshold value shows up as a black pixel.
|
||||
We then use Heckbert's Seed Fill Algorithm to detect connected black regions by recursively flooding connected pixels.
|
||||
We then filter these error regions, ensuring their area is at least the threshold.
|
||||
We do this, since single pixel errors are frequent, but cannot be perceived by a human. Most likely some float inaccuracies.
|
||||
If there are any error regions left, we count the test as failed.
|
||||
*/
|
||||
private static final int ERROR_REGION_AREA_THRESHOLD = 10;
|
||||
public static final int LUMINANCE_DIFFERENCE_THRESHOLD = 170;
|
||||
|
||||
private static final Path TEST_OUTPUT_DIR = Path.of("/tmp/AAA_EQUALITY_TEST/");
|
||||
private static final String LEPTONICA_DIR = "/home/kschuettler/software/leptonica/vcpkg/installed/x64-linux-dynamic/lib/";
|
||||
|
||||
GhostScriptService ghostScriptService = new GhostScriptService();
|
||||
InvisibleElementRemovalService invisibleElementRemovalService = new InvisibleElementRemovalService();
|
||||
Path TEST_OUTPUT_DIR = Path.of("/tmp/AAA_EQUALITY_TEST/");
|
||||
|
||||
|
||||
@BeforeEach
|
||||
@ -54,20 +74,22 @@ public class VisualEqualityTest {
|
||||
@SneakyThrows
|
||||
public void assertVisualEqualityOfProcessedFile() {
|
||||
|
||||
Path file = Path.of("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/SOLICITA_VICTRATO-GOLD-II_Item 20_Sensibilizacao_02.pdf");
|
||||
Path file = Path.of("/home/kschuettler/Dokumente/TestFiles/full_syn_dm_testfiles/3977411_Final_Thiamethoxam_SL_MNLY.pdf");
|
||||
Context context = new Context(TEST_OUTPUT_DIR, new HashMap<>());
|
||||
|
||||
runForFile(file, context);
|
||||
|
||||
System.out.println(context);
|
||||
|
||||
assert context.failedFiles.isEmpty();
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void assertVisualEqualityOfProcessedFolder() {
|
||||
|
||||
Path folder = Path.of("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles");
|
||||
Path folder = Path.of("/home/kschuettler/Dokumente/TestFiles/full_syn_dm_testfiles");
|
||||
Context context = new Context(TEST_OUTPUT_DIR, new HashMap<>());
|
||||
|
||||
Files.walk(folder)
|
||||
@ -78,7 +100,6 @@ public class VisualEqualityTest {
|
||||
.peek(file -> runForFile(file, context))
|
||||
.forEach(f -> System.out.println(context));
|
||||
|
||||
|
||||
assert context.failedFiles.isEmpty();
|
||||
}
|
||||
|
||||
@ -91,17 +112,20 @@ public class VisualEqualityTest {
|
||||
Files.createDirectories(fileFolder);
|
||||
Path processedFile = fileFolder.resolve("processed.pdf");
|
||||
Path deltaFile = fileFolder.resolve("delta.pdf");
|
||||
Path copiedOriginFile = fileFolder.resolve("origin.pdf");
|
||||
Files.copy(originFile, copiedOriginFile, StandardCopyOption.REPLACE_EXISTING);
|
||||
Path savedOriginFile = fileFolder.resolve("origin.pdf");
|
||||
try (var in = new FileInputStream(originFile.toFile()); var out = new FileOutputStream(savedOriginFile.toFile())) {
|
||||
PDFDoc pdfDoc = new PDFDoc(in);
|
||||
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
|
||||
}
|
||||
|
||||
try (var in = new FileInputStream(copiedOriginFile.toFile()); var out = new FileOutputStream(processedFile.toFile())) {
|
||||
try (var in = new FileInputStream(originFile.toFile()); var out = new FileOutputStream(processedFile.toFile())) {
|
||||
invisibleElementRemovalService.removeInvisibleElements(in, out, false);
|
||||
}
|
||||
try (var in = new FileInputStream(copiedOriginFile.toFile()); var out = new FileOutputStream(deltaFile.toFile())) {
|
||||
try (var in = new FileInputStream(originFile.toFile()); var out = new FileOutputStream(deltaFile.toFile())) {
|
||||
invisibleElementRemovalService.removeInvisibleElements(in, out, true);
|
||||
}
|
||||
System.out.println("removed invisible elements");
|
||||
assertVisualEquality(originFile, processedFile, context);
|
||||
assertVisualEquality(savedOriginFile, processedFile, context);
|
||||
System.out.println("finished visual equality check");
|
||||
}
|
||||
|
||||
@ -147,10 +171,10 @@ public class VisualEqualityTest {
|
||||
}
|
||||
|
||||
String errorFile = context.getErrorFolder(originFile).resolve(originalPage.pageNumber() + ".tiff").toFile().toString();
|
||||
double diffRatio = detectErrors(originalPagePix, processedPagePix, errorFile);
|
||||
List<Rectangle2D> errorRegions = detectErrors(originalPagePix, processedPagePix, errorFile);
|
||||
|
||||
if (diffRatio > SIMILARITY_THRESHOLD) {
|
||||
context.getFailedFile(originFile).addErrorMessage("Page " + originalPage.pageNumber() + " differs by " + formatPercentage(diffRatio) + "%!");
|
||||
if (!errorRegions.isEmpty()) {
|
||||
context.getFailedFile(originFile).addErrorMessage("Page " + originalPage.pageNumber() + " has " + errorRegions.size() + " errors!");
|
||||
}
|
||||
|
||||
synchronized (VisualEqualityTest.class) {
|
||||
@ -161,37 +185,66 @@ public class VisualEqualityTest {
|
||||
}
|
||||
|
||||
|
||||
private static String formatPercentage(double diffRatio) {
|
||||
synchronized public List<Rectangle2D> detectErrors(Pix pix1, Pix pix2, String errorFile) {
|
||||
|
||||
return String.format("%.2f", diffRatio * 100);
|
||||
Pix pixDiff = Leptonica1.pixAbsDifference(pix1, pix2);
|
||||
|
||||
Pix pixThresh = Leptonica1.pixThresholdToBinary(pixDiff, LUMINANCE_DIFFERENCE_THRESHOLD);
|
||||
Leptonica1.pixInvert(pixThresh, pixThresh);
|
||||
// checks for connected black regions and outputs them as a list of boxes, a boxa
|
||||
Boxa boxa = Leptonica1.pixConnComp(pixThresh, null, 8);
|
||||
|
||||
List<Rectangle2D> errorRegions = readRectsFromBoxa(boxa).stream()
|
||||
.filter(box -> box.getWidth() * box.getHeight() >= ERROR_REGION_AREA_THRESHOLD)
|
||||
.toList();
|
||||
|
||||
if (!errorRegions.isEmpty()) {
|
||||
System.out.println("Found error(s) on page " + Path.of(errorFile).getFileName().toString().replace(".tiff", "") + ", writing error file.");
|
||||
// Boxa errorRegionsBoxa = pushRectsIntoBoxa(errorRegions); // this does not work
|
||||
// Pix errorPix = Leptonica1.pixDrawBoxa(pixThresh, errorRegionsBoxa, 2, -1); // somehow this runs forever
|
||||
Leptonica1.pixWrite(errorFile, pixThresh, 4);
|
||||
// LeptUtils.disposePix(errorPix);
|
||||
// LeptUtils.dispose(errorRegionsBoxa);
|
||||
}
|
||||
|
||||
LeptUtils.dispose(boxa);
|
||||
LeptUtils.disposePix(pixDiff);
|
||||
LeptUtils.disposePix(pixThresh);
|
||||
return errorRegions;
|
||||
}
|
||||
|
||||
|
||||
public double detectErrors(Pix pix1, Pix pix2, String errorFile) {
|
||||
// First, check if dimensions are the same
|
||||
if (pix1.w != pix2.w || pix1.h != pix2.h || pix1.d != pix2.d) {
|
||||
return 1;
|
||||
private static List<Rectangle2D> readRectsFromBoxa(Boxa boxa) {
|
||||
|
||||
Pointer[] pointers = boxa.box.getPointer().getPointerArray(0, boxa.n);
|
||||
List<Rectangle2D> boxes = new ArrayList<>(boxa.n);
|
||||
for (int i = 0; i < boxa.n; i++) {
|
||||
Box box = new Box(pointers[i]);
|
||||
boxes.add(new Rectangle2D.Double(box.x, box.y, box.w, box.h));
|
||||
LeptUtils.dispose(box);
|
||||
}
|
||||
return boxes;
|
||||
}
|
||||
|
||||
|
||||
private static Boxa pushRectsIntoBoxa(List<Rectangle2D> rects) {
|
||||
|
||||
Boxa boxa = new Boxa();
|
||||
boxa.n = rects.size();
|
||||
boxa.nalloc = rects.size();
|
||||
|
||||
Memory boxMemory = new Memory((long) Native.POINTER_SIZE * rects.size());
|
||||
|
||||
for (int i = 0; i < rects.size(); i++) {
|
||||
Rectangle2D rect = rects.get(i);
|
||||
Box box = new Box((int) rect.getX(), (int) rect.getY(), (int) rect.getWidth(), (int) rect.getHeight(), 0);
|
||||
|
||||
boxMemory.setPointer((long) i * Native.POINTER_SIZE, box.getPointer());
|
||||
}
|
||||
|
||||
// Create a new Pix for the absolute difference
|
||||
Pix pixDiff = Leptonica1.pixAbsDifference(pix1, pix2);
|
||||
boxa.box = new PointerByReference(boxMemory);
|
||||
|
||||
// Set a threshold for pixel difference (e.g., 10 out of 255)
|
||||
int threshold = 10;
|
||||
Pix pixThresh = Leptonica1.pixThresholdToBinary(pixDiff, threshold);
|
||||
|
||||
IntBuffer pCount = IntBuffer.allocate(1);
|
||||
Leptonica1.pixCountPixels(pixThresh, pCount, null);
|
||||
long totalPixels = (long) pix1.w * pix1.h;
|
||||
long samePixels = pCount.get();
|
||||
double percentDifference = 1 - (double) samePixels / totalPixels;
|
||||
if (percentDifference > SIMILARITY_THRESHOLD) {
|
||||
Leptonica1.pixWrite(errorFile, pixThresh, 5);
|
||||
}
|
||||
|
||||
LeptUtils.disposePix(pixDiff);
|
||||
LeptUtils.disposePix(pixThresh);
|
||||
return percentDifference;
|
||||
return boxa;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -30,7 +30,7 @@ public class GhostScriptService {
|
||||
int BATCH_SIZE = 256;
|
||||
String FORMAT = ".tiff";
|
||||
String DEVICE = "tiffgray";
|
||||
int DPI = 125;
|
||||
int DPI = 100;
|
||||
int PROCESS_COUNT = 1;
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user