Compare commits

..

20 Commits

Author SHA1 Message Date
Kilian Schüttler
c9424a5f4b Merge branch 'RED-10365' into 'master'
RED-10365: InvisibleElementRemovalService crashes for specific file

Closes RED-10365

See merge request redactmanager/commons/pdftron-logic-commons!36
2024-11-05 12:23:28 +01:00
Kilian Schuettler
e86e6fba2a RED-10365: InvisibleElementRemovalService crashes for specific file 2024-11-05 12:18:29 +01:00
Kilian Schüttler
ff9fd7bd44 Merge branch 'RED-9864' into 'master'
RED-9864: Ocr not working

Closes RED-9864

See merge request redactmanager/commons/pdftron-logic-commons!35
2024-08-26 14:59:11 +02:00
Kilian Schüttler
e6a1656e18 RED-9864: Ocr not working 2024-08-26 14:59:10 +02:00
Kilian Schüttler
b42bb29e5e Merge branch 'RED-9746' into 'master'
RED-9746: Document hardly editable

Closes RED-9746

See merge request redactmanager/commons/pdftron-logic-commons!34
2024-08-19 10:50:55 +02:00
Kilian Schüttler
de7e58d897 RED-9746: Document hardly editable 2024-08-19 10:50:55 +02:00
Corina Olariu
0b19f2d04c Merge branch 'RED-8701' into 'master'
RED-8701 - Move files to customer data repositories

Closes RED-8701

See merge request redactmanager/commons/pdftron-logic-commons!33
2024-04-24 11:56:01 +02:00
Corina Olariu
666c247f6a RED-8701 - Move files to customer data repositories
- update syngenta submodule
2024-04-23 14:59:36 +03:00
Corina Olariu
7b35b53a54 RED-8701 - Move files to customer data repositories
- update syngenta submodule
2024-04-23 10:50:11 +03:00
Corina Olariu
90470b41a0 RED-8701 - Move files to customer data repositories
- update unit tests with the new path to submodules for customer files
- remove customer files from project
2024-04-22 14:17:02 +03:00
Corina Olariu
bf7d374b0a RED-8701 - Move files to customer data repositories
- use git lfs to store customer files
2024-04-22 00:58:21 +03:00
Timo Bejan
16ef5afe90 Merge branch 'memory-optimisations' into 'master'
Memory optimisations

See merge request redactmanager/commons/pdftron-logic-commons!32
2024-03-15 08:55:34 +01:00
Timo Bejan
1b4ab8dc88 memory optimisations 2024-03-15 09:47:34 +02:00
Andrei Isvoran
e926083881 Merge branch 'RED-8359-deploy' into 'master'
RED-8359 - Fix gradle deploy

Closes RED-8359

See merge request redactmanager/commons/pdftron-logic-commons!25
2024-02-08 11:55:24 +01:00
Andrei Isvoran
d9677f37f7 RED-8359 - Fix gradle deploy 2024-02-08 12:48:09 +02:00
Andrei Isvoran
73d77624e9 Merge branch 'RED-8359' into 'master'
RED-8359 - Migrate to gradle

Closes RED-8359

See merge request redactmanager/commons/pdftron-logic-commons!23
2024-02-08 10:11:09 +01:00
Andrei Isvoran
4b0a06b8ba RED-8359 - Migrate to gradle 2024-02-08 10:11:08 +01:00
Kilian Schüttler
3bbd13daca Merge branch 'RED-8385' into 'master'
RED-8385: add functionality to ignore specific marked contents

Closes RED-8385

See merge request redactmanager/commons/pdftron-logic-commons!22
2024-02-01 12:26:06 +01:00
Kilian Schuettler
09d8ac4e9c RED-8385: add functionality to ignore specific marked contents 2024-02-01 12:22:20 +01:00
Kilian Schuettler
4888ac1608 RED-8385: add functionality to ignore specific marked contents 2024-02-01 12:21:19 +01:00
49 changed files with 1246914 additions and 821 deletions

8
.gitignore vendored
View File

@ -27,3 +27,11 @@
**/classpath-data.json
**/dependencies-and-licenses-overview.txt
git.tag
gradle.properties
gradlew
gradlew.bat
gradle/
**/.gradle
**/build

View File

@ -1,4 +1,25 @@
variables:
# SONAR_PROJECT_KEY: 'com.iqser.red.commons:pdftron-logic-commons'
GIT_SUBMODULE_STRATEGY: recursive
GIT_SUBMODULE_FORCE_HTTPS: 'true'
include:
- project: 'gitlab/gitlab'
ref: 'main'
file: 'ci-templates/maven_deps.yml'
file: 'ci-templates/gradle_java.yml'
deploy:
stage: deploy
tags:
- dind
script:
- echo "Building with gradle version ${BUILDVERSION}"
- gradle -Pversion=${BUILDVERSION} publish
- echo "BUILDVERSION=$BUILDVERSION" >> version.env
artifacts:
reports:
dotenv: version.env
rules:
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
- if: $CI_COMMIT_BRANCH =~ /^release/
- if: $CI_COMMIT_TAG

6
.gitmodules vendored Normal file
View File

@ -0,0 +1,6 @@
[submodule "src/test/resources/files/syngenta"]
path = src/test/resources/files/syngenta
url = https://gitlab.knecon.com/fforesight/documents/syngenta.git
[submodule "src/test/resources/files/basf"]
path = src/test/resources/files/basf
url = https://gitlab.knecon.com/fforesight/documents/basf.git

107
build.gradle.kts Normal file
View File

@ -0,0 +1,107 @@
plugins {
`java-library`
`maven-publish`
`kotlin-dsl`
pmd
checkstyle
jacoco
id("io.freefair.lombok") version "8.4"
id("org.sonarqube") version "4.0.0.2929"
}
repositories {
mavenLocal()
maven {
url = uri("https://pdftron.com/maven/release")
}
maven {
url = uri("https://nexus.knecon.com/repository/gindev/");
credentials {
username = providers.gradleProperty("mavenUser").getOrNull();
password = providers.gradleProperty("mavenPassword").getOrNull();
}
}
mavenCentral()
}
dependencies {
api("org.projectlombok:lombok:1.18.30")
api("com.google.guava:guava:33.0.0-jre")
api("com.pdftron:PDFNet:11.0.0")
testImplementation("net.sourceforge.lept4j:lept4j:1.19.1")
testImplementation("org.junit.jupiter:junit-jupiter:5.10.2")
testImplementation("org.assertj:assertj-core:3.24.2")
testImplementation("org.mockito:mockito-core:5.2.0")
testImplementation("org.apache.logging.log4j:log4j-slf4j2-impl:2.22.1")
compileOnly("org.slf4j:slf4j-api:2.0.11")
}
group = "com.iqser.red.commons"
description = "pdftron-logic-commons"
java.sourceCompatibility = JavaVersion.VERSION_17
java.targetCompatibility = JavaVersion.VERSION_17
publishing {
publications {
create<MavenPublication>("mavenJava") {
from(components["java"])
}
}
repositories {
maven {
url = uri("https://nexus.knecon.com/repository/red-platform-releases/")
credentials {
username = providers.gradleProperty("mavenUser").getOrNull();
password = providers.gradleProperty("mavenPassword").getOrNull();
}
}
}
}
tasks.withType<PublishToMavenRepository> {
onlyIf { publication.name == "mavenJava" }
}
pmd {
isConsoleOutput = true
}
tasks.pmdMain {
pmd.ruleSetFiles = files("${rootDir}/config/pmd/pmd.xml")
}
tasks.pmdTest {
pmd.ruleSetFiles = files("${rootDir}/config/pmd/test_pmd.xml")
}
tasks.named<Test>("test") {
useJUnitPlatform()
reports {
junitXml.outputLocation.set(layout.buildDirectory.dir("reports/junit"))
}
}
sonarqube {
properties {
property("sonar.login", providers.gradleProperty("sonarToken").getOrNull())
property("sonar.host.url", "https://sonarqube.knecon.com")
}
}
tasks.test {
finalizedBy(tasks.jacocoTestReport)
}
tasks.jacocoTestReport {
dependsOn(tasks.test)
reports {
xml.required.set(true)
csv.required.set(false)
html.outputLocation.set(layout.buildDirectory.dir("jacocoHtml"))
}
}
java {
withJavadocJar()
}

View File

@ -0,0 +1,39 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE module PUBLIC "-//Puppy Crawl//DTD Check Configuration 1.3//EN"
"http://www.puppycrawl.com/dtds/configuration_1_3.dtd">
<module name="Checker">
<property
name="severity"
value="error"/>
<module name="TreeWalker">
<module name="SuppressWarningsHolder"/>
<module name="MissingDeprecated"/>
<module name="MissingOverride"/>
<module name="AnnotationLocation"/>
<module name="JavadocStyle"/>
<module name="NonEmptyAtclauseDescription"/>
<module name="IllegalImport"/>
<module name="RedundantImport"/>
<module name="RedundantModifier"/>
<module name="EmptyBlock"/>
<module name="DefaultComesLast"/>
<module name="EmptyStatement"/>
<module name="EqualsHashCode"/>
<module name="ExplicitInitialization"/>
<module name="IllegalInstantiation"/>
<module name="ModifiedControlVariable"/>
<module name="MultipleVariableDeclarations"/>
<module name="PackageDeclaration"/>
<module name="ParameterAssignment"/>
<module name="SimplifyBooleanExpression"/>
<module name="SimplifyBooleanReturn"/>
<module name="StringLiteralEquality"/>
<module name="OneStatementPerLine"/>
<module name="FinalClass"/>
<module name="ArrayTypeStyle"/>
<module name="UpperEll"/>
<module name="OuterTypeFilename"/>
</module>
<module name="FileTabCharacter"/>
<module name="SuppressWarningsFilter"/>
</module>

20
config/pmd/pmd.xml Normal file
View File

@ -0,0 +1,20 @@
<?xml version="1.0"?>
<ruleset name="Custom ruleset"
xmlns="http://pmd.sourceforge.net/ruleset/2.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://pmd.sourceforge.net/ruleset/2.0.0 http://pmd.sourceforge.net/ruleset_2_0_0.xsd">
<description>
Knecon ruleset checks the code for bad stuff
</description>
<rule ref="category/java/errorprone.xml">
<exclude name="MissingSerialVersionUID"/>
<exclude name="AvoidLiteralsInIfCondition"/>
<exclude name="AvoidDuplicateLiterals"/>
<exclude name="NullAssignment"/>
<exclude name="AssignmentInOperand"/>
<exclude name="BeanMembersShouldSerialize"/>
</rule>
</ruleset>

22
config/pmd/test_pmd.xml Normal file
View File

@ -0,0 +1,22 @@
<?xml version="1.0"?>
<ruleset name="Custom ruleset"
xmlns="http://pmd.sourceforge.net/ruleset/2.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://pmd.sourceforge.net/ruleset/2.0.0 http://pmd.sourceforge.net/ruleset_2_0_0.xsd">
<description>
Knecon test ruleset checks the code for bad stuff
</description>
<rule ref="category/java/errorprone.xml">
<exclude name="MissingSerialVersionUID"/>
<exclude name="AvoidLiteralsInIfCondition"/>
<exclude name="AvoidDuplicateLiterals"/>
<exclude name="NullAssignment"/>
<exclude name="AssignmentInOperand"/>
<exclude name="TestClassWithoutTestCases"/>
<exclude name="BeanMembersShouldSerialize"/>
</rule>
</ruleset>

1
gradle.properties.kts Normal file
View File

@ -0,0 +1 @@
version = 2.0-SNAPSHOT

94
pom.xml
View File

@ -1,94 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>platform-dependency</artifactId>
<groupId>com.iqser.red</groupId>
<version>2.2.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<artifactId>pdftron-logic-commons</artifactId>
<groupId>com.iqser.red.commons</groupId>
<version>2.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-slf4j2-impl</artifactId>
<version>2.20.0</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
</dependency>
<dependency>
<groupId>com.pdftron</groupId>
<artifactId>PDFNet</artifactId>
<version>10.3.0</version>
<scope>provided</scope>
</dependency>
<!-- Test Dependencies -->
</dependencies>
<build>
<plugins>
<plugin>
<!-- create a test jar for the api classes to be used by other modules -->
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<executions>
<execution>
<goals>
<goal>test-jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.jacoco</groupId>
<artifactId>jacoco-maven-plugin</artifactId>
<executions>
<execution>
<id>prepare-agent</id>
<goals>
<goal>prepare-agent</goal>
</goals>
</execution>
<execution>
<id>report</id>
<goals>
<goal>report</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
<repositories>
<repository>
<id>pdftron</id>
<name>PDFNet Maven</name>
<url>https://pdftron.com/maven/release</url>
</repository>
</repositories>
</project>

1
settings.gradle.kts Normal file
View File

@ -0,0 +1 @@
rootProject.name = "pdftron-logic-commons"

View File

@ -1,7 +1,5 @@
package com.iqser.red.pdftronlogic.commons;
import static com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService.TOLERANCE;
import java.awt.geom.Area;
@ -35,6 +33,13 @@ public class ClippingPathStack {
}
@SneakyThrows
public void intersectClippingPath(Rectangle2D path) {
getCurrentClippingPath().intersect(new Area(path));
}
public boolean almostIntersects(double x, double y, double width, double height) {
// To address inconsistencies in the calculation of the bounding box we slightly increase the rectangle
// Height or width are zero for straight lines, even though they are being rendered. Therefore, height or width must be at minimum >0.
@ -56,15 +61,16 @@ public class ClippingPathStack {
public void enterNewGState() {
Area current = stack.peek();
Area cloned = new Area();
cloned.add(current);
Area cloned = (Area) current.clone();
stack.push(cloned);
}
public void leaveGState() {
stack.pop();
// somehow this greatly helps memory management
var popped = stack.pop();
popped.reset();
}
}
}

View File

@ -0,0 +1,76 @@
package com.iqser.red.pdftronlogic.commons;
import static com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService.TOLERANCE;
import java.awt.Shape;
import java.awt.geom.Rectangle2D;
import lombok.experimental.UtilityClass;
@UtilityClass
public class ComparisonUtils {
public Rectangle2D shrinkRectangle(Rectangle2D inner) {
return shrinkRectangle(inner, TOLERANCE);
}
public Rectangle2D shrinkRectangle(Rectangle2D rect, double tolerance) {
double newX = rect.getX() + tolerance;
double newY = rect.getY() + tolerance;
double newWidth = rect.getWidth() - 2 * tolerance;
double newHeight = rect.getHeight() - 2 * tolerance;
if (newWidth <= 1e-1) {
newWidth = 1e-1;
newX = rect.getX() + newWidth / 2;
}
if (newHeight <= 1e-1) {
newHeight = 1e-1;
newY = rect.getY() + newHeight / 2;
}
return new Rectangle2D.Double(newX, newY, newWidth, newHeight);
}
public Rectangle2D padRectangle(Rectangle2D inner) {
return padRectangle(inner, TOLERANCE);
}
public Rectangle2D padRectangle(Rectangle2D rect, double tolerance) {
double newX = rect.getX() - tolerance;
double newY = rect.getY() - tolerance;
double newWidth = rect.getWidth() + 2 * tolerance;
double newHeight = rect.getHeight() + 2 * tolerance;
if (newWidth <= 0) {
newWidth = 1e-2;
}
if (newHeight <= 0) {
newHeight = 1e-2;
}
return new Rectangle2D.Double(newX, newY, newWidth, newHeight);
}
public boolean almostContains(Shape outer, Rectangle2D inner) {
Rectangle2D innerRect = ComparisonUtils.shrinkRectangle(inner);
return outer.contains(innerRect);
}
public static boolean almostEqual(double a, double b) {
return Math.abs(a - b) < TOLERANCE;
}
}

View File

@ -3,8 +3,11 @@ package com.iqser.red.pdftronlogic.commons;
import java.awt.Color;
import java.awt.geom.AffineTransform;
import java.awt.geom.GeneralPath;
import java.awt.geom.PathIterator;
import java.awt.geom.Rectangle2D;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import com.google.common.primitives.Bytes;
import com.google.common.primitives.Doubles;
@ -21,16 +24,18 @@ import lombok.experimental.UtilityClass;
@UtilityClass
public class Converter {
public GeneralPath convertToGeneralPath(PathData pathData) throws PDFNetException {
public GeneralPath convertToGeneralPath(PathData pathData) {
GeneralPath linePath = new GeneralPath();
Iterator<Double> points = Doubles.asList(pathData.getPoints()).iterator();
Iterator<Double> points = Doubles.asList(pathData.getPoints())
.iterator();
Iterable<Byte> operators = Bytes.asList(pathData.getOperators());
for (var operator : operators) {
switch (operator) {
case PathData.e_moveto -> linePath.moveTo(points.next(), points.next());
case PathData.e_lineto -> linePath.lineTo(points.next(), points.next());
case PathData.e_cubicto -> linePath.curveTo(points.next(), points.next(), points.next(), points.next(), points.next(), points.next());
case PathData.e_conicto -> linePath.quadTo(points.next(), points.next(), points.next(), points.next());
case PathData.e_closepath -> linePath.closePath();
case PathData.e_rect -> {
double x = points.next();
@ -43,14 +48,67 @@ public class Converter {
linePath.lineTo(x, y + h);
linePath.closePath();
}
default -> throw new PDFNetException("Invalid Element Type", 0, "", "", "");
default -> throw new IllegalArgumentException("Invalid Operator Type " + operator);
}
}
return linePath;
}
public GeneralPath convertToGeneralPathAndTransformToInitialUserSpace(PathData pathData, Matrix2D ctm) throws PDFNetException{
public PathData convertToPathData(GeneralPath linePath) {
PathIterator pathIterator = linePath.getPathIterator(null);
List<Byte> operators = new LinkedList<>();
List<Double> points = new LinkedList<>();
while (!pathIterator.isDone()) {
double[] currentPoints = new double[6];
int type = pathIterator.currentSegment(currentPoints);
switch (type) {
case PathIterator.SEG_MOVETO -> {
operators.add((byte) PathData.e_moveto);
points.add(currentPoints[0]);
points.add(currentPoints[1]);
}
case PathIterator.SEG_LINETO -> {
operators.add((byte) PathData.e_lineto);
points.add(currentPoints[0]);
points.add(currentPoints[1]);
}
case PathIterator.SEG_QUADTO -> {
operators.add((byte) PathData.e_conicto);
points.add(currentPoints[0]);
points.add(currentPoints[1]);
points.add(currentPoints[2]);
points.add(currentPoints[3]);
}
case PathIterator.SEG_CUBICTO -> {
operators.add((byte) PathData.e_cubicto);
points.add(currentPoints[0]);
points.add(currentPoints[1]);
points.add(currentPoints[2]);
points.add(currentPoints[3]);
points.add(currentPoints[4]);
points.add(currentPoints[5]);
}
case PathIterator.SEG_CLOSE -> {
operators.add((byte) PathData.e_closepath);
}
}
}
byte[] operatorArr = new byte[operators.size()];
for (int i = 0; i < operators.size(); i++) {
operatorArr[i] = operators.get(i);
}
double[] pointArr = new double[points.size()];
for (int i = 0; i < points.size(); i++) {
pointArr[i] = points.get(i);
}
return new PathData(true, operatorArr, pointArr);
}
public GeneralPath convertToGeneralPathAndTransformToInitialUserSpace(PathData pathData, Matrix2D ctm) throws PDFNetException {
GeneralPath linePath = Converter.convertToGeneralPath(pathData);
//transform path to initial user space
@ -59,13 +117,13 @@ public class Converter {
return linePath;
}
@SneakyThrows
public static Color convertColor(ColorSpace colorSpace, ColorPt colorPt) {
ColorPt rgbColor = colorSpace.convert2RGB(colorPt);
Color color = new Color((float) rgbColor.get(0), (float) rgbColor.get(1), (float) rgbColor.get(2));
rgbColor.destroy();
return color;
try (ColorPt rgbColor = colorSpace.convert2RGB(colorPt)) {
return new Color((float) rgbColor.get(0), (float) rgbColor.get(1), (float) rgbColor.get(2));
}
}
@ -75,4 +133,15 @@ public class Converter {
return new Rectangle2D.Double(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
}
@SneakyThrows
public static AffineTransform toAffineTransform(Matrix2D textMatrix) {
if (textMatrix == null) {
return null;
}
return new AffineTransform(textMatrix.getA(), textMatrix.getB(), textMatrix.getC(), textMatrix.getD(), textMatrix.getV(), textMatrix.getH());
}
}

View File

@ -1,80 +0,0 @@
package com.iqser.red.pdftronlogic.commons;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.Element;
public class ElementFeatureFactory {
public static ElementFeatures extractFeatures(Element element) throws PDFNetException {
return switch (element.getType()) {
case Element.e_path -> buildPath(element);
case Element.e_text -> buildText(element);
case Element.e_image, Element.e_inline_image -> buildImage(element).build();
case Element.e_form -> buildForm(element);
// This technically should never happen, it's a safetynet
default -> throw new RuntimeException("Feature Extraction is not supported for PDFTron.Element with type: " + element.getType());
};
}
public static ElementFeatures extractFeaturesWithHash(Element element, String hashObject) throws PDFNetException {
return buildImage(element)
.hashOfImage(hashObject)
.build();
}
private static ElementFeatures.Form buildForm(Element element) throws PDFNetException {
return ElementFeatures.Form.builder()
.elementType(element.getType())
.boundingBox(Converter.toRectangle2D(element.getBBox()))
.xObjectType(element.getXObject().getType())
.dictOrArrayOrStreamLength(element.getXObject().getType() == 7 ? element.getXObject().getDecodedStream().size() : 0)
.build();
}
private static ElementFeatures.Image.ImageBuilder<?, ?> buildImage(Element element) throws PDFNetException {
return ElementFeatures.Image.builder()
.elementType(element.getType())
.boundingBox(Converter.toRectangle2D(element.getBBox()))
.dataSize(element.getImageDataSize())
.height(element.getImageHeight())
.width(element.getImageWidth())
.renderingIntent(element.getImageRenderingIntent())
.componentNum(element.getComponentNum())
.bitsPerComponent(element.getBitsPerComponent());
}
private static ElementFeatures.Text buildText(Element element) throws PDFNetException {
return ElementFeatures.Text.builder()
.elementType(element.getType())
.boundingBox(Converter.toRectangle2D(element.getBBox()))
.text(element.getTextString())
.font(element.getGState().getFont().getType())
.fontsize(element.getGState().getFontSize())
.build();
}
private static ElementFeatures.Path buildPath(Element element) throws PDFNetException {
return ElementFeatures.Path.builder()
.elementType(element.getType())
.boundingBox(Converter.toRectangle2D(element.getBBox()))
.isClippingPath(element.isClippingPath())
.isClipWindingFill(element.isClipWindingFill())
.isStroked(element.isStroked())
.isFilled(element.isFilled())
.isWindingFill(element.isWindingFill())
.fillColor(Converter.convertColor(element.getGState().getFillColorSpace(), element.getGState().getFillColor()))
.strokeColor(Converter.convertColor(element.getGState().getStrokeColorSpace(), element.getGState().getStrokeColor()))
.linePath(Converter.convertToGeneralPathAndTransformToInitialUserSpace(element.getPathData(), element.getCTM()))
.build();
}
}

View File

@ -1,270 +0,0 @@
package com.iqser.red.pdftronlogic.commons;
import static com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService.TOLERANCE;
import java.awt.Color;
import java.awt.geom.GeneralPath;
import java.awt.geom.Rectangle2D;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.Rect;
import lombok.AccessLevel;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Getter
@SuperBuilder
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ElementFeatures {
final private static double RECT_POSITION_SIMILARITY_THRESHOLD_FACTOR = 0.2; // specify how much the x and y value are allowed to differ
final private static double RECT_SIZE_SIMILARITY_THRESHOLD_FACTOR = 0.1; // the scale the images are allowed to differ
final private static double HAMMING_DISTANCE_THRESHOLD = 4; // defines the similarity of the hash of images
int elementType;
Rectangle2D boundingBox;
public boolean almostMatches(Element element) throws PDFNetException {
return element.getType() == elementType && //
element.getBBox() != null && //
rectsAlmostMatch(element.getBBox());
}
@SneakyThrows
private boolean rectsAlmostMatch(Rect bBox) {
// To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance
return almostEqual(bBox.getX1(), boundingBox.getX()) && //
almostEqual(bBox.getY1(), boundingBox.getY()) && //
almostEqual(bBox.getWidth(), boundingBox.getWidth()) && //
almostEqual(bBox.getHeight(), boundingBox.getHeight());
}
protected boolean almostEqual(double a, double b) {
return Math.abs(a - b) < TOLERANCE;
}
public boolean almostMatches(ElementFeatures elementFeatures) {
return elementFeatures.getElementType() == elementType && elementFeatures.getBoundingBox() != null && rectsAlmostMatch(elementFeatures.getBoundingBox());
}
@SneakyThrows
private boolean rectsAlmostMatch(Rectangle2D bBox) {
// To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance
return almostEqual(bBox.getX(), boundingBox.getX()) && //
almostEqual(bBox.getY(), boundingBox.getY()) && //
almostEqual(bBox.getWidth(), boundingBox.getWidth()) && //
almostEqual(bBox.getHeight(), boundingBox.getHeight());
}
public boolean isSimilarTo(ElementFeatures elementFeatures) {
return elementFeatures.getElementType() == elementType && elementFeatures.getBoundingBox() != null && areRectsSimilar(elementFeatures.getBoundingBox());
}
private boolean areRectsSimilar(Rectangle2D rectangle2D) {
// To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance
return isPositionSimilar(rectangle2D.getX(), boundingBox.getX(), rectangle2D.getWidth()) && //
isPositionSimilar(rectangle2D.getY(), boundingBox.getY(), rectangle2D.getHeight()) && //
isSizeSimilar(rectangle2D.getWidth(), boundingBox.getWidth()) && //
isSizeSimilar(rectangle2D.getHeight(), boundingBox.getHeight());
}
protected boolean isPositionSimilar(double a, double b, double boxSize) {
return Math.abs(a - b) < boxSize * RECT_POSITION_SIMILARITY_THRESHOLD_FACTOR;
}
protected boolean isSizeSimilar(double a, double b) {
return Math.abs(a - b) < a * RECT_SIZE_SIMILARITY_THRESHOLD_FACTOR;
}
@EqualsAndHashCode(callSuper = true)
@Getter
@SuperBuilder
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public static class Text extends ElementFeatures {
String text;
int font;
double fontsize;
@Override
public boolean almostMatches(Element element) throws PDFNetException {
return super.almostMatches(element) && //
text.equals(element.getTextString()) && //
font == element.getGState().getFont().getType() && //
almostEqual(fontsize, element.getGState().getFontSize());
}
}
@EqualsAndHashCode(callSuper = true)
@Getter
@SuperBuilder
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public static class Path extends ElementFeatures {
boolean isClippingPath;
boolean isClipWindingFill;
boolean isStroked;
boolean isFilled;
boolean isWindingFill;
Color strokeColor;
Color fillColor;
GeneralPath linePath;
@Override
public boolean almostMatches(Element element) throws PDFNetException {
return super.almostMatches(element) && //
isClippingPath == element.isClippingPath() && //
isClipWindingFill == element.isClipWindingFill() && //
isStroked == element.isStroked() && //
isFilled == element.isFilled() && //
isWindingFill == element.isWindingFill();
}
public boolean matchesFillColor(Color color) {
return color.equals(fillColor);
}
@SneakyThrows
public boolean isBackground(Rect area) {
return isFilled && //
getBoundingBox().intersects(area.getX1(), area.getY1(), area.getWidth(), area.getHeight());
}
}
@EqualsAndHashCode(callSuper = true)
@Getter
@SuperBuilder
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public static class Image extends ElementFeatures {
int dataSize;
int height;
int width;
int renderingIntent;
int componentNum;
int bitsPerComponent;
String hashOfImage;
@Override
public boolean almostMatches(Element element) throws PDFNetException {
return super.almostMatches(element) && //
dataSize == element.getImageDataSize() && //
height == element.getImageHeight() && //
width == element.getImageWidth() && //
renderingIntent == element.getImageRenderingIntent() && //
componentNum == element.getComponentNum() && //
bitsPerComponent == element.getBitsPerComponent();
}
public boolean almostMatches(ElementFeatures elementFeatures) {
if (elementFeatures.getClass() != this.getClass()) {
return false;
}
return super.almostMatches(elementFeatures) && this.dataSize == ((Image) elementFeatures).getDataSize() && this.height == ((Image) elementFeatures).getHeight() && this.width == ((Image) elementFeatures).getWidth() && this.renderingIntent == ((Image) elementFeatures).getRenderingIntent() && this.componentNum == ((Image) elementFeatures).getComponentNum() && this.bitsPerComponent == ((Image) elementFeatures).getBitsPerComponent() && calculateHammingDistance(
((Image) elementFeatures).getHashOfImage()) <= HAMMING_DISTANCE_THRESHOLD;
}
public boolean isSimilarTo(ElementFeatures elementFeatures) {
return super.isSimilarTo(elementFeatures) && //
calculateHammingDistance(((Image) elementFeatures).getHashOfImage()) <= HAMMING_DISTANCE_THRESHOLD;
}
// Helper method to calculate the Hamming distance between two hexadecimal strings
private int calculateHammingDistance(String hash2) {
int distance = 0;
int maxLength = Math.max(this.hashOfImage.length(), hash2.length());
for (int i = 0; i < maxLength; i++) {
char char1 = i < this.hashOfImage.length() ? this.hashOfImage.charAt(i) : '0';
char char2 = i < hash2.length() ? hash2.charAt(i) : '0';
if (char1 != char2) {
distance++;
}
}
return distance;
}
}
@EqualsAndHashCode(callSuper = true)
@Getter
@SuperBuilder
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public static class Form extends ElementFeatures {
int xObjectType;
long dictOrArrayOrStreamLength;
@Override
public boolean almostMatches(Element element) throws PDFNetException {
return element.getType() == getElementType() && //
element.getBBox() != null && //
(super.rectsAlmostMatch(element.getBBox()) || almostRotateMatches(element.getBBox().getRectangle())) && xObjectType == element.getXObject()
.getType() && dictOrArrayOrStreamLength == element.getXObject().getDecodedStream().size();
}
public boolean almostMatches(ElementFeatures elementFeatures) {
if (elementFeatures.getClass() != this.getClass()) {
return false;
}
return elementFeatures.getElementType() == getElementType() && elementFeatures.getBoundingBox() != null && (super.rectsAlmostMatch(elementFeatures.getBoundingBox()) || almostRotateMatches(
elementFeatures.getBoundingBox()
.getBounds2D())) && xObjectType == ((Form) elementFeatures).getXObjectType() && dictOrArrayOrStreamLength == ((Form) elementFeatures).getDictOrArrayOrStreamLength();
}
private boolean almostRotateMatches(Rectangle2D bBox) {
return almostEqual(bBox.getWidth(), getBoundingBox().getHeight()) && //
almostEqual(bBox.getHeight(), getBoundingBox().getWidth());
}
}
}

View File

@ -35,17 +35,16 @@ public class ImageHashFactory {
@SneakyThrows
private byte[] getBytesOfImage(com.pdftron.pdf.Image inputImage) {
// 0 because the memory filter determines the size
var memFilter = new MemoryFilter(0, false);
var filterWriter = new FilterWriter(memFilter);
try(var memFilter = new MemoryFilter(0, false);
var filterWriter = new FilterWriter(memFilter)) {
inputImage.export(filterWriter);
filterWriter.flushAll();
byte[] res = memFilter.getBuffer();
inputImage.export(filterWriter);
filterWriter.flushAll();
byte[] res = memFilter.getBuffer();
memFilter.flushAll();
memFilter.destroy();
filterWriter.destroy();
return res;
memFilter.flushAll();
return res;
}
}

View File

@ -1,23 +1,29 @@
package com.iqser.red.pdftronlogic.commons;
import static com.iqser.red.pdftronlogic.commons.VisualizationUtils.drawFeature;
import static com.iqser.red.pdftronlogic.commons.VisualizationUtils.drawRect;
import java.awt.Color;
import java.awt.Shape;
import java.awt.geom.Area;
import java.awt.geom.GeneralPath;
import java.awt.geom.Rectangle2D;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.TreeSet;
import com.iqser.red.pdftronlogic.commons.features.ElementFeatureFactory;
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
import com.iqser.red.pdftronlogic.commons.features.ImageFeatures;
import com.iqser.red.pdftronlogic.commons.features.PathFeatures;
import com.iqser.red.pdftronlogic.commons.lookup.ElementFeatureLookup;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.ColorPt;
import com.pdftron.pdf.ColorSpace;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementBuilder;
import com.pdftron.pdf.ElementReader;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.GState;
@ -37,6 +43,7 @@ import lombok.extern.slf4j.Slf4j;
public class InvisibleElementRemovalService {
static public final double TOLERANCE = 1;
public static final String KNECON_OCR = "KNECON_OCR";
/**
@ -62,15 +69,12 @@ public class InvisibleElementRemovalService {
PDFDoc pdfDoc = new PDFDoc(pdfFile);
execute(pdfDoc, delta, removePaths, markedContentToIgnore);
try {
try (pdfDoc) {
execute(pdfDoc, delta, removePaths, markedContentToIgnore);
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
} catch (Exception e) {
log.error("File could not be saved after invisible element removal");
throw new RuntimeException(e);
} finally {
pdfDoc.close();
}
}
@ -86,12 +90,22 @@ public class InvisibleElementRemovalService {
}
/**
* This method is equal to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean, Set)}, with removePaths == true and markedContentsToIgnore = Set.of("KNECON_OCR").
*/
public void removeInvisibleElementsButKeepOcrText(InputStream pdfFile, OutputStream out, boolean delta) {
removeInvisibleElements(pdfFile, out, delta, true, Set.of("KNECON_OCR"));
removeInvisibleElements(pdfFile, out, delta, true, Set.of(KNECON_OCR));
}
/**
* This method is equal to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean, Set)}, with removePaths == true and markedContentsToIgnore = Set.of("KNECON_OCR").
*/
public void removeInvisibleElementsButKeepOcrText(PDFDoc pdfFile, boolean delta) {
removeInvisibleElements(pdfFile, delta, true, Set.of(KNECON_OCR));
}
@ -140,39 +154,37 @@ public class InvisibleElementRemovalService {
private void execute(PDFDoc pdfDoc, boolean delta, boolean removePaths, Set<String> markedContentToIgnore) {
log.info("Start removing invisible Elements");
ElementWriter writer = new ElementWriter();
ElementReader reader = new ElementReader();
Set<Long> visitedXObjIds = new TreeSet<>();
try (PageIterator iterator = pdfDoc.getPageIterator(); ElementWriter writer = new ElementWriter(); ElementReader reader = new ElementReader()) {
Set<Long> visitedXObjIds = new TreeSet<>();
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
while (iterator.hasNext()) {
Page page = iterator.next();
Page page = iterator.next();
visitedXObjIds.add(page.getSDFObj().getObjNum());
visitedXObjIds.add(page.getSDFObj().getObjNum());
InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder()
.reader(reader)
.clippingPathStack(new ClippingPathStack(page.getMediaBox()))
.markedContentStack(new MarkedContentStack())
.removePaths(removePaths)
.delta(delta)
.overlappedElements(new ArrayList<>())
.visibleElements(new ArrayList<>())
.visitedXObjIds(visitedXObjIds)
.markedContentToIgnore(markedContentToIgnore)
.build();
try (InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder()
.reader(reader)
.clippingPathStack(new ClippingPathStack(page.getMediaBox()))
.markedContentStack(new MarkedContentStack(pdfDoc))
.removePaths(removePaths)
.delta(delta)
.overlappedElements(new ElementFeatureLookup())
.visibleElements(new ElementFeatureLookup())
.visitedXObjIds(visitedXObjIds)
.markedContentToIgnore(markedContentToIgnore)
.build()) {
removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context);
removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context);
context.visitedXObjIds().clear();
context.markedContentStack().clear();
context.visitedXObjIds().clear();
context.markedContentStack().clear();
removeOverlappedElements(page, writer, context);
removeOverlappedElements(page, writer, context);
}
}
}
writer.destroy();
reader.destroy();
log.info("Finished removing invisible Elements");
}
@ -228,68 +240,78 @@ public class InvisibleElementRemovalService {
private void processImages(Element imageElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
Rect rect = imageElement.getBBox();
try (Rect rect = imageElement.getBBox()) {
if (rect == null) {
return;
}
if (rect == null) {
return;
}
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
if (inClippingPath) {
ImageFeatures imageFeatures = ElementFeatureFactory.buildImage(imageElement);
if (!(context.markedContentStack.contextHasTransparency() || imageFeatures.isTransparent() || imageFeatures.isImageMask() || imageFeatures.isSoftMask())) {
calculateOverlaps(context, imageFeatures, imageFeatures.isMasked());
}
context.visibleElements().add(imageFeatures);
}
if (!context.delta() && inClippingPath) {
context.visibleElements().add(ElementFeatureFactory.extractFeatures(imageElement));
}
if (context.delta() ^ inClippingPath) {
writer.writeElement(imageElement);
if (context.delta() ^ inClippingPath) {
writer.writeElement(imageElement);
}
}
}
private void processText(Element textElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
Rect textBBox = textElement.getBBox();
try (Rect textBBox = textElement.getBBox()) {
if (textBBox == null) {
writer.writeElement(textElement);
return;
}
GState gState = textElement.getGState();
boolean inClippingPath = context.clippingPathStack().almostIntersects(textBBox.getX1(), textBBox.getY1(), textBBox.getWidth(), textBBox.getHeight());
boolean isTextVisible = isTextRenderedVisibly(gState, textBBox, context);
if (inClippingPath && isTextVisible) {
context.visibleElements().add(ElementFeatureFactory.extractFeatures(textElement));
}
if (!context.delta()) {
if (inClippingPath && isTextVisible) {
if (textBBox == null) {
writer.writeElement(textElement);
} else if (textElement.hasTextMatrix()) {
return;
}
GState gState = textElement.getGState();
boolean inClippingPath = context.clippingPathStack().almostIntersects(textBBox.getX1(), textBBox.getY1(), textBBox.getWidth(), textBBox.getHeight());
boolean isTextVisible = isTextRenderedVisibly(gState, textBBox, context);
if (inClippingPath && isTextVisible) {
context.visibleElements().add(ElementFeatureFactory.buildText(textElement, true, context.delta()));
}
if (!context.delta()) {
if (inClippingPath && isTextVisible) {
writer.writeElement(textElement);
} else if (textElement.hasTextMatrix()) {
/*
PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
Therefore, the position of a following Tj is affected by not writing the first Element.
This is why, we write only the Tm command:
*/
writer.writeGStateChanges(textElement);
}
} else {
if (!inClippingPath) {
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
// red for elements removed by clipping path
gState.setFillColor(new ColorPt(1, 0, 0));
writer.writeElement(textElement);
}
if (!isTextVisible) {
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
// blue for elements removed due to transparency or not rendered or same color as background
gState.setFillColor(new ColorPt(0, 0, 1));
gState.setTextRenderMode(GState.e_fill_text);
gState.setFillOpacity(1);
writer.writeElement(textElement);
textElement.setTextData(new byte[]{});
writer.writeElement(textElement);
}
} else {
if (!inClippingPath) {
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
// red for elements removed by clipping path
try (var color = new ColorPt(1, 0, 0)) {
gState.setFillColor(color);
}
writer.writeElement(textElement);
}
if (!isTextVisible) {
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
// blue for elements removed due to transparency or not rendered or same color as background
try (var color = new ColorPt(0, 0, 1)) {
gState.setFillColor(color);
}
gState.setTextRenderMode(GState.e_fill_text);
gState.setFillOpacity(1);
writer.writeElement(textElement);
}
}
}
}
@ -302,22 +324,27 @@ public class InvisibleElementRemovalService {
if (!context.visitedXObjIds().contains(formObj.getObjNum())) {
context.visitedXObjIds().add(formObj.getObjNum());
// writer needs to be newly initialized when entering a new content stream
// see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
ElementWriter formWriter = new ElementWriter();
context.clippingPathStack().enterNewGState();
context.clippingPathStack().intersectClippingPath(new GeneralPath(Converter.toRectangle2D(formElement.getBBox())));
context.reader().formBegin();
formWriter.begin(formObj);
try (ElementWriter formWriter = new ElementWriter()) {
context.markedContentStack.enterForm(formElement);
context.clippingPathStack().enterNewGState();
try (var formElementBBOX = formElement.getBBox()) {
context.clippingPathStack().intersectClippingPath(Converter.toRectangle2D(formElementBBOX));
context.reader().formBegin();
formWriter.begin(formObj);
context.reader().clearChangeList();
formWriter.setDefaultGState(context.reader());
context.reader().clearChangeList();
formWriter.setDefaultGState(context.reader());
processElements(formWriter, context);
formWriter.end();
formWriter.destroy();
context.reader().end();
context.clippingPathStack().leaveGState();
processElements(formWriter, context);
formWriter.end();
context.reader().end();
context.clippingPathStack().leaveGState();
context.markedContentStack.leaveForm();
}
}
}
}
@ -326,60 +353,65 @@ public class InvisibleElementRemovalService {
PathData pathData = pathElement.getPathData();
if (pathData.getOperators().length == 0 && pathData.getPoints().length == 0 || pathElement.getBBox() == null) {
writer.writeElement(pathElement);
return;
}
GeneralPath linePath = Converter.convertToGeneralPathAndTransformToInitialUserSpace(pathData, pathElement.getCTM());
var rect = linePath.getBounds2D();
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight());
if (pathElement.isClippingPath()) {
if (pathElement.isClipWindingFill()) {
linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
} else {
linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
try (var bbox = pathElement.getBBox()) {
if (pathData.getOperators().length == 0 && pathData.getPoints().length == 0 || bbox == null) {
writer.writeElement(pathElement);
return;
}
context.clippingPathStack().intersectClippingPath(linePath);
pathElement.setPathClip(!context.delta());
writer.writeElement(pathElement);
PathFeatures pathFeatures = ElementFeatureFactory.buildPath(pathElement);
GeneralPath linePath = pathFeatures.getLinePath();
} else {
if (pathElement.isWindingFill()) {
linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
} else {
linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
}
var rect = linePath.getBounds2D();
if (inClippingPath) {
if (isFilledAndNonTransparent(pathElement)) {
calculateOverlapsForLinePath(context, linePath);
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight());
if (pathElement.isClippingPath()) {
if (pathElement.isClipWindingFill()) {
linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
} else {
linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
}
context.visibleElements().add(ElementFeatureFactory.extractFeatures(pathElement));
}
if (!context.delta() && (inClippingPath || !context.removePaths())) {
context.clippingPathStack().intersectClippingPath(linePath);
pathElement.setPathClip(!context.delta());
writer.writeElement(pathElement);
}
if (context.delta() && !inClippingPath && context.removePaths()) {
pathElement.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
pathElement.getGState().setFillColor(new ColorPt(1, 0, 0));
pathElement.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
pathElement.getGState().setStrokeColor(new ColorPt(1, 0, 0));
writer.writeElement(pathElement);
} else {
if (pathElement.isWindingFill()) {
linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
} else {
linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
}
if (inClippingPath) {
if (!context.markedContentStack.contextHasTransparency() && isFilledAndNonTransparent(pathElement)) {
calculateOverlaps(context, pathFeatures, false);
}
context.visibleElements().add(ElementFeatureFactory.extractFeatures(pathElement));
}
if (!context.delta() && (inClippingPath || !context.removePaths())) {
writer.writeElement(pathElement);
}
if (context.delta() && !inClippingPath && context.removePaths()) {
try (var color = new ColorPt(1, 0, 0)) {
pathElement.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
pathElement.getGState().setFillColor(color);
pathElement.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
pathElement.getGState().setStrokeColor(color);
writer.writeElement(pathElement);
}
}
}
}
}
private void calculateOverlapsForLinePath(InvisibleElementRemovalContext context, GeneralPath linePath) {
private void calculateOverlaps(InvisibleElementRemovalContext context, ElementFeatures elementFeatures, boolean textOnly) {
List<ElementFeatures> currentOverlappedElements = context.visibleElements().stream().filter(features -> almostContains(linePath, features.getBoundingBox())).toList();
List<ElementFeatures> currentOverlappedElements = context.visibleElements().findOverlapped(elementFeatures, textOnly);
context.overlappedElements().addAll(currentOverlappedElements);
context.visibleElements().removeAll(currentOverlappedElements);
}
@ -391,7 +423,8 @@ public class InvisibleElementRemovalService {
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
if (context.delta()) {
// green for element removed due to overlapping
context.overlappedElements().forEach(feature -> drawBBox(writer, feature.getBoundingBox(), "#00FF00"));
context.overlappedElements()
.forEach(feature -> drawFeature(writer, feature, Color.GREEN));
context.overlappedElements().clear();
}
processOverlappedElements(writer, context);
@ -399,7 +432,7 @@ public class InvisibleElementRemovalService {
context.reader().end();
if (!context.overlappedElements().isEmpty()) {
log.warn(context.overlappedElements().size() + " overlapped elements have not been found or removed");
log.debug(context.overlappedElements().size() + " overlapped elements have not been found or removed");
}
}
@ -439,24 +472,30 @@ public class InvisibleElementRemovalService {
private static void removeOverlappedElement(ElementWriter writer, InvisibleElementRemovalContext context, Element element) throws PDFNetException {
boolean anyMatch = false;
for (ElementFeatures elementToRemove : context.overlappedElements()) {
if (elementToRemove.almostMatches(element)) {
context.overlappedElements().remove(elementToRemove);
anyMatch = true;
break;
try (Rect bbox = element.getBBox()) {
if (bbox == null) {
writer.writeElement(element);
return;
}
}
if (!anyMatch) {
Optional<ElementFeatures> optionalElementMatch = context.overlappedElements()
.anyMatch(ElementFeatureFactory.extractFeatures(element));
if (optionalElementMatch.isPresent()) {
context.overlappedElements().remove(optionalElementMatch.get());
if (element.getType() == 3 && element.hasTextMatrix()) {
/*
PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
Therefore, the position of a following Tj is affected by not writing the first Element.
This is why, we write only the Tm command:
*/
element.setTextData(new byte[]{});
writer.writeElement(element);
}
} else {
writer.writeElement(element);
} else if (element.getType() == 3 && element.hasTextMatrix()) {
/*
PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
Therefore, the position of a following Tj is affected by not writing the first Element.
This is why, we write only the Tm command:
*/
writer.writeGStateChanges(element);
}
}
@ -470,17 +509,17 @@ public class InvisibleElementRemovalService {
context.visitedXObjIds().add(formObj.getObjNum());
// writer needs to be newly initialized when entering a new content stream
// see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
ElementWriter formWriter = new ElementWriter();
context.reader().formBegin();
formWriter.begin(formObj);
try (ElementWriter formWriter = new ElementWriter()) {
context.reader().formBegin();
formWriter.begin(formObj);
context.reader().clearChangeList();
formWriter.setDefaultGState(context.reader());
context.reader().clearChangeList();
formWriter.setDefaultGState(context.reader());
processOverlappedElements(formWriter, context);
formWriter.end();
formWriter.destroy();
context.reader().end();
processOverlappedElements(formWriter, context);
formWriter.end();
context.reader().end();
}
}
}
@ -499,93 +538,71 @@ public class InvisibleElementRemovalService {
private boolean strokeIsVisible(GState gState, Rect textBBox, InvisibleElementRemovalContext context) throws PDFNetException {
return gState.getStrokeOpacity() != 0 && differentColorThanBackgroundColor(Converter.convertColor(gState.getStrokeColorSpace(), gState.getStrokeColor()),
textBBox,
context);
textBBox,
context);
}
private boolean fillIsVisible(GState gState, Rect textBBox, InvisibleElementRemovalContext context) throws PDFNetException {
return gState.getFillOpacity() != 0 && differentColorThanBackgroundColor(Converter.convertColor(gState.getFillColorSpace(), gState.getFillColor()), textBBox, context);
try (var color = gState.getFillColor()) {
return gState.getFillOpacity() != 0 && differentColorThanBackgroundColor(Converter.convertColor(gState.getFillColorSpace(), color), textBBox, context);
}
}
@SneakyThrows
private boolean differentColorThanBackgroundColor(Color fillColor, Rect textBBox, InvisibleElementRemovalContext context) {
List<ElementFeatures.Path> backgroundElements = findVisiblePathElementsThatIntersect(textBBox, context);
List<PathFeatures> backgroundElements = findVisiblePathElementsThatIntersect(textBBox, context);
if (backgroundElements.isEmpty()) {
return !fillColor.equals(Color.WHITE);
}
List<ElementFeatures.Path> pathElementsByColor = backgroundElements.stream().filter(path -> path.getFillColor().equals(fillColor)).toList();
List<PathFeatures> pathElementsByColor = backgroundElements.stream()
.filter(path -> path.getFillColor().equals(fillColor))
.toList();
if (pathElementsByColor.isEmpty()) {
return true;
}
Area backgroundArea = mergeLinePathsToArea(pathElementsByColor);
return !almostContains(backgroundArea, Converter.toRectangle2D(textBBox));
return !ComparisonUtils.almostContains(backgroundArea, Converter.toRectangle2D(textBBox));
}
private static List<ElementFeatures.Path> findVisiblePathElementsThatIntersect(Rect textBBox, InvisibleElementRemovalContext context) {
private static List<PathFeatures> findVisiblePathElementsThatIntersect(Rect textBBox, InvisibleElementRemovalContext context) {
var result = new ArrayList<PathFeatures>();
context.visibleElements().findIntersecting(textBBox)
.forEach(element -> {
if (element instanceof PathFeatures pathFeatures && !pathFeatures.getFillColor().equals(Color.WHITE) && pathFeatures.isFilled()) {
result.add(pathFeatures);
}
});
return result;
return context.visibleElements()
.stream()
.filter(elementFeatures -> elementFeatures.getElementType() == Element.e_path)
.map(elementFeatures -> (ElementFeatures.Path) elementFeatures)
.filter(elementFeatures -> !elementFeatures.getFillColor().equals(Color.WHITE))
.filter(element -> element.isBackground(textBBox))
.toList();
}
private static Area mergeLinePathsToArea(List<ElementFeatures.Path> pathElementsWithSameColor) {
private static Area mergeLinePathsToArea(List<PathFeatures> pathElementsWithSameColor) {
Area backgroundArea = new Area();
pathElementsWithSameColor.stream().map(ElementFeatures.Path::getLinePath).map(Area::new).forEach(backgroundArea::add);
pathElementsWithSameColor.stream()
.map(PathFeatures::getLinePath)
.map(Area::new)
.forEach(backgroundArea::add);
return backgroundArea;
}
private boolean almostContains(Shape outer, Rectangle2D inner) {
//To address inconsistencies in the calculation of the bounding box we slightly shrink the inner rectangle
double x_with_tolerance = inner.getX() >= 0 ? inner.getX() + TOLERANCE : inner.getX() - TOLERANCE;
double y_with_tolerance = inner.getY() >= 0 ? inner.getY() + TOLERANCE : inner.getY() - TOLERANCE;
double height_with_tolerance = inner.getHeight() - (2 * TOLERANCE);
double width_with_tolerance = inner.getWidth() - (2 * TOLERANCE);
Rectangle2D innerRect = new Rectangle2D.Double(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance);
return outer.contains(innerRect);
}
private boolean isFilledAndNonTransparent(Element element) throws PDFNetException {
return element.isFilled() && element.getGState().getFillOpacity() == 1;
}
@SneakyThrows
private void drawBBox(ElementWriter writer, Rectangle2D r, String hexcolor) {
ColorPt colorPt = new ColorPt(Integer.valueOf(hexcolor.substring(1, 3), 16) / 255d,
Integer.valueOf(hexcolor.substring(3, 5), 16) / 255d,
Integer.valueOf(hexcolor.substring(5, 7), 16) / 255d);
ElementBuilder eb = new ElementBuilder();
Element rect = eb.createRect(r.getX(), r.getY(), r.getWidth(), r.getHeight());
rect.setPathStroke(true);
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
rect.getGState().setStrokeColor(colorPt);
writer.writePlacedElement(rect);
colorPt.destroy();
eb.destroy();
}
@Builder
private record InvisibleElementRemovalContext(
boolean removePaths,
@ -593,11 +610,19 @@ public class InvisibleElementRemovalService {
ElementReader reader,
ClippingPathStack clippingPathStack,
MarkedContentStack markedContentStack,
List<ElementFeatures> overlappedElements,
List<ElementFeatures> visibleElements,
ElementFeatureLookup overlappedElements,
ElementFeatureLookup visibleElements,
Set<Long> visitedXObjIds,
Set<String> markedContentToIgnore) {
Set<String> markedContentToIgnore
) implements AutoCloseable {
@Override
public void close() {
overlappedElements.close();
visibleElements.close();
}
}
}
}

View File

@ -3,37 +3,66 @@ package com.iqser.red.pdftronlogic.commons;
import java.util.Deque;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Optional;
import java.util.Set;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.ocg.Group;
import com.pdftron.sdf.Obj;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
@RequiredArgsConstructor
public class MarkedContentStack {
Deque<MarkedContent> markedContentStack = new LinkedList<>();
private final PDFDoc pdfDoc;
Deque<MarkedContent> stack = new LinkedList<>();
Deque<Form> formStack = new LinkedList<>();
public void enterMarkedContent(String name) {
markedContentStack.push(new MarkedContent(name));
stack.push(new MarkedContent(name, name.startsWith("OC")));
}
@SneakyThrows
public void enterForm(Element formElement) {
Obj oc = formElement.getXObject().findObj("OC");
Obj group = formElement.getXObject().findObj("Group");
boolean transparency = false;
if (group != null) {
Obj groupSubType = group.findObj("S");
if (groupSubType != null && groupSubType.isName() && groupSubType.getName().equals("Transparency")) {
transparency = true;
}
}
formStack.push(new Form(formElement.getXObject().getObjNum(), oc != null, transparency));
}
public void leaveMarkedContent() {
markedContentStack.pop();
stack.pop();
}
public String currentMarkedContent() {
if (markedContentStack.isEmpty()) {
if (stack.isEmpty()) {
return "";
}
return markedContentStack.peek().name();
return stack.peek().name();
}
public boolean currentMarkedContentContains(String name) {
Iterator<MarkedContent> markedContentIterator = markedContentStack.descendingIterator();
Iterator<MarkedContent> markedContentIterator = stack.descendingIterator();
while (markedContentIterator.hasNext()) {
var markedContent = markedContentIterator.next();
if (markedContent.name().equals(name)) {
@ -46,10 +75,10 @@ public class MarkedContentStack {
public boolean currentMarkedContentContainsAny(Set<String> names) {
if (markedContentStack.isEmpty()) {
if (stack.isEmpty()) {
return false;
}
Iterator<MarkedContent> markedContentIterator = markedContentStack.descendingIterator();
Iterator<MarkedContent> markedContentIterator = stack.descendingIterator();
while (markedContentIterator.hasNext()) {
var markedContent = markedContentIterator.next();
if (names.contains(markedContent.name())) {
@ -62,11 +91,31 @@ public class MarkedContentStack {
public void clear() {
markedContentStack.clear();
stack.clear();
}
private record MarkedContent(String name) {
public boolean contextHasTransparency() {
return formStack.stream()
.anyMatch(form -> form.optionalContent || form.transparency) //
|| stack.stream()
.anyMatch(MarkedContent::optionalContent);
}
public void leaveForm() {
formStack.pop();
}
private record MarkedContent(String name, boolean optionalContent) {
}
private record Form(long ref, boolean optionalContent, boolean transparency) {
}

View File

@ -5,6 +5,7 @@ import com.pdftron.pdf.*;
import com.pdftron.pdf.ocg.Group;
import com.pdftron.pdf.ocg.OCMD;
import com.pdftron.sdf.Obj;
import lombok.SneakyThrows;
import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j;
@ -27,8 +28,9 @@ public class OCGWatermarkRemovalService {
@SneakyThrows
private boolean hasOCGWatermarks(PDFDoc pdfDoc) {
Obj ocgs = pdfDoc.getOCGs();
if(ocgs != null) {
if (ocgs != null) {
for (int i = 0; i < ocgs.size(); i++) {
Group group = new Group(ocgs.getAt(i));
if (group.isValid() && group.getName().equals("Watermark")) {
@ -43,26 +45,21 @@ public class OCGWatermarkRemovalService {
@SneakyThrows
private void removeOCGWatermarks(PDFDoc pdfDoc) {
ElementReader reader = new ElementReader();
ElementWriter writer = new ElementWriter();
Set<Long> visitedXObjIds = new TreeSet<>();
try (PageIterator iterator = pdfDoc.getPageIterator(); ElementReader reader = new ElementReader(); ElementWriter writer = new ElementWriter()) {
Set<Long> visitedXObjIds = new TreeSet<>();
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
while (iterator.hasNext()) {
Page page = iterator.next();
writeAllElementsExceptWatermarks(page, reader, writer, visitedXObjIds);
}
Page page = iterator.next();
writeAllElementsExceptWatermarks(page, reader, writer, visitedXObjIds);
}
reader.destroy();
writer.destroy();
}
@SneakyThrows
private void writeAllElementsExceptWatermarks(Page page,
ElementReader reader,
ElementWriter writer,
Set<Long> visitedXObjIds) {
private void writeAllElementsExceptWatermarks(Page page, ElementReader reader, ElementWriter writer, Set<Long> visitedXObjIds) {
reader.begin(page);
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
@ -72,10 +69,7 @@ public class OCGWatermarkRemovalService {
}
private void processElements(Page page,
ElementReader reader,
ElementWriter writer,
Set<Long> visitedXObjIds) throws PDFNetException {
private void processElements(Page page, ElementReader reader, ElementWriter writer, Set<Long> visitedXObjIds) throws PDFNetException {
for (Element element = reader.next(); element != null; element = reader.next()) {
@ -93,6 +87,7 @@ public class OCGWatermarkRemovalService {
@SneakyThrows
private boolean inOCGWatermark(Element element) {
var xObj = element.getXObject();
if (xObj != null) {
Obj oc = xObj.findObj("OC");
@ -111,12 +106,7 @@ public class OCGWatermarkRemovalService {
@SneakyThrows
private void processForms(Page page,
Element element,
ElementReader reader,
ElementWriter writer,
Set<Long> visitedXObjIds) {
private void processForms(Page page, Element element, ElementReader reader, ElementWriter writer, Set<Long> visitedXObjIds) {
writer.writeElement(element);
@ -124,17 +114,17 @@ public class OCGWatermarkRemovalService {
visitedXObjIds.add(element.getXObject().getObjNum());
// writer needs to be newly initialized when entering a new content stream
// see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
ElementWriter formWriter = new ElementWriter();
reader.formBegin();
formWriter.begin(element.getXObject());
try (ElementWriter formWriter = new ElementWriter()) {
reader.formBegin();
formWriter.begin(element.getXObject());
reader.clearChangeList();
formWriter.setDefaultGState(reader);
reader.clearChangeList();
formWriter.setDefaultGState(reader);
processElements(page, reader, formWriter, visitedXObjIds);
formWriter.end();
formWriter.destroy();
reader.end();
processElements(page, reader, formWriter, visitedXObjIds);
formWriter.end();
reader.end();
}
}
}

View File

@ -0,0 +1,49 @@
package com.iqser.red.pdftronlogic.commons;
import java.lang.reflect.Field;
import com.pdftron.pdf.Font;
import lombok.experimental.UtilityClass;
@UtilityClass
public class PDFNetUtils {
@SuppressWarnings("PMD")
public void requireFontNotClosed(Font font) {
try {
if (font.__GetHandle() == 0L) {
throw new AssertionError("Font is already closed!");
}
Object refHandle = font.__GetRefHandle();
Class<?> clazz = refHandle.getClass();
Field implField = null;
while (clazz != null) {
try {
implField = clazz.getDeclaredField("impl");
implField.setAccessible(true);
break;
} catch (NoSuchFieldException e) {
clazz = clazz.getSuperclass();
}
}
if (implField != null) {
long implValue = (Long) implField.get(refHandle);
if (implValue == 0L) {
throw new AssertionError("Associated ElementReader of Font is already closed!");
}
}
} catch (IllegalAccessException e) {
throw new AssertionError("Font Ref is missing the field impl, should never happen!");
}
}
}

View File

@ -0,0 +1,71 @@
package com.iqser.red.pdftronlogic.commons;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import com.iqser.red.pdftronlogic.commons.features.ElementFeatureFactory;
import com.iqser.red.pdftronlogic.commons.features.ImageFeatures;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementReader;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.sdf.Obj;
import lombok.experimental.UtilityClass;
@UtilityClass
public class PdfImageExtraction {
public List<List<ImageFeatures>> extractImages(InputStream fileStream) throws IOException, PDFNetException {
try (PDFDoc pdfDoc = new PDFDoc(fileStream); ElementReader reader = new ElementReader()) {
List<List<ImageFeatures>> imagesPerPage = new ArrayList<>(pdfDoc.getPageCount());
var iter = pdfDoc.getPageIterator();
while (iter.hasNext()) {
Page page = iter.next();
Set<Long> visitedXObjIds = new HashSet<>();
visitedXObjIds.add(page.getSDFObj().getObjNum());
List<ImageFeatures> imageFeatures = new LinkedList<>();
reader.begin(page);
processElements(reader, imageFeatures, visitedXObjIds);
reader.end();
imagesPerPage.add(imageFeatures);
}
return imagesPerPage;
}
}
private void processElements(ElementReader reader, List<ImageFeatures> imageFeaturesOnPage, Set<Long> visitedXObjIds) throws PDFNetException {
for (Element element = reader.next(); element != null; element = reader.next()) {
switch (element.getType()) {
case Element.e_image, Element.e_inline_image -> imageFeaturesOnPage.add(ElementFeatureFactory.buildImage(element));
case Element.e_form -> {
Obj formObj = element.getXObject();
if (!visitedXObjIds.contains(formObj.getObjNum())) {
visitedXObjIds.add(formObj.getObjNum());
reader.formBegin();
processElements(reader, imageFeaturesOnPage, visitedXObjIds);
reader.end();
}
}
}
}
}
}

View File

@ -3,42 +3,102 @@ package com.iqser.red.pdftronlogic.commons;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import com.iqser.red.pdftronlogic.commons.features.ElementFeatureFactory;
import com.iqser.red.pdftronlogic.commons.features.TextFeatures;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementReader;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
import com.pdftron.pdf.TextExtractor;
import com.pdftron.sdf.Obj;
import lombok.experimental.UtilityClass;
@UtilityClass
public class PdfTextExtraction {
private static String execute(PDFDoc pdfDoc) throws IOException, PDFNetException{
TextExtractor extractor = new TextExtractor();
List<String> texts = new ArrayList<>();
private static String execute(PDFDoc pdfDoc) throws PDFNetException {
PageIterator iterator = pdfDoc.getPageIterator();
while (iterator.hasNext()) {
Page page = iterator.next();
extractor.begin(page);
texts.add(extractor.getAsText());
try (PageIterator iterator = pdfDoc.getPageIterator(); TextExtractor extractor = new TextExtractor()) {
List<String> texts = new ArrayList<>();
while (iterator.hasNext()) {
Page page = iterator.next();
extractor.begin(page);
texts.add(extractor.getAsText());
}
pdfDoc.close();
return String.join("\n", texts);
}
extractor.destroy();
pdfDoc.close();
return String.join("\n", texts);
}
public static String extractAllTextFromDocument(InputStream fileStream) throws IOException, PDFNetException {
PDFDoc pdfDoc = new PDFDoc(fileStream);
return execute(pdfDoc);
}
public static String extractAllTextFromDocument(PDFDoc pdfDoc) throws IOException, PDFNetException {
return execute(pdfDoc);
}
public static List<List<TextFeatures>> extractAllGlyphsFromDocument(InputStream fileStream, boolean includePathData) throws IOException, PDFNetException {
try (PDFDoc pdfDoc = new PDFDoc(fileStream); ElementReader reader = new ElementReader()) {
List<List<TextFeatures>> glyphsPerPages = new ArrayList<>(pdfDoc.getPageCount());
var iter = pdfDoc.getPageIterator();
while (iter.hasNext()) {
Page page = iter.next();
Set<Long> visitedXObjIds = new HashSet<>();
visitedXObjIds.add(page.getSDFObj().getObjNum());
List<TextFeatures> textFeatures = new LinkedList<>();
reader.begin(page);
processElements(reader, textFeatures, visitedXObjIds, includePathData);
reader.end();
glyphsPerPages.add(textFeatures);
}
return glyphsPerPages;
}
}
private static void processElements(ElementReader reader, List<TextFeatures> textFeaturesOnPage, Set<Long> visitedXObjIds, boolean includePathData) throws PDFNetException {
for (Element element = reader.next(); element != null; element = reader.next()) {
switch (element.getType()) {
case Element.e_text -> textFeaturesOnPage.add(ElementFeatureFactory.buildText(element, includePathData, includePathData));
case Element.e_form -> {
Obj formObj = element.getXObject();
if (!visitedXObjIds.contains(formObj.getObjNum())) {
visitedXObjIds.add(formObj.getObjNum());
reader.formBegin();
processElements(reader, textFeaturesOnPage, visitedXObjIds, includePathData);
reader.end();
}
}
}
}
}
}

View File

@ -0,0 +1,91 @@
package com.iqser.red.pdftronlogic.commons;
import java.awt.Color;
import java.awt.geom.Rectangle2D;
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
import com.iqser.red.pdftronlogic.commons.features.GlyphInfo;
import com.iqser.red.pdftronlogic.commons.features.TextFeatures;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.ColorPt;
import com.pdftron.pdf.ColorSpace;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementBuilder;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.PathData;
import lombok.SneakyThrows;
import lombok.experimental.UtilityClass;
@UtilityClass
public class VisualizationUtils {
@SneakyThrows
public static void drawFeature(ElementWriter writer, ElementFeatures features, Color color) {
try (ElementBuilder builder = new ElementBuilder()) {
if (features instanceof TextFeatures textFeatures) {
for (GlyphInfo glyph : textFeatures.getGlyphs()) {
if (glyph.getPathData().isPresent()) {
drawPathData(glyph.getPathData().get(), builder, writer, color);
}
}
}
drawRect(features.getBoundingBox(), builder, writer, color);
}
}
public static void drawPathData(PathData pathData, ElementBuilder builder, ElementWriter writer, Color color) throws PDFNetException {
Element path = builder.createPath(pathData.getPoints(), pathData.getOperators());
path.setPathFill(true);
path.setPathStroke(false);
path.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
float[] comp = color.getColorComponents(null);
try (ColorPt colorPt = new ColorPt(comp[0], comp[1], comp[2])) {
path.getGState().setFillColor(colorPt);
}
path.setWindingFill(true);
writer.writeElement(path);
}
public static void drawRect(Rectangle2D rectangle2D, ElementBuilder builder, ElementWriter writer, Color color) throws PDFNetException {
drawRect(rectangle2D, builder, writer, color, false);
}
public static void drawRect(Rectangle2D rectangle2D, ElementBuilder builder, ElementWriter writer, Color color, boolean fill) throws PDFNetException {
Element rect = builder.createRect(rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight());
rect.setPathFill(false);
rect.setPathStroke(true);
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
float[] comp = fill ? Color.BLACK.getColorComponents(null) : color.getColorComponents(null);
try (ColorPt colorPt = new ColorPt(comp[0], comp[1], comp[2])) {
rect.getGState().setStrokeColor(colorPt);
}
double lineWidth = fill ? 0.1 : 0.5;
rect.getGState().setLineWidth(lineWidth);
writer.writeElement(rect);
if (fill) {
Element filledRect = builder.createRect(rectangle2D.getX() + lineWidth,
rectangle2D.getY() + lineWidth,
rectangle2D.getWidth() - 2 * lineWidth,
rectangle2D.getHeight() - 2 * lineWidth);
filledRect.setPathFill(true);
filledRect.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
float[] comp2 = color.getColorComponents(null);
try (ColorPt colorPt = new ColorPt(comp2[0], comp2[1], comp2[2])) {
filledRect.getGState().setFillColor(colorPt);
}
writer.writeElement(filledRect);
}
}
}

View File

@ -10,6 +10,8 @@ import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import com.iqser.red.pdftronlogic.commons.features.ElementFeatureFactory;
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementReader;
@ -93,27 +95,25 @@ public class WatermarkRemovalService {
Map<Long, List<ElementFeatures>> formObjectsAndImagesForPages = new HashMap<>();
Set<Long> visitedXObjIds = new TreeSet<>();
ElementReader reader = new ElementReader();
try (ElementReader reader = new ElementReader(); PageIterator iterator = pdfDoc.getPageIterator()) {
while (iterator.hasNext()) {
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
Page page = iterator.next();
Page page = iterator.next();
double minAreaCoveringFromPage = AREA_THRESHOLD * page.getPageHeight() * page.getPageWidth();
double minAreaCoveringFromPage = AREA_THRESHOLD * page.getPageHeight() * page.getPageWidth();
LinkedList<ElementFeatures> elementFeaturesLinkedList = new LinkedList<>();
LinkedList<ElementFeatures> elementFeaturesLinkedList = new LinkedList<>();
reader.begin(page);
for (Element element = reader.next(); element != null; element = reader.next()) {
processElement(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringFromPage, page);
}
reader.begin(page);
for (Element element = reader.next(); element != null; element = reader.next()) {
processElement(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringFromPage, page);
formObjectsAndImagesForPages.put(page.getSDFObj().getObjNum(), elementFeaturesLinkedList);
}
formObjectsAndImagesForPages.put(page.getSDFObj().getObjNum(), elementFeaturesLinkedList);
return formObjectsAndImagesForPages;
}
reader.destroy();
return formObjectsAndImagesForPages;
}
@ -124,14 +124,16 @@ public class WatermarkRemovalService {
double minAreaCoveringPage,
Page page) throws PDFNetException {
if (element.getBBox() == null) {
return;
}
try (var bbox = element.getBBox()) {
if (bbox == null) {
return;
}
switch (element.getType()) {
case Element.e_form -> processXObject(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage, page);
case Element.e_image, Element.e_inline_image -> processImages(element, elementFeaturesLinkedList, page, minAreaCoveringPage);
case Element.e_text -> processText(element, elementFeaturesLinkedList, page);
switch (element.getType()) {
case Element.e_form -> processXObject(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage, page);
case Element.e_image, Element.e_inline_image -> processImages(element, elementFeaturesLinkedList, page, minAreaCoveringPage);
case Element.e_text -> processText(element, elementFeaturesLinkedList, page);
}
}
}
@ -148,11 +150,13 @@ public class WatermarkRemovalService {
return;
}
boolean isBigEnough = Math.abs(element.getBBox().getY1() - element.getBBox().getY2()) > page.getPageHeight() * MIN_TEXTWATERMARK_HEIGHT_THRESHOLD;
try (var bbox = element.getBBox()) {
boolean isBigEnough = Math.abs(bbox.getY1() - bbox.getY2()) > page.getPageHeight() * MIN_TEXTWATERMARK_HEIGHT_THRESHOLD;
if (isBigEnough) {
ElementFeatures elementFeatures = ElementFeatureFactory.extractFeatures(element);
elementFeaturesLinkedList.add(elementFeatures);
if (isBigEnough) {
ElementFeatures elementFeatures = ElementFeatureFactory.extractFeatures(element);
elementFeaturesLinkedList.add(elementFeatures);
}
}
}
@ -161,8 +165,9 @@ public class WatermarkRemovalService {
@SneakyThrows
private boolean isTextRotated(Element element) {
return Math.abs(element.getCTM().getB()) < Math.sin(Math.toRadians(ROTATED_TEXT_THRESHOLD)) || Math.abs(element.getCTM()
.getB()) > Math.sin(Math.toRadians(70 - ROTATED_TEXT_THRESHOLD));
try (var ctm = element.getCTM()) {
return Math.abs(ctm.getB()) < Math.sin(Math.toRadians(ROTATED_TEXT_THRESHOLD)) || Math.abs(ctm.getB()) > Math.sin(Math.toRadians(70 - ROTATED_TEXT_THRESHOLD));
}
}
@ -186,13 +191,15 @@ public class WatermarkRemovalService {
if (element.getXObject() == null) {
return;
}
if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringPage && isLocatedNearBorder(element, page)) {
return;
}
try (var bbox = element.getBBox()) {
if (bbox.getHeight() * bbox.getWidth() < minAreaCoveringPage && isLocatedNearBorder(element, page)) {
return;
}
String hashOfImage = ImageHashFactory.calculate(element);
ElementFeatures elementFeatures = ElementFeatureFactory.extractFeaturesWithHash(element, hashOfImage);
elementFeaturesLinkedList.add(elementFeatures);
String hashOfImage = ImageHashFactory.calculate(element);
ElementFeatures elementFeatures = ElementFeatureFactory.buildImageWithHash(element, hashOfImage);
elementFeaturesLinkedList.add(elementFeatures);
}
}
@ -200,10 +207,12 @@ public class WatermarkRemovalService {
@SneakyThrows
private boolean isLocatedNearBorder(Element element, Page page) {
return element.getBBox().getY1() < page.getVisibleContentBox().getY1() + page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD || element.getBBox()
.getY2() > page.getVisibleContentBox().getY2() - page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD || element.getBBox().getX1() < page.getVisibleContentBox()
.getX1() + page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD || element.getBBox().getX2() > page.getVisibleContentBox()
.getX2() - page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD;
try (var bbox = element.getBBox(); var contentBox = page.getVisibleContentBox();) {
return bbox.getY1() < contentBox.getY1() + page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD
|| bbox.getY2() > contentBox.getY2() - page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD
|| bbox.getX1() < contentBox.getX1() + page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD
|| bbox.getX2() > contentBox.getX2() - page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD;
}
}
@ -215,18 +224,20 @@ public class WatermarkRemovalService {
double minAreaCoveringPage,
Page page) {
if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringPage) {
return;
try (var bbox = element.getBBox()) {
if (bbox.getHeight() * bbox.getWidth() < minAreaCoveringPage) {
return;
}
}
if (visitedXObjIds.add(element.getXObject().getObjNum())) {
ElementReader xObjectReader = new ElementReader();
xObjectReader.begin(element.getXObject());
for (Element element1 = xObjectReader.next(); element1 != null; element1 = xObjectReader.next()) {
processElement(element1, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage, page);
try (ElementReader xObjectReader = new ElementReader()) {
xObjectReader.begin(element.getXObject());
for (Element element1 = xObjectReader.next(); element1 != null; element1 = xObjectReader.next()) {
processElement(element1, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage, page);
}
elementFeaturesLinkedList.add(ElementFeatureFactory.extractFeatures(element));
}
elementFeaturesLinkedList.add(ElementFeatureFactory.extractFeatures(element));
xObjectReader.destroy();
} else {
elementFeaturesLinkedList.add(ElementFeatureFactory.extractFeatures(element));
}
@ -245,10 +256,12 @@ public class WatermarkRemovalService {
.stream()
.flatMap(Collection::stream)
.filter(elementFeature -> formObjectsPerPage.values()
.stream()
.filter(elementFeaturesOnPage -> elementFeaturesOnPage.stream()
.anyMatch(elementFeature.getElementType() == Element.e_image || elementFeature.getElementType() == Element.e_inline_image ? elementFeature::isSimilarTo : elementFeature::almostMatches))
.count() >= minPagesFilter)
.stream()
.filter(elementFeaturesOnPage -> elementFeaturesOnPage.stream()
.anyMatch(elementFeature.getElementType() == Element.e_image
|| elementFeature.getElementType()
== Element.e_inline_image ? elementFeature::similar : elementFeature::matches))
.count() >= minPagesFilter)
.toList();
}
@ -256,21 +269,16 @@ public class WatermarkRemovalService {
@SneakyThrows
private void removeAllWatermarks(PDFDoc pdfDoc, List<ElementFeatures> watermarksElementFeaturesList) {
ElementReader reader = new ElementReader();
ElementWriter writer = new ElementWriter();
Set<Long> visitedXObjIds = new TreeSet<>();
try (PageIterator iterator = pdfDoc.getPageIterator(); ElementReader reader = new ElementReader(); ElementWriter writer = new ElementWriter()) {
Set<Long> visitedXObjIds = new TreeSet<>();
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
while (iterator.hasNext()) {
Page page = iterator.next();
writeAllElementsExceptWatermarks(page, reader, writer, watermarksElementFeaturesList, visitedXObjIds);
Page page = iterator.next();
writeAllElementsExceptWatermarks(page, reader, writer, watermarksElementFeaturesList, visitedXObjIds);
}
}
reader.destroy();
writer.destroy();
}
@ -300,16 +308,21 @@ public class WatermarkRemovalService {
switch (element.getType()) {
case Element.e_image, Element.e_inline_image -> {
if (element.getBBox() == null) {
writer.writeElement(element);
continue;
try (var bbox = element.getBBox()) {
if (bbox == null) {
writer.writeElement(element);
continue;
}
}
if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage && isLocatedNearBorder(element, page) && element.getBBox()
.getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage || element.getXObject() == null) {
try (var bbox = element.getBBox()) {
if (bbox.getHeight() * bbox.getWidth() < minAreaCoveringFromPage
&& isLocatedNearBorder(element, page)
&& bbox.getHeight() * bbox.getWidth() < minAreaCoveringFromPage || element.getXObject() == null) {
writer.writeElement(element);
continue;
writer.writeElement(element);
continue;
}
}
removeImages(element, writer, watermarksElementFeaturesList);
}
@ -330,7 +343,7 @@ public class WatermarkRemovalService {
}
for (ElementFeatures elementFeatures : watermarksElementFeaturesList) {
if (elementFeatures.almostMatches(element)) {
if (elementFeatures.matches(ElementFeatureFactory.extractFeatures(element))) {
return;
}
}
@ -349,8 +362,10 @@ public class WatermarkRemovalService {
return false;
}
if (Math.max(element.getBBox().getY1(), element.getBBox().getY2()) < page.getVisibleContentBox().getY1() + page.getPageHeight() * TEXT_POSITION_THRESHOLD) {
return false;
try (var bbox = element.getBBox(); var contents = page.getVisibleContentBox();) {
if (Math.max(bbox.getY1(), bbox.getY2()) < contents.getY1() + page.getPageHeight() * TEXT_POSITION_THRESHOLD) {
return false;
}
}
return true;
}
@ -360,9 +375,9 @@ public class WatermarkRemovalService {
private void removeImages(Element element, ElementWriter writer, List<ElementFeatures> watermarksElementFeaturesList) {
String hashValueOfImage = ImageHashFactory.calculate(element);
ElementFeatures imageFeatures = ElementFeatureFactory.extractFeaturesWithHash(element, hashValueOfImage);
ElementFeatures imageFeatures = ElementFeatureFactory.buildImageWithHash(element, hashValueOfImage);
for (ElementFeatures elementFeatures : watermarksElementFeaturesList) {
if (elementFeatures.isSimilarTo(imageFeatures)) {
if (elementFeatures.similar(imageFeatures)) {
return;
}
}
@ -379,7 +394,7 @@ public class WatermarkRemovalService {
Set<Long> visitedXObjIds) throws PDFNetException {
for (ElementFeatures elementFeatures : watermarksElementFeaturesList) {
if (elementFeatures.almostMatches(element)) {
if (elementFeatures.matches(ElementFeatureFactory.extractFeatures(element))) {
return;
}
}
@ -390,17 +405,17 @@ public class WatermarkRemovalService {
visitedXObjIds.add(element.getXObject().getObjNum());
// writer needs to be newly initialized when entering a new content stream
// see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
ElementWriter formWriter = new ElementWriter();
reader.formBegin();
formWriter.begin(element.getXObject());
try (ElementWriter formWriter = new ElementWriter()) {
reader.formBegin();
formWriter.begin(element.getXObject());
reader.clearChangeList();
formWriter.setDefaultGState(reader);
reader.clearChangeList();
formWriter.setDefaultGState(reader);
processElements(page, reader, formWriter, watermarksElementFeaturesList, visitedXObjIds);
formWriter.end();
formWriter.destroy();
reader.end();
processElements(page, reader, formWriter, watermarksElementFeaturesList, visitedXObjIds);
formWriter.end();
reader.end();
}
}
}

View File

@ -0,0 +1,197 @@
package com.iqser.red.pdftronlogic.commons.features;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import com.iqser.red.pdftronlogic.commons.Converter;
import com.pdftron.common.Matrix2D;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.CharData;
import com.pdftron.pdf.CharIterator;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.Font;
import com.pdftron.pdf.GState;
import com.pdftron.pdf.Image;
import com.pdftron.sdf.Obj;
import lombok.SneakyThrows;
import lombok.experimental.UtilityClass;
@UtilityClass
public class ElementFeatureFactory {
public ElementFeatures extractFeatures(Element element) throws PDFNetException {
return switch (element.getType()) {
case Element.e_path -> buildPath(element);
case Element.e_text -> buildText(element, false, false);
case Element.e_image, Element.e_inline_image -> buildImage(element);
case Element.e_form -> buildForm(element);
// This technically should never happen, it's a safetynet
default -> throw new RuntimeException("Feature Extraction is not supported for PDFTron.Element with type: " + element.getType());
};
}
public ImageFeatures buildImageWithHash(Element element, String hashObject) throws PDFNetException {
return buildImageBase(element).hashOfImage(hashObject).build();
}
public ImageFeatures buildImage(Element element) throws PDFNetException {
return buildImageBase(element).build();
}
public FormFeatures buildForm(Element element) throws PDFNetException {
try (var bbox = element.getBBox();) {
return FormFeatures.builder()
.elementType(element.getType())
.boundingBox(Converter.toRectangle2D(bbox))
.xObjectType(element.getXObject().getType())
.dictOrArrayOrStreamLength(element.getXObject().getType() == 7 ? element.getXObject().getDecodedStream().size() : 0)
.build();
}
}
private ImageFeatures.ImageFeaturesBuilder<?, ?> buildImageBase(Element element) throws PDFNetException {
assert element.getType() == Element.e_image || element.getType() == Element.e_inline_image;
try (var bbox = element.getBBox();) {
boolean transparent = element.getGState().getBlendMode() != GState.e_bl_normal
|| element.getGState().getFillOpacity() > 1
|| element.getGState().getStrokeOpacity() > 1;
// see spec: 8.9.6.3 Explicit masking
boolean masked = false;
if (element.getType() == Element.e_image) {
Image image = new Image(element.getXObject());
if (image.getMask() != null && image.getMask().getType() == Obj.e_stream) {
Image imageMask = new Image(image.getMask());
masked = imageMask.isImageMask();
}
}
return ImageFeatures.builder()
.elementType(element.getType())
.boundingBox(Converter.toRectangle2D(bbox))
.dataSize(element.getImageDataSize())
.height(element.getImageHeight())
.width(element.getImageWidth())
.renderingIntent(element.getImageRenderingIntent())
.componentNum(element.getComponentNum())
.bitsPerComponent(element.getBitsPerComponent())
.imageMask(element.isImageMask())
.softMask(element.getGState().getSoftMask() != null)
.masked(masked)
.transparent(transparent);
}
}
/*
Use includeGlyphs = true and preComputePathData = true, when trying to draw the glyphs, see GlyphExtractionTest
precomputePathData = true is needed, when trying to access the PathData after the PDFDoc/ElementReader has been closed
*/
public TextFeatures buildText(Element element, boolean includeGlyphs, boolean preComputePathData) throws PDFNetException {
try (var bbox = element.getBBox()) {
TextFeatures.TextFeaturesBuilder<?, ?> simpleTextFeatures = TextFeatures.builder()
.elementType(element.getType())
.boundingBox(Converter.toRectangle2D(bbox))
.text(element.getTextString())
.font(element.getGState().getFont().getType())
.fontsize(element.getGState().getFontSize());
if (includeGlyphs) {
simpleTextFeatures.glyphs(extractGlyphInfo(element, preComputePathData));
}
return simpleTextFeatures.build();
}
}
public PathFeatures buildPath(Element element) throws PDFNetException {
try (var bbox = element.getBBox(); var ctm = element.getCTM(); var fillColor = element.getGState().getFillColor(); var strokeColor = element.getGState().getStrokeColor()) {
return PathFeatures.builder()
.elementType(element.getType())
.boundingBox(Converter.toRectangle2D(bbox))
.clippingPath(element.isClippingPath())
.clipWindingFill(element.isClipWindingFill())
.stroked(element.isStroked())
.filled(element.isFilled())
.windingFill(element.isWindingFill())
.fillColor(Converter.convertColor(element.getGState().getFillColorSpace(), fillColor))
.strokeColor(Converter.convertColor(element.getGState().getStrokeColorSpace(), strokeColor))
.linePath(Converter.convertToGeneralPathAndTransformToInitialUserSpace(element.getPathData(), ctm))
.build();
}
}
@SneakyThrows
private List<GlyphInfo> extractGlyphInfo(Element textElement, boolean precomputePathData) {
assert textElement != null && textElement.getType() == Element.e_text;
if (textElement.getBBox() == null) {
return Collections.emptyList();
}
Font font = textElement.getGState().getFont();
if (font.getType() == Font.e_Type3) {
// type 3 fonts seem to be much more difficult, one must use font.getType3GlyphStream and font.getType3FontMatrix instead
// couldn't find much information except this post https://groups.google.com/g/pdfnet-sdk/c/SvhMflbtQho
// will implement this when necessary
return Collections.emptyList();
}
List<GlyphInfo> glyphs = new ArrayList<>();
short unitsPerEm = font.getUnitsPerEm();
try (CharIterator charIterator = textElement.getCharIterator(); Matrix2D ctm = textElement.getCTM().multiply(textElement.getTextMatrix());) {
while (charIterator.hasNext()) {
CharData charData = charIterator.next();
long charCode = charData.getCharCode();
try (Matrix2D fontMatrix = computeFontMatrix(charData, textElement, unitsPerEm)) {
GlyphInfo glyph = GlyphInfo.builder() //
.charCode(charCode) //
.cachePathData(precomputePathData) //
.glyphMatrix(ctm.multiply(fontMatrix)) //
.font(font) //
.build();
glyphs.add(glyph);
if (precomputePathData) {
// call the functions once to cache all data
glyph.getBoundingBox();
}
}
}
}
return glyphs;
}
private Matrix2D computeFontMatrix(CharData charData, Element textElement, short unitsPerEm) throws PDFNetException {
double yScaleFactor = textElement.getGState().getFontSize() / unitsPerEm;
double xScaleFactor = (textElement.getGState().getHorizontalScale() / 100) * yScaleFactor;
return new Matrix2D(xScaleFactor, 0, 0, -yScaleFactor, charData.getGlyphX(), charData.getGlyphY());
}
}

View File

@ -0,0 +1,102 @@
package com.iqser.red.pdftronlogic.commons.features;
import static com.iqser.red.pdftronlogic.commons.ComparisonUtils.almostEqual;
import java.awt.Shape;
import java.awt.geom.Rectangle2D;
import com.iqser.red.pdftronlogic.commons.ComparisonUtils;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Getter
@SuperBuilder
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ElementFeatures {
final private static double RECT_POSITION_SIMILARITY_THRESHOLD_FACTOR = 0.2; // specify how much the x and y value are allowed to differ
final private static double RECT_SIZE_SIMILARITY_THRESHOLD_FACTOR = 0.1; // the scale the images are allowed to differ
int elementType;
Rectangle2D boundingBox;
public boolean matches(ElementFeatures elementFeatures) {
return elementFeatures.getElementType() == elementType && elementFeatures.getBoundingBox() != null && bboxMatches(elementFeatures.getBoundingBox());
}
@SneakyThrows
protected boolean bboxMatches(Rectangle2D bBox) {
// To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance
return almostEqual(bBox.getX(), boundingBox.getX()) && //
almostEqual(bBox.getY(), boundingBox.getY()) && //
almostEqual(bBox.getWidth(), boundingBox.getWidth()) && //
almostEqual(bBox.getHeight(), boundingBox.getHeight());
}
public Shape getOverlapShape() {
return boundingBox;
}
public boolean similar(ElementFeatures elementFeatures) {
return elementFeatures.getElementType() == elementType && elementFeatures.getBoundingBox() != null && areRectsSimilar(elementFeatures.getBoundingBox());
}
protected boolean areRectsSimilar(Rectangle2D rectangle2D) {
// To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance
return isPositionSimilar(rectangle2D.getX(), boundingBox.getX(), rectangle2D.getWidth()) && //
isPositionSimilar(rectangle2D.getY(), boundingBox.getY(), rectangle2D.getHeight()) && //
isSizeSimilar(rectangle2D.getWidth(), boundingBox.getWidth()) && //
isSizeSimilar(rectangle2D.getHeight(), boundingBox.getHeight());
}
protected boolean isPositionSimilar(double a, double b, double boxSize) {
return Math.abs(a - b) < boxSize * RECT_POSITION_SIMILARITY_THRESHOLD_FACTOR;
}
protected boolean isSizeSimilar(double a, double b) {
return Math.abs(a - b) < a * RECT_SIZE_SIMILARITY_THRESHOLD_FACTOR;
}
public boolean contains(ElementFeatures features) {
return features.containedBy(this);
}
public boolean testOverlapped(ElementFeatures overlappingElement) {
return containedBy(overlappingElement);
}
private boolean containedBy(ElementFeatures features) {
Shape overlapShape = features.getOverlapShape();
return overlapShape.contains(ComparisonUtils.shrinkRectangle(boundingBox));
}
public void destroy() {
// do nothing, except for text
}
}

View File

@ -0,0 +1,45 @@
package com.iqser.red.pdftronlogic.commons.features;
import static com.iqser.red.pdftronlogic.commons.ComparisonUtils.almostEqual;
import java.awt.geom.Rectangle2D;
import lombok.AccessLevel;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@EqualsAndHashCode(callSuper = true)
@Getter
@SuperBuilder
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class FormFeatures extends ElementFeatures {
int xObjectType;
long dictOrArrayOrStreamLength;
public boolean matches(ElementFeatures elementFeatures) {
if (elementFeatures instanceof FormFeatures features) {
return elementFeatures.getElementType() == getElementType()
&& elementFeatures.getBoundingBox() != null
&& (super.bboxMatches(elementFeatures.getBoundingBox())
|| rotationMatches(elementFeatures.getBoundingBox()
.getBounds2D()))
&& xObjectType == features.getXObjectType()
&& dictOrArrayOrStreamLength == features.getDictOrArrayOrStreamLength();
}
return false;
}
private boolean rotationMatches(Rectangle2D bBox) {
return almostEqual(bBox.getWidth(), getBoundingBox().getHeight()) && //
almostEqual(bBox.getHeight(), getBoundingBox().getWidth());
}
}

View File

@ -0,0 +1,116 @@
package com.iqser.red.pdftronlogic.commons.features;
import java.awt.geom.Rectangle2D;
import java.util.Optional;
import com.iqser.red.pdftronlogic.commons.ComparisonUtils;
import com.iqser.red.pdftronlogic.commons.Converter;
import com.iqser.red.pdftronlogic.commons.PDFNetUtils;
import com.pdftron.common.Matrix2D;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.Font;
import com.pdftron.pdf.PathData;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Builder
@AllArgsConstructor
@RequiredArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class GlyphInfo {
final Matrix2D glyphMatrix;
final long charCode;
final Font font;
// in order to speed up invisible element removal, we only calculate the pathdata where necessary, as it is the costliest operation.
// It will only work as long as the associated ElementReader is still open, as the Font is bound to the ContentStream being read.
Rectangle2D bbox;
final boolean cachePathData;
PathData pathData;
boolean overlapped;
ElementFeatures overlappingElement;
public boolean testOverlapped(ElementFeatures overlappingElement) {
if (overlapped) {
return true;
}
Optional<Rectangle2D> bbox = getBoundingBox();
if (bbox.isEmpty()) {
return true;
}
if (overlappingElement.getOverlapShape().contains(ComparisonUtils.shrinkRectangle(bbox.get()))) {
overlapped = true;
this.overlappingElement = overlappingElement;
}
return overlapped;
}
public String getUnicode() {
try {
return new String(font.mapToUnicode(charCode));
} catch (PDFNetException e) {
return "";
}
}
@SneakyThrows
public Optional<PathData> getPathData() {
if (pathData == null) {
PDFNetUtils.requireFontNotClosed(font);
PathData computedPathData = font.getGlyphPath(charCode, true, glyphMatrix);
if (computedPathData.getOperators().length == 1 && computedPathData.getOperators()[0] == 6) {
// This happens for some chinese characters or whitespaces, don't know why...
return Optional.empty();
}
if (cachePathData) {
pathData = computedPathData;
}
return Optional.of(computedPathData);
}
return Optional.of(pathData);
}
@SneakyThrows
public Optional<Rectangle2D> getBoundingBox() {
if (bbox == null) {
Optional<PathData> pathData = getPathData();
if (pathData.isEmpty()) {
return Optional.empty();
}
bbox = Converter.convertToGeneralPath(pathData.get()).getBounds2D();
}
return Optional.of(bbox);
}
@SneakyThrows
public void destroy() {
if (glyphMatrix != null) {
glyphMatrix.close();
}
}
}

View File

@ -0,0 +1,75 @@
package com.iqser.red.pdftronlogic.commons.features;
import lombok.AccessLevel;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@EqualsAndHashCode(callSuper = true)
@Getter
@SuperBuilder
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ImageFeatures extends ElementFeatures {
final private static double HAMMING_DISTANCE_THRESHOLD = 4; // defines the similarity of the hash of images
int dataSize;
int height;
int width;
int renderingIntent;
int componentNum;
int bitsPerComponent;
boolean imageMask;
boolean softMask;
boolean masked;
boolean transparent;
String hashOfImage;
@Override
public boolean matches(ElementFeatures elementFeatures) {
if (elementFeatures instanceof ImageFeatures imageFeatures) {
return super.matches(elementFeatures)
&& this.dataSize == imageFeatures.getDataSize()
&& this.height == imageFeatures.getHeight()
&& this.width == imageFeatures.getWidth()
&& this.renderingIntent == imageFeatures.getRenderingIntent()
&& this.componentNum == imageFeatures.getComponentNum()
&& this.bitsPerComponent == imageFeatures.getBitsPerComponent()
&& this.imageMask == imageFeatures.isImageMask()
&& this.softMask == imageFeatures.isSoftMask()
&& this.transparent == imageFeatures.isTransparent()
&& calculateHammingDistance(imageFeatures.getHashOfImage()) <= HAMMING_DISTANCE_THRESHOLD;
}
return false;
}
public boolean similar(ElementFeatures elementFeatures) {
return super.similar(elementFeatures) && //
calculateHammingDistance(((ImageFeatures) elementFeatures).getHashOfImage()) <= HAMMING_DISTANCE_THRESHOLD;
}
// Helper method to calculate the Hamming distance between two hexadecimal strings
private int calculateHammingDistance(String hash2) {
if (hash2 == null) {
return 0;
}
int distance = 0;
int maxLength = Math.max(this.hashOfImage.length(), hash2.length());
for (int i = 0; i < maxLength; i++) {
char char1 = i < this.hashOfImage.length() ? this.hashOfImage.charAt(i) : '0';
char char2 = i < hash2.length() ? hash2.charAt(i) : '0';
if (char1 != char2) {
distance++;
}
}
return distance;
}
}

View File

@ -0,0 +1,51 @@
package com.iqser.red.pdftronlogic.commons.features;
import java.awt.Color;
import java.awt.Shape;
import java.awt.geom.GeneralPath;
import lombok.AccessLevel;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Getter
@SuperBuilder
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
@EqualsAndHashCode(callSuper = true)
public class PathFeatures extends ElementFeatures {
boolean clippingPath;
boolean clipWindingFill;
boolean stroked;
boolean filled;
boolean windingFill;
Color strokeColor;
Color fillColor;
GeneralPath linePath;
@Override
public boolean matches(ElementFeatures element) {
if (element instanceof PathFeatures pathFeaturesElement) {
return super.matches(element)
&& clippingPath == pathFeaturesElement.isClippingPath()
&& clipWindingFill == pathFeaturesElement.isClipWindingFill()
&& stroked == pathFeaturesElement.isStroked()
&& filled == pathFeaturesElement.isFilled()
&& windingFill == pathFeaturesElement.isWindingFill();
}
return false;
}
@Override
public Shape getOverlapShape() {
return linePath;
}
}

View File

@ -0,0 +1,60 @@
package com.iqser.red.pdftronlogic.commons.features;
import static com.iqser.red.pdftronlogic.commons.ComparisonUtils.almostEqual;
import java.util.ArrayList;
import java.util.List;
import lombok.AccessLevel;
import lombok.Builder;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@EqualsAndHashCode(callSuper = true)
@Getter
@SuperBuilder
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
@SuppressWarnings("PMD")
public class TextFeatures extends ElementFeatures {
String text;
int font;
double fontsize;
@Builder.Default
List<GlyphInfo> glyphs = new ArrayList<>();
@Override
public boolean matches(ElementFeatures element) {
if (element instanceof TextFeatures textFeaturesElement) {
return super.matches(textFeaturesElement)//
&& text.equals(textFeaturesElement.getText()) //
&& font == textFeaturesElement.getFont()//
&& almostEqual(fontsize, textFeaturesElement.getFontsize());
}
return false;
}
public boolean testOverlapped(ElementFeatures overlappingElement) {
if (glyphs.isEmpty()) {
return super.testOverlapped(overlappingElement);
}
return super.testOverlapped(overlappingElement) || glyphs.stream()
.allMatch(glyph -> glyph.testOverlapped(overlappingElement));
}
@Override
public void destroy() {
glyphs.forEach(GlyphInfo::destroy);
}
}

View File

@ -0,0 +1,43 @@
package com.iqser.red.pdftronlogic.commons.lookup;
import java.util.Optional;
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
@RequiredArgsConstructor
public class AnyMatchVisitor implements ElementFeatureVisitor {
private final ElementFeatures queryFeatures;
@Getter
private ElementFeatures match;
public Optional<ElementFeatures> getAnyMatch() {
return Optional.ofNullable(match);
}
@Override
public void visitItem(ElementFeatures features) {
if (hasAnyMatch()) {
return;
}
if (queryFeatures.matches(features)) {
match = features;
}
}
private boolean hasAnyMatch() {
return getAnyMatch().isPresent();
}
}

View File

@ -0,0 +1,135 @@
package com.iqser.red.pdftronlogic.commons.lookup;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Optional;
import java.util.function.Consumer;
import java.util.function.Predicate;
import com.iqser.red.pdftronlogic.commons.Converter;
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.Rect;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
@Getter
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ElementFeatureLookup implements AutoCloseable {
/*
This class looks a bit weird and uses visitors since I tried to use the quadtree implementation by locationtech, as it uses Rectangles by default to query its data structure.
Unfortunately there were always edge cases where it lost a few elements making it completely unusable. Further, it didn't even speed up the algorithm all that much.
*/
List<ElementFeatures> allElements = new ArrayList<>();
public void add(ElementFeatures elementFeatures) {
allElements.add(elementFeatures);
}
public void remove(ElementFeatures elementFeatures) {
allElements.remove(elementFeatures);
}
public Optional<ElementFeatures> anyMatch(ElementFeatures elementFeatures) {
AnyMatchVisitor visitor = new AnyMatchVisitor(elementFeatures);
forEach(visitor::visitItem);
return visitor.getAnyMatch();
}
@SneakyThrows
public List<ElementFeatures> query(Predicate<ElementFeatures> predicate) {
PredicateItemVisitor visitor = new PredicateItemVisitor(predicate);
forEach(visitor::visitItem);
return visitor.getMatchingFeatures();
}
@SneakyThrows
public List<ElementFeatures> findIntersecting(Rect bbox) {
Rectangle2D r = Converter.toRectangle2D(bbox);
return query(elementFeatures -> elementFeatures.getBoundingBox().intersects(r));
}
public List<ElementFeatures> findOverlapped(ElementFeatures overlappingElement, boolean textOnly) {
List<ElementFeatures> overlappedElementFeatures = new LinkedList<>();
for (int i = 0; i < allElements.size(); i++) {
ElementFeatures features = allElements.get(i);
if (textOnly && features.getElementType() != Element.e_text) {
continue;
}
if (features.getBoundingBox().intersects(overlappingElement.getBoundingBox())) {
if (features.testOverlapped(overlappingElement)) {
overlappedElementFeatures.add(features);
}
}
}
return overlappedElementFeatures;
}
public void forEach(Consumer<ElementFeatures> consumer) {
allElements.forEach(consumer);
}
public void clear() {
allElements.clear();
}
public boolean isEmpty() {
return allElements.isEmpty();
}
public int size() {
return allElements.size();
}
public void addAll(List<ElementFeatures> currentOverlappedElements) {
allElements.addAll(currentOverlappedElements);
}
public void removeAll(List<ElementFeatures> currentOverlappedElements) {
allElements.removeAll(currentOverlappedElements);
}
@Override
public void close() {
allElements.forEach(ElementFeatures::destroy);
}
}

View File

@ -0,0 +1,9 @@
package com.iqser.red.pdftronlogic.commons.lookup;
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
public interface ElementFeatureVisitor {
void visitItem(ElementFeatures features);
}

View File

@ -0,0 +1,29 @@
package com.iqser.red.pdftronlogic.commons.lookup;
import java.util.ArrayList;
import java.util.List;
import java.util.function.Predicate;
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
@RequiredArgsConstructor
public class PredicateItemVisitor implements ElementFeatureVisitor {
private final Predicate<ElementFeatures> predicate;
@Getter
private final List<ElementFeatures> matchingFeatures = new ArrayList<>();
@Override
public void visitItem(ElementFeatures features) {
if (predicate.test(features)) {
matchingFeatures.add(features);
}
}
}

View File

@ -0,0 +1,91 @@
package com.iqser.red.pdftronlogic.commons;
import static com.iqser.red.pdftronlogic.commons.VisualizationUtils.drawPathData;
import static com.iqser.red.pdftronlogic.commons.VisualizationUtils.drawRect;
import java.awt.Color;
import java.io.FileOutputStream;
import java.nio.file.Path;
import java.util.List;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import com.iqser.red.pdftronlogic.commons.features.GlyphInfo;
import com.iqser.red.pdftronlogic.commons.features.ImageFeatures;
import com.iqser.red.pdftronlogic.commons.features.TextFeatures;
import com.pdftron.pdf.ElementBuilder;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.PDFNet;
import com.pdftron.pdf.Page;
import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows;
@Disabled // makes no sense to run in pipeline
public class GlyphExtractionTest {
@BeforeAll
static void init() {
PDFNet.initialize(PDFTronConfig.license);
}
@Test
@SneakyThrows
public void testGlyphExtraction() {
String file = "files/everyCharIsImage.pdf";
List<List<TextFeatures>> textsPerPage;
List<List<ImageFeatures>> imagesPerPage;
try (var in = Thread.currentThread().getContextClassLoader().getResourceAsStream(file)) {
textsPerPage = PdfTextExtraction.extractAllGlyphsFromDocument(in, true);
}
try (var in = Thread.currentThread().getContextClassLoader().getResourceAsStream(file)) {
imagesPerPage = PdfImageExtraction.extractImages(in);
}
try (var in = Thread.currentThread().getContextClassLoader().getResourceAsStream(file);//
var out = new FileOutputStream(Path.of("/tmp/").resolve(Path.of(file).getFileName() + "_GLYPHS.pdf").toFile())) {
try (PDFDoc pdfDoc = new PDFDoc(in)) {
for (int i = 0; i < pdfDoc.getPageCount(); i++) {
Page page = pdfDoc.getPage(i + 1);
List<TextFeatures> textFeaturesOnPage = textsPerPage.get(i);
List<ImageFeatures> imageFeaturesOnPage = imagesPerPage.get(i);
try (ElementWriter writer = new ElementWriter(); ElementBuilder builder = new ElementBuilder()) {
writer.begin(page, ElementWriter.e_overlay, false);
for (ImageFeatures imageFeatures : imageFeaturesOnPage) {
if (imageFeatures.getBoundingBox().getHeight() * imageFeatures.getBoundingBox().getWidth() >= page.getPageHeight() * page.getPageWidth() * 0.8) {
continue;
}
drawRect(imageFeatures.getBoundingBox(), builder, writer, Color.CYAN, true);
}
for (TextFeatures textFeatures : textFeaturesOnPage) {
drawRect(textFeatures.getBoundingBox(), builder, writer, Color.BLUE);
for (GlyphInfo glyph : textFeatures.getGlyphs()) {
if (glyph.getPathData().isPresent() && glyph.getBoundingBox().isPresent()) {
drawPathData(glyph.getPathData().get(), builder, writer, Color.BLACK);
drawRect(ComparisonUtils.shrinkRectangle(glyph.getBoundingBox().get()), builder, writer, Color.RED);
drawRect(glyph.getBoundingBox().get(), builder, writer, Color.MAGENTA);
}
}
}
writer.end();
}
}
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
}
}
}
}

View File

@ -3,9 +3,13 @@ package com.iqser.red.pdftronlogic.commons;
import static com.iqser.red.pdftronlogic.commons.PdfTextExtraction.extractAllTextFromDocument;
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
import java.io.ByteArrayOutputStream;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import lombok.extern.slf4j.Slf4j;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@ -13,15 +17,23 @@ import com.pdftron.pdf.PDFNet;
import lombok.SneakyThrows;
@SuppressWarnings("PMD")
@Slf4j
class InvisibleElementRemovalServiceTest {
InvisibleElementRemovalService invisibleElementRemovalService;
@BeforeEach
void createService() {
@BeforeAll
static void init() {
PDFNet.initialize(PDFTronConfig.license);
}
@BeforeEach
void createServices() {
invisibleElementRemovalService = new InvisibleElementRemovalService();
}
@ -53,6 +65,19 @@ class InvisibleElementRemovalServiceTest {
}
@Test
@SneakyThrows
void page32DoesNotCrash() {
String fileName = "files/Page32.pdf";
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new ByteArrayOutputStream()) {
invisibleElementRemovalService.removeInvisibleElements(in, out, false);
}
}
@Test
@SneakyThrows
void removeInvisibleTextClippedByFormObjects() {
@ -74,9 +99,9 @@ class InvisibleElementRemovalServiceTest {
try (var in = new FileInputStream(deltaResultFileName)) {
String[] text = extractAllTextFromDocument(in).split("\n");
assertThat(text).contains(":Bold S-enantiomer form if two codes are supplied",
"Red : Only observed in laboratory soil studies",
"Green : Observed in both laboratory soil studies and lysimeter leachate",
"Blue : Only observed in lysimeter leachate");
"Red : Only observed in laboratory soil studies",
"Green : Observed in both laboratory soil studies and lysimeter leachate",
"Blue : Only observed in lysimeter leachate");
}
}
@ -99,9 +124,20 @@ class InvisibleElementRemovalServiceTest {
try (var in = new FileInputStream(deltaResultFileName)) {
String result = PdfTextExtraction.extractAllTextFromDocument(in);
assertThat(result).contains("#1 Dark",
"#13 Yellow",
"Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper suscipit lobortis nisl ut aliquip\n" + "ex ea commodo consequat. Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie\n" + "consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim\n" + "qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi.");
assertThat(result).doesNotContain("Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut\n" + "labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et\n" + "ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem\n" + "ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et\n" + "dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea\n" + "rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum\n" + "dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore\n" + "magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet\n" + "clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. ");
"#13 Yellow",
"Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper suscipit lobortis nisl ut aliquip\n"
+ "ex ea commodo consequat. Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie\n"
+ "consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim\n"
+ "qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi.");
assertThat(result).doesNotContain("Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut\n"
+ "labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et\n"
+ "ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem\n"
+ "ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et\n"
+ "dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea\n"
+ "rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum\n"
+ "dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore\n"
+ "magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet\n"
+ "clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. ");
}
}
@ -175,4 +211,26 @@ class InvisibleElementRemovalServiceTest {
}
}
@Test
@SneakyThrows
void removeInvisibleElementsWhereEachCharIsImage() {
String fileName = "files/everyCharIsImage.pdf";
String resultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL");
String deltaResultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL_DELTA");
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(resultFileName)) {
invisibleElementRemovalService.removeInvisibleElementsButKeepOcrText(in, out, false);
}
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(deltaResultFileName)) {
invisibleElementRemovalService.removeInvisibleElementsButKeepOcrText(in, out, true);
}
try (var in = new FileInputStream(resultFileName)) {
String result = PdfTextExtraction.extractAllTextFromDocument(in);
assertThat(result).isBlank();
}
}
}

View File

@ -0,0 +1,314 @@
package com.iqser.red.pdftronlogic.commons;
import java.awt.geom.Rectangle2D;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.CompletableFuture;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import com.iqser.red.pdftronlogic.commons.rendering.GhostScriptService;
import com.iqser.red.pdftronlogic.commons.rendering.ImageFile;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.PDFNet;
import com.pdftron.sdf.SDFDoc;
import com.sun.jna.NativeLibrary;
import com.sun.jna.Pointer;
import lombok.SneakyThrows;
import net.sourceforge.lept4j.Box;
import net.sourceforge.lept4j.Boxa;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.lept4j.util.LeptUtils;
@Disabled // requires leptonica and ghostscript to be installed locally
public class VisualEqualityTest {
/*
We render both the origin and the processed file and then computes a diff per page, we then threshold and invert the diff.
This means, a visual difference of luminance greater than the threshold value shows up as a black pixel.
We then use Heckbert's Seed Fill Algorithm to detect connected black regions by recursively flooding connected pixels.
We then filter these error regions, ensuring their area is at least the threshold.
We do this, since single pixel errors are frequent, but cannot be perceived by a human. Most likely some float inaccuracies.
If there are any error regions left, we count the test as failed.
*/
private static final int ERROR_REGION_AREA_THRESHOLD = 10;
public static final int LUMINANCE_DIFFERENCE_THRESHOLD = 170;
private static final Path TEST_OUTPUT_DIR = Path.of("/tmp/AAA_EQUALITY_TEST/");
private static final String LEPTONICA_DIR = "/home/kschuettler/software/leptonica/vcpkg/installed/x64-linux-dynamic/lib/";
GhostScriptService ghostScriptService = new GhostScriptService();
InvisibleElementRemovalService invisibleElementRemovalService = new InvisibleElementRemovalService();
@BeforeEach
public void setup() {
PDFNet.initialize(PDFTronConfig.license);
System.setProperty("jna.library.path", LEPTONICA_DIR);
try (NativeLibrary leptonicaLib = NativeLibrary.getInstance("leptonica")) {
assert leptonicaLib != null;
}
}
@Test
@SneakyThrows
public void assertVisualEqualityOfProcessedFile() {
Path file = Path.of("/home/kschuettler/Dokumente/TestFiles/full_syn_dm_testfiles/3977411_Final_Thiamethoxam_SL_MNLY.pdf");
Context context = new Context(TEST_OUTPUT_DIR, new HashMap<>());
runForFile(file, context);
System.out.println(context);
assert context.failedFiles.isEmpty();
}
@Test
@SneakyThrows
public void assertVisualEqualityOfProcessedFolder() {
Path folder = Path.of("/home/kschuettler/Dokumente/TestFiles/full_syn_dm_testfiles");
Context context = new Context(TEST_OUTPUT_DIR, new HashMap<>());
Files.walk(folder)
.filter(Files::isRegularFile)
.map(Path::toFile)
.filter(file -> file.toString().endsWith(".pdf"))
.map(File::toPath)
.peek(file -> runForFile(file, context))
.forEach(f -> System.out.println(context));
assert context.failedFiles.isEmpty();
}
@SneakyThrows
private void runForFile(Path originFile, Context context) {
System.out.println(originFile.toFile());
Path fileFolder = context.getFileFolder(originFile);
Files.createDirectories(fileFolder);
Path processedFile = fileFolder.resolve("processed.pdf");
Path deltaFile = fileFolder.resolve("delta.pdf");
Path savedOriginFile = fileFolder.resolve("origin.pdf");
try (var in = new FileInputStream(originFile.toFile()); PDFDoc pdfDoc = new PDFDoc(in); var out = new FileOutputStream(savedOriginFile.toFile())) {
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
}
try (var in = new FileInputStream(originFile.toFile()); var out = new FileOutputStream(processedFile.toFile())) {
invisibleElementRemovalService.removeInvisibleElements(in, out, false);
}
try (var in = new FileInputStream(originFile.toFile()); var out = new FileOutputStream(deltaFile.toFile())) {
invisibleElementRemovalService.removeInvisibleElements(in, out, true);
}
System.out.println("removed invisible elements");
assertVisualEquality(savedOriginFile, processedFile, context);
System.out.println("finished visual equality check");
}
@SneakyThrows
private void assertVisualEquality(Path originFile, Path processedFile, Context context) {
Path imageDir = context.getFileFolder(originFile).resolve("images");
Path originDir = imageDir.resolve("origin");
Files.createDirectories(originDir);
CompletableFuture<List<ImageFile>> originalPagesFuture = ghostScriptService.renderDocument(originFile, originDir);
Path processedDir = imageDir.resolve("processed");
Files.createDirectories(processedDir);
CompletableFuture<List<ImageFile>> processedPagesFuture = ghostScriptService.renderDocument(processedFile, processedDir);
Files.walk(context.getErrorFolder(originFile))
.map(Path::toFile)
.filter(File::isFile)
.forEach(File::delete);
List<ImageFile> originalPages = originalPagesFuture.join();
List<ImageFile> processedPages = processedPagesFuture.join();
if (originalPages.size() != processedPages.size()) {
context.getFailedFile(originFile).addErrorMessage("Differing page counts!");
return;
}
for (ImageFile originalPage : originalPages) {
Optional<ImageFile> samePage = processedPages.stream()
.filter(p -> p.pageNumber() == originalPage.pageNumber())
.findFirst();
if (samePage.isEmpty()) {
context.getFailedFile(originFile).addErrorMessage("Page " + originalPage.pageNumber() + " missing!");
return;
}
ImageFile processedPage = samePage.get();
Pix originalPagePix;
Pix processedPagePix;
synchronized (VisualEqualityTest.class) {
originalPagePix = originalPage.readPix();
processedPagePix = processedPage.readPix();
}
String errorFile = context.getErrorFolder(originFile).resolve(originalPage.pageNumber() + ".tiff").toFile().toString();
List<Rectangle2D> errorRegions = detectErrors(originalPagePix, processedPagePix, errorFile);
if (!errorRegions.isEmpty()) {
context.getFailedFile(originFile).addErrorMessage("Page " + originalPage.pageNumber() + " has " + errorRegions.size() + " errors!");
}
synchronized (VisualEqualityTest.class) {
LeptUtils.disposePix(originalPagePix);
LeptUtils.disposePix(processedPagePix);
}
}
}
synchronized public List<Rectangle2D> detectErrors(Pix pix1, Pix pix2, String errorFile) {
Pix pixDiff = Leptonica1.pixAbsDifference(pix1, pix2);
Pix pixThresh = Leptonica1.pixThresholdToBinary(pixDiff, LUMINANCE_DIFFERENCE_THRESHOLD);
Leptonica1.pixInvert(pixThresh, pixThresh);
// checks for connected black regions and outputs them as a list of boxes, a boxa
Boxa boxa = Leptonica1.pixConnComp(pixThresh, null, 8);
List<Rectangle2D> errorRegions = readRectsFromBoxa(boxa).stream()
.filter(box -> box.getWidth() * box.getHeight() >= ERROR_REGION_AREA_THRESHOLD)
.toList();
if (!errorRegions.isEmpty()) {
System.out.println("Found error(s) on page " + Path.of(errorFile).getFileName().toString().replace(".tiff", "") + ", writing error file.");
// Boxa errorRegionsBoxa = pushRectsIntoBoxa(errorRegions); // this does not work
// Pix errorPix = Leptonica1.pixDrawBoxa(pixThresh, errorRegionsBoxa, 2, -1); // somehow this runs forever
Leptonica1.pixWrite(errorFile, pixThresh, 4);
// LeptUtils.disposePix(errorPix);
// LeptUtils.dispose(errorRegionsBoxa);
}
LeptUtils.dispose(boxa);
LeptUtils.disposePix(pixDiff);
LeptUtils.disposePix(pixThresh);
return errorRegions;
}
private static List<Rectangle2D> readRectsFromBoxa(Boxa boxa) {
Pointer[] pointers = boxa.box.getPointer().getPointerArray(0, boxa.n);
List<Rectangle2D> boxes = new ArrayList<>(boxa.n);
for (int i = 0; i < boxa.n; i++) {
Box box = new Box(pointers[i]);
boxes.add(new Rectangle2D.Double(box.x, box.y, box.w, box.h));
LeptUtils.dispose(box);
}
return boxes;
}
/*
private static Boxa pushRectsIntoBoxa(List<Rectangle2D> rects) {
Boxa boxa = new Boxa();
boxa.n = rects.size();
boxa.nalloc = rects.size();
Memory boxMemory = new Memory((long) Native.POINTER_SIZE * rects.size());
for (int i = 0; i < rects.size(); i++) {
Rectangle2D rect = rects.get(i);
Box box = new Box((int) rect.getX(), (int) rect.getY(), (int) rect.getWidth(), (int) rect.getHeight(), 0);
boxMemory.setPointer((long) i * Native.POINTER_SIZE, box.getPointer());
}
boxa.box = new PointerByReference(boxMemory);
return boxa;
}
*/
private record Context(Path outFolder, Map<Path, FailedFile> failedFiles) {
public FailedFile getFailedFile(Path path) {
return failedFiles.computeIfAbsent(path, p -> FailedFile.init());
}
public Path getFileFolder(Path file) {
return outFolder.resolve(file.getFileName());
}
public String toString() {
if (failedFiles.isEmpty()) {
return "All files visually equal!";
}
StringBuilder sb = new StringBuilder();
failedFiles.forEach((file, failedFile) -> sb.append(file.getFileName().toFile()).append(": ").append(failedFile.toString()).append("\n"));
return sb.toString();
}
@SneakyThrows
public Path getErrorFolder(Path originFile) {
Path errorDir = getFileFolder(originFile).resolve("error");
Files.createDirectories(errorDir);
return errorDir;
}
}
private record FailedFile(Map<ImageFile, FailedPage> failedPages, List<String> errors) {
public static FailedFile init() {
return new FailedFile(new HashMap<>(), new LinkedList<>());
}
public void addErrorMessage(String s) {
errors.add(s);
}
public void addFailedPage(ImageFile imageFile, double location) {
failedPages.computeIfAbsent(imageFile, file -> new FailedPage(new LinkedList<>())).locations().add(location);
}
public String toString() {
return String.join(", ", errors);
}
}
private record FailedPage(List<Double> locations) {
}
}

View File

@ -13,6 +13,7 @@ import com.pdftron.pdf.PDFNet;
import lombok.SneakyThrows;
@Disabled
@SuppressWarnings("PMD")
class WatermarkRemovalServiceTest {
@SneakyThrows
@ -23,7 +24,7 @@ class WatermarkRemovalServiceTest {
WatermarkRemovalService watermarkRemovalService = new WatermarkRemovalService();
String filename = "files/1.A16148F - Toxicidade oral aguda (1).pdf";
String filename = "files/syngenta/CustomerFiles/1.A16148F - Toxicidade oral aguda (1).pdf";
String tmpFilename = createTmpFileName(filename, "WATERMARK_REMOVAL");
try (var in = this.getClass().getClassLoader().getResourceAsStream(filename); var out = new FileOutputStream(tmpFilename)) {

View File

@ -0,0 +1,145 @@
package com.iqser.red.pdftronlogic.commons.rendering;
import java.io.BufferedReader;
import java.io.File;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Map;
import java.util.function.Consumer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@RequiredArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class GhostScriptOutputHandler extends Thread {
static Pattern pageFinishedPattern = Pattern.compile("Page (\\d+)");
// If the stdError or stdOut buffer of a thread is not being emptied it might lock the process in case of errors, so we need to empty both streams to prevent a deadlock.
// Since both need to read simultaneously we need to implement the readers as separate threads.
final InputStream is;
final String processName;
final Type type;
final Map<Integer, ImageFile> pagesToProcess;
final Consumer<ImageFile> outputHandler;
final Consumer<String> errorHandler;
int currentPageNumber;
public static GhostScriptOutputHandler stdError(InputStream is, Consumer<String> errorHandler) {
return new GhostScriptOutputHandler(is, "GS", Type.ERROR, null, null, errorHandler);
}
public static GhostScriptOutputHandler stdOut(InputStream is, Map<Integer, ImageFile> pagesToProcess, Consumer<ImageFile> imageFileOutput, Consumer<String> errorHandler) {
return new GhostScriptOutputHandler(is, "GS", Type.STD_OUT, pagesToProcess, imageFileOutput, errorHandler);
}
@SneakyThrows
public void run() {
try (InputStreamReader isr = new InputStreamReader(is); BufferedReader br = new BufferedReader(isr)) {
String line;
while (true) {
line = br.readLine();
if (line == null) {
break;
}
if (type.equals(Type.ERROR)) {
log.error("{}_{}>{}", processName, type.name(), line);
} else {
log.debug("{}_{}>{}", processName, type.name(), line);
addProcessedImageToQueue(line);
}
}
}
is.close();
if (type.equals(Type.STD_OUT)) {
queueFinishedPage(currentPageNumber);
if (!pagesToProcess.isEmpty()) {
errorHandler.accept(String.format("Ghostscript finished for batch, but pages %s remain unprocessed.", formatPagesToProcess()));
}
}
}
private String formatPagesToProcess() {
if (pagesToProcess.isEmpty()) {
return "-";
}
if (pagesToProcess.size() == 1) {
return pagesToProcess.keySet()
.iterator().next().toString();
}
return pagesToProcess.keySet()
.stream()
.mapToInt(Integer::intValue)
.min()
.orElse(0) + "-" + pagesToProcess.keySet()
.stream()
.mapToInt(Integer::intValue)
.max()
.orElse(0);
}
private void addProcessedImageToQueue(String line) {
/*
Ghostscript prints the pageNumber it is currently working on, so we remember the current page and queue it as soon as the next comes in.
*/
Matcher pageNumberMatcher = pageFinishedPattern.matcher(line);
if (pageNumberMatcher.find()) {
int pageNumber = Integer.parseInt(pageNumberMatcher.group(1));
if (currentPageNumber == 0) {
currentPageNumber = pageNumber;
return;
}
queueFinishedPage(currentPageNumber);
currentPageNumber = pageNumber;
}
}
private void queueFinishedPage(int pageNumber) {
var imageFile = this.pagesToProcess.remove(pageNumber);
if (imageFile == null) {
errorHandler.accept(String.format("Page number %d does not exist in this thread. It only has pagenumbers %s", pageNumber, pagesToProcess.keySet()));
} else {
if (!new File(imageFile.absoluteFilePath()).exists()) {
errorHandler.accept(String.format("Rendered page with number %d does not exist!", pageNumber));
}
}
outputHandler.accept(imageFile);
}
public enum Type {
ERROR,
STD_OUT
}
}

View File

@ -0,0 +1,192 @@
package com.iqser.red.pdftronlogic.commons.rendering;
import java.io.InputStream;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.CompletableFuture;
import java.util.function.Consumer;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.PDFDoc;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
@SuppressWarnings("PMD") // can't figure out how to safely close the stdOut and stdError streams in line 142/144
public class GhostScriptService {
int BATCH_SIZE = 256;
String FORMAT = ".tiff";
String DEVICE = "tiffgray";
int DPI = 100;
int PROCESS_COUNT = 1;
@SneakyThrows
public CompletableFuture<List<ImageFile>> renderDocument(Path documentFile, Path imageDir) {
int pageCount = getPageCount(documentFile);
List<Integer> allPages = IntStream.range(1, pageCount + 1).boxed()
.toList();
ImageSupervisorImpl supervisor = new ImageSupervisorImpl(allPages);
renderPagesBatched(allPages, documentFile.toFile().toString(), imageDir, supervisor, supervisor.successHandler(), supervisor.errorHandler());
return CompletableFuture.supplyAsync(() -> awaitImageFiles(supervisor));
}
@SneakyThrows
private static List<ImageFile> awaitImageFiles(ImageSupervisorImpl supervisor) {
supervisor.awaitAll();
return supervisor.getRenderedImages();
}
private static int getPageCount(Path documentFile) throws PDFNetException {
try (PDFDoc doc = new PDFDoc(documentFile.toFile().toString())) {
return doc.getPageCount();
}
}
@SneakyThrows
public void renderPagesBatched(List<Integer> pagesToProcess,
String documentAbsolutePath,
Path tmpImageDir,
ImageSupervisor supervisor,
Consumer<ImageFile> successHandler,
Consumer<String> errorHandler) {
List<List<ProcessInfo>> processInfoBatches = buildSubListForEachProcess(pagesToProcess,
PROCESS_COUNT,
BATCH_SIZE
* PROCESS_COUNT); // GS has a limit on how many pageIndices per call are possible, so we limit it to 256 pages per process
for (int batchIdx = 0; batchIdx < processInfoBatches.size(); batchIdx++) {
supervisor.requireNoErrors();
List<ProcessInfo> processInfos = processInfoBatches.get(batchIdx);
log.info("Batch {}: Running {} gs processes with ({}) pages each",
batchIdx,
processInfos.size(),
processInfos.stream()
.map(info -> info.pageNumbers().size())
.map(String::valueOf)
.collect(Collectors.joining(", ")));
int finalBatchIdx = batchIdx;
List<Process> processes = processInfos.stream()
.parallel()
.map(info -> buildCmdArgs(info.processIdx(), finalBatchIdx, info.pageNumbers(), tmpImageDir, documentAbsolutePath))
.peek(s -> log.debug(String.join(" ", s.cmdArgs())))
.map(processInfo -> executeProcess(processInfo, successHandler, errorHandler))
.toList();
List<Integer> processExitCodes = new LinkedList<>();
for (Process process : processes) {
processExitCodes.add(process.waitFor());
}
log.info("Batch {}: Ghostscript processes finished with exit codes {}", batchIdx, processExitCodes);
}
}
private List<List<ProcessInfo>> buildSubListForEachProcess(List<Integer> stitchedPageNumbers, int processCount, int batchSize) {
// GhostScript command line can only handle so many page numbers at once, so we split it into batches
int batchCount = (int) Math.ceil((double) stitchedPageNumbers.size() / batchSize);
log.info("Splitting {} page renderings across {} process(es) in {} batch(es) with size {}", stitchedPageNumbers.size(), processCount, batchCount, batchSize);
List<List<ProcessInfo>> processInfoBatches = new ArrayList<>(batchCount);
List<List<List<Integer>>> batchedBalancedSublist = ListSplittingUtils.buildBatchedBalancedSublist(stitchedPageNumbers.stream()
.sorted()
.toList(), processCount, batchCount);
for (var batch : batchedBalancedSublist) {
List<ProcessInfo> processInfos = new ArrayList<>(processCount);
for (int threadIdx = 0; threadIdx < batch.size(); threadIdx++) {
List<Integer> balancedPageNumbersSubList = batch.get(threadIdx);
processInfos.add(new ProcessInfo(threadIdx, balancedPageNumbersSubList));
}
processInfoBatches.add(processInfos);
}
return processInfoBatches;
}
@SneakyThrows
private ProcessCmdsAndRenderedImageFiles buildCmdArgs(Integer processIdx,
Integer batchIdx,
List<Integer> stitchedImagePageIndices,
Path outputDir,
String documentAbsolutePath) {
String imagePathFormat = outputDir.resolve("output_" + processIdx + "_" + batchIdx + ".%04d" + FORMAT).toFile().toString();
Map<Integer, ImageFile> fullPageImages = new HashMap<>();
for (int i = 0; i < stitchedImagePageIndices.size(); i++) {
Integer pageNumber = stitchedImagePageIndices.get(i);
fullPageImages.put(pageNumber, new ImageFile(pageNumber, String.format(imagePathFormat, i + 1)));
}
String[] cmdArgs = buildCmdArgs(stitchedImagePageIndices, documentAbsolutePath, imagePathFormat);
return new ProcessCmdsAndRenderedImageFiles(cmdArgs, fullPageImages);
}
private String[] buildCmdArgs(List<Integer> pageNumbers, String documentAbsolutePath, String imagePathFormat) {
StringBuilder sPageList = new StringBuilder();
int i = 1;
for (Integer integer : pageNumbers) {
sPageList.append(integer);
if (i < pageNumbers.size()) {
sPageList.append(",");
}
i++;
}
return new String[]{"gs", "-dNOPAUSE", "-sDEVICE=" + DEVICE, "-r" + DPI, "-sPageList=" + sPageList, "-sOutputFile=" + imagePathFormat, documentAbsolutePath, "-c", "quit"};
}
@SneakyThrows
private Process executeProcess(ProcessCmdsAndRenderedImageFiles processInfo, Consumer<ImageFile> successHandler, Consumer<String> errorHandler) {
Process p = Runtime.getRuntime().exec(processInfo.cmdArgs());
InputStream stdOut = p.getInputStream();
GhostScriptOutputHandler stdOutLogger = GhostScriptOutputHandler.stdOut(stdOut, processInfo.renderedPageImageFiles(), successHandler, errorHandler);
InputStream stdError = p.getErrorStream();
GhostScriptOutputHandler stdErrorLogger = GhostScriptOutputHandler.stdError(stdError, errorHandler);
stdOutLogger.start();
stdErrorLogger.start();
return p;
}
private record ProcessCmdsAndRenderedImageFiles(String[] cmdArgs, Map<Integer, ImageFile> renderedPageImageFiles) {
}
private record ProcessInfo(Integer processIdx, List<Integer> pageNumbers) {
}
}

View File

@ -0,0 +1,13 @@
package com.iqser.red.pdftronlogic.commons.rendering;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
public record ImageFile(int pageNumber, String absoluteFilePath) {
public Pix readPix() {
return Leptonica1.pixRead(absoluteFilePath);
}
}

View File

@ -0,0 +1,7 @@
package com.iqser.red.pdftronlogic.commons.rendering;
public interface ImageSupervisor {
void requireNoErrors();
}

View File

@ -0,0 +1,114 @@
package com.iqser.red.pdftronlogic.commons.rendering;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.CountDownLatch;
import java.util.function.Consumer;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@RequiredArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class ImageSupervisorImpl implements ImageSupervisor {
final Map<Integer, CountDownLatch> pageLatches;
final Map<Integer, ImageFile> images;
final List<String> errors;
final ImageFile[] finishedPages;
public ImageSupervisorImpl(List<Integer> pageNumbers) {
this.pageLatches = Collections.synchronizedMap(new HashMap<>());
this.images = Collections.synchronizedMap(new HashMap<>());
this.errors = Collections.synchronizedList(new ArrayList<>());
this.finishedPages = new ImageFile[pageNumbers.size()];
for (Integer pageNumber : pageNumbers) {
pageLatches.put(pageNumber, new CountDownLatch(1));
}
}
public List<ImageFile> getRenderedImages() {
return new ArrayList<>(images.values());
}
public void markPageFinished(ImageFile imageFile) {
log.debug("finished page: {}", imageFile.pageNumber());
getPageLatch(imageFile.pageNumber()).countDown();
images.put(imageFile.pageNumber(), imageFile);
finishedPages[imageFile.pageNumber() - 1] = imageFile;
}
public Consumer<ImageFile> successHandler() {
return this::markPageFinished;
}
public Consumer<String> errorHandler() {
return this::markError;
}
private CountDownLatch getPageLatch(Integer pageNumber) {
if (pageNumber == null || !pageLatches.containsKey(pageNumber)) {
throw new IllegalArgumentException("awaiting non-existent page " + pageNumber);
}
return pageLatches.get(pageNumber);
}
public ImageFile awaitProcessedPage(Integer pageNumber) throws InterruptedException {
if (hasErrors()) {
return null;
}
getPageLatch(pageNumber).await();
return images.get(pageNumber);
}
private boolean hasErrors() {
return errors.isEmpty();
}
public void markError(String errorMessage) {
this.errors.add(errorMessage);
}
public void awaitAll() throws InterruptedException {
for (CountDownLatch countDownLatch : pageLatches.values()) {
countDownLatch.await();
}
}
public void requireNoErrors() {
// GS will log
if (this.errors.isEmpty()) {
return;
}
throw new IllegalStateException("Error(s) occurred during image processing: " + String.join("\n", errors));
}
}

View File

@ -0,0 +1,106 @@
package com.iqser.red.pdftronlogic.commons.rendering;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.stream.IntStream;
import lombok.experimental.UtilityClass;
@UtilityClass
public class ListSplittingUtils {
public List<List<Integer>> buildBalancedContinuousSublist(Integer totalNumberOfEntries, int threadCount) {
return buildBalancedSublist(IntStream.range(0, totalNumberOfEntries)
.map(i -> i + 1).boxed()
.toList(), threadCount);
}
public <T> List<List<T>> buildBalancedSublist(List<T> entries, int threadCount) {
List<Integer> balancedEntryCounts = buildBalancedEntryCounts(entries.size(), threadCount);
List<List<T>> balancedSublist = new ArrayList<>(threadCount);
int startIdx = 0;
for (Integer numberOfEntriesPerThread : balancedEntryCounts) {
balancedSublist.add(entries.subList(startIdx, startIdx + numberOfEntriesPerThread));
startIdx += numberOfEntriesPerThread;
}
return balancedSublist;
}
public <T> List<List<List<T>>> buildBatchedBalancedSublist(List<T> entries, int threadCount, int batchSize) {
// batches -> threads -> entries
List<List<List<T>>> batchedBalancedSubList = new LinkedList<>();
List<List<List<T>>> threadsWithBatches = buildBalancedSublist(entries, threadCount).stream()
.map(list -> buildBalancedSublist(list, batchSize))
.toList();
// swap first two dimensions
for (int batchIdx = 0; batchIdx < batchSize; batchIdx++) {
List<List<T>> threadEntriesPerBatch = new ArrayList<>(threadCount);
for (int threadIdx = 0; threadIdx < threadCount; threadIdx++) {
threadEntriesPerBatch.add(threadsWithBatches.get(threadIdx).get(batchIdx));
}
batchedBalancedSubList.add(threadEntriesPerBatch);
}
return batchedBalancedSubList;
}
public List<Integer> buildBalancedEntryCounts(int totalNumberOfEntries, int threadCount) {
List<Integer> numberOfPagesPerThread = new ArrayList<>(threadCount);
for (int i = 0; i < threadCount; i++) {
numberOfPagesPerThread.add(0);
}
int threadIdx;
for (int i = 0; i < totalNumberOfEntries; i++) {
threadIdx = i % threadCount;
numberOfPagesPerThread.set(threadIdx, numberOfPagesPerThread.get(threadIdx) + 1);
}
return numberOfPagesPerThread;
}
public static List<String> formatIntervals(List<Integer> sortedList) {
List<String> intervals = new ArrayList<>();
if (sortedList.isEmpty()) {
return intervals;
}
int start = sortedList.get(0);
int end = start;
for (int i = 1; i < sortedList.size(); i++) {
int current = sortedList.get(i);
if (current == end + 1) {
end = current;
} else {
intervals.add(formatInterval(start, end));
start = current;
end = start;
}
}
intervals.add(formatInterval(start, end));
return intervals;
}
private static String formatInterval(int start, int end) {
if (start == end) {
return String.valueOf(start);
} else {
return start + "-" + end;
}
}
}

Binary file not shown.

@ -0,0 +1 @@
Subproject commit 9dc6c2337dea32e63aef53271dba0692537c6605

File diff suppressed because one or more lines are too long

@ -0,0 +1 @@
Subproject commit 21fefb64bf27ca2b3329a6c69d90a27450b17930