Compare commits

..

No commits in common. "master" and "RED-4875_1" have entirely different histories.

63 changed files with 799 additions and 1248059 deletions

8
.gitignore vendored
View File

@ -27,11 +27,3 @@
**/classpath-data.json
**/dependencies-and-licenses-overview.txt
git.tag
gradle.properties
gradlew
gradlew.bat
gradle/
**/.gradle
**/build

View File

@ -1,25 +0,0 @@
variables:
# SONAR_PROJECT_KEY: 'com.iqser.red.commons:pdftron-logic-commons'
GIT_SUBMODULE_STRATEGY: recursive
GIT_SUBMODULE_FORCE_HTTPS: 'true'
include:
- project: 'gitlab/gitlab'
ref: 'main'
file: 'ci-templates/gradle_java.yml'
deploy:
stage: deploy
tags:
- dind
script:
- echo "Building with gradle version ${BUILDVERSION}"
- gradle -Pversion=${BUILDVERSION} publish
- echo "BUILDVERSION=$BUILDVERSION" >> version.env
artifacts:
reports:
dotenv: version.env
rules:
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
- if: $CI_COMMIT_BRANCH =~ /^release/
- if: $CI_COMMIT_TAG

6
.gitmodules vendored
View File

@ -1,6 +0,0 @@
[submodule "src/test/resources/files/syngenta"]
path = src/test/resources/files/syngenta
url = https://gitlab.knecon.com/fforesight/documents/syngenta.git
[submodule "src/test/resources/files/basf"]
path = src/test/resources/files/basf
url = https://gitlab.knecon.com/fforesight/documents/basf.git

36
bamboo-specs/pom.xml Normal file
View File

@ -0,0 +1,36 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.atlassian.bamboo</groupId>
<artifactId>bamboo-specs-parent</artifactId>
<version>8.1.3</version>
<relativePath/>
</parent>
<artifactId>bamboo-specs</artifactId>
<version>1.0.0-SNAPSHOT</version>
<packaging>jar</packaging>
<dependencies>
<dependency>
<groupId>com.atlassian.bamboo</groupId>
<artifactId>bamboo-specs-api</artifactId>
</dependency>
<dependency>
<groupId>com.atlassian.bamboo</groupId>
<artifactId>bamboo-specs</artifactId>
</dependency>
<!-- Test dependencies -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
<!-- run 'mvn test' to perform offline validation of the plan -->
<!-- run 'mvn -Ppublish-specs' to upload the plan to your Bamboo server -->
</project>

View File

@ -0,0 +1,127 @@
package buildjob;
import static com.atlassian.bamboo.specs.builders.task.TestParserTask.createJUnitParserTask;
import com.atlassian.bamboo.specs.api.BambooSpec;
import com.atlassian.bamboo.specs.api.builders.BambooKey;
import com.atlassian.bamboo.specs.api.builders.docker.DockerConfiguration;
import com.atlassian.bamboo.specs.api.builders.permission.PermissionType;
import com.atlassian.bamboo.specs.api.builders.permission.Permissions;
import com.atlassian.bamboo.specs.api.builders.permission.PlanPermissions;
import com.atlassian.bamboo.specs.api.builders.plan.Job;
import com.atlassian.bamboo.specs.api.builders.plan.Plan;
import com.atlassian.bamboo.specs.api.builders.plan.PlanIdentifier;
import com.atlassian.bamboo.specs.api.builders.plan.Stage;
import com.atlassian.bamboo.specs.api.builders.plan.branches.BranchCleanup;
import com.atlassian.bamboo.specs.api.builders.plan.branches.PlanBranchManagement;
import com.atlassian.bamboo.specs.api.builders.project.Project;
import com.atlassian.bamboo.specs.api.builders.Variable;
import com.atlassian.bamboo.specs.builders.task.CheckoutItem;
import com.atlassian.bamboo.specs.builders.task.InjectVariablesTask;
import com.atlassian.bamboo.specs.builders.task.ScriptTask;
import com.atlassian.bamboo.specs.builders.task.VcsCheckoutTask;
import com.atlassian.bamboo.specs.builders.task.VcsTagTask;
import com.atlassian.bamboo.specs.builders.trigger.BitbucketServerTrigger;
import com.atlassian.bamboo.specs.model.task.InjectVariablesScope;
import com.atlassian.bamboo.specs.util.BambooServer;
import com.atlassian.bamboo.specs.model.task.ScriptTaskProperties.Location;
/**
* Plan configuration for Bamboo.
* Learn more on: <a href="https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs">https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs</a>
*/
@BambooSpec
public class PlanSpec {
private static final String SERVICE_NAME = "pdftron-logic-commons";
private static final String SERVICE_KEY = SERVICE_NAME.toUpperCase().replaceAll("-", "");
/**
* Run main to publish plan on Bamboo
*/
public static void main(final String[] args) throws Exception {
//By default credentials are read from the '.credentials' file.
BambooServer bambooServer = new BambooServer("http://localhost:8085");
Plan plan = new PlanSpec().createPlan();
bambooServer.publish(plan);
PlanPermissions planPermission = new PlanSpec().createPlanPermission(plan.getIdentifier());
bambooServer.publish(planPermission);
}
private PlanPermissions createPlanPermission(PlanIdentifier planIdentifier) {
Permissions permission = new Permissions()
.userPermissions("atlbamboo", PermissionType.EDIT, PermissionType.VIEW, PermissionType.ADMIN, PermissionType.CLONE, PermissionType.BUILD)
.groupPermissions("development", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
.groupPermissions("devplant", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
.loggedInUserPermissions(PermissionType.VIEW)
.anonymousUserPermissionView();
return new PlanPermissions(planIdentifier.getProjectKey(), planIdentifier.getPlanKey()).permissions(permission);
}
private Project project() {
return new Project()
.name("RED")
.key(new BambooKey("RED"));
}
public Plan createPlan() {
return new Plan(
project(),
SERVICE_NAME, new BambooKey(SERVICE_KEY))
.description("Plan created from (enter repository url of your plan)")
.variables(new Variable("maven_add_param", ""))
.stages(new Stage("Default Stage")
.jobs(new Job("Default Job",
new BambooKey("JOB1"))
.tasks(
new ScriptTask()
.description("Clean")
.inlineBody("#!/bin/bash\n" +
"set -e\n" +
"rm -rf ./*"),
new VcsCheckoutTask()
.description("Checkout Default Repository")
.checkoutItems(new CheckoutItem().defaultRepository()),
new ScriptTask()
.description("Build")
.location(Location.FILE)
.fileFromPath("bamboo-specs/src/main/resources/scripts/build-java.sh")
.argument(SERVICE_NAME),
createJUnitParserTask()
.description("Resultparser")
.resultDirectories("**/test-reports/*.xml, **/target/surefire-reports/*.xml, **/target/failsafe-reports/*.xml")
.enabled(true),
new ScriptTask()
.description("Sonar")
.location(Location.FILE)
.fileFromPath("bamboo-specs/src/main/resources/scripts/sonar-java.sh")
.argument(SERVICE_NAME),
new InjectVariablesTask()
.description("Inject git Tag")
.path("git.tag")
.namespace("g")
.scope(InjectVariablesScope.LOCAL),
new VcsTagTask()
.description("${bamboo.g.gitTag}")
.tagName("${bamboo.g.gitTag}")
.defaultRepository())
.dockerConfiguration(
new DockerConfiguration()
.image("nexus.iqser.com:5001/infra/maven:3.8.4-openjdk-17-slim")
.volume("/etc/maven/settings.xml", "/usr/share/maven/conf/settings.xml")
.dockerRunArguments("--net=host")
.volume("/var/run/docker.sock", "/var/run/docker.sock")
)
)
)
.linkedRepositories("RED / " + SERVICE_NAME)
.triggers(new BitbucketServerTrigger())
.planBranchManagement(new PlanBranchManagement()
.createForVcsBranch()
.delete(new BranchCleanup()
.whenInactiveInRepositoryAfterDays(14))
.notificationForCommitters());
}
}

View File

@ -0,0 +1,52 @@
#!/bin/bash
set -e
SERVICE_NAME=$1
if [[ "$bamboo_planRepository_branchName" == "master" ]]
then
branchVersion=$(cat pom.xml | grep -Eo " <version>.*-SNAPSHOT</version>" | sed -s 's|<version>\(.*\)\..*\(-*.*\)</version>|\1|' | tr -d ' ')
latestVersion=$( semver $(git tag -l "${branchVersion}.*" ) | tail -n1 )
newVersion="$(semver $latestVersion -p -i minor)"
echo "new release on master with version $newVersion"
elif [[ "$bamboo_planRepository_branchName" == release* ]]
then
branchVersion=$(echo $bamboo_planRepository_branchName | sed -s 's|release\/\([0-9]\+\.[0-9]\+\)\.x|\1|')
latestVersion=$( semver $(git tag -l "${branchVersion}.*" ) | tail -n1 )
newVersion="$(semver $latestVersion -p -i patch)"
echo "new release on $bamboo_planRepository_branchName with version $newVersion"
elif [[ "${bamboo_version_tag}" != "dev" ]]
then
newVersion="${bamboo_version_tag}"
echo "new special version bild with $newVersion"
else
mvn -f ${bamboo_build_working_directory}/pom.xml \
--no-transfer-progress \
${bamboo_maven_add_param} \
clean install \
-Djava.security.egd=file:/dev/./urandomelse
echo "dev build with tag ${bamboo_planRepository_1_branch}_${bamboo_buildNumber}"
echo "gitTag=${bamboo_planRepository_1_branch}_${bamboo_buildNumber}" > git.tag
exit 0
fi
echo "gitTag=${newVersion}" > git.tag
mvn --no-transfer-progress \
-f ${bamboo_build_working_directory}/pom.xml \
${bamboo_maven_add_param} \
versions:set \
-DnewVersion=${newVersion}
mvn -f ${bamboo_build_working_directory}/pom.xml \
--no-transfer-progress \
clean deploy \
${bamboo_maven_add_param} \
-e \
-DdeployAtEnd=true \
-Dmaven.wagon.http.ssl.insecure=true \
-Dmaven.wagon.http.ssl.allowall=true \
-Dmaven.wagon.http.ssl.ignore.validity.dates=true \
-DaltDeploymentRepository=iqser_release::default::https://nexus.iqser.com/repository/red-platform-releases

View File

@ -0,0 +1,37 @@
#!/bin/bash
set -e
SERVICE_NAME=$1
echo "dependency-check:aggregate"
mvn --no-transfer-progress \
org.owasp:dependency-check-maven:aggregate \
-DknownExploitedEnabled=false
if [[ -z "${bamboo_repository_pr_key}" ]]
then
echo "Sonar Scan for branch: ${bamboo_planRepository_1_branch}"
mvn --no-transfer-progress \
sonar:sonar \
-Dsonar.projectKey=RED_$SERVICE_NAME \
-Dsonar.host.url=https://sonarqube.iqser.com \
-Dsonar.login=${bamboo_sonarqube_api_token_secret} \
-Dsonar.branch.name=${bamboo_planRepository_1_branch} \
-Dsonar.dependencyCheck.jsonReportPath=target/dependency-check-report.json \
-Dsonar.dependencyCheck.xmlReportPath=target/dependency-check-report.xml \
-Dsonar.dependencyCheck.htmlReportPath=target/dependency-check-report.html
else
echo "Sonar Scan for PR with key1: ${bamboo_repository_pr_key}"
mvn --no-transfer-progress \
sonar:sonar \
-Dsonar.projectKey=RED_$SERVICE_NAME \
-Dsonar.host.url=https://sonarqube.iqser.com \
-Dsonar.login=${bamboo_sonarqube_api_token_secret} \
-Dsonar.pullrequest.key=${bamboo_repository_pr_key} \
-Dsonar.pullrequest.branch=${bamboo_repository_pr_sourceBranch} \
-Dsonar.pullrequest.base=${bamboo_repository_pr_targetBranch} \
-Dsonar.dependencyCheck.jsonReportPath=target/dependency-check-report.json \
-Dsonar.dependencyCheck.xmlReportPath=target/dependency-check-report.xml \
-Dsonar.dependencyCheck.htmlReportPath=target/dependency-check-report.html
fi

View File

@ -0,0 +1,16 @@
package buildjob;
import com.atlassian.bamboo.specs.api.builders.plan.Plan;
import com.atlassian.bamboo.specs.api.exceptions.PropertiesValidationException;
import com.atlassian.bamboo.specs.api.util.EntityPropertiesBuilders;
import org.junit.Test;
public class PlanSpecTest {
@Test
public void checkYourPlanOffline() throws PropertiesValidationException {
Plan plan = new PlanSpec().createPlan();
EntityPropertiesBuilders.build(plan);
}
}

View File

@ -1,107 +0,0 @@
plugins {
`java-library`
`maven-publish`
`kotlin-dsl`
pmd
checkstyle
jacoco
id("io.freefair.lombok") version "8.4"
id("org.sonarqube") version "4.0.0.2929"
}
repositories {
mavenLocal()
maven {
url = uri("https://pdftron.com/maven/release")
}
maven {
url = uri("https://nexus.knecon.com/repository/gindev/");
credentials {
username = providers.gradleProperty("mavenUser").getOrNull();
password = providers.gradleProperty("mavenPassword").getOrNull();
}
}
mavenCentral()
}
dependencies {
api("org.projectlombok:lombok:1.18.30")
api("com.google.guava:guava:33.0.0-jre")
api("com.pdftron:PDFNet:11.0.0")
testImplementation("net.sourceforge.lept4j:lept4j:1.19.1")
testImplementation("org.junit.jupiter:junit-jupiter:5.10.2")
testImplementation("org.assertj:assertj-core:3.24.2")
testImplementation("org.mockito:mockito-core:5.2.0")
testImplementation("org.apache.logging.log4j:log4j-slf4j2-impl:2.22.1")
compileOnly("org.slf4j:slf4j-api:2.0.11")
}
group = "com.iqser.red.commons"
description = "pdftron-logic-commons"
java.sourceCompatibility = JavaVersion.VERSION_17
java.targetCompatibility = JavaVersion.VERSION_17
publishing {
publications {
create<MavenPublication>("mavenJava") {
from(components["java"])
}
}
repositories {
maven {
url = uri("https://nexus.knecon.com/repository/red-platform-releases/")
credentials {
username = providers.gradleProperty("mavenUser").getOrNull();
password = providers.gradleProperty("mavenPassword").getOrNull();
}
}
}
}
tasks.withType<PublishToMavenRepository> {
onlyIf { publication.name == "mavenJava" }
}
pmd {
isConsoleOutput = true
}
tasks.pmdMain {
pmd.ruleSetFiles = files("${rootDir}/config/pmd/pmd.xml")
}
tasks.pmdTest {
pmd.ruleSetFiles = files("${rootDir}/config/pmd/test_pmd.xml")
}
tasks.named<Test>("test") {
useJUnitPlatform()
reports {
junitXml.outputLocation.set(layout.buildDirectory.dir("reports/junit"))
}
}
sonarqube {
properties {
property("sonar.login", providers.gradleProperty("sonarToken").getOrNull())
property("sonar.host.url", "https://sonarqube.knecon.com")
}
}
tasks.test {
finalizedBy(tasks.jacocoTestReport)
}
tasks.jacocoTestReport {
dependsOn(tasks.test)
reports {
xml.required.set(true)
csv.required.set(false)
html.outputLocation.set(layout.buildDirectory.dir("jacocoHtml"))
}
}
java {
withJavadocJar()
}

View File

@ -1,39 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE module PUBLIC "-//Puppy Crawl//DTD Check Configuration 1.3//EN"
"http://www.puppycrawl.com/dtds/configuration_1_3.dtd">
<module name="Checker">
<property
name="severity"
value="error"/>
<module name="TreeWalker">
<module name="SuppressWarningsHolder"/>
<module name="MissingDeprecated"/>
<module name="MissingOverride"/>
<module name="AnnotationLocation"/>
<module name="JavadocStyle"/>
<module name="NonEmptyAtclauseDescription"/>
<module name="IllegalImport"/>
<module name="RedundantImport"/>
<module name="RedundantModifier"/>
<module name="EmptyBlock"/>
<module name="DefaultComesLast"/>
<module name="EmptyStatement"/>
<module name="EqualsHashCode"/>
<module name="ExplicitInitialization"/>
<module name="IllegalInstantiation"/>
<module name="ModifiedControlVariable"/>
<module name="MultipleVariableDeclarations"/>
<module name="PackageDeclaration"/>
<module name="ParameterAssignment"/>
<module name="SimplifyBooleanExpression"/>
<module name="SimplifyBooleanReturn"/>
<module name="StringLiteralEquality"/>
<module name="OneStatementPerLine"/>
<module name="FinalClass"/>
<module name="ArrayTypeStyle"/>
<module name="UpperEll"/>
<module name="OuterTypeFilename"/>
</module>
<module name="FileTabCharacter"/>
<module name="SuppressWarningsFilter"/>
</module>

View File

@ -1,20 +0,0 @@
<?xml version="1.0"?>
<ruleset name="Custom ruleset"
xmlns="http://pmd.sourceforge.net/ruleset/2.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://pmd.sourceforge.net/ruleset/2.0.0 http://pmd.sourceforge.net/ruleset_2_0_0.xsd">
<description>
Knecon ruleset checks the code for bad stuff
</description>
<rule ref="category/java/errorprone.xml">
<exclude name="MissingSerialVersionUID"/>
<exclude name="AvoidLiteralsInIfCondition"/>
<exclude name="AvoidDuplicateLiterals"/>
<exclude name="NullAssignment"/>
<exclude name="AssignmentInOperand"/>
<exclude name="BeanMembersShouldSerialize"/>
</rule>
</ruleset>

View File

@ -1,22 +0,0 @@
<?xml version="1.0"?>
<ruleset name="Custom ruleset"
xmlns="http://pmd.sourceforge.net/ruleset/2.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://pmd.sourceforge.net/ruleset/2.0.0 http://pmd.sourceforge.net/ruleset_2_0_0.xsd">
<description>
Knecon test ruleset checks the code for bad stuff
</description>
<rule ref="category/java/errorprone.xml">
<exclude name="MissingSerialVersionUID"/>
<exclude name="AvoidLiteralsInIfCondition"/>
<exclude name="AvoidDuplicateLiterals"/>
<exclude name="NullAssignment"/>
<exclude name="AssignmentInOperand"/>
<exclude name="TestClassWithoutTestCases"/>
<exclude name="BeanMembersShouldSerialize"/>
</rule>
</ruleset>

View File

@ -1 +0,0 @@
version = 2.0-SNAPSHOT

105
pom.xml Normal file
View File

@ -0,0 +1,105 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>platform-dependency</artifactId>
<groupId>com.iqser.red</groupId>
<version>1.17.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<artifactId>pdftron-logic-commons</artifactId>
<groupId>com.iqser.red.commons</groupId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-configuration-processor</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.pdftron</groupId>
<artifactId>PDFNet</artifactId>
<version>9.4.0</version>
<scope>provided</scope>
</dependency>
<!-- Test Dependencies -->
</dependencies>
<build>
<plugins>
<plugin>
<!-- create a test jar for the api classes to be used by other modules -->
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<executions>
<execution>
<goals>
<goal>test-jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.jacoco</groupId>
<artifactId>jacoco-maven-plugin</artifactId>
<executions>
<execution>
<id>prepare-agent</id>
<goals>
<goal>prepare-agent</goal>
</goals>
</execution>
<execution>
<id>report</id>
<goals>
<goal>report</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
<repositories>
<repository>
<id>pdftron</id>
<name>PDFNet Maven</name>
<url>https://pdftron.com/maven/release</url>
</repository>
</repositories>
</project>

View File

@ -1,6 +0,0 @@
{
"$schema": "https://docs.renovatebot.com/renovate-schema.json",
"extends": [
"config:base"
]
}

View File

@ -1 +0,0 @@
rootProject.name = "pdftron-logic-commons"

View File

@ -1,5 +1,7 @@
package com.iqser.red.pdftronlogic.commons;
import static com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService.TOLERANCE;
import java.awt.geom.Area;
@ -33,13 +35,6 @@ public class ClippingPathStack {
}
@SneakyThrows
public void intersectClippingPath(Rectangle2D path) {
getCurrentClippingPath().intersect(new Area(path));
}
public boolean almostIntersects(double x, double y, double width, double height) {
// To address inconsistencies in the calculation of the bounding box we slightly increase the rectangle
// Height or width are zero for straight lines, even though they are being rendered. Therefore, height or width must be at minimum >0.
@ -61,16 +56,15 @@ public class ClippingPathStack {
public void enterNewGState() {
Area current = stack.peek();
Area cloned = (Area) current.clone();
Area cloned = new Area();
cloned.add(current);
stack.push(cloned);
}
public void leaveGState() {
// somehow this greatly helps memory management
var popped = stack.pop();
popped.reset();
stack.pop();
}
}
}

View File

@ -1,76 +0,0 @@
package com.iqser.red.pdftronlogic.commons;
import static com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService.TOLERANCE;
import java.awt.Shape;
import java.awt.geom.Rectangle2D;
import lombok.experimental.UtilityClass;
@UtilityClass
public class ComparisonUtils {
public Rectangle2D shrinkRectangle(Rectangle2D inner) {
return shrinkRectangle(inner, TOLERANCE);
}
public Rectangle2D shrinkRectangle(Rectangle2D rect, double tolerance) {
double newX = rect.getX() + tolerance;
double newY = rect.getY() + tolerance;
double newWidth = rect.getWidth() - 2 * tolerance;
double newHeight = rect.getHeight() - 2 * tolerance;
if (newWidth <= 1e-1) {
newWidth = 1e-1;
newX = rect.getX() + newWidth / 2;
}
if (newHeight <= 1e-1) {
newHeight = 1e-1;
newY = rect.getY() + newHeight / 2;
}
return new Rectangle2D.Double(newX, newY, newWidth, newHeight);
}
public Rectangle2D padRectangle(Rectangle2D inner) {
return padRectangle(inner, TOLERANCE);
}
public Rectangle2D padRectangle(Rectangle2D rect, double tolerance) {
double newX = rect.getX() - tolerance;
double newY = rect.getY() - tolerance;
double newWidth = rect.getWidth() + 2 * tolerance;
double newHeight = rect.getHeight() + 2 * tolerance;
if (newWidth <= 0) {
newWidth = 1e-2;
}
if (newHeight <= 0) {
newHeight = 1e-2;
}
return new Rectangle2D.Double(newX, newY, newWidth, newHeight);
}
public boolean almostContains(Shape outer, Rectangle2D inner) {
Rectangle2D innerRect = ComparisonUtils.shrinkRectangle(inner);
return outer.contains(innerRect);
}
public static boolean almostEqual(double a, double b) {
return Math.abs(a - b) < TOLERANCE;
}
}

View File

@ -1,147 +0,0 @@
package com.iqser.red.pdftronlogic.commons;
import java.awt.Color;
import java.awt.geom.AffineTransform;
import java.awt.geom.GeneralPath;
import java.awt.geom.PathIterator;
import java.awt.geom.Rectangle2D;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import com.google.common.primitives.Bytes;
import com.google.common.primitives.Doubles;
import com.pdftron.common.Matrix2D;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.ColorPt;
import com.pdftron.pdf.ColorSpace;
import com.pdftron.pdf.PathData;
import com.pdftron.pdf.Rect;
import lombok.SneakyThrows;
import lombok.experimental.UtilityClass;
@UtilityClass
public class Converter {
public GeneralPath convertToGeneralPath(PathData pathData) {
GeneralPath linePath = new GeneralPath();
Iterator<Double> points = Doubles.asList(pathData.getPoints())
.iterator();
Iterable<Byte> operators = Bytes.asList(pathData.getOperators());
for (var operator : operators) {
switch (operator) {
case PathData.e_moveto -> linePath.moveTo(points.next(), points.next());
case PathData.e_lineto -> linePath.lineTo(points.next(), points.next());
case PathData.e_cubicto -> linePath.curveTo(points.next(), points.next(), points.next(), points.next(), points.next(), points.next());
case PathData.e_conicto -> linePath.quadTo(points.next(), points.next(), points.next(), points.next());
case PathData.e_closepath -> linePath.closePath();
case PathData.e_rect -> {
double x = points.next();
double y = points.next();
double w = points.next();
double h = points.next();
linePath.moveTo(x, y);
linePath.lineTo(x + w, y);
linePath.lineTo(x + w, y + h);
linePath.lineTo(x, y + h);
linePath.closePath();
}
default -> throw new IllegalArgumentException("Invalid Operator Type " + operator);
}
}
return linePath;
}
public PathData convertToPathData(GeneralPath linePath) {
PathIterator pathIterator = linePath.getPathIterator(null);
List<Byte> operators = new LinkedList<>();
List<Double> points = new LinkedList<>();
while (!pathIterator.isDone()) {
double[] currentPoints = new double[6];
int type = pathIterator.currentSegment(currentPoints);
switch (type) {
case PathIterator.SEG_MOVETO -> {
operators.add((byte) PathData.e_moveto);
points.add(currentPoints[0]);
points.add(currentPoints[1]);
}
case PathIterator.SEG_LINETO -> {
operators.add((byte) PathData.e_lineto);
points.add(currentPoints[0]);
points.add(currentPoints[1]);
}
case PathIterator.SEG_QUADTO -> {
operators.add((byte) PathData.e_conicto);
points.add(currentPoints[0]);
points.add(currentPoints[1]);
points.add(currentPoints[2]);
points.add(currentPoints[3]);
}
case PathIterator.SEG_CUBICTO -> {
operators.add((byte) PathData.e_cubicto);
points.add(currentPoints[0]);
points.add(currentPoints[1]);
points.add(currentPoints[2]);
points.add(currentPoints[3]);
points.add(currentPoints[4]);
points.add(currentPoints[5]);
}
case PathIterator.SEG_CLOSE -> {
operators.add((byte) PathData.e_closepath);
}
}
}
byte[] operatorArr = new byte[operators.size()];
for (int i = 0; i < operators.size(); i++) {
operatorArr[i] = operators.get(i);
}
double[] pointArr = new double[points.size()];
for (int i = 0; i < points.size(); i++) {
pointArr[i] = points.get(i);
}
return new PathData(true, operatorArr, pointArr);
}
public GeneralPath convertToGeneralPathAndTransformToInitialUserSpace(PathData pathData, Matrix2D ctm) throws PDFNetException {
GeneralPath linePath = Converter.convertToGeneralPath(pathData);
//transform path to initial user space
var affineTransform = new AffineTransform(ctm.getA(), ctm.getB(), ctm.getC(), ctm.getD(), ctm.getH(), ctm.getV());
linePath.transform(affineTransform);
return linePath;
}
@SneakyThrows
public static Color convertColor(ColorSpace colorSpace, ColorPt colorPt) {
try (ColorPt rgbColor = colorSpace.convert2RGB(colorPt)) {
return new Color((float) rgbColor.get(0), (float) rgbColor.get(1), (float) rgbColor.get(2));
}
}
@SneakyThrows
public static Rectangle2D toRectangle2D(Rect rect) {
return new Rectangle2D.Double(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
}
@SneakyThrows
public static AffineTransform toAffineTransform(Matrix2D textMatrix) {
if (textMatrix == null) {
return null;
}
return new AffineTransform(textMatrix.getA(), textMatrix.getB(), textMatrix.getC(), textMatrix.getD(), textMatrix.getV(), textMatrix.getH());
}
}

View File

@ -0,0 +1,170 @@
package com.iqser.red.pdftronlogic.commons;
import static com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService.TOLERANCE;
import java.awt.geom.Rectangle2D;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.Rect;
import lombok.AccessLevel;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Getter
@SuperBuilder
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ElementFeatures {
int elementType;
Rectangle2D boundingBox;
public boolean almostMatches(Element element) throws PDFNetException {
return element.getType() == elementType && //
element.getBBox() != null && //
rectsAlmostMatch(element.getBBox());
}
protected boolean almostEqual(double a, double b) {
return Math.abs(a - b) < TOLERANCE;
}
@SneakyThrows
private boolean rectsAlmostMatch(Rect bBox) {
// To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance
return almostEqual(bBox.getX1(), boundingBox.getX()) && //
almostEqual(bBox.getY1(), boundingBox.getY()) && //
almostEqual(bBox.getWidth(), boundingBox.getWidth()) && //
almostEqual(bBox.getHeight(), boundingBox.getHeight());
}
@EqualsAndHashCode(callSuper = true)
@Getter
@SuperBuilder
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
private static class Text extends ElementFeatures {
String text;
int font;
double fontsize;
@Override
public boolean almostMatches(Element element) throws PDFNetException {
return super.almostMatches(element) && //
text.equals(element.getTextString()) && //
font == element.getGState().getFont().getType() && //
almostEqual(fontsize, element.getGState().getFontSize());
}
}
@EqualsAndHashCode(callSuper = true)
@Getter
@SuperBuilder
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
private static class Path extends ElementFeatures {
boolean isClippingPath;
boolean isClipWindingFill;
boolean isStroked;
boolean isFilled;
boolean isWindingFill;
@Override
public boolean almostMatches(Element element) throws PDFNetException {
return super.almostMatches(element) && //
isClippingPath == element.isClippingPath() && //
isClipWindingFill == element.isClipWindingFill() && //
isStroked == element.isStroked() && //
isFilled == element.isFilled() && //
isWindingFill == element.isWindingFill();
}
}
@EqualsAndHashCode(callSuper = true)
@Getter
@SuperBuilder
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
private static class Image extends ElementFeatures {
int dataSize;
int height;
int width;
int renderingIntent;
int componentNum;
int bitsPerComponent;
@Override
public boolean almostMatches(Element element) throws PDFNetException {
return super.almostMatches(element) && //
dataSize == element.getImageDataSize() && //
height == element.getImageHeight() && //
width == element.getImageWidth() && //
renderingIntent == element.getImageRenderingIntent() && //
componentNum == element.getComponentNum() && //
bitsPerComponent == element.getBitsPerComponent();
}
}
public static ElementFeatures extractFeatures(Element element) throws PDFNetException {
return switch (element.getType()) {
case Element.e_path -> Path.builder()
.elementType(element.getType())
.boundingBox(toRectangle2D(element.getBBox()))
.isClippingPath(element.isClippingPath())
.isClipWindingFill(element.isClipWindingFill())
.isStroked(element.isStroked())
.isFilled(element.isFilled())
.isWindingFill(element.isWindingFill())
.build();
case Element.e_text -> Text.builder()
.elementType(element.getType())
.boundingBox(toRectangle2D(element.getBBox()))
.text(element.getTextString())
.font(element.getGState().getFont().getType())
.fontsize(element.getGState().getFontSize())
.build();
case Element.e_image, Element.e_inline_image -> Image.builder()
.elementType(element.getType())
.boundingBox(toRectangle2D(element.getBBox()))
.dataSize(element.getImageDataSize())
.height(element.getImageHeight())
.width(element.getImageWidth())
.renderingIntent(element.getImageRenderingIntent())
.componentNum(element.getComponentNum())
.bitsPerComponent(element.getBitsPerComponent())
.build();
// This technically should never happen, it's a safetynet
default -> throw new RuntimeException("Feature Extraction is not supported for PDFTron.Element with type: " + element.getType());
};
}
private static Rectangle2D toRectangle2D(Rect rect) throws PDFNetException {
return new Rectangle2D.Double(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
}
}

View File

@ -1,95 +0,0 @@
package com.iqser.red.pdftronlogic.commons;
import java.awt.Color;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import javax.imageio.ImageIO;
import com.pdftron.filters.FilterWriter;
import com.pdftron.filters.MemoryFilter;
import com.pdftron.pdf.Element;
import lombok.SneakyThrows;
import lombok.experimental.UtilityClass;
@UtilityClass
public class ImageHashFactory {
@SneakyThrows
public String calculate(Element element) {
com.pdftron.pdf.Image image = new com.pdftron.pdf.Image(element.getXObject());
byte[] imageBytes = getBytesOfImage(image);
ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(imageBytes);
BufferedImage image1 = ImageIO.read(byteArrayInputStream);
String hash = getSimplePHash(image1);
return hash;
}
@SneakyThrows
private byte[] getBytesOfImage(com.pdftron.pdf.Image inputImage) {
// 0 because the memory filter determines the size
try(var memFilter = new MemoryFilter(0, false);
var filterWriter = new FilterWriter(memFilter)) {
inputImage.export(filterWriter);
filterWriter.flushAll();
byte[] res = memFilter.getBuffer();
memFilter.flushAll();
return res;
}
}
public String getSimplePHash(BufferedImage image) {
// Resize the image to a fixed size (e.g., 8x8 pixels)
int targetWidth = 8;
int targetHeight = 8;
BufferedImage resizedImage = new BufferedImage(targetWidth, targetHeight, BufferedImage.TYPE_INT_ARGB);
resizedImage.getGraphics().drawImage(image.getScaledInstance(targetWidth, targetHeight, java.awt.Image.SCALE_SMOOTH), 0, 0, targetWidth, targetHeight, null);
// Convert the image to grayscale
BufferedImage grayscaleImage = new BufferedImage(targetWidth, targetHeight, BufferedImage.TYPE_BYTE_GRAY);
grayscaleImage.getGraphics().drawImage(resizedImage, 0, 0, null);
// Calculate the average grayscale pixel value
int average = calculateAverage(grayscaleImage);
// Create a binary hash based on pixel values
StringBuilder hashBuilder = new StringBuilder();
for (int y = 0; y < targetHeight; y++) {
for (int x = 0; x < targetWidth; x++) {
int pixelValue = new Color(grayscaleImage.getRGB(x, y)).getRed();
if (pixelValue > average) {
hashBuilder.append("1");
} else {
hashBuilder.append("0");
}
}
}
return hashBuilder.toString();
}
// Helper method to calculate the average grayscale pixel value
private int calculateAverage(BufferedImage image) {
int total = 0;
int width = image.getWidth();
int height = image.getHeight();
for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x++) {
total += new Color(image.getRGB(x, y)).getRed();
}
}
return total / (width * height);
}
}

View File

@ -1,29 +1,27 @@
package com.iqser.red.pdftronlogic.commons;
import static com.iqser.red.pdftronlogic.commons.VisualizationUtils.drawFeature;
import static com.iqser.red.pdftronlogic.commons.VisualizationUtils.drawRect;
import java.awt.Color;
import java.awt.geom.Area;
import java.awt.Shape;
import java.awt.geom.AffineTransform;
import java.awt.geom.GeneralPath;
import java.awt.geom.Rectangle2D;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.TreeSet;
import com.iqser.red.pdftronlogic.commons.features.ElementFeatureFactory;
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
import com.iqser.red.pdftronlogic.commons.features.ImageFeatures;
import com.iqser.red.pdftronlogic.commons.features.PathFeatures;
import com.iqser.red.pdftronlogic.commons.lookup.ElementFeatureLookup;
import org.springframework.stereotype.Service;
import com.google.common.primitives.Bytes;
import com.google.common.primitives.Doubles;
import com.pdftron.common.Matrix2D;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.ColorPt;
import com.pdftron.pdf.ColorSpace;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementBuilder;
import com.pdftron.pdf.ElementReader;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.GState;
@ -40,10 +38,10 @@ import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
public class InvisibleElementRemovalService {
static public final double TOLERANCE = 1;
public static final String KNECON_OCR = "KNECON_OCR";
static public final double TOLERANCE = 1e-3;
/**
@ -51,141 +49,59 @@ public class InvisibleElementRemovalService {
* handled cases:
* -Text which is transparent or is set to not render
* -Elements outside of clipping path
* -Elements outside of Form XObjects
* -Elements that have been painted over by visible and filled Paths
* -Elements with the same color as background
* unhandled cases:
* -Elements covered by widely stroked path
* -Elements with the same color as background
* -Any Text set to clipping with its many interactions with other elements
*
* @param pdfFile The PDF file to process
* @param removePaths If this flag is set, invisible path elements will be removed
* @param delta If this flag is set only the removed Elements will be written to the output file.
* The Elements are red if they are removed by clipping path, blue for transparency, and a green bounding box for overlap.
* @param out OutputStream to write the resulting file to
* @param pdfFile The PDF file to process
* @param delta If this flag is set only the removed Elements will be written to the output file.
* The Elements are red if they are removed by clipping path, blue for transparency, and a green bounding box for overlap.
* @param out OutputStream to write the resulting file to
**/
@SneakyThrows
public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta, boolean removePaths, Set<String> markedContentToIgnore) {
public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta) {
PDFDoc pdfDoc = new PDFDoc(pdfFile);
try (pdfDoc) {
execute(pdfDoc, delta, removePaths, markedContentToIgnore);
ElementWriter writer = new ElementWriter();
ElementReader reader = new ElementReader();
Set<Long> visitedXObjIds = new TreeSet<>();
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
Page page = iterator.next();
visitedXObjIds.add(page.getSDFObj().getObjNum());
InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder()
.reader(reader)
.clippingPathStack(new ClippingPathStack(page.getMediaBox()))
.delta(delta)
.overlappedElements(new ArrayList<>())
.visibleElements(new ArrayList<>())
.visitedXObjIds(visitedXObjIds)
.build();
removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context);
context.visitedXObjIds().clear();
removeOverlappedElements(page, writer, context);
}
try {
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
} catch (Exception e) {
log.error("File could not be saved after invisible element removal");
throw new RuntimeException(e);
}
}
/**
* This method is equal to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean, Set)}, with removePaths == true and markedContentsToIgnore == emptySet().
*/
@SneakyThrows
public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta) {
removeInvisibleElements(pdfFile, out, delta, true, Collections.emptySet());
}
/**
* This method is equal to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean, Set)}, with removePaths == true and markedContentsToIgnore = Set.of("KNECON_OCR").
*/
public void removeInvisibleElementsButKeepOcrText(InputStream pdfFile, OutputStream out, boolean delta) {
removeInvisibleElements(pdfFile, out, delta, true, Set.of(KNECON_OCR));
}
/**
* This method is equal to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean, Set)}, with removePaths == true and markedContentsToIgnore = Set.of("KNECON_OCR").
*/
public void removeInvisibleElementsButKeepOcrText(PDFDoc pdfFile, boolean delta) {
removeInvisibleElements(pdfFile, delta, true, Set.of(KNECON_OCR));
}
/**
* This method is equal to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean, Set)}, with markedContentsToIgnore == emptySet().
*/
@SneakyThrows
public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta, boolean removePaths) {
removeInvisibleElements(pdfFile, out, delta, removePaths, Collections.emptySet());
}
/**
* This method is similar to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean, Set)}, just with a PDFDoc.
*/
@SneakyThrows
public void removeInvisibleElements(PDFDoc pdfDoc, boolean delta, boolean removePaths, Set<String> markedContentToIgnore) {
execute(pdfDoc, delta, removePaths, markedContentToIgnore);
}
/**
* This method is similar to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean)}, just with a PDFDoc.
*/
@SneakyThrows
public void removeInvisibleElements(PDFDoc pdfDoc, boolean delta, boolean removePaths) {
execute(pdfDoc, delta, removePaths, Collections.emptySet());
}
/**
* This method is equal to {@link #removeInvisibleElements(PDFDoc, boolean)}, just with a PDFDoc.
*/
@SneakyThrows
public void removeInvisibleElements(PDFDoc pdfDoc, boolean delta) {
execute(pdfDoc, delta, true, Collections.emptySet());
}
@SneakyThrows
private void execute(PDFDoc pdfDoc, boolean delta, boolean removePaths, Set<String> markedContentToIgnore) {
log.info("Start removing invisible Elements");
try (PageIterator iterator = pdfDoc.getPageIterator(); ElementWriter writer = new ElementWriter(); ElementReader reader = new ElementReader()) {
Set<Long> visitedXObjIds = new TreeSet<>();
while (iterator.hasNext()) {
Page page = iterator.next();
visitedXObjIds.add(page.getSDFObj().getObjNum());
try (InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder()
.reader(reader)
.clippingPathStack(new ClippingPathStack(page.getMediaBox()))
.markedContentStack(new MarkedContentStack(pdfDoc))
.removePaths(removePaths)
.delta(delta)
.overlappedElements(new ElementFeatureLookup())
.visibleElements(new ElementFeatureLookup())
.visitedXObjIds(visitedXObjIds)
.markedContentToIgnore(markedContentToIgnore)
.build()) {
removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context);
context.visitedXObjIds().clear();
context.markedContentStack().clear();
removeOverlappedElements(page, writer, context);
}
}
}
log.info("Finished removing invisible Elements");
writer.destroy();
reader.destroy();
pdfDoc.close();
}
@ -194,7 +110,6 @@ public class InvisibleElementRemovalService {
InvisibleElementRemovalContext context) throws PDFNetException {
context.reader().begin(page);
context.markedContentStack().clear();
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
processElements(writer, context);
writer.end();
@ -204,13 +119,7 @@ public class InvisibleElementRemovalService {
private void processElements(ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
for (Element element = context.reader().next(); element != null; element = context.reader().next()) {
if (context.markedContentStack().currentMarkedContentContainsAny(context.markedContentToIgnore()) && element.getType() != Element.e_marked_content_end) {
writer.writeElement(element);
continue;
}
for (Element element = context.reader().next(); element != null; element = context.reader().next())
switch (element.getType()) {
case Element.e_image, Element.e_inline_image -> processImages(element, writer, context);
case Element.e_text -> processText(element, writer, context);
@ -224,94 +133,75 @@ public class InvisibleElementRemovalService {
context.clippingPathStack().leaveGState();
writer.writeElement(element);
}
case Element.e_marked_content_begin -> {
context.markedContentStack().enterMarkedContent(element.getMCTag().getName());
writer.writeElement(element);
}
case Element.e_marked_content_end -> {
context.markedContentStack().leaveMarkedContent();
writer.writeElement(element);
}
default -> writer.writeElement(element);
}
}
}
private void processImages(Element imageElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
try (Rect rect = imageElement.getBBox()) {
Rect rect = imageElement.getBBox();
if (rect == null) {
return;
}
if (rect == null) {
return;
}
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
if (inClippingPath) {
ImageFeatures imageFeatures = ElementFeatureFactory.buildImage(imageElement);
if (!(context.markedContentStack.contextHasTransparency() || imageFeatures.isTransparent() || imageFeatures.isImageMask() || imageFeatures.isSoftMask())) {
calculateOverlaps(context, imageFeatures, imageFeatures.isMasked());
}
context.visibleElements().add(imageFeatures);
}
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
if (context.delta() ^ inClippingPath) {
writer.writeElement(imageElement);
}
if (!context.delta() && inClippingPath) {
context.visibleElements().add(ElementFeatures.extractFeatures(imageElement));
}
if (context.delta() ^ inClippingPath) {
writer.writeElement(imageElement);
}
}
private void processText(Element textElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
try (Rect textBBox = textElement.getBBox()) {
Rect rect = textElement.getBBox();
if (textBBox == null) {
writer.writeElement(textElement);
return;
}
if (rect == null) {
writer.writeElement(textElement);
return;
}
GState gState = textElement.getGState();
GState gState = textElement.getGState();
boolean inClippingPath = context.clippingPathStack().almostIntersects(textBBox.getX1(), textBBox.getY1(), textBBox.getWidth(), textBBox.getHeight());
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
boolean isTextVisible = isTextRenderedVisibly(gState, textBBox, context);
boolean isTextVisible = isTextRenderedVisibly(gState);
if (inClippingPath && isTextVisible) {
context.visibleElements().add(ElementFeatures.extractFeatures(textElement));
}
if (!context.delta()) {
if (inClippingPath && isTextVisible) {
context.visibleElements().add(ElementFeatureFactory.buildText(textElement, true, context.delta()));
}
if (!context.delta()) {
if (inClippingPath && isTextVisible) {
writer.writeElement(textElement);
} else if (textElement.hasTextMatrix()) {
writer.writeElement(textElement);
} else if (textElement.hasTextMatrix()) {
/*
PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
Therefore, the position of a following Tj is affected by not writing the first Element.
This is why, we write only the Tm command:
*/
textElement.setTextData(new byte[]{});
writer.writeElement(textElement);
}
} else {
if (!inClippingPath) {
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
// red for elements removed by clipping path
try (var color = new ColorPt(1, 0, 0)) {
gState.setFillColor(color);
}
writer.writeElement(textElement);
}
if (!isTextVisible) {
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
// blue for elements removed due to transparency or not rendered or same color as background
try (var color = new ColorPt(0, 0, 1)) {
gState.setFillColor(color);
}
gState.setTextRenderMode(GState.e_fill_text);
gState.setFillOpacity(1);
writer.writeElement(textElement);
}
writer.writeGStateChanges(textElement);
}
} else {
if (!inClippingPath) {
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
// red for elements removed by clipping path
gState.setFillColor(new ColorPt(1, 0, 0));
writer.writeElement(textElement);
}
if (!isTextVisible) {
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
// blue for elements removed due to transparency or not rendered
gState.setFillColor(new ColorPt(0, 0, 1));
gState.setTextRenderMode(GState.e_fill_text);
gState.setFillOpacity(1);
writer.writeElement(textElement);
}
}
}
@ -324,27 +214,19 @@ public class InvisibleElementRemovalService {
if (!context.visitedXObjIds().contains(formObj.getObjNum())) {
context.visitedXObjIds().add(formObj.getObjNum());
// writer needs to be newly initialized when entering a new content stream
// see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
try (ElementWriter formWriter = new ElementWriter()) {
context.markedContentStack.enterForm(formElement);
context.clippingPathStack().enterNewGState();
try (var formElementBBOX = formElement.getBBox()) {
context.clippingPathStack().intersectClippingPath(Converter.toRectangle2D(formElementBBOX));
context.reader().formBegin();
formWriter.begin(formObj);
ElementWriter formWriter = new ElementWriter();
context.reader().formBegin();
formWriter.begin(formObj);
context.reader().clearChangeList();
formWriter.setDefaultGState(context.reader());
context.reader().clearChangeList();
formWriter.setDefaultGState(context.reader());
processElements(formWriter, context);
formWriter.end();
context.reader().end();
context.clippingPathStack().leaveGState();
context.markedContentStack.leaveForm();
}
}
processElements(formWriter, context);
formWriter.end();
formWriter.destroy();
context.reader().end();
}
}
@ -353,86 +235,80 @@ public class InvisibleElementRemovalService {
PathData pathData = pathElement.getPathData();
try (var bbox = pathElement.getBBox()) {
if (pathData.getOperators().length == 0 && pathData.getPoints().length == 0 || bbox == null) {
writer.writeElement(pathElement);
return;
if (pathData.getOperators().length == 0 && pathData.getPoints().length == 0) {
writer.writeGStateChanges(pathElement);
return;
}
GeneralPath linePath = convertToGeneralPath(pathData);
//transform path to initial user space
var ctm = pathElement.getCTM();
var affineTransform = toAffineTransform(ctm);
linePath.transform(affineTransform);
var rect = linePath.getBounds2D();
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight());
if (pathElement.isClippingPath()) {
if (pathElement.isClipWindingFill()) {
linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
} else {
linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
}
PathFeatures pathFeatures = ElementFeatureFactory.buildPath(pathElement);
GeneralPath linePath = pathFeatures.getLinePath();
var rect = linePath.getBounds2D();
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight());
if (pathElement.isClippingPath()) {
if (pathElement.isClipWindingFill()) {
linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
} else {
linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
}
context.clippingPathStack().intersectClippingPath(linePath);
pathElement.setPathClip(!context.delta());
writer.writeElement(pathElement);
context.clippingPathStack().intersectClippingPath(linePath);
pathElement.setPathClip(!context.delta());
writer.writeElement(pathElement);
} else {
if (pathElement.isWindingFill()) {
linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
} else {
if (pathElement.isWindingFill()) {
linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
} else {
linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
}
linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
}
if (inClippingPath) {
if (!context.markedContentStack.contextHasTransparency() && isFilledAndNonTransparent(pathElement)) {
calculateOverlaps(context, pathFeatures, false);
}
context.visibleElements().add(ElementFeatureFactory.extractFeatures(pathElement));
if (inClippingPath) {
if (isFilledAndNonTransparent(pathElement)) {
List<ElementFeatures> currentOverlappedElements = context.visibleElements()
.stream()
.filter(features -> almostContains(linePath, features.getBoundingBox()))
.toList();
context.overlappedElements().addAll(currentOverlappedElements);
context.visibleElements().removeAll(currentOverlappedElements);
}
if (!context.delta() && (inClippingPath || !context.removePaths())) {
context.visibleElements().add(ElementFeatures.extractFeatures(pathElement));
if (!context.delta()) {
writer.writeElement(pathElement);
}
if (context.delta() && !inClippingPath && context.removePaths()) {
try (var color = new ColorPt(1, 0, 0)) {
pathElement.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
pathElement.getGState().setFillColor(color);
pathElement.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
pathElement.getGState().setStrokeColor(color);
writer.writeElement(pathElement);
}
}
}
if (context.delta() && !inClippingPath) {
pathElement.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
pathElement.getGState().setFillColor(new ColorPt(1, 0, 0));
pathElement.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
pathElement.getGState().setStrokeColor(new ColorPt(1, 0, 0));
writer.writeElement(pathElement);
}
}
}
private void calculateOverlaps(InvisibleElementRemovalContext context, ElementFeatures elementFeatures, boolean textOnly) {
List<ElementFeatures> currentOverlappedElements = context.visibleElements().findOverlapped(elementFeatures, textOnly);
context.overlappedElements().addAll(currentOverlappedElements);
context.visibleElements().removeAll(currentOverlappedElements);
}
private void removeOverlappedElements(Page page, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
context.reader().begin(page);
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
if (context.delta()) {
// green for element removed due to overlapping
context.overlappedElements()
.forEach(feature -> drawFeature(writer, feature, Color.GREEN));
context.overlappedElements().forEach(feature -> drawBBox(writer, feature.getBoundingBox(), "#00FF00"));
context.overlappedElements().clear();
}
processOverlappedElements(writer, context);
writer.end();
context.reader().end();
if (!context.overlappedElements().isEmpty()) {
log.debug(context.overlappedElements().size() + " overlapped elements have not been found or removed");
if (context.overlappedElements().size() > 0) {
log.warn(context.overlappedElements().size() + " overlapped elements have not been found or removed");
}
}
@ -440,29 +316,28 @@ public class InvisibleElementRemovalService {
private void processOverlappedElements(ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
for (Element element = context.reader().next(); element != null; element = context.reader().next()) {
if (context.markedContentStack().currentMarkedContentContainsAny(context.markedContentToIgnore()) && element.getType() != Element.e_marked_content_end) {
writer.writeElement(element);
continue;
}
switch (element.getType()) {
case Element.e_form -> processFormOverlappedElements(writer, element, context);
case Element.e_image, Element.e_inline_image, Element.e_text -> removeOverlappedElement(writer, context, element);
case Element.e_path -> {
if (context.removePaths()) {
removeOverlappedElement(writer, context, element);
} else {
writer.writeElement(element);
case Element.e_path, Element.e_image, Element.e_inline_image, Element.e_text -> {
boolean anyMatch = false;
for (ElementFeatures elementToRemove : context.overlappedElements()) {
if (elementToRemove.almostMatches(element)) {
context.overlappedElements().remove(elementToRemove);
anyMatch = true;
break;
}
}
if (!anyMatch) {
writer.writeElement(element);
} else if (element.getType() == 3 && element.hasTextMatrix()) {
/*
PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
Therefore, the position of a following Tj is affected by not writing the first Element.
This is why, we write only the Tm command:
*/
writer.writeGStateChanges(element);
}
}
case Element.e_marked_content_begin -> {
context.markedContentStack().enterMarkedContent(element.getMCTag().getName());
writer.writeElement(element);
}
case Element.e_marked_content_end -> {
context.markedContentStack().leaveMarkedContent();
writer.writeElement(element);
}
default -> writer.writeElement(element);
}
@ -470,36 +345,6 @@ public class InvisibleElementRemovalService {
}
private static void removeOverlappedElement(ElementWriter writer, InvisibleElementRemovalContext context, Element element) throws PDFNetException {
try (Rect bbox = element.getBBox()) {
if (bbox == null) {
writer.writeElement(element);
return;
}
}
Optional<ElementFeatures> optionalElementMatch = context.overlappedElements()
.anyMatch(ElementFeatureFactory.extractFeatures(element));
if (optionalElementMatch.isPresent()) {
context.overlappedElements().remove(optionalElementMatch.get());
if (element.getType() == 3 && element.hasTextMatrix()) {
/*
PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
Therefore, the position of a following Tj is affected by not writing the first Element.
This is why, we write only the Tm command:
*/
element.setTextData(new byte[]{});
writer.writeElement(element);
}
} else {
writer.writeElement(element);
}
}
private void processFormOverlappedElements(ElementWriter writer, Element formElement, InvisibleElementRemovalContext context) throws PDFNetException {
writer.writeElement(formElement);
@ -509,91 +354,69 @@ public class InvisibleElementRemovalService {
context.visitedXObjIds().add(formObj.getObjNum());
// writer needs to be newly initialized when entering a new content stream
// see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
try (ElementWriter formWriter = new ElementWriter()) {
context.reader().formBegin();
formWriter.begin(formObj);
ElementWriter formWriter = new ElementWriter();
context.reader().formBegin();
formWriter.begin(formObj);
context.reader().clearChangeList();
formWriter.setDefaultGState(context.reader());
context.reader().clearChangeList();
formWriter.setDefaultGState(context.reader());
processOverlappedElements(formWriter, context);
formWriter.end();
context.reader().end();
processOverlappedElements(formWriter, context);
formWriter.end();
formWriter.destroy();
context.reader().end();
}
}
private boolean isTextRenderedVisibly(GState gState) throws PDFNetException {
return gState.getTextRenderMode() != GState.e_invisible_text && //
!(gState.getTextRenderMode() == GState.e_fill_text && gState.getFillOpacity() == 0) && //
!(gState.getTextRenderMode() == GState.e_stroke_text && gState.getStrokeOpacity() == 0) && //
!(gState.getTextRenderMode() == GState.e_fill_stroke_text && gState.getFillOpacity() == 0 && gState.getStrokeOpacity() == 0);
}
private GeneralPath convertToGeneralPath(PathData pathData) throws PDFNetException {
GeneralPath linePath = new GeneralPath();
Iterator<Double> points = Doubles.asList(pathData.getPoints()).iterator();
Iterable<Byte> operators = Bytes.asList(pathData.getOperators());
for (var operator : operators) {
switch (operator) {
case PathData.e_moveto -> linePath.moveTo(points.next(), points.next());
case PathData.e_lineto -> linePath.lineTo(points.next(), points.next());
case PathData.e_cubicto -> linePath.curveTo(points.next(), points.next(), points.next(), points.next(), points.next(), points.next());
case PathData.e_closepath -> linePath.closePath();
case PathData.e_rect -> {
double x = points.next();
double y = points.next();
double w = points.next();
double h = points.next();
linePath.moveTo(x, y);
linePath.lineTo(x + w, y);
linePath.lineTo(x + w, y + h);
linePath.lineTo(x, y + h);
linePath.closePath();
}
default -> throw new PDFNetException("Invalid Element Type", 0, "", "", "");
}
}
return linePath;
}
private boolean isTextRenderedVisibly(GState gState, Rect textBBox, InvisibleElementRemovalContext context) throws PDFNetException {
private boolean almostContains(Shape outer, Rectangle2D inner) {
//To address inconsistencies in the calculation of the bounding box we slightly shrink the inner rectangle
return switch (gState.getTextRenderMode()) {
case GState.e_invisible_text -> false;
case GState.e_fill_text -> fillIsVisible(gState, textBBox, context);
case GState.e_stroke_text -> strokeIsVisible(gState, textBBox, context);
default -> fillIsVisible(gState, textBBox, context) || strokeIsVisible(gState, textBBox, context);
};
}
double x_with_tolerance = inner.getX() >= 0 ? inner.getX() + TOLERANCE : inner.getX() - TOLERANCE;
double y_with_tolerance = inner.getY() >= 0 ? inner.getY() + TOLERANCE : inner.getY() - TOLERANCE;
double height_with_tolerance = inner.getHeight() - (2 * TOLERANCE);
double width_with_tolerance = inner.getWidth() - (2 * TOLERANCE);
Rectangle2D innerRect = new Rectangle2D.Double(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance);
private boolean strokeIsVisible(GState gState, Rect textBBox, InvisibleElementRemovalContext context) throws PDFNetException {
return gState.getStrokeOpacity() != 0 && differentColorThanBackgroundColor(Converter.convertColor(gState.getStrokeColorSpace(), gState.getStrokeColor()),
textBBox,
context);
}
private boolean fillIsVisible(GState gState, Rect textBBox, InvisibleElementRemovalContext context) throws PDFNetException {
try (var color = gState.getFillColor()) {
return gState.getFillOpacity() != 0 && differentColorThanBackgroundColor(Converter.convertColor(gState.getFillColorSpace(), color), textBBox, context);
}
}
@SneakyThrows
private boolean differentColorThanBackgroundColor(Color fillColor, Rect textBBox, InvisibleElementRemovalContext context) {
List<PathFeatures> backgroundElements = findVisiblePathElementsThatIntersect(textBBox, context);
if (backgroundElements.isEmpty()) {
return !fillColor.equals(Color.WHITE);
}
List<PathFeatures> pathElementsByColor = backgroundElements.stream()
.filter(path -> path.getFillColor().equals(fillColor))
.toList();
if (pathElementsByColor.isEmpty()) {
return true;
}
Area backgroundArea = mergeLinePathsToArea(pathElementsByColor);
return !ComparisonUtils.almostContains(backgroundArea, Converter.toRectangle2D(textBBox));
}
private static List<PathFeatures> findVisiblePathElementsThatIntersect(Rect textBBox, InvisibleElementRemovalContext context) {
var result = new ArrayList<PathFeatures>();
context.visibleElements().findIntersecting(textBBox)
.forEach(element -> {
if (element instanceof PathFeatures pathFeatures && !pathFeatures.getFillColor().equals(Color.WHITE) && pathFeatures.isFilled()) {
result.add(pathFeatures);
}
});
return result;
}
private static Area mergeLinePathsToArea(List<PathFeatures> pathElementsWithSameColor) {
Area backgroundArea = new Area();
pathElementsWithSameColor.stream()
.map(PathFeatures::getLinePath)
.map(Area::new)
.forEach(backgroundArea::add);
return backgroundArea;
return outer.contains(innerRect);
}
@ -603,26 +426,39 @@ public class InvisibleElementRemovalService {
}
@SneakyThrows
private void drawBBox(ElementWriter writer, Rectangle2D r, String hexcolor) {
ColorPt colorPt = new ColorPt(Integer.valueOf(hexcolor.substring(1, 3), 16) / 255d,
Integer.valueOf(hexcolor.substring(3, 5), 16) / 255d,
Integer.valueOf(hexcolor.substring(5, 7), 16) / 255d);
ElementBuilder eb = new ElementBuilder();
Element rect = eb.createRect(r.getX(), r.getY(), r.getWidth(), r.getHeight());
rect.setPathStroke(true);
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
rect.getGState().setStrokeColor(colorPt);
writer.writePlacedElement(rect);
colorPt.destroy();
eb.destroy();
}
private static AffineTransform toAffineTransform(Matrix2D ctm) throws PDFNetException {
return new AffineTransform(ctm.getA(), ctm.getB(), ctm.getC(), ctm.getD(), ctm.getH(), ctm.getV());
}
@Builder
private record InvisibleElementRemovalContext(
boolean removePaths,
boolean delta,
ElementReader reader,
ClippingPathStack clippingPathStack,
MarkedContentStack markedContentStack,
ElementFeatureLookup overlappedElements,
ElementFeatureLookup visibleElements,
Set<Long> visitedXObjIds,
Set<String> markedContentToIgnore
) implements AutoCloseable {
@Override
public void close() {
overlappedElements.close();
visibleElements.close();
}
List<ElementFeatures> overlappedElements,
List<ElementFeatures> visibleElements,
Set<Long> visitedXObjIds) {
}
}
}

View File

@ -1,122 +0,0 @@
package com.iqser.red.pdftronlogic.commons;
import java.util.Deque;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Optional;
import java.util.Set;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.ocg.Group;
import com.pdftron.sdf.Obj;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
@RequiredArgsConstructor
public class MarkedContentStack {
private final PDFDoc pdfDoc;
Deque<MarkedContent> stack = new LinkedList<>();
Deque<Form> formStack = new LinkedList<>();
public void enterMarkedContent(String name) {
stack.push(new MarkedContent(name, name.startsWith("OC")));
}
@SneakyThrows
public void enterForm(Element formElement) {
Obj oc = formElement.getXObject().findObj("OC");
Obj group = formElement.getXObject().findObj("Group");
boolean transparency = false;
if (group != null) {
Obj groupSubType = group.findObj("S");
if (groupSubType != null && groupSubType.isName() && groupSubType.getName().equals("Transparency")) {
transparency = true;
}
}
formStack.push(new Form(formElement.getXObject().getObjNum(), oc != null, transparency));
}
public void leaveMarkedContent() {
stack.pop();
}
public String currentMarkedContent() {
if (stack.isEmpty()) {
return "";
}
return stack.peek().name();
}
public boolean currentMarkedContentContains(String name) {
Iterator<MarkedContent> markedContentIterator = stack.descendingIterator();
while (markedContentIterator.hasNext()) {
var markedContent = markedContentIterator.next();
if (markedContent.name().equals(name)) {
return true;
}
}
return false;
}
public boolean currentMarkedContentContainsAny(Set<String> names) {
if (stack.isEmpty()) {
return false;
}
Iterator<MarkedContent> markedContentIterator = stack.descendingIterator();
while (markedContentIterator.hasNext()) {
var markedContent = markedContentIterator.next();
if (names.contains(markedContent.name())) {
return true;
}
}
return false;
}
public void clear() {
stack.clear();
}
public boolean contextHasTransparency() {
return formStack.stream()
.anyMatch(form -> form.optionalContent || form.transparency) //
|| stack.stream()
.anyMatch(MarkedContent::optionalContent);
}
public void leaveForm() {
formStack.pop();
}
private record MarkedContent(String name, boolean optionalContent) {
}
private record Form(long ref, boolean optionalContent, boolean transparency) {
}
}

View File

@ -1,132 +0,0 @@
package com.iqser.red.pdftronlogic.commons;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.*;
import com.pdftron.pdf.ocg.Group;
import com.pdftron.pdf.ocg.OCMD;
import com.pdftron.sdf.Obj;
import lombok.SneakyThrows;
import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j;
import java.util.Set;
import java.util.TreeSet;
@Slf4j
@UtilityClass
public class OCGWatermarkRemovalService {
@SneakyThrows
public void removeWatermarks(PDFDoc pdfDoc) {
if (hasOCGWatermarks(pdfDoc)) {
removeOCGWatermarks(pdfDoc);
}
}
@SneakyThrows
private boolean hasOCGWatermarks(PDFDoc pdfDoc) {
Obj ocgs = pdfDoc.getOCGs();
if (ocgs != null) {
for (int i = 0; i < ocgs.size(); i++) {
Group group = new Group(ocgs.getAt(i));
if (group.isValid() && group.getName().equals("Watermark")) {
return true;
}
}
}
return false;
}
@SneakyThrows
private void removeOCGWatermarks(PDFDoc pdfDoc) {
try (PageIterator iterator = pdfDoc.getPageIterator(); ElementReader reader = new ElementReader(); ElementWriter writer = new ElementWriter()) {
Set<Long> visitedXObjIds = new TreeSet<>();
while (iterator.hasNext()) {
Page page = iterator.next();
writeAllElementsExceptWatermarks(page, reader, writer, visitedXObjIds);
}
}
}
@SneakyThrows
private void writeAllElementsExceptWatermarks(Page page, ElementReader reader, ElementWriter writer, Set<Long> visitedXObjIds) {
reader.begin(page);
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
processElements(page, reader, writer, visitedXObjIds);
writer.end();
reader.end();
}
private void processElements(Page page, ElementReader reader, ElementWriter writer, Set<Long> visitedXObjIds) throws PDFNetException {
for (Element element = reader.next(); element != null; element = reader.next()) {
if (inOCGWatermark(element)) {
continue;
}
switch (element.getType()) {
case Element.e_form -> processForms(page, element, reader, writer, visitedXObjIds);
default -> writer.writeElement(element);
}
}
}
@SneakyThrows
private boolean inOCGWatermark(Element element) {
var xObj = element.getXObject();
if (xObj != null) {
Obj oc = xObj.findObj("OC");
if (oc != null) {
OCMD ocmd = new OCMD(oc);
if (ocmd.isValid()) {
Group group = new Group(ocmd.getOCGs());
if (group.isValid() && group.getName().equals("Watermark")) {
return true;
}
}
}
}
return false;
}
@SneakyThrows
private void processForms(Page page, Element element, ElementReader reader, ElementWriter writer, Set<Long> visitedXObjIds) {
writer.writeElement(element);
if (!visitedXObjIds.contains(element.getXObject().getObjNum())) {
visitedXObjIds.add(element.getXObject().getObjNum());
// writer needs to be newly initialized when entering a new content stream
// see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
try (ElementWriter formWriter = new ElementWriter()) {
reader.formBegin();
formWriter.begin(element.getXObject());
reader.clearChangeList();
formWriter.setDefaultGState(reader);
processElements(page, reader, formWriter, visitedXObjIds);
formWriter.end();
reader.end();
}
}
}
}

View File

@ -1,49 +0,0 @@
package com.iqser.red.pdftronlogic.commons;
import java.lang.reflect.Field;
import com.pdftron.pdf.Font;
import lombok.experimental.UtilityClass;
@UtilityClass
public class PDFNetUtils {
@SuppressWarnings("PMD")
public void requireFontNotClosed(Font font) {
try {
if (font.__GetHandle() == 0L) {
throw new AssertionError("Font is already closed!");
}
Object refHandle = font.__GetRefHandle();
Class<?> clazz = refHandle.getClass();
Field implField = null;
while (clazz != null) {
try {
implField = clazz.getDeclaredField("impl");
implField.setAccessible(true);
break;
} catch (NoSuchFieldException e) {
clazz = clazz.getSuperclass();
}
}
if (implField != null) {
long implValue = (Long) implField.get(refHandle);
if (implValue == 0L) {
throw new AssertionError("Associated ElementReader of Font is already closed!");
}
}
} catch (IllegalAccessException e) {
throw new AssertionError("Font Ref is missing the field impl, should never happen!");
}
}
}

View File

@ -1,71 +0,0 @@
package com.iqser.red.pdftronlogic.commons;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import com.iqser.red.pdftronlogic.commons.features.ElementFeatureFactory;
import com.iqser.red.pdftronlogic.commons.features.ImageFeatures;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementReader;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.sdf.Obj;
import lombok.experimental.UtilityClass;
@UtilityClass
public class PdfImageExtraction {
public List<List<ImageFeatures>> extractImages(InputStream fileStream) throws IOException, PDFNetException {
try (PDFDoc pdfDoc = new PDFDoc(fileStream); ElementReader reader = new ElementReader()) {
List<List<ImageFeatures>> imagesPerPage = new ArrayList<>(pdfDoc.getPageCount());
var iter = pdfDoc.getPageIterator();
while (iter.hasNext()) {
Page page = iter.next();
Set<Long> visitedXObjIds = new HashSet<>();
visitedXObjIds.add(page.getSDFObj().getObjNum());
List<ImageFeatures> imageFeatures = new LinkedList<>();
reader.begin(page);
processElements(reader, imageFeatures, visitedXObjIds);
reader.end();
imagesPerPage.add(imageFeatures);
}
return imagesPerPage;
}
}
private void processElements(ElementReader reader, List<ImageFeatures> imageFeaturesOnPage, Set<Long> visitedXObjIds) throws PDFNetException {
for (Element element = reader.next(); element != null; element = reader.next()) {
switch (element.getType()) {
case Element.e_image, Element.e_inline_image -> imageFeaturesOnPage.add(ElementFeatureFactory.buildImage(element));
case Element.e_form -> {
Obj formObj = element.getXObject();
if (!visitedXObjIds.contains(formObj.getObjNum())) {
visitedXObjIds.add(formObj.getObjNum());
reader.formBegin();
processElements(reader, imageFeaturesOnPage, visitedXObjIds);
reader.end();
}
}
}
}
}
}

View File

@ -1,104 +0,0 @@
package com.iqser.red.pdftronlogic.commons;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import com.iqser.red.pdftronlogic.commons.features.ElementFeatureFactory;
import com.iqser.red.pdftronlogic.commons.features.TextFeatures;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementReader;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
import com.pdftron.pdf.TextExtractor;
import com.pdftron.sdf.Obj;
import lombok.experimental.UtilityClass;
@UtilityClass
public class PdfTextExtraction {
private static String execute(PDFDoc pdfDoc) throws PDFNetException {
try (PageIterator iterator = pdfDoc.getPageIterator(); TextExtractor extractor = new TextExtractor()) {
List<String> texts = new ArrayList<>();
while (iterator.hasNext()) {
Page page = iterator.next();
extractor.begin(page);
texts.add(extractor.getAsText());
}
pdfDoc.close();
return String.join("\n", texts);
}
}
public static String extractAllTextFromDocument(InputStream fileStream) throws IOException, PDFNetException {
PDFDoc pdfDoc = new PDFDoc(fileStream);
return execute(pdfDoc);
}
public static String extractAllTextFromDocument(PDFDoc pdfDoc) throws IOException, PDFNetException {
return execute(pdfDoc);
}
public static List<List<TextFeatures>> extractAllGlyphsFromDocument(InputStream fileStream, boolean includePathData) throws IOException, PDFNetException {
try (PDFDoc pdfDoc = new PDFDoc(fileStream); ElementReader reader = new ElementReader()) {
List<List<TextFeatures>> glyphsPerPages = new ArrayList<>(pdfDoc.getPageCount());
var iter = pdfDoc.getPageIterator();
while (iter.hasNext()) {
Page page = iter.next();
Set<Long> visitedXObjIds = new HashSet<>();
visitedXObjIds.add(page.getSDFObj().getObjNum());
List<TextFeatures> textFeatures = new LinkedList<>();
reader.begin(page);
processElements(reader, textFeatures, visitedXObjIds, includePathData);
reader.end();
glyphsPerPages.add(textFeatures);
}
return glyphsPerPages;
}
}
private static void processElements(ElementReader reader, List<TextFeatures> textFeaturesOnPage, Set<Long> visitedXObjIds, boolean includePathData) throws PDFNetException {
for (Element element = reader.next(); element != null; element = reader.next()) {
switch (element.getType()) {
case Element.e_text -> textFeaturesOnPage.add(ElementFeatureFactory.buildText(element, includePathData, includePathData));
case Element.e_form -> {
Obj formObj = element.getXObject();
if (!visitedXObjIds.contains(formObj.getObjNum())) {
visitedXObjIds.add(formObj.getObjNum());
reader.formBegin();
processElements(reader, textFeaturesOnPage, visitedXObjIds, includePathData);
reader.end();
}
}
}
}
}
}

View File

@ -1,91 +0,0 @@
package com.iqser.red.pdftronlogic.commons;
import java.awt.Color;
import java.awt.geom.Rectangle2D;
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
import com.iqser.red.pdftronlogic.commons.features.GlyphInfo;
import com.iqser.red.pdftronlogic.commons.features.TextFeatures;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.ColorPt;
import com.pdftron.pdf.ColorSpace;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementBuilder;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.PathData;
import lombok.SneakyThrows;
import lombok.experimental.UtilityClass;
@UtilityClass
public class VisualizationUtils {
@SneakyThrows
public static void drawFeature(ElementWriter writer, ElementFeatures features, Color color) {
try (ElementBuilder builder = new ElementBuilder()) {
if (features instanceof TextFeatures textFeatures) {
for (GlyphInfo glyph : textFeatures.getGlyphs()) {
if (glyph.getPathData().isPresent()) {
drawPathData(glyph.getPathData().get(), builder, writer, color);
}
}
}
drawRect(features.getBoundingBox(), builder, writer, color);
}
}
public static void drawPathData(PathData pathData, ElementBuilder builder, ElementWriter writer, Color color) throws PDFNetException {
Element path = builder.createPath(pathData.getPoints(), pathData.getOperators());
path.setPathFill(true);
path.setPathStroke(false);
path.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
float[] comp = color.getColorComponents(null);
try (ColorPt colorPt = new ColorPt(comp[0], comp[1], comp[2])) {
path.getGState().setFillColor(colorPt);
}
path.setWindingFill(true);
writer.writeElement(path);
}
public static void drawRect(Rectangle2D rectangle2D, ElementBuilder builder, ElementWriter writer, Color color) throws PDFNetException {
drawRect(rectangle2D, builder, writer, color, false);
}
public static void drawRect(Rectangle2D rectangle2D, ElementBuilder builder, ElementWriter writer, Color color, boolean fill) throws PDFNetException {
Element rect = builder.createRect(rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight());
rect.setPathFill(false);
rect.setPathStroke(true);
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
float[] comp = fill ? Color.BLACK.getColorComponents(null) : color.getColorComponents(null);
try (ColorPt colorPt = new ColorPt(comp[0], comp[1], comp[2])) {
rect.getGState().setStrokeColor(colorPt);
}
double lineWidth = fill ? 0.1 : 0.5;
rect.getGState().setLineWidth(lineWidth);
writer.writeElement(rect);
if (fill) {
Element filledRect = builder.createRect(rectangle2D.getX() + lineWidth,
rectangle2D.getY() + lineWidth,
rectangle2D.getWidth() - 2 * lineWidth,
rectangle2D.getHeight() - 2 * lineWidth);
filledRect.setPathFill(true);
filledRect.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
float[] comp2 = color.getColorComponents(null);
try (ColorPt colorPt = new ColorPt(comp2[0], comp2[1], comp2[2])) {
filledRect.getGState().setFillColor(colorPt);
}
writer.writeElement(filledRect);
}
}
}

View File

@ -1,423 +0,0 @@
package com.iqser.red.pdftronlogic.commons;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import com.iqser.red.pdftronlogic.commons.features.ElementFeatureFactory;
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementReader;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public class WatermarkRemovalService {
final static double AREA_THRESHOLD = 0.5; // multiplied with page area
final static double OCCURING_ON_PAGES_THRESHOLD_FACTOR = 0.75; // multiplied with number of pages
final static int MIN_PAGES_THRESHOLD = 3;
final static double IMAGE_POSITION_HEIGHT_THRESHOLD = 0.2; // multiplied with page height
final static double IMAGE_POSITION_WIDTH_THRESHOLD = 0.125; // multiplied with page width
final static double TEXT_POSITION_THRESHOLD = 0.15;
final static double MIN_TEXTWATERMARK_HEIGHT_THRESHOLD = 0.125; // multiplied with page height
final static int PAGE_NUMBER_TEXT_SEARCH_THRESHOLD = 5; // stop text based search after 5 pages without watermark
final static double ROTATED_TEXT_THRESHOLD = 12.5; //this is in degrees
static boolean foundTextWatermark = true;
/**
* The method remove watermark works only for Documents with size greater than MIN_PAGES_THRESHOLD.
* The following watermarks will be found: big XObjects, big Images, small Images that appear in the middle of the page, and
* text that is rotated and big enough compared to height of page.
* First the possible watermarks will be detected and then checked if those appear on most pages according to the
* OCCURING_ON_PAGES_THRESHOLD_FACTOR. We us image hashing for similarity between pictures and size and stream size of the xobjects.
* If so, these detected and confirmed will not be written to the pdf file.
*
* @param pdfFile PDFFile to remove watermarks
* @param out The OutputStream the final file will be written to
*/
@SneakyThrows
public void removeWatermarks(InputStream pdfFile, OutputStream out) {
PDFDoc pdfDoc = new PDFDoc(pdfFile);
OCGWatermarkRemovalService.removeWatermarks(pdfDoc);
if (pdfDoc.getPageCount() < MIN_PAGES_THRESHOLD) {
log.info("Document page count {} is below threshold {}", pdfDoc.getPageCount(), MIN_PAGES_THRESHOLD);
} else {
Map<Long, List<ElementFeatures>> formObjectsForPages = findAllFormObjectsAndImages(pdfDoc);
List<ElementFeatures> watermarkElementFeatures = filterSameFormObjectsOccuringOnMostPages(formObjectsForPages);
if (watermarkElementFeatures.size() > 0) {
log.info("Watermark found and will be removed!");
removeAllWatermarks(pdfDoc, watermarkElementFeatures);
} else {
log.info("No unlabeled watermark found!");
}
}
try {
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
} catch (Exception e) {
throw new RuntimeException(e);
} finally {
pdfDoc.close();
}
}
@SneakyThrows
private Map<Long, List<ElementFeatures>> findAllFormObjectsAndImages(PDFDoc pdfDoc) {
List<ElementFeatures> formObjectsOccuringMoreThanOnceOnAPage = new LinkedList<>();
Map<Long, List<ElementFeatures>> formObjectsAndImagesForPages = new HashMap<>();
Set<Long> visitedXObjIds = new TreeSet<>();
try (ElementReader reader = new ElementReader(); PageIterator iterator = pdfDoc.getPageIterator()) {
while (iterator.hasNext()) {
Page page = iterator.next();
double minAreaCoveringFromPage = AREA_THRESHOLD * page.getPageHeight() * page.getPageWidth();
LinkedList<ElementFeatures> elementFeaturesLinkedList = new LinkedList<>();
reader.begin(page);
for (Element element = reader.next(); element != null; element = reader.next()) {
processElement(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringFromPage, page);
}
formObjectsAndImagesForPages.put(page.getSDFObj().getObjNum(), elementFeaturesLinkedList);
}
return formObjectsAndImagesForPages;
}
}
private void processElement(Element element,
Set<Long> visitedXObjIds,
List<ElementFeatures> elementFeaturesLinkedList,
List<ElementFeatures> formObjectsOccuringMoreThanOnceOnAPage,
double minAreaCoveringPage,
Page page) throws PDFNetException {
try (var bbox = element.getBBox()) {
if (bbox == null) {
return;
}
switch (element.getType()) {
case Element.e_form -> processXObject(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage, page);
case Element.e_image, Element.e_inline_image -> processImages(element, elementFeaturesLinkedList, page, minAreaCoveringPage);
case Element.e_text -> processText(element, elementFeaturesLinkedList, page);
}
}
}
@SneakyThrows
private void processText(Element element, List<ElementFeatures> elementFeaturesLinkedList, Page page) {
if (page.getIndex() == PAGE_NUMBER_TEXT_SEARCH_THRESHOLD) {
shouldTextSearchBeContinued(elementFeaturesLinkedList);
}
if (!couldTextBeAWatermark(element, page)) {
return;
}
try (var bbox = element.getBBox()) {
boolean isBigEnough = Math.abs(bbox.getY1() - bbox.getY2()) > page.getPageHeight() * MIN_TEXTWATERMARK_HEIGHT_THRESHOLD;
if (isBigEnough) {
ElementFeatures elementFeatures = ElementFeatureFactory.extractFeatures(element);
elementFeaturesLinkedList.add(elementFeatures);
}
}
}
@SneakyThrows
private boolean isTextRotated(Element element) {
try (var ctm = element.getCTM()) {
return Math.abs(ctm.getB()) < Math.sin(Math.toRadians(ROTATED_TEXT_THRESHOLD)) || Math.abs(ctm.getB()) > Math.sin(Math.toRadians(70 - ROTATED_TEXT_THRESHOLD));
}
}
private void shouldTextSearchBeContinued(List<ElementFeatures> elementFeaturesLinkedList) {
int countTextWatermarks = 0;
for (ElementFeatures elementFeatures : elementFeaturesLinkedList) {
if (elementFeatures.getElementType() == Element.e_text) {
countTextWatermarks++;
}
}
if (countTextWatermarks < elementFeaturesLinkedList.size() * OCCURING_ON_PAGES_THRESHOLD_FACTOR) {
foundTextWatermark = false;
}
}
@SneakyThrows
private void processImages(Element element, List<ElementFeatures> elementFeaturesLinkedList, Page page, double minAreaCoveringPage) {
if (element.getXObject() == null) {
return;
}
try (var bbox = element.getBBox()) {
if (bbox.getHeight() * bbox.getWidth() < minAreaCoveringPage && isLocatedNearBorder(element, page)) {
return;
}
String hashOfImage = ImageHashFactory.calculate(element);
ElementFeatures elementFeatures = ElementFeatureFactory.buildImageWithHash(element, hashOfImage);
elementFeaturesLinkedList.add(elementFeatures);
}
}
// Typically company logos on dossier pages are located near the border and should be excluded from the watermark removal
@SneakyThrows
private boolean isLocatedNearBorder(Element element, Page page) {
try (var bbox = element.getBBox(); var contentBox = page.getVisibleContentBox();) {
return bbox.getY1() < contentBox.getY1() + page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD
|| bbox.getY2() > contentBox.getY2() - page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD
|| bbox.getX1() < contentBox.getX1() + page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD
|| bbox.getX2() > contentBox.getX2() - page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD;
}
}
@SneakyThrows
private void processXObject(Element element,
Set<Long> visitedXObjIds,
List<ElementFeatures> elementFeaturesLinkedList,
List<ElementFeatures> formObjectsOccuringMoreThanOnceOnAPage,
double minAreaCoveringPage,
Page page) {
try (var bbox = element.getBBox()) {
if (bbox.getHeight() * bbox.getWidth() < minAreaCoveringPage) {
return;
}
}
if (visitedXObjIds.add(element.getXObject().getObjNum())) {
try (ElementReader xObjectReader = new ElementReader()) {
xObjectReader.begin(element.getXObject());
for (Element element1 = xObjectReader.next(); element1 != null; element1 = xObjectReader.next()) {
processElement(element1, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage, page);
}
elementFeaturesLinkedList.add(ElementFeatureFactory.extractFeatures(element));
}
} else {
elementFeaturesLinkedList.add(ElementFeatureFactory.extractFeatures(element));
}
}
/*
parameter
*/
private List<ElementFeatures> filterSameFormObjectsOccuringOnMostPages(Map<Long, List<ElementFeatures>> formObjectsPerPage) {
int pageCount = formObjectsPerPage.keySet().size();
int minPagesFilter = (int) (OCCURING_ON_PAGES_THRESHOLD_FACTOR * pageCount);
return formObjectsPerPage.values()
.stream()
.flatMap(Collection::stream)
.filter(elementFeature -> formObjectsPerPage.values()
.stream()
.filter(elementFeaturesOnPage -> elementFeaturesOnPage.stream()
.anyMatch(elementFeature.getElementType() == Element.e_image
|| elementFeature.getElementType()
== Element.e_inline_image ? elementFeature::similar : elementFeature::matches))
.count() >= minPagesFilter)
.toList();
}
@SneakyThrows
private void removeAllWatermarks(PDFDoc pdfDoc, List<ElementFeatures> watermarksElementFeaturesList) {
try (PageIterator iterator = pdfDoc.getPageIterator(); ElementReader reader = new ElementReader(); ElementWriter writer = new ElementWriter()) {
Set<Long> visitedXObjIds = new TreeSet<>();
while (iterator.hasNext()) {
Page page = iterator.next();
writeAllElementsExceptWatermarks(page, reader, writer, watermarksElementFeaturesList, visitedXObjIds);
}
}
}
@SneakyThrows
private void writeAllElementsExceptWatermarks(Page page,
ElementReader reader,
ElementWriter writer,
List<ElementFeatures> watermarksElementFeaturesList,
Set<Long> visitedXObjIds) {
reader.begin(page);
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
processElements(page, reader, writer, watermarksElementFeaturesList, visitedXObjIds);
writer.end();
reader.end();
}
private void processElements(Page page,
ElementReader reader,
ElementWriter writer,
List<ElementFeatures> watermarksElementFeaturesList,
Set<Long> visitedXObjIds) throws PDFNetException {
double minAreaCoveringFromPage = AREA_THRESHOLD * page.getPageHeight() * page.getPageWidth();
for (Element element = reader.next(); element != null; element = reader.next()) {
switch (element.getType()) {
case Element.e_image, Element.e_inline_image -> {
try (var bbox = element.getBBox()) {
if (bbox == null) {
writer.writeElement(element);
continue;
}
}
try (var bbox = element.getBBox()) {
if (bbox.getHeight() * bbox.getWidth() < minAreaCoveringFromPage
&& isLocatedNearBorder(element, page)
&& bbox.getHeight() * bbox.getWidth() < minAreaCoveringFromPage || element.getXObject() == null) {
writer.writeElement(element);
continue;
}
}
removeImages(element, writer, watermarksElementFeaturesList);
}
case Element.e_form -> processForms(page, element, reader, writer, watermarksElementFeaturesList, visitedXObjIds);
case Element.e_text -> processText(element, writer, watermarksElementFeaturesList, page);
default -> writer.writeElement(element);
}
}
}
@SneakyThrows
private void processText(Element element, ElementWriter writer, List<ElementFeatures> watermarksElementFeaturesList, Page page) {
if (!couldTextBeAWatermark(element, page)) {
writer.writeElement(element);
return;
}
for (ElementFeatures elementFeatures : watermarksElementFeaturesList) {
if (elementFeatures.matches(ElementFeatureFactory.extractFeatures(element))) {
return;
}
}
writer.writeElement(element);
}
private boolean couldTextBeAWatermark(Element element, Page page) throws PDFNetException {
if (!foundTextWatermark) {
return false;
}
if (isTextRotated(element)) {
return false;
}
try (var bbox = element.getBBox(); var contents = page.getVisibleContentBox();) {
if (Math.max(bbox.getY1(), bbox.getY2()) < contents.getY1() + page.getPageHeight() * TEXT_POSITION_THRESHOLD) {
return false;
}
}
return true;
}
@SneakyThrows
private void removeImages(Element element, ElementWriter writer, List<ElementFeatures> watermarksElementFeaturesList) {
String hashValueOfImage = ImageHashFactory.calculate(element);
ElementFeatures imageFeatures = ElementFeatureFactory.buildImageWithHash(element, hashValueOfImage);
for (ElementFeatures elementFeatures : watermarksElementFeaturesList) {
if (elementFeatures.similar(imageFeatures)) {
return;
}
}
writer.writeElement(element);
}
private void processForms(Page page,
Element element,
ElementReader reader,
ElementWriter writer,
List<ElementFeatures> watermarksElementFeaturesList,
Set<Long> visitedXObjIds) throws PDFNetException {
for (ElementFeatures elementFeatures : watermarksElementFeaturesList) {
if (elementFeatures.matches(ElementFeatureFactory.extractFeatures(element))) {
return;
}
}
writer.writeElement(element);
if (!visitedXObjIds.contains(element.getXObject().getObjNum())) {
visitedXObjIds.add(element.getXObject().getObjNum());
// writer needs to be newly initialized when entering a new content stream
// see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
try (ElementWriter formWriter = new ElementWriter()) {
reader.formBegin();
formWriter.begin(element.getXObject());
reader.clearChangeList();
formWriter.setDefaultGState(reader);
processElements(page, reader, formWriter, watermarksElementFeaturesList, visitedXObjIds);
formWriter.end();
reader.end();
}
}
}
}

View File

@ -1,197 +0,0 @@
package com.iqser.red.pdftronlogic.commons.features;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import com.iqser.red.pdftronlogic.commons.Converter;
import com.pdftron.common.Matrix2D;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.CharData;
import com.pdftron.pdf.CharIterator;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.Font;
import com.pdftron.pdf.GState;
import com.pdftron.pdf.Image;
import com.pdftron.sdf.Obj;
import lombok.SneakyThrows;
import lombok.experimental.UtilityClass;
@UtilityClass
public class ElementFeatureFactory {
public ElementFeatures extractFeatures(Element element) throws PDFNetException {
return switch (element.getType()) {
case Element.e_path -> buildPath(element);
case Element.e_text -> buildText(element, false, false);
case Element.e_image, Element.e_inline_image -> buildImage(element);
case Element.e_form -> buildForm(element);
// This technically should never happen, it's a safetynet
default -> throw new RuntimeException("Feature Extraction is not supported for PDFTron.Element with type: " + element.getType());
};
}
public ImageFeatures buildImageWithHash(Element element, String hashObject) throws PDFNetException {
return buildImageBase(element).hashOfImage(hashObject).build();
}
public ImageFeatures buildImage(Element element) throws PDFNetException {
return buildImageBase(element).build();
}
public FormFeatures buildForm(Element element) throws PDFNetException {
try (var bbox = element.getBBox();) {
return FormFeatures.builder()
.elementType(element.getType())
.boundingBox(Converter.toRectangle2D(bbox))
.xObjectType(element.getXObject().getType())
.dictOrArrayOrStreamLength(element.getXObject().getType() == 7 ? element.getXObject().getDecodedStream().size() : 0)
.build();
}
}
private ImageFeatures.ImageFeaturesBuilder<?, ?> buildImageBase(Element element) throws PDFNetException {
assert element.getType() == Element.e_image || element.getType() == Element.e_inline_image;
try (var bbox = element.getBBox();) {
boolean transparent = element.getGState().getBlendMode() != GState.e_bl_normal
|| element.getGState().getFillOpacity() > 1
|| element.getGState().getStrokeOpacity() > 1;
// see spec: 8.9.6.3 Explicit masking
boolean masked = false;
if (element.getType() == Element.e_image) {
Image image = new Image(element.getXObject());
if (image.getMask() != null && image.getMask().getType() == Obj.e_stream) {
Image imageMask = new Image(image.getMask());
masked = imageMask.isImageMask();
}
}
return ImageFeatures.builder()
.elementType(element.getType())
.boundingBox(Converter.toRectangle2D(bbox))
.dataSize(element.getImageDataSize())
.height(element.getImageHeight())
.width(element.getImageWidth())
.renderingIntent(element.getImageRenderingIntent())
.componentNum(element.getComponentNum())
.bitsPerComponent(element.getBitsPerComponent())
.imageMask(element.isImageMask())
.softMask(element.getGState().getSoftMask() != null)
.masked(masked)
.transparent(transparent);
}
}
/*
Use includeGlyphs = true and preComputePathData = true, when trying to draw the glyphs, see GlyphExtractionTest
precomputePathData = true is needed, when trying to access the PathData after the PDFDoc/ElementReader has been closed
*/
public TextFeatures buildText(Element element, boolean includeGlyphs, boolean preComputePathData) throws PDFNetException {
try (var bbox = element.getBBox()) {
TextFeatures.TextFeaturesBuilder<?, ?> simpleTextFeatures = TextFeatures.builder()
.elementType(element.getType())
.boundingBox(Converter.toRectangle2D(bbox))
.text(element.getTextString())
.font(element.getGState().getFont().getType())
.fontsize(element.getGState().getFontSize());
if (includeGlyphs) {
simpleTextFeatures.glyphs(extractGlyphInfo(element, preComputePathData));
}
return simpleTextFeatures.build();
}
}
public PathFeatures buildPath(Element element) throws PDFNetException {
try (var bbox = element.getBBox(); var ctm = element.getCTM(); var fillColor = element.getGState().getFillColor(); var strokeColor = element.getGState().getStrokeColor()) {
return PathFeatures.builder()
.elementType(element.getType())
.boundingBox(Converter.toRectangle2D(bbox))
.clippingPath(element.isClippingPath())
.clipWindingFill(element.isClipWindingFill())
.stroked(element.isStroked())
.filled(element.isFilled())
.windingFill(element.isWindingFill())
.fillColor(Converter.convertColor(element.getGState().getFillColorSpace(), fillColor))
.strokeColor(Converter.convertColor(element.getGState().getStrokeColorSpace(), strokeColor))
.linePath(Converter.convertToGeneralPathAndTransformToInitialUserSpace(element.getPathData(), ctm))
.build();
}
}
@SneakyThrows
private List<GlyphInfo> extractGlyphInfo(Element textElement, boolean precomputePathData) {
assert textElement != null && textElement.getType() == Element.e_text;
if (textElement.getBBox() == null) {
return Collections.emptyList();
}
Font font = textElement.getGState().getFont();
if (font.getType() == Font.e_Type3) {
// type 3 fonts seem to be much more difficult, one must use font.getType3GlyphStream and font.getType3FontMatrix instead
// couldn't find much information except this post https://groups.google.com/g/pdfnet-sdk/c/SvhMflbtQho
// will implement this when necessary
return Collections.emptyList();
}
List<GlyphInfo> glyphs = new ArrayList<>();
short unitsPerEm = font.getUnitsPerEm();
try (CharIterator charIterator = textElement.getCharIterator(); Matrix2D ctm = textElement.getCTM().multiply(textElement.getTextMatrix());) {
while (charIterator.hasNext()) {
CharData charData = charIterator.next();
long charCode = charData.getCharCode();
try (Matrix2D fontMatrix = computeFontMatrix(charData, textElement, unitsPerEm)) {
GlyphInfo glyph = GlyphInfo.builder() //
.charCode(charCode) //
.cachePathData(precomputePathData) //
.glyphMatrix(ctm.multiply(fontMatrix)) //
.font(font) //
.build();
glyphs.add(glyph);
if (precomputePathData) {
// call the functions once to cache all data
glyph.getBoundingBox();
}
}
}
}
return glyphs;
}
private Matrix2D computeFontMatrix(CharData charData, Element textElement, short unitsPerEm) throws PDFNetException {
double yScaleFactor = textElement.getGState().getFontSize() / unitsPerEm;
double xScaleFactor = (textElement.getGState().getHorizontalScale() / 100) * yScaleFactor;
return new Matrix2D(xScaleFactor, 0, 0, -yScaleFactor, charData.getGlyphX(), charData.getGlyphY());
}
}

View File

@ -1,102 +0,0 @@
package com.iqser.red.pdftronlogic.commons.features;
import static com.iqser.red.pdftronlogic.commons.ComparisonUtils.almostEqual;
import java.awt.Shape;
import java.awt.geom.Rectangle2D;
import com.iqser.red.pdftronlogic.commons.ComparisonUtils;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Getter
@SuperBuilder
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ElementFeatures {
final private static double RECT_POSITION_SIMILARITY_THRESHOLD_FACTOR = 0.2; // specify how much the x and y value are allowed to differ
final private static double RECT_SIZE_SIMILARITY_THRESHOLD_FACTOR = 0.1; // the scale the images are allowed to differ
int elementType;
Rectangle2D boundingBox;
public boolean matches(ElementFeatures elementFeatures) {
return elementFeatures.getElementType() == elementType && elementFeatures.getBoundingBox() != null && bboxMatches(elementFeatures.getBoundingBox());
}
@SneakyThrows
protected boolean bboxMatches(Rectangle2D bBox) {
// To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance
return almostEqual(bBox.getX(), boundingBox.getX()) && //
almostEqual(bBox.getY(), boundingBox.getY()) && //
almostEqual(bBox.getWidth(), boundingBox.getWidth()) && //
almostEqual(bBox.getHeight(), boundingBox.getHeight());
}
public Shape getOverlapShape() {
return boundingBox;
}
public boolean similar(ElementFeatures elementFeatures) {
return elementFeatures.getElementType() == elementType && elementFeatures.getBoundingBox() != null && areRectsSimilar(elementFeatures.getBoundingBox());
}
protected boolean areRectsSimilar(Rectangle2D rectangle2D) {
// To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance
return isPositionSimilar(rectangle2D.getX(), boundingBox.getX(), rectangle2D.getWidth()) && //
isPositionSimilar(rectangle2D.getY(), boundingBox.getY(), rectangle2D.getHeight()) && //
isSizeSimilar(rectangle2D.getWidth(), boundingBox.getWidth()) && //
isSizeSimilar(rectangle2D.getHeight(), boundingBox.getHeight());
}
protected boolean isPositionSimilar(double a, double b, double boxSize) {
return Math.abs(a - b) < boxSize * RECT_POSITION_SIMILARITY_THRESHOLD_FACTOR;
}
protected boolean isSizeSimilar(double a, double b) {
return Math.abs(a - b) < a * RECT_SIZE_SIMILARITY_THRESHOLD_FACTOR;
}
public boolean contains(ElementFeatures features) {
return features.containedBy(this);
}
public boolean testOverlapped(ElementFeatures overlappingElement) {
return containedBy(overlappingElement);
}
private boolean containedBy(ElementFeatures features) {
Shape overlapShape = features.getOverlapShape();
return overlapShape.contains(ComparisonUtils.shrinkRectangle(boundingBox));
}
public void destroy() {
// do nothing, except for text
}
}

View File

@ -1,45 +0,0 @@
package com.iqser.red.pdftronlogic.commons.features;
import static com.iqser.red.pdftronlogic.commons.ComparisonUtils.almostEqual;
import java.awt.geom.Rectangle2D;
import lombok.AccessLevel;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@EqualsAndHashCode(callSuper = true)
@Getter
@SuperBuilder
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class FormFeatures extends ElementFeatures {
int xObjectType;
long dictOrArrayOrStreamLength;
public boolean matches(ElementFeatures elementFeatures) {
if (elementFeatures instanceof FormFeatures features) {
return elementFeatures.getElementType() == getElementType()
&& elementFeatures.getBoundingBox() != null
&& (super.bboxMatches(elementFeatures.getBoundingBox())
|| rotationMatches(elementFeatures.getBoundingBox()
.getBounds2D()))
&& xObjectType == features.getXObjectType()
&& dictOrArrayOrStreamLength == features.getDictOrArrayOrStreamLength();
}
return false;
}
private boolean rotationMatches(Rectangle2D bBox) {
return almostEqual(bBox.getWidth(), getBoundingBox().getHeight()) && //
almostEqual(bBox.getHeight(), getBoundingBox().getWidth());
}
}

View File

@ -1,116 +0,0 @@
package com.iqser.red.pdftronlogic.commons.features;
import java.awt.geom.Rectangle2D;
import java.util.Optional;
import com.iqser.red.pdftronlogic.commons.ComparisonUtils;
import com.iqser.red.pdftronlogic.commons.Converter;
import com.iqser.red.pdftronlogic.commons.PDFNetUtils;
import com.pdftron.common.Matrix2D;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.Font;
import com.pdftron.pdf.PathData;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Builder
@AllArgsConstructor
@RequiredArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class GlyphInfo {
final Matrix2D glyphMatrix;
final long charCode;
final Font font;
// in order to speed up invisible element removal, we only calculate the pathdata where necessary, as it is the costliest operation.
// It will only work as long as the associated ElementReader is still open, as the Font is bound to the ContentStream being read.
Rectangle2D bbox;
final boolean cachePathData;
PathData pathData;
boolean overlapped;
ElementFeatures overlappingElement;
public boolean testOverlapped(ElementFeatures overlappingElement) {
if (overlapped) {
return true;
}
Optional<Rectangle2D> bbox = getBoundingBox();
if (bbox.isEmpty()) {
return true;
}
if (overlappingElement.getOverlapShape().contains(ComparisonUtils.shrinkRectangle(bbox.get()))) {
overlapped = true;
this.overlappingElement = overlappingElement;
}
return overlapped;
}
public String getUnicode() {
try {
return new String(font.mapToUnicode(charCode));
} catch (PDFNetException e) {
return "";
}
}
@SneakyThrows
public Optional<PathData> getPathData() {
if (pathData == null) {
PDFNetUtils.requireFontNotClosed(font);
PathData computedPathData = font.getGlyphPath(charCode, true, glyphMatrix);
if (computedPathData.getOperators().length == 1 && computedPathData.getOperators()[0] == 6) {
// This happens for some chinese characters or whitespaces, don't know why...
return Optional.empty();
}
if (cachePathData) {
pathData = computedPathData;
}
return Optional.of(computedPathData);
}
return Optional.of(pathData);
}
@SneakyThrows
public Optional<Rectangle2D> getBoundingBox() {
if (bbox == null) {
Optional<PathData> pathData = getPathData();
if (pathData.isEmpty()) {
return Optional.empty();
}
bbox = Converter.convertToGeneralPath(pathData.get()).getBounds2D();
}
return Optional.of(bbox);
}
@SneakyThrows
public void destroy() {
if (glyphMatrix != null) {
glyphMatrix.close();
}
}
}

View File

@ -1,75 +0,0 @@
package com.iqser.red.pdftronlogic.commons.features;
import lombok.AccessLevel;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@EqualsAndHashCode(callSuper = true)
@Getter
@SuperBuilder
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ImageFeatures extends ElementFeatures {
final private static double HAMMING_DISTANCE_THRESHOLD = 4; // defines the similarity of the hash of images
int dataSize;
int height;
int width;
int renderingIntent;
int componentNum;
int bitsPerComponent;
boolean imageMask;
boolean softMask;
boolean masked;
boolean transparent;
String hashOfImage;
@Override
public boolean matches(ElementFeatures elementFeatures) {
if (elementFeatures instanceof ImageFeatures imageFeatures) {
return super.matches(elementFeatures)
&& this.dataSize == imageFeatures.getDataSize()
&& this.height == imageFeatures.getHeight()
&& this.width == imageFeatures.getWidth()
&& this.renderingIntent == imageFeatures.getRenderingIntent()
&& this.componentNum == imageFeatures.getComponentNum()
&& this.bitsPerComponent == imageFeatures.getBitsPerComponent()
&& this.imageMask == imageFeatures.isImageMask()
&& this.softMask == imageFeatures.isSoftMask()
&& this.transparent == imageFeatures.isTransparent()
&& calculateHammingDistance(imageFeatures.getHashOfImage()) <= HAMMING_DISTANCE_THRESHOLD;
}
return false;
}
public boolean similar(ElementFeatures elementFeatures) {
return super.similar(elementFeatures) && //
calculateHammingDistance(((ImageFeatures) elementFeatures).getHashOfImage()) <= HAMMING_DISTANCE_THRESHOLD;
}
// Helper method to calculate the Hamming distance between two hexadecimal strings
private int calculateHammingDistance(String hash2) {
if (hash2 == null) {
return 0;
}
int distance = 0;
int maxLength = Math.max(this.hashOfImage.length(), hash2.length());
for (int i = 0; i < maxLength; i++) {
char char1 = i < this.hashOfImage.length() ? this.hashOfImage.charAt(i) : '0';
char char2 = i < hash2.length() ? hash2.charAt(i) : '0';
if (char1 != char2) {
distance++;
}
}
return distance;
}
}

View File

@ -1,51 +0,0 @@
package com.iqser.red.pdftronlogic.commons.features;
import java.awt.Color;
import java.awt.Shape;
import java.awt.geom.GeneralPath;
import lombok.AccessLevel;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Getter
@SuperBuilder
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
@EqualsAndHashCode(callSuper = true)
public class PathFeatures extends ElementFeatures {
boolean clippingPath;
boolean clipWindingFill;
boolean stroked;
boolean filled;
boolean windingFill;
Color strokeColor;
Color fillColor;
GeneralPath linePath;
@Override
public boolean matches(ElementFeatures element) {
if (element instanceof PathFeatures pathFeaturesElement) {
return super.matches(element)
&& clippingPath == pathFeaturesElement.isClippingPath()
&& clipWindingFill == pathFeaturesElement.isClipWindingFill()
&& stroked == pathFeaturesElement.isStroked()
&& filled == pathFeaturesElement.isFilled()
&& windingFill == pathFeaturesElement.isWindingFill();
}
return false;
}
@Override
public Shape getOverlapShape() {
return linePath;
}
}

View File

@ -1,60 +0,0 @@
package com.iqser.red.pdftronlogic.commons.features;
import static com.iqser.red.pdftronlogic.commons.ComparisonUtils.almostEqual;
import java.util.ArrayList;
import java.util.List;
import lombok.AccessLevel;
import lombok.Builder;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@EqualsAndHashCode(callSuper = true)
@Getter
@SuperBuilder
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
@SuppressWarnings("PMD")
public class TextFeatures extends ElementFeatures {
String text;
int font;
double fontsize;
@Builder.Default
List<GlyphInfo> glyphs = new ArrayList<>();
@Override
public boolean matches(ElementFeatures element) {
if (element instanceof TextFeatures textFeaturesElement) {
return super.matches(textFeaturesElement)//
&& text.equals(textFeaturesElement.getText()) //
&& font == textFeaturesElement.getFont()//
&& almostEqual(fontsize, textFeaturesElement.getFontsize());
}
return false;
}
public boolean testOverlapped(ElementFeatures overlappingElement) {
if (glyphs.isEmpty()) {
return super.testOverlapped(overlappingElement);
}
return super.testOverlapped(overlappingElement) || glyphs.stream()
.allMatch(glyph -> glyph.testOverlapped(overlappingElement));
}
@Override
public void destroy() {
glyphs.forEach(GlyphInfo::destroy);
}
}

View File

@ -1,43 +0,0 @@
package com.iqser.red.pdftronlogic.commons.lookup;
import java.util.Optional;
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
@RequiredArgsConstructor
public class AnyMatchVisitor implements ElementFeatureVisitor {
private final ElementFeatures queryFeatures;
@Getter
private ElementFeatures match;
public Optional<ElementFeatures> getAnyMatch() {
return Optional.ofNullable(match);
}
@Override
public void visitItem(ElementFeatures features) {
if (hasAnyMatch()) {
return;
}
if (queryFeatures.matches(features)) {
match = features;
}
}
private boolean hasAnyMatch() {
return getAnyMatch().isPresent();
}
}

View File

@ -1,135 +0,0 @@
package com.iqser.red.pdftronlogic.commons.lookup;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Optional;
import java.util.function.Consumer;
import java.util.function.Predicate;
import com.iqser.red.pdftronlogic.commons.Converter;
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.Rect;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
@Getter
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ElementFeatureLookup implements AutoCloseable {
/*
This class looks a bit weird and uses visitors since I tried to use the quadtree implementation by locationtech, as it uses Rectangles by default to query its data structure.
Unfortunately there were always edge cases where it lost a few elements making it completely unusable. Further, it didn't even speed up the algorithm all that much.
*/
List<ElementFeatures> allElements = new ArrayList<>();
public void add(ElementFeatures elementFeatures) {
allElements.add(elementFeatures);
}
public void remove(ElementFeatures elementFeatures) {
allElements.remove(elementFeatures);
}
public Optional<ElementFeatures> anyMatch(ElementFeatures elementFeatures) {
AnyMatchVisitor visitor = new AnyMatchVisitor(elementFeatures);
forEach(visitor::visitItem);
return visitor.getAnyMatch();
}
@SneakyThrows
public List<ElementFeatures> query(Predicate<ElementFeatures> predicate) {
PredicateItemVisitor visitor = new PredicateItemVisitor(predicate);
forEach(visitor::visitItem);
return visitor.getMatchingFeatures();
}
@SneakyThrows
public List<ElementFeatures> findIntersecting(Rect bbox) {
Rectangle2D r = Converter.toRectangle2D(bbox);
return query(elementFeatures -> elementFeatures.getBoundingBox().intersects(r));
}
public List<ElementFeatures> findOverlapped(ElementFeatures overlappingElement, boolean textOnly) {
List<ElementFeatures> overlappedElementFeatures = new LinkedList<>();
for (int i = 0; i < allElements.size(); i++) {
ElementFeatures features = allElements.get(i);
if (textOnly && features.getElementType() != Element.e_text) {
continue;
}
if (features.getBoundingBox().intersects(overlappingElement.getBoundingBox())) {
if (features.testOverlapped(overlappingElement)) {
overlappedElementFeatures.add(features);
}
}
}
return overlappedElementFeatures;
}
public void forEach(Consumer<ElementFeatures> consumer) {
allElements.forEach(consumer);
}
public void clear() {
allElements.clear();
}
public boolean isEmpty() {
return allElements.isEmpty();
}
public int size() {
return allElements.size();
}
public void addAll(List<ElementFeatures> currentOverlappedElements) {
allElements.addAll(currentOverlappedElements);
}
public void removeAll(List<ElementFeatures> currentOverlappedElements) {
allElements.removeAll(currentOverlappedElements);
}
@Override
public void close() {
allElements.forEach(ElementFeatures::destroy);
}
}

View File

@ -1,9 +0,0 @@
package com.iqser.red.pdftronlogic.commons.lookup;
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
public interface ElementFeatureVisitor {
void visitItem(ElementFeatures features);
}

View File

@ -1,29 +0,0 @@
package com.iqser.red.pdftronlogic.commons.lookup;
import java.util.ArrayList;
import java.util.List;
import java.util.function.Predicate;
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
@RequiredArgsConstructor
public class PredicateItemVisitor implements ElementFeatureVisitor {
private final Predicate<ElementFeatures> predicate;
@Getter
private final List<ElementFeatures> matchingFeatures = new ArrayList<>();
@Override
public void visitItem(ElementFeatures features) {
if (predicate.test(features)) {
matchingFeatures.add(features);
}
}
}

View File

@ -1,91 +0,0 @@
package com.iqser.red.pdftronlogic.commons;
import static com.iqser.red.pdftronlogic.commons.VisualizationUtils.drawPathData;
import static com.iqser.red.pdftronlogic.commons.VisualizationUtils.drawRect;
import java.awt.Color;
import java.io.FileOutputStream;
import java.nio.file.Path;
import java.util.List;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import com.iqser.red.pdftronlogic.commons.features.GlyphInfo;
import com.iqser.red.pdftronlogic.commons.features.ImageFeatures;
import com.iqser.red.pdftronlogic.commons.features.TextFeatures;
import com.pdftron.pdf.ElementBuilder;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.PDFNet;
import com.pdftron.pdf.Page;
import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows;
@Disabled // makes no sense to run in pipeline
public class GlyphExtractionTest {
@BeforeAll
static void init() {
PDFNet.initialize(PDFTronConfig.license);
}
@Test
@SneakyThrows
public void testGlyphExtraction() {
String file = "files/everyCharIsImage.pdf";
List<List<TextFeatures>> textsPerPage;
List<List<ImageFeatures>> imagesPerPage;
try (var in = Thread.currentThread().getContextClassLoader().getResourceAsStream(file)) {
textsPerPage = PdfTextExtraction.extractAllGlyphsFromDocument(in, true);
}
try (var in = Thread.currentThread().getContextClassLoader().getResourceAsStream(file)) {
imagesPerPage = PdfImageExtraction.extractImages(in);
}
try (var in = Thread.currentThread().getContextClassLoader().getResourceAsStream(file);//
var out = new FileOutputStream(Path.of("/tmp/").resolve(Path.of(file).getFileName() + "_GLYPHS.pdf").toFile())) {
try (PDFDoc pdfDoc = new PDFDoc(in)) {
for (int i = 0; i < pdfDoc.getPageCount(); i++) {
Page page = pdfDoc.getPage(i + 1);
List<TextFeatures> textFeaturesOnPage = textsPerPage.get(i);
List<ImageFeatures> imageFeaturesOnPage = imagesPerPage.get(i);
try (ElementWriter writer = new ElementWriter(); ElementBuilder builder = new ElementBuilder()) {
writer.begin(page, ElementWriter.e_overlay, false);
for (ImageFeatures imageFeatures : imageFeaturesOnPage) {
if (imageFeatures.getBoundingBox().getHeight() * imageFeatures.getBoundingBox().getWidth() >= page.getPageHeight() * page.getPageWidth() * 0.8) {
continue;
}
drawRect(imageFeatures.getBoundingBox(), builder, writer, Color.CYAN, true);
}
for (TextFeatures textFeatures : textFeaturesOnPage) {
drawRect(textFeatures.getBoundingBox(), builder, writer, Color.BLUE);
for (GlyphInfo glyph : textFeatures.getGlyphs()) {
if (glyph.getPathData().isPresent() && glyph.getBoundingBox().isPresent()) {
drawPathData(glyph.getPathData().get(), builder, writer, Color.BLACK);
drawRect(ComparisonUtils.shrinkRectangle(glyph.getBoundingBox().get()), builder, writer, Color.RED);
drawRect(glyph.getBoundingBox().get(), builder, writer, Color.MAGENTA);
}
}
}
writer.end();
}
}
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
}
}
}
}

View File

@ -1,236 +0,0 @@
package com.iqser.red.pdftronlogic.commons;
import static com.iqser.red.pdftronlogic.commons.PdfTextExtraction.extractAllTextFromDocument;
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
import java.io.ByteArrayOutputStream;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import lombok.extern.slf4j.Slf4j;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import com.pdftron.pdf.PDFNet;
import lombok.SneakyThrows;
@SuppressWarnings("PMD")
@Slf4j
class InvisibleElementRemovalServiceTest {
InvisibleElementRemovalService invisibleElementRemovalService;
@BeforeAll
static void init() {
PDFNet.initialize(PDFTronConfig.license);
}
@BeforeEach
void createServices() {
invisibleElementRemovalService = new InvisibleElementRemovalService();
}
@Test
@SneakyThrows
void removeInvisibleText() {
String fileName = "files/InvisibleText.pdf";
String resultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL");
String deltaResultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL_DELTA");
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(resultFileName)) {
invisibleElementRemovalService.removeInvisibleElements(in, out, false);
}
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(deltaResultFileName)) {
invisibleElementRemovalService.removeInvisibleElements(in, out, true);
}
try (var in = new FileInputStream(resultFileName)) {
String[] text = extractAllTextFromDocument(in).split("\n");
assertThat(text).contains("APPENDIX 16 Pathology Report", "Amendment 1", "Page 255 of 260");
}
try (var in = new FileInputStream(deltaResultFileName)) {
String[] text = extractAllTextFromDocument(in).split("\n");
assertThat(text).contains("Michela Gregori DVM PhD Pathologist", "AUTHOR(S):", "COMPLETION DATE:");
}
}
@Test
@SneakyThrows
void page32DoesNotCrash() {
String fileName = "files/Page32.pdf";
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new ByteArrayOutputStream()) {
invisibleElementRemovalService.removeInvisibleElements(in, out, false);
}
}
@Test
@SneakyThrows
void removeInvisibleTextClippedByFormObjects() {
String fileName = "files/invisibleTextInNestedFormObjects.pdf";
String resultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL");
String deltaResultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL_DELTA");
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(resultFileName)) {
invisibleElementRemovalService.removeInvisibleElements(in, out, false);
}
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(deltaResultFileName)) {
invisibleElementRemovalService.removeInvisibleElements(in, out, true);
}
try (var in = new FileInputStream(resultFileName)) {
String text = extractAllTextFromDocument(in);
assertThat(text).isBlank();
}
try (var in = new FileInputStream(deltaResultFileName)) {
String[] text = extractAllTextFromDocument(in).split("\n");
assertThat(text).contains(":Bold S-enantiomer form if two codes are supplied",
"Red : Only observed in laboratory soil studies",
"Green : Observed in both laboratory soil studies and lysimeter leachate",
"Blue : Only observed in lysimeter leachate");
}
}
@Test
@SneakyThrows
void removeInvisibleElementsWithColoredBackground() {
String fileName = "files/textOnColoredBackground.pdf";
String resultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL");
String deltaResultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL_DELTA");
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(resultFileName)) {
invisibleElementRemovalService.removeInvisibleElements(in, out, false);
}
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(deltaResultFileName)) {
invisibleElementRemovalService.removeInvisibleElements(in, out, true);
}
try (var in = new FileInputStream(deltaResultFileName)) {
String result = PdfTextExtraction.extractAllTextFromDocument(in);
assertThat(result).contains("#1 Dark",
"#13 Yellow",
"Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper suscipit lobortis nisl ut aliquip\n"
+ "ex ea commodo consequat. Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie\n"
+ "consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim\n"
+ "qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi.");
assertThat(result).doesNotContain("Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut\n"
+ "labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et\n"
+ "ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem\n"
+ "ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et\n"
+ "dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea\n"
+ "rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum\n"
+ "dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore\n"
+ "magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet\n"
+ "clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. ");
}
}
@Test
@SneakyThrows
void removeInvisibleElementsThinFilledTable() {
String fileName = "files/tableIsSingleLinePath.pdf";
String resultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL");
String deltaResultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL_DELTA");
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(resultFileName)) {
invisibleElementRemovalService.removeInvisibleElements(in, out, false);
}
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(deltaResultFileName)) {
invisibleElementRemovalService.removeInvisibleElements(in, out, true);
}
try (var in = new FileInputStream(deltaResultFileName)) {
String result = PdfTextExtraction.extractAllTextFromDocument(in);
assertThat(result).isEqualTo("");
}
}
@Test
@SneakyThrows
void removeInvisibleElementsChineseOverlapped() {
String fileName = "files/chineseInvisibleElements.pdf";
String resultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL");
String deltaResultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL_DELTA");
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(resultFileName)) {
invisibleElementRemovalService.removeInvisibleElements(in, out, false);
}
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(deltaResultFileName)) {
invisibleElementRemovalService.removeInvisibleElements(in, out, true);
}
try (var in = new FileInputStream(deltaResultFileName)) {
String result = PdfTextExtraction.extractAllTextFromDocument(in);
assertThat(result).contains("[Table_KeyInfo]", "[Table_StockInfo]", "[Table_BaseInfo]", "国内无线键鼠龙头企", "", "精研研发发制制造造商商先先的的本");
}
}
@Test
@SneakyThrows
void removeInvisibleElementsButKeepOCRText() {
String fileName = "files/singlePageWithOcrText.pdf";
String resultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL");
String deltaResultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL_DELTA");
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(resultFileName)) {
invisibleElementRemovalService.removeInvisibleElementsButKeepOcrText(in, out, false);
}
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(deltaResultFileName)) {
invisibleElementRemovalService.removeInvisibleElementsButKeepOcrText(in, out, true);
}
try (var in = new FileInputStream(resultFileName)) {
String result = PdfTextExtraction.extractAllTextFromDocument(in);
assertThat(result).contains("TABLE 17:", "Intergroup comparison oftotal litter", "TABLE 20:");
}
}
@Test
@SneakyThrows
void removeInvisibleElementsWhereEachCharIsImage() {
String fileName = "files/everyCharIsImage.pdf";
String resultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL");
String deltaResultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL_DELTA");
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(resultFileName)) {
invisibleElementRemovalService.removeInvisibleElementsButKeepOcrText(in, out, false);
}
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(deltaResultFileName)) {
invisibleElementRemovalService.removeInvisibleElementsButKeepOcrText(in, out, true);
}
try (var in = new FileInputStream(resultFileName)) {
String result = PdfTextExtraction.extractAllTextFromDocument(in);
assertThat(result).isBlank();
}
}
}

View File

@ -1,31 +0,0 @@
package com.iqser.red.pdftronlogic.commons;
import java.nio.file.Path;
import java.util.Locale;
import org.junit.platform.commons.util.StringUtils;
public class OsUtils {
private static boolean isWindows() {
return System.getProperty("os.name").toLowerCase(Locale.ROOT).contains("windows");
}
public static String getTemporaryDirectory() {
String tmpdir = System.getProperty("java.io.tmpdir");
if (isWindows() && StringUtils.isNotBlank(tmpdir)) {
return tmpdir;
}
return "/tmp";
}
public static String createTmpFileName(String filename, String suffix) {
return Path.of(OsUtils.getTemporaryDirectory()).resolve(Path.of(filename).getFileName()).toString().replace(".pdf", "_" + suffix + ".pdf");
}
}

View File

@ -1,6 +0,0 @@
package com.iqser.red.pdftronlogic.commons;
public class PDFTronConfig {
public static final String license = "demo:1650351709282:7bd235e003000000004ec28a6743e1163a085e2115de2536ab6e2cfe5a";
}

View File

@ -1,314 +0,0 @@
package com.iqser.red.pdftronlogic.commons;
import java.awt.geom.Rectangle2D;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.CompletableFuture;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import com.iqser.red.pdftronlogic.commons.rendering.GhostScriptService;
import com.iqser.red.pdftronlogic.commons.rendering.ImageFile;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.PDFNet;
import com.pdftron.sdf.SDFDoc;
import com.sun.jna.NativeLibrary;
import com.sun.jna.Pointer;
import lombok.SneakyThrows;
import net.sourceforge.lept4j.Box;
import net.sourceforge.lept4j.Boxa;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.lept4j.util.LeptUtils;
@Disabled // requires leptonica and ghostscript to be installed locally
public class VisualEqualityTest {
/*
We render both the origin and the processed file and then computes a diff per page, we then threshold and invert the diff.
This means, a visual difference of luminance greater than the threshold value shows up as a black pixel.
We then use Heckbert's Seed Fill Algorithm to detect connected black regions by recursively flooding connected pixels.
We then filter these error regions, ensuring their area is at least the threshold.
We do this, since single pixel errors are frequent, but cannot be perceived by a human. Most likely some float inaccuracies.
If there are any error regions left, we count the test as failed.
*/
private static final int ERROR_REGION_AREA_THRESHOLD = 10;
public static final int LUMINANCE_DIFFERENCE_THRESHOLD = 170;
private static final Path TEST_OUTPUT_DIR = Path.of("/tmp/AAA_EQUALITY_TEST/");
private static final String LEPTONICA_DIR = "/home/kschuettler/software/leptonica/vcpkg/installed/x64-linux-dynamic/lib/";
GhostScriptService ghostScriptService = new GhostScriptService();
InvisibleElementRemovalService invisibleElementRemovalService = new InvisibleElementRemovalService();
@BeforeEach
public void setup() {
PDFNet.initialize(PDFTronConfig.license);
System.setProperty("jna.library.path", LEPTONICA_DIR);
try (NativeLibrary leptonicaLib = NativeLibrary.getInstance("leptonica")) {
assert leptonicaLib != null;
}
}
@Test
@SneakyThrows
public void assertVisualEqualityOfProcessedFile() {
Path file = Path.of("/home/kschuettler/Dokumente/TestFiles/full_syn_dm_testfiles/3977411_Final_Thiamethoxam_SL_MNLY.pdf");
Context context = new Context(TEST_OUTPUT_DIR, new HashMap<>());
runForFile(file, context);
System.out.println(context);
assert context.failedFiles.isEmpty();
}
@Test
@SneakyThrows
public void assertVisualEqualityOfProcessedFolder() {
Path folder = Path.of("/home/kschuettler/Dokumente/TestFiles/full_syn_dm_testfiles");
Context context = new Context(TEST_OUTPUT_DIR, new HashMap<>());
Files.walk(folder)
.filter(Files::isRegularFile)
.map(Path::toFile)
.filter(file -> file.toString().endsWith(".pdf"))
.map(File::toPath)
.peek(file -> runForFile(file, context))
.forEach(f -> System.out.println(context));
assert context.failedFiles.isEmpty();
}
@SneakyThrows
private void runForFile(Path originFile, Context context) {
System.out.println(originFile.toFile());
Path fileFolder = context.getFileFolder(originFile);
Files.createDirectories(fileFolder);
Path processedFile = fileFolder.resolve("processed.pdf");
Path deltaFile = fileFolder.resolve("delta.pdf");
Path savedOriginFile = fileFolder.resolve("origin.pdf");
try (var in = new FileInputStream(originFile.toFile()); PDFDoc pdfDoc = new PDFDoc(in); var out = new FileOutputStream(savedOriginFile.toFile())) {
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
}
try (var in = new FileInputStream(originFile.toFile()); var out = new FileOutputStream(processedFile.toFile())) {
invisibleElementRemovalService.removeInvisibleElements(in, out, false);
}
try (var in = new FileInputStream(originFile.toFile()); var out = new FileOutputStream(deltaFile.toFile())) {
invisibleElementRemovalService.removeInvisibleElements(in, out, true);
}
System.out.println("removed invisible elements");
assertVisualEquality(savedOriginFile, processedFile, context);
System.out.println("finished visual equality check");
}
@SneakyThrows
private void assertVisualEquality(Path originFile, Path processedFile, Context context) {
Path imageDir = context.getFileFolder(originFile).resolve("images");
Path originDir = imageDir.resolve("origin");
Files.createDirectories(originDir);
CompletableFuture<List<ImageFile>> originalPagesFuture = ghostScriptService.renderDocument(originFile, originDir);
Path processedDir = imageDir.resolve("processed");
Files.createDirectories(processedDir);
CompletableFuture<List<ImageFile>> processedPagesFuture = ghostScriptService.renderDocument(processedFile, processedDir);
Files.walk(context.getErrorFolder(originFile))
.map(Path::toFile)
.filter(File::isFile)
.forEach(File::delete);
List<ImageFile> originalPages = originalPagesFuture.join();
List<ImageFile> processedPages = processedPagesFuture.join();
if (originalPages.size() != processedPages.size()) {
context.getFailedFile(originFile).addErrorMessage("Differing page counts!");
return;
}
for (ImageFile originalPage : originalPages) {
Optional<ImageFile> samePage = processedPages.stream()
.filter(p -> p.pageNumber() == originalPage.pageNumber())
.findFirst();
if (samePage.isEmpty()) {
context.getFailedFile(originFile).addErrorMessage("Page " + originalPage.pageNumber() + " missing!");
return;
}
ImageFile processedPage = samePage.get();
Pix originalPagePix;
Pix processedPagePix;
synchronized (VisualEqualityTest.class) {
originalPagePix = originalPage.readPix();
processedPagePix = processedPage.readPix();
}
String errorFile = context.getErrorFolder(originFile).resolve(originalPage.pageNumber() + ".tiff").toFile().toString();
List<Rectangle2D> errorRegions = detectErrors(originalPagePix, processedPagePix, errorFile);
if (!errorRegions.isEmpty()) {
context.getFailedFile(originFile).addErrorMessage("Page " + originalPage.pageNumber() + " has " + errorRegions.size() + " errors!");
}
synchronized (VisualEqualityTest.class) {
LeptUtils.disposePix(originalPagePix);
LeptUtils.disposePix(processedPagePix);
}
}
}
synchronized public List<Rectangle2D> detectErrors(Pix pix1, Pix pix2, String errorFile) {
Pix pixDiff = Leptonica1.pixAbsDifference(pix1, pix2);
Pix pixThresh = Leptonica1.pixThresholdToBinary(pixDiff, LUMINANCE_DIFFERENCE_THRESHOLD);
Leptonica1.pixInvert(pixThresh, pixThresh);
// checks for connected black regions and outputs them as a list of boxes, a boxa
Boxa boxa = Leptonica1.pixConnComp(pixThresh, null, 8);
List<Rectangle2D> errorRegions = readRectsFromBoxa(boxa).stream()
.filter(box -> box.getWidth() * box.getHeight() >= ERROR_REGION_AREA_THRESHOLD)
.toList();
if (!errorRegions.isEmpty()) {
System.out.println("Found error(s) on page " + Path.of(errorFile).getFileName().toString().replace(".tiff", "") + ", writing error file.");
// Boxa errorRegionsBoxa = pushRectsIntoBoxa(errorRegions); // this does not work
// Pix errorPix = Leptonica1.pixDrawBoxa(pixThresh, errorRegionsBoxa, 2, -1); // somehow this runs forever
Leptonica1.pixWrite(errorFile, pixThresh, 4);
// LeptUtils.disposePix(errorPix);
// LeptUtils.dispose(errorRegionsBoxa);
}
LeptUtils.dispose(boxa);
LeptUtils.disposePix(pixDiff);
LeptUtils.disposePix(pixThresh);
return errorRegions;
}
private static List<Rectangle2D> readRectsFromBoxa(Boxa boxa) {
Pointer[] pointers = boxa.box.getPointer().getPointerArray(0, boxa.n);
List<Rectangle2D> boxes = new ArrayList<>(boxa.n);
for (int i = 0; i < boxa.n; i++) {
Box box = new Box(pointers[i]);
boxes.add(new Rectangle2D.Double(box.x, box.y, box.w, box.h));
LeptUtils.dispose(box);
}
return boxes;
}
/*
private static Boxa pushRectsIntoBoxa(List<Rectangle2D> rects) {
Boxa boxa = new Boxa();
boxa.n = rects.size();
boxa.nalloc = rects.size();
Memory boxMemory = new Memory((long) Native.POINTER_SIZE * rects.size());
for (int i = 0; i < rects.size(); i++) {
Rectangle2D rect = rects.get(i);
Box box = new Box((int) rect.getX(), (int) rect.getY(), (int) rect.getWidth(), (int) rect.getHeight(), 0);
boxMemory.setPointer((long) i * Native.POINTER_SIZE, box.getPointer());
}
boxa.box = new PointerByReference(boxMemory);
return boxa;
}
*/
private record Context(Path outFolder, Map<Path, FailedFile> failedFiles) {
public FailedFile getFailedFile(Path path) {
return failedFiles.computeIfAbsent(path, p -> FailedFile.init());
}
public Path getFileFolder(Path file) {
return outFolder.resolve(file.getFileName());
}
public String toString() {
if (failedFiles.isEmpty()) {
return "All files visually equal!";
}
StringBuilder sb = new StringBuilder();
failedFiles.forEach((file, failedFile) -> sb.append(file.getFileName().toFile()).append(": ").append(failedFile.toString()).append("\n"));
return sb.toString();
}
@SneakyThrows
public Path getErrorFolder(Path originFile) {
Path errorDir = getFileFolder(originFile).resolve("error");
Files.createDirectories(errorDir);
return errorDir;
}
}
private record FailedFile(Map<ImageFile, FailedPage> failedPages, List<String> errors) {
public static FailedFile init() {
return new FailedFile(new HashMap<>(), new LinkedList<>());
}
public void addErrorMessage(String s) {
errors.add(s);
}
public void addFailedPage(ImageFile imageFile, double location) {
failedPages.computeIfAbsent(imageFile, file -> new FailedPage(new LinkedList<>())).locations().add(location);
}
public String toString() {
return String.join(", ", errors);
}
}
private record FailedPage(List<Double> locations) {
}
}

View File

@ -1,62 +0,0 @@
package com.iqser.red.pdftronlogic.commons;
import java.io.FileOutputStream;
import java.nio.file.Path;
import java.util.Locale;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.junit.platform.commons.util.StringUtils;
import com.pdftron.pdf.PDFNet;
import lombok.SneakyThrows;
@Disabled
@SuppressWarnings("PMD")
class WatermarkRemovalServiceTest {
@SneakyThrows
@Test
void removeWatermarks() {
PDFNet.initialize(PDFTronConfig.license);
WatermarkRemovalService watermarkRemovalService = new WatermarkRemovalService();
String filename = "files/syngenta/CustomerFiles/1.A16148F - Toxicidade oral aguda (1).pdf";
String tmpFilename = createTmpFileName(filename, "WATERMARK_REMOVAL");
try (var in = this.getClass().getClassLoader().getResourceAsStream(filename); var out = new FileOutputStream(tmpFilename)) {
{
System.out.println(tmpFilename);
watermarkRemovalService.removeWatermarks(in, out);
}
}
}
public static String createTmpFileName(String filename, String suffix) {
return Path.of(getTemporaryDirectory()).resolve(Path.of(filename).getFileName()).toString().replace(".pdf", "_" + suffix + ".pdf");
}
public static String getTemporaryDirectory() {
String tmpdir = System.getProperty("java.io.tmpdir");
if (isWindows() && StringUtils.isNotBlank(tmpdir)) {
return tmpdir;
}
return "/tmp";
}
private static boolean isWindows() {
return System.getProperty("os.name").toLowerCase(Locale.ROOT).contains("windows");
}
}

View File

@ -1,145 +0,0 @@
package com.iqser.red.pdftronlogic.commons.rendering;
import java.io.BufferedReader;
import java.io.File;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Map;
import java.util.function.Consumer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@RequiredArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class GhostScriptOutputHandler extends Thread {
static Pattern pageFinishedPattern = Pattern.compile("Page (\\d+)");
// If the stdError or stdOut buffer of a thread is not being emptied it might lock the process in case of errors, so we need to empty both streams to prevent a deadlock.
// Since both need to read simultaneously we need to implement the readers as separate threads.
final InputStream is;
final String processName;
final Type type;
final Map<Integer, ImageFile> pagesToProcess;
final Consumer<ImageFile> outputHandler;
final Consumer<String> errorHandler;
int currentPageNumber;
public static GhostScriptOutputHandler stdError(InputStream is, Consumer<String> errorHandler) {
return new GhostScriptOutputHandler(is, "GS", Type.ERROR, null, null, errorHandler);
}
public static GhostScriptOutputHandler stdOut(InputStream is, Map<Integer, ImageFile> pagesToProcess, Consumer<ImageFile> imageFileOutput, Consumer<String> errorHandler) {
return new GhostScriptOutputHandler(is, "GS", Type.STD_OUT, pagesToProcess, imageFileOutput, errorHandler);
}
@SneakyThrows
public void run() {
try (InputStreamReader isr = new InputStreamReader(is); BufferedReader br = new BufferedReader(isr)) {
String line;
while (true) {
line = br.readLine();
if (line == null) {
break;
}
if (type.equals(Type.ERROR)) {
log.error("{}_{}>{}", processName, type.name(), line);
} else {
log.debug("{}_{}>{}", processName, type.name(), line);
addProcessedImageToQueue(line);
}
}
}
is.close();
if (type.equals(Type.STD_OUT)) {
queueFinishedPage(currentPageNumber);
if (!pagesToProcess.isEmpty()) {
errorHandler.accept(String.format("Ghostscript finished for batch, but pages %s remain unprocessed.", formatPagesToProcess()));
}
}
}
private String formatPagesToProcess() {
if (pagesToProcess.isEmpty()) {
return "-";
}
if (pagesToProcess.size() == 1) {
return pagesToProcess.keySet()
.iterator().next().toString();
}
return pagesToProcess.keySet()
.stream()
.mapToInt(Integer::intValue)
.min()
.orElse(0) + "-" + pagesToProcess.keySet()
.stream()
.mapToInt(Integer::intValue)
.max()
.orElse(0);
}
private void addProcessedImageToQueue(String line) {
/*
Ghostscript prints the pageNumber it is currently working on, so we remember the current page and queue it as soon as the next comes in.
*/
Matcher pageNumberMatcher = pageFinishedPattern.matcher(line);
if (pageNumberMatcher.find()) {
int pageNumber = Integer.parseInt(pageNumberMatcher.group(1));
if (currentPageNumber == 0) {
currentPageNumber = pageNumber;
return;
}
queueFinishedPage(currentPageNumber);
currentPageNumber = pageNumber;
}
}
private void queueFinishedPage(int pageNumber) {
var imageFile = this.pagesToProcess.remove(pageNumber);
if (imageFile == null) {
errorHandler.accept(String.format("Page number %d does not exist in this thread. It only has pagenumbers %s", pageNumber, pagesToProcess.keySet()));
} else {
if (!new File(imageFile.absoluteFilePath()).exists()) {
errorHandler.accept(String.format("Rendered page with number %d does not exist!", pageNumber));
}
}
outputHandler.accept(imageFile);
}
public enum Type {
ERROR,
STD_OUT
}
}

View File

@ -1,192 +0,0 @@
package com.iqser.red.pdftronlogic.commons.rendering;
import java.io.InputStream;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.CompletableFuture;
import java.util.function.Consumer;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.PDFDoc;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
@SuppressWarnings("PMD") // can't figure out how to safely close the stdOut and stdError streams in line 142/144
public class GhostScriptService {
int BATCH_SIZE = 256;
String FORMAT = ".tiff";
String DEVICE = "tiffgray";
int DPI = 100;
int PROCESS_COUNT = 1;
@SneakyThrows
public CompletableFuture<List<ImageFile>> renderDocument(Path documentFile, Path imageDir) {
int pageCount = getPageCount(documentFile);
List<Integer> allPages = IntStream.range(1, pageCount + 1).boxed()
.toList();
ImageSupervisorImpl supervisor = new ImageSupervisorImpl(allPages);
renderPagesBatched(allPages, documentFile.toFile().toString(), imageDir, supervisor, supervisor.successHandler(), supervisor.errorHandler());
return CompletableFuture.supplyAsync(() -> awaitImageFiles(supervisor));
}
@SneakyThrows
private static List<ImageFile> awaitImageFiles(ImageSupervisorImpl supervisor) {
supervisor.awaitAll();
return supervisor.getRenderedImages();
}
private static int getPageCount(Path documentFile) throws PDFNetException {
try (PDFDoc doc = new PDFDoc(documentFile.toFile().toString())) {
return doc.getPageCount();
}
}
@SneakyThrows
public void renderPagesBatched(List<Integer> pagesToProcess,
String documentAbsolutePath,
Path tmpImageDir,
ImageSupervisor supervisor,
Consumer<ImageFile> successHandler,
Consumer<String> errorHandler) {
List<List<ProcessInfo>> processInfoBatches = buildSubListForEachProcess(pagesToProcess,
PROCESS_COUNT,
BATCH_SIZE
* PROCESS_COUNT); // GS has a limit on how many pageIndices per call are possible, so we limit it to 256 pages per process
for (int batchIdx = 0; batchIdx < processInfoBatches.size(); batchIdx++) {
supervisor.requireNoErrors();
List<ProcessInfo> processInfos = processInfoBatches.get(batchIdx);
log.info("Batch {}: Running {} gs processes with ({}) pages each",
batchIdx,
processInfos.size(),
processInfos.stream()
.map(info -> info.pageNumbers().size())
.map(String::valueOf)
.collect(Collectors.joining(", ")));
int finalBatchIdx = batchIdx;
List<Process> processes = processInfos.stream()
.parallel()
.map(info -> buildCmdArgs(info.processIdx(), finalBatchIdx, info.pageNumbers(), tmpImageDir, documentAbsolutePath))
.peek(s -> log.debug(String.join(" ", s.cmdArgs())))
.map(processInfo -> executeProcess(processInfo, successHandler, errorHandler))
.toList();
List<Integer> processExitCodes = new LinkedList<>();
for (Process process : processes) {
processExitCodes.add(process.waitFor());
}
log.info("Batch {}: Ghostscript processes finished with exit codes {}", batchIdx, processExitCodes);
}
}
private List<List<ProcessInfo>> buildSubListForEachProcess(List<Integer> stitchedPageNumbers, int processCount, int batchSize) {
// GhostScript command line can only handle so many page numbers at once, so we split it into batches
int batchCount = (int) Math.ceil((double) stitchedPageNumbers.size() / batchSize);
log.info("Splitting {} page renderings across {} process(es) in {} batch(es) with size {}", stitchedPageNumbers.size(), processCount, batchCount, batchSize);
List<List<ProcessInfo>> processInfoBatches = new ArrayList<>(batchCount);
List<List<List<Integer>>> batchedBalancedSublist = ListSplittingUtils.buildBatchedBalancedSublist(stitchedPageNumbers.stream()
.sorted()
.toList(), processCount, batchCount);
for (var batch : batchedBalancedSublist) {
List<ProcessInfo> processInfos = new ArrayList<>(processCount);
for (int threadIdx = 0; threadIdx < batch.size(); threadIdx++) {
List<Integer> balancedPageNumbersSubList = batch.get(threadIdx);
processInfos.add(new ProcessInfo(threadIdx, balancedPageNumbersSubList));
}
processInfoBatches.add(processInfos);
}
return processInfoBatches;
}
@SneakyThrows
private ProcessCmdsAndRenderedImageFiles buildCmdArgs(Integer processIdx,
Integer batchIdx,
List<Integer> stitchedImagePageIndices,
Path outputDir,
String documentAbsolutePath) {
String imagePathFormat = outputDir.resolve("output_" + processIdx + "_" + batchIdx + ".%04d" + FORMAT).toFile().toString();
Map<Integer, ImageFile> fullPageImages = new HashMap<>();
for (int i = 0; i < stitchedImagePageIndices.size(); i++) {
Integer pageNumber = stitchedImagePageIndices.get(i);
fullPageImages.put(pageNumber, new ImageFile(pageNumber, String.format(imagePathFormat, i + 1)));
}
String[] cmdArgs = buildCmdArgs(stitchedImagePageIndices, documentAbsolutePath, imagePathFormat);
return new ProcessCmdsAndRenderedImageFiles(cmdArgs, fullPageImages);
}
private String[] buildCmdArgs(List<Integer> pageNumbers, String documentAbsolutePath, String imagePathFormat) {
StringBuilder sPageList = new StringBuilder();
int i = 1;
for (Integer integer : pageNumbers) {
sPageList.append(integer);
if (i < pageNumbers.size()) {
sPageList.append(",");
}
i++;
}
return new String[]{"gs", "-dNOPAUSE", "-sDEVICE=" + DEVICE, "-r" + DPI, "-sPageList=" + sPageList, "-sOutputFile=" + imagePathFormat, documentAbsolutePath, "-c", "quit"};
}
@SneakyThrows
private Process executeProcess(ProcessCmdsAndRenderedImageFiles processInfo, Consumer<ImageFile> successHandler, Consumer<String> errorHandler) {
Process p = Runtime.getRuntime().exec(processInfo.cmdArgs());
InputStream stdOut = p.getInputStream();
GhostScriptOutputHandler stdOutLogger = GhostScriptOutputHandler.stdOut(stdOut, processInfo.renderedPageImageFiles(), successHandler, errorHandler);
InputStream stdError = p.getErrorStream();
GhostScriptOutputHandler stdErrorLogger = GhostScriptOutputHandler.stdError(stdError, errorHandler);
stdOutLogger.start();
stdErrorLogger.start();
return p;
}
private record ProcessCmdsAndRenderedImageFiles(String[] cmdArgs, Map<Integer, ImageFile> renderedPageImageFiles) {
}
private record ProcessInfo(Integer processIdx, List<Integer> pageNumbers) {
}
}

View File

@ -1,13 +0,0 @@
package com.iqser.red.pdftronlogic.commons.rendering;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
public record ImageFile(int pageNumber, String absoluteFilePath) {
public Pix readPix() {
return Leptonica1.pixRead(absoluteFilePath);
}
}

View File

@ -1,7 +0,0 @@
package com.iqser.red.pdftronlogic.commons.rendering;
public interface ImageSupervisor {
void requireNoErrors();
}

View File

@ -1,114 +0,0 @@
package com.iqser.red.pdftronlogic.commons.rendering;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.CountDownLatch;
import java.util.function.Consumer;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@RequiredArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE)
public class ImageSupervisorImpl implements ImageSupervisor {
final Map<Integer, CountDownLatch> pageLatches;
final Map<Integer, ImageFile> images;
final List<String> errors;
final ImageFile[] finishedPages;
public ImageSupervisorImpl(List<Integer> pageNumbers) {
this.pageLatches = Collections.synchronizedMap(new HashMap<>());
this.images = Collections.synchronizedMap(new HashMap<>());
this.errors = Collections.synchronizedList(new ArrayList<>());
this.finishedPages = new ImageFile[pageNumbers.size()];
for (Integer pageNumber : pageNumbers) {
pageLatches.put(pageNumber, new CountDownLatch(1));
}
}
public List<ImageFile> getRenderedImages() {
return new ArrayList<>(images.values());
}
public void markPageFinished(ImageFile imageFile) {
log.debug("finished page: {}", imageFile.pageNumber());
getPageLatch(imageFile.pageNumber()).countDown();
images.put(imageFile.pageNumber(), imageFile);
finishedPages[imageFile.pageNumber() - 1] = imageFile;
}
public Consumer<ImageFile> successHandler() {
return this::markPageFinished;
}
public Consumer<String> errorHandler() {
return this::markError;
}
private CountDownLatch getPageLatch(Integer pageNumber) {
if (pageNumber == null || !pageLatches.containsKey(pageNumber)) {
throw new IllegalArgumentException("awaiting non-existent page " + pageNumber);
}
return pageLatches.get(pageNumber);
}
public ImageFile awaitProcessedPage(Integer pageNumber) throws InterruptedException {
if (hasErrors()) {
return null;
}
getPageLatch(pageNumber).await();
return images.get(pageNumber);
}
private boolean hasErrors() {
return errors.isEmpty();
}
public void markError(String errorMessage) {
this.errors.add(errorMessage);
}
public void awaitAll() throws InterruptedException {
for (CountDownLatch countDownLatch : pageLatches.values()) {
countDownLatch.await();
}
}
public void requireNoErrors() {
// GS will log
if (this.errors.isEmpty()) {
return;
}
throw new IllegalStateException("Error(s) occurred during image processing: " + String.join("\n", errors));
}
}

View File

@ -1,106 +0,0 @@
package com.iqser.red.pdftronlogic.commons.rendering;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.stream.IntStream;
import lombok.experimental.UtilityClass;
@UtilityClass
public class ListSplittingUtils {
public List<List<Integer>> buildBalancedContinuousSublist(Integer totalNumberOfEntries, int threadCount) {
return buildBalancedSublist(IntStream.range(0, totalNumberOfEntries)
.map(i -> i + 1).boxed()
.toList(), threadCount);
}
public <T> List<List<T>> buildBalancedSublist(List<T> entries, int threadCount) {
List<Integer> balancedEntryCounts = buildBalancedEntryCounts(entries.size(), threadCount);
List<List<T>> balancedSublist = new ArrayList<>(threadCount);
int startIdx = 0;
for (Integer numberOfEntriesPerThread : balancedEntryCounts) {
balancedSublist.add(entries.subList(startIdx, startIdx + numberOfEntriesPerThread));
startIdx += numberOfEntriesPerThread;
}
return balancedSublist;
}
public <T> List<List<List<T>>> buildBatchedBalancedSublist(List<T> entries, int threadCount, int batchSize) {
// batches -> threads -> entries
List<List<List<T>>> batchedBalancedSubList = new LinkedList<>();
List<List<List<T>>> threadsWithBatches = buildBalancedSublist(entries, threadCount).stream()
.map(list -> buildBalancedSublist(list, batchSize))
.toList();
// swap first two dimensions
for (int batchIdx = 0; batchIdx < batchSize; batchIdx++) {
List<List<T>> threadEntriesPerBatch = new ArrayList<>(threadCount);
for (int threadIdx = 0; threadIdx < threadCount; threadIdx++) {
threadEntriesPerBatch.add(threadsWithBatches.get(threadIdx).get(batchIdx));
}
batchedBalancedSubList.add(threadEntriesPerBatch);
}
return batchedBalancedSubList;
}
public List<Integer> buildBalancedEntryCounts(int totalNumberOfEntries, int threadCount) {
List<Integer> numberOfPagesPerThread = new ArrayList<>(threadCount);
for (int i = 0; i < threadCount; i++) {
numberOfPagesPerThread.add(0);
}
int threadIdx;
for (int i = 0; i < totalNumberOfEntries; i++) {
threadIdx = i % threadCount;
numberOfPagesPerThread.set(threadIdx, numberOfPagesPerThread.get(threadIdx) + 1);
}
return numberOfPagesPerThread;
}
public static List<String> formatIntervals(List<Integer> sortedList) {
List<String> intervals = new ArrayList<>();
if (sortedList.isEmpty()) {
return intervals;
}
int start = sortedList.get(0);
int end = start;
for (int i = 1; i < sortedList.size(); i++) {
int current = sortedList.get(i);
if (current == end + 1) {
end = current;
} else {
intervals.add(formatInterval(start, end));
start = current;
end = start;
}
}
intervals.add(formatInterval(start, end));
return intervals;
}
private static String formatInterval(int start, int end) {
if (start == end) {
return String.valueOf(start);
} else {
return start + "-" + end;
}
}
}

Binary file not shown.

@ -1 +0,0 @@
Subproject commit 9dc6c2337dea32e63aef53271dba0692537c6605

File diff suppressed because one or more lines are too long

@ -1 +0,0 @@
Subproject commit 21fefb64bf27ca2b3329a6c69d90a27450b17930