Compare commits
29 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c9424a5f4b | ||
|
|
e86e6fba2a | ||
|
|
ff9fd7bd44 | ||
|
|
e6a1656e18 | ||
|
|
b42bb29e5e | ||
|
|
de7e58d897 | ||
|
|
0b19f2d04c | ||
|
|
666c247f6a | ||
|
|
7b35b53a54 | ||
|
|
90470b41a0 | ||
|
|
bf7d374b0a | ||
|
|
16ef5afe90 | ||
|
|
1b4ab8dc88 | ||
|
|
e926083881 | ||
|
|
d9677f37f7 | ||
|
|
73d77624e9 | ||
|
|
4b0a06b8ba | ||
|
|
3bbd13daca | ||
|
|
09d8ac4e9c | ||
|
|
4888ac1608 | ||
|
|
72b4e98538 | ||
|
|
c29d39cc38 | ||
|
|
073312702c | ||
|
|
323c5a47b5 | ||
|
|
2a9583318b | ||
|
|
ab4a4619bc | ||
|
|
6b6417ed80 | ||
|
|
2caa3e92a4 | ||
|
|
b5f8c37bcd |
8
.gitignore
vendored
8
.gitignore
vendored
@ -27,3 +27,11 @@
|
||||
**/classpath-data.json
|
||||
**/dependencies-and-licenses-overview.txt
|
||||
git.tag
|
||||
|
||||
gradle.properties
|
||||
gradlew
|
||||
gradlew.bat
|
||||
gradle/
|
||||
|
||||
**/.gradle
|
||||
**/build
|
||||
|
||||
@ -1,4 +1,25 @@
|
||||
variables:
|
||||
# SONAR_PROJECT_KEY: 'com.iqser.red.commons:pdftron-logic-commons'
|
||||
GIT_SUBMODULE_STRATEGY: recursive
|
||||
GIT_SUBMODULE_FORCE_HTTPS: 'true'
|
||||
include:
|
||||
- project: 'gitlab/gitlab'
|
||||
ref: 'main'
|
||||
file: 'ci-templates/maven_deps.yml'
|
||||
file: 'ci-templates/gradle_java.yml'
|
||||
|
||||
|
||||
deploy:
|
||||
stage: deploy
|
||||
tags:
|
||||
- dind
|
||||
script:
|
||||
- echo "Building with gradle version ${BUILDVERSION}"
|
||||
- gradle -Pversion=${BUILDVERSION} publish
|
||||
- echo "BUILDVERSION=$BUILDVERSION" >> version.env
|
||||
artifacts:
|
||||
reports:
|
||||
dotenv: version.env
|
||||
rules:
|
||||
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
|
||||
- if: $CI_COMMIT_BRANCH =~ /^release/
|
||||
- if: $CI_COMMIT_TAG
|
||||
6
.gitmodules
vendored
Normal file
6
.gitmodules
vendored
Normal file
@ -0,0 +1,6 @@
|
||||
[submodule "src/test/resources/files/syngenta"]
|
||||
path = src/test/resources/files/syngenta
|
||||
url = https://gitlab.knecon.com/fforesight/documents/syngenta.git
|
||||
[submodule "src/test/resources/files/basf"]
|
||||
path = src/test/resources/files/basf
|
||||
url = https://gitlab.knecon.com/fforesight/documents/basf.git
|
||||
107
build.gradle.kts
Normal file
107
build.gradle.kts
Normal file
@ -0,0 +1,107 @@
|
||||
plugins {
|
||||
`java-library`
|
||||
`maven-publish`
|
||||
`kotlin-dsl`
|
||||
pmd
|
||||
checkstyle
|
||||
jacoco
|
||||
id("io.freefair.lombok") version "8.4"
|
||||
id("org.sonarqube") version "4.0.0.2929"
|
||||
}
|
||||
|
||||
repositories {
|
||||
mavenLocal()
|
||||
maven {
|
||||
url = uri("https://pdftron.com/maven/release")
|
||||
}
|
||||
maven {
|
||||
url = uri("https://nexus.knecon.com/repository/gindev/");
|
||||
credentials {
|
||||
username = providers.gradleProperty("mavenUser").getOrNull();
|
||||
password = providers.gradleProperty("mavenPassword").getOrNull();
|
||||
}
|
||||
}
|
||||
mavenCentral()
|
||||
}
|
||||
|
||||
dependencies {
|
||||
api("org.projectlombok:lombok:1.18.30")
|
||||
api("com.google.guava:guava:33.0.0-jre")
|
||||
api("com.pdftron:PDFNet:11.0.0")
|
||||
testImplementation("net.sourceforge.lept4j:lept4j:1.19.1")
|
||||
testImplementation("org.junit.jupiter:junit-jupiter:5.10.2")
|
||||
testImplementation("org.assertj:assertj-core:3.24.2")
|
||||
testImplementation("org.mockito:mockito-core:5.2.0")
|
||||
testImplementation("org.apache.logging.log4j:log4j-slf4j2-impl:2.22.1")
|
||||
compileOnly("org.slf4j:slf4j-api:2.0.11")
|
||||
}
|
||||
|
||||
group = "com.iqser.red.commons"
|
||||
description = "pdftron-logic-commons"
|
||||
java.sourceCompatibility = JavaVersion.VERSION_17
|
||||
java.targetCompatibility = JavaVersion.VERSION_17
|
||||
|
||||
publishing {
|
||||
publications {
|
||||
create<MavenPublication>("mavenJava") {
|
||||
from(components["java"])
|
||||
}
|
||||
}
|
||||
repositories {
|
||||
maven {
|
||||
url = uri("https://nexus.knecon.com/repository/red-platform-releases/")
|
||||
credentials {
|
||||
username = providers.gradleProperty("mavenUser").getOrNull();
|
||||
password = providers.gradleProperty("mavenPassword").getOrNull();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tasks.withType<PublishToMavenRepository> {
|
||||
onlyIf { publication.name == "mavenJava" }
|
||||
}
|
||||
|
||||
|
||||
pmd {
|
||||
isConsoleOutput = true
|
||||
}
|
||||
|
||||
tasks.pmdMain {
|
||||
pmd.ruleSetFiles = files("${rootDir}/config/pmd/pmd.xml")
|
||||
}
|
||||
|
||||
tasks.pmdTest {
|
||||
pmd.ruleSetFiles = files("${rootDir}/config/pmd/test_pmd.xml")
|
||||
}
|
||||
|
||||
tasks.named<Test>("test") {
|
||||
useJUnitPlatform()
|
||||
reports {
|
||||
junitXml.outputLocation.set(layout.buildDirectory.dir("reports/junit"))
|
||||
}
|
||||
}
|
||||
|
||||
sonarqube {
|
||||
properties {
|
||||
property("sonar.login", providers.gradleProperty("sonarToken").getOrNull())
|
||||
property("sonar.host.url", "https://sonarqube.knecon.com")
|
||||
}
|
||||
}
|
||||
|
||||
tasks.test {
|
||||
finalizedBy(tasks.jacocoTestReport)
|
||||
}
|
||||
|
||||
tasks.jacocoTestReport {
|
||||
dependsOn(tasks.test)
|
||||
reports {
|
||||
xml.required.set(true)
|
||||
csv.required.set(false)
|
||||
html.outputLocation.set(layout.buildDirectory.dir("jacocoHtml"))
|
||||
}
|
||||
}
|
||||
|
||||
java {
|
||||
withJavadocJar()
|
||||
}
|
||||
39
config/checkstyle/checkstyle.xml
Normal file
39
config/checkstyle/checkstyle.xml
Normal file
@ -0,0 +1,39 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE module PUBLIC "-//Puppy Crawl//DTD Check Configuration 1.3//EN"
|
||||
"http://www.puppycrawl.com/dtds/configuration_1_3.dtd">
|
||||
<module name="Checker">
|
||||
<property
|
||||
name="severity"
|
||||
value="error"/>
|
||||
<module name="TreeWalker">
|
||||
<module name="SuppressWarningsHolder"/>
|
||||
<module name="MissingDeprecated"/>
|
||||
<module name="MissingOverride"/>
|
||||
<module name="AnnotationLocation"/>
|
||||
<module name="JavadocStyle"/>
|
||||
<module name="NonEmptyAtclauseDescription"/>
|
||||
<module name="IllegalImport"/>
|
||||
<module name="RedundantImport"/>
|
||||
<module name="RedundantModifier"/>
|
||||
<module name="EmptyBlock"/>
|
||||
<module name="DefaultComesLast"/>
|
||||
<module name="EmptyStatement"/>
|
||||
<module name="EqualsHashCode"/>
|
||||
<module name="ExplicitInitialization"/>
|
||||
<module name="IllegalInstantiation"/>
|
||||
<module name="ModifiedControlVariable"/>
|
||||
<module name="MultipleVariableDeclarations"/>
|
||||
<module name="PackageDeclaration"/>
|
||||
<module name="ParameterAssignment"/>
|
||||
<module name="SimplifyBooleanExpression"/>
|
||||
<module name="SimplifyBooleanReturn"/>
|
||||
<module name="StringLiteralEquality"/>
|
||||
<module name="OneStatementPerLine"/>
|
||||
<module name="FinalClass"/>
|
||||
<module name="ArrayTypeStyle"/>
|
||||
<module name="UpperEll"/>
|
||||
<module name="OuterTypeFilename"/>
|
||||
</module>
|
||||
<module name="FileTabCharacter"/>
|
||||
<module name="SuppressWarningsFilter"/>
|
||||
</module>
|
||||
20
config/pmd/pmd.xml
Normal file
20
config/pmd/pmd.xml
Normal file
@ -0,0 +1,20 @@
|
||||
<?xml version="1.0"?>
|
||||
<ruleset name="Custom ruleset"
|
||||
xmlns="http://pmd.sourceforge.net/ruleset/2.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://pmd.sourceforge.net/ruleset/2.0.0 http://pmd.sourceforge.net/ruleset_2_0_0.xsd">
|
||||
|
||||
<description>
|
||||
Knecon ruleset checks the code for bad stuff
|
||||
</description>
|
||||
|
||||
<rule ref="category/java/errorprone.xml">
|
||||
<exclude name="MissingSerialVersionUID"/>
|
||||
<exclude name="AvoidLiteralsInIfCondition"/>
|
||||
<exclude name="AvoidDuplicateLiterals"/>
|
||||
<exclude name="NullAssignment"/>
|
||||
<exclude name="AssignmentInOperand"/>
|
||||
<exclude name="BeanMembersShouldSerialize"/>
|
||||
</rule>
|
||||
|
||||
</ruleset>
|
||||
22
config/pmd/test_pmd.xml
Normal file
22
config/pmd/test_pmd.xml
Normal file
@ -0,0 +1,22 @@
|
||||
<?xml version="1.0"?>
|
||||
<ruleset name="Custom ruleset"
|
||||
xmlns="http://pmd.sourceforge.net/ruleset/2.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://pmd.sourceforge.net/ruleset/2.0.0 http://pmd.sourceforge.net/ruleset_2_0_0.xsd">
|
||||
|
||||
<description>
|
||||
Knecon test ruleset checks the code for bad stuff
|
||||
</description>
|
||||
|
||||
|
||||
<rule ref="category/java/errorprone.xml">
|
||||
<exclude name="MissingSerialVersionUID"/>
|
||||
<exclude name="AvoidLiteralsInIfCondition"/>
|
||||
<exclude name="AvoidDuplicateLiterals"/>
|
||||
<exclude name="NullAssignment"/>
|
||||
<exclude name="AssignmentInOperand"/>
|
||||
<exclude name="TestClassWithoutTestCases"/>
|
||||
<exclude name="BeanMembersShouldSerialize"/>
|
||||
</rule>
|
||||
|
||||
</ruleset>
|
||||
1
gradle.properties.kts
Normal file
1
gradle.properties.kts
Normal file
@ -0,0 +1 @@
|
||||
version = 2.0-SNAPSHOT
|
||||
94
pom.xml
94
pom.xml
@ -1,94 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
|
||||
<parent>
|
||||
<artifactId>platform-dependency</artifactId>
|
||||
<groupId>com.iqser.red</groupId>
|
||||
<version>2.2.0</version>
|
||||
</parent>
|
||||
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<properties>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
</properties>
|
||||
|
||||
<artifactId>pdftron-logic-commons</artifactId>
|
||||
<groupId>com.iqser.red.commons</groupId>
|
||||
<version>2.0-SNAPSHOT</version>
|
||||
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-api</artifactId>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.logging.log4j</groupId>
|
||||
<artifactId>log4j-slf4j2-impl</artifactId>
|
||||
<version>2.20.0</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.google.guava</groupId>
|
||||
<artifactId>guava</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.pdftron</groupId>
|
||||
<artifactId>PDFNet</artifactId>
|
||||
<version>10.3.0</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<!-- Test Dependencies -->
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<!-- create a test jar for the api classes to be used by other modules -->
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-jar-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<goals>
|
||||
<goal>test-jar</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.jacoco</groupId>
|
||||
<artifactId>jacoco-maven-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>prepare-agent</id>
|
||||
<goals>
|
||||
<goal>prepare-agent</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
<execution>
|
||||
<id>report</id>
|
||||
<goals>
|
||||
<goal>report</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
|
||||
|
||||
<repositories>
|
||||
<repository>
|
||||
<id>pdftron</id>
|
||||
<name>PDFNet Maven</name>
|
||||
<url>https://pdftron.com/maven/release</url>
|
||||
</repository>
|
||||
</repositories>
|
||||
|
||||
</project>
|
||||
1
settings.gradle.kts
Normal file
1
settings.gradle.kts
Normal file
@ -0,0 +1 @@
|
||||
rootProject.name = "pdftron-logic-commons"
|
||||
@ -1,7 +1,5 @@
|
||||
package com.iqser.red.pdftronlogic.commons;
|
||||
|
||||
|
||||
|
||||
import static com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService.TOLERANCE;
|
||||
|
||||
import java.awt.geom.Area;
|
||||
@ -35,6 +33,13 @@ public class ClippingPathStack {
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void intersectClippingPath(Rectangle2D path) {
|
||||
|
||||
getCurrentClippingPath().intersect(new Area(path));
|
||||
}
|
||||
|
||||
|
||||
public boolean almostIntersects(double x, double y, double width, double height) {
|
||||
// To address inconsistencies in the calculation of the bounding box we slightly increase the rectangle
|
||||
// Height or width are zero for straight lines, even though they are being rendered. Therefore, height or width must be at minimum >0.
|
||||
@ -56,15 +61,16 @@ public class ClippingPathStack {
|
||||
public void enterNewGState() {
|
||||
|
||||
Area current = stack.peek();
|
||||
Area cloned = new Area();
|
||||
cloned.add(current);
|
||||
Area cloned = (Area) current.clone();
|
||||
stack.push(cloned);
|
||||
}
|
||||
|
||||
|
||||
public void leaveGState() {
|
||||
|
||||
stack.pop();
|
||||
// somehow this greatly helps memory management
|
||||
var popped = stack.pop();
|
||||
popped.reset();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -0,0 +1,76 @@
|
||||
package com.iqser.red.pdftronlogic.commons;
|
||||
|
||||
import static com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService.TOLERANCE;
|
||||
|
||||
import java.awt.Shape;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class ComparisonUtils {
|
||||
|
||||
public Rectangle2D shrinkRectangle(Rectangle2D inner) {
|
||||
|
||||
return shrinkRectangle(inner, TOLERANCE);
|
||||
}
|
||||
|
||||
|
||||
public Rectangle2D shrinkRectangle(Rectangle2D rect, double tolerance) {
|
||||
|
||||
double newX = rect.getX() + tolerance;
|
||||
double newY = rect.getY() + tolerance;
|
||||
double newWidth = rect.getWidth() - 2 * tolerance;
|
||||
double newHeight = rect.getHeight() - 2 * tolerance;
|
||||
|
||||
if (newWidth <= 1e-1) {
|
||||
newWidth = 1e-1;
|
||||
newX = rect.getX() + newWidth / 2;
|
||||
}
|
||||
if (newHeight <= 1e-1) {
|
||||
newHeight = 1e-1;
|
||||
newY = rect.getY() + newHeight / 2;
|
||||
}
|
||||
|
||||
return new Rectangle2D.Double(newX, newY, newWidth, newHeight);
|
||||
}
|
||||
|
||||
|
||||
public Rectangle2D padRectangle(Rectangle2D inner) {
|
||||
|
||||
return padRectangle(inner, TOLERANCE);
|
||||
}
|
||||
|
||||
|
||||
public Rectangle2D padRectangle(Rectangle2D rect, double tolerance) {
|
||||
|
||||
double newX = rect.getX() - tolerance;
|
||||
double newY = rect.getY() - tolerance;
|
||||
double newWidth = rect.getWidth() + 2 * tolerance;
|
||||
double newHeight = rect.getHeight() + 2 * tolerance;
|
||||
|
||||
if (newWidth <= 0) {
|
||||
newWidth = 1e-2;
|
||||
}
|
||||
if (newHeight <= 0) {
|
||||
newHeight = 1e-2;
|
||||
}
|
||||
|
||||
return new Rectangle2D.Double(newX, newY, newWidth, newHeight);
|
||||
}
|
||||
|
||||
|
||||
public boolean almostContains(Shape outer, Rectangle2D inner) {
|
||||
|
||||
Rectangle2D innerRect = ComparisonUtils.shrinkRectangle(inner);
|
||||
|
||||
return outer.contains(innerRect);
|
||||
}
|
||||
|
||||
|
||||
public static boolean almostEqual(double a, double b) {
|
||||
|
||||
return Math.abs(a - b) < TOLERANCE;
|
||||
}
|
||||
|
||||
}
|
||||
@ -3,8 +3,11 @@ package com.iqser.red.pdftronlogic.commons;
|
||||
import java.awt.Color;
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.GeneralPath;
|
||||
import java.awt.geom.PathIterator;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import com.google.common.primitives.Bytes;
|
||||
import com.google.common.primitives.Doubles;
|
||||
@ -21,16 +24,18 @@ import lombok.experimental.UtilityClass;
|
||||
@UtilityClass
|
||||
public class Converter {
|
||||
|
||||
public GeneralPath convertToGeneralPath(PathData pathData) throws PDFNetException {
|
||||
public GeneralPath convertToGeneralPath(PathData pathData) {
|
||||
|
||||
GeneralPath linePath = new GeneralPath();
|
||||
Iterator<Double> points = Doubles.asList(pathData.getPoints()).iterator();
|
||||
Iterator<Double> points = Doubles.asList(pathData.getPoints())
|
||||
.iterator();
|
||||
Iterable<Byte> operators = Bytes.asList(pathData.getOperators());
|
||||
for (var operator : operators) {
|
||||
switch (operator) {
|
||||
case PathData.e_moveto -> linePath.moveTo(points.next(), points.next());
|
||||
case PathData.e_lineto -> linePath.lineTo(points.next(), points.next());
|
||||
case PathData.e_cubicto -> linePath.curveTo(points.next(), points.next(), points.next(), points.next(), points.next(), points.next());
|
||||
case PathData.e_conicto -> linePath.quadTo(points.next(), points.next(), points.next(), points.next());
|
||||
case PathData.e_closepath -> linePath.closePath();
|
||||
case PathData.e_rect -> {
|
||||
double x = points.next();
|
||||
@ -43,14 +48,67 @@ public class Converter {
|
||||
linePath.lineTo(x, y + h);
|
||||
linePath.closePath();
|
||||
}
|
||||
default -> throw new PDFNetException("Invalid Element Type", 0, "", "", "");
|
||||
default -> throw new IllegalArgumentException("Invalid Operator Type " + operator);
|
||||
}
|
||||
}
|
||||
return linePath;
|
||||
}
|
||||
|
||||
|
||||
public GeneralPath convertToGeneralPathAndTransformToInitialUserSpace(PathData pathData, Matrix2D ctm) throws PDFNetException{
|
||||
public PathData convertToPathData(GeneralPath linePath) {
|
||||
|
||||
PathIterator pathIterator = linePath.getPathIterator(null);
|
||||
List<Byte> operators = new LinkedList<>();
|
||||
List<Double> points = new LinkedList<>();
|
||||
while (!pathIterator.isDone()) {
|
||||
double[] currentPoints = new double[6];
|
||||
int type = pathIterator.currentSegment(currentPoints);
|
||||
switch (type) {
|
||||
case PathIterator.SEG_MOVETO -> {
|
||||
operators.add((byte) PathData.e_moveto);
|
||||
points.add(currentPoints[0]);
|
||||
points.add(currentPoints[1]);
|
||||
}
|
||||
case PathIterator.SEG_LINETO -> {
|
||||
operators.add((byte) PathData.e_lineto);
|
||||
points.add(currentPoints[0]);
|
||||
points.add(currentPoints[1]);
|
||||
}
|
||||
case PathIterator.SEG_QUADTO -> {
|
||||
operators.add((byte) PathData.e_conicto);
|
||||
points.add(currentPoints[0]);
|
||||
points.add(currentPoints[1]);
|
||||
points.add(currentPoints[2]);
|
||||
points.add(currentPoints[3]);
|
||||
}
|
||||
case PathIterator.SEG_CUBICTO -> {
|
||||
operators.add((byte) PathData.e_cubicto);
|
||||
points.add(currentPoints[0]);
|
||||
points.add(currentPoints[1]);
|
||||
points.add(currentPoints[2]);
|
||||
points.add(currentPoints[3]);
|
||||
points.add(currentPoints[4]);
|
||||
points.add(currentPoints[5]);
|
||||
}
|
||||
case PathIterator.SEG_CLOSE -> {
|
||||
operators.add((byte) PathData.e_closepath);
|
||||
}
|
||||
}
|
||||
}
|
||||
byte[] operatorArr = new byte[operators.size()];
|
||||
for (int i = 0; i < operators.size(); i++) {
|
||||
operatorArr[i] = operators.get(i);
|
||||
}
|
||||
double[] pointArr = new double[points.size()];
|
||||
for (int i = 0; i < points.size(); i++) {
|
||||
pointArr[i] = points.get(i);
|
||||
}
|
||||
return new PathData(true, operatorArr, pointArr);
|
||||
}
|
||||
|
||||
|
||||
public GeneralPath convertToGeneralPathAndTransformToInitialUserSpace(PathData pathData, Matrix2D ctm) throws PDFNetException {
|
||||
|
||||
GeneralPath linePath = Converter.convertToGeneralPath(pathData);
|
||||
|
||||
//transform path to initial user space
|
||||
@ -59,13 +117,13 @@ public class Converter {
|
||||
return linePath;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static Color convertColor(ColorSpace colorSpace, ColorPt colorPt) {
|
||||
|
||||
ColorPt rgbColor = colorSpace.convert2RGB(colorPt);
|
||||
Color color = new Color((float) rgbColor.get(0), (float) rgbColor.get(1), (float) rgbColor.get(2));
|
||||
rgbColor.destroy();
|
||||
return color;
|
||||
try (ColorPt rgbColor = colorSpace.convert2RGB(colorPt)) {
|
||||
return new Color((float) rgbColor.get(0), (float) rgbColor.get(1), (float) rgbColor.get(2));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -75,4 +133,15 @@ public class Converter {
|
||||
return new Rectangle2D.Double(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static AffineTransform toAffineTransform(Matrix2D textMatrix) {
|
||||
|
||||
if (textMatrix == null) {
|
||||
return null;
|
||||
}
|
||||
return new AffineTransform(textMatrix.getA(), textMatrix.getB(), textMatrix.getC(), textMatrix.getD(), textMatrix.getV(), textMatrix.getH());
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,80 +0,0 @@
|
||||
package com.iqser.red.pdftronlogic.commons;
|
||||
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.Element;
|
||||
|
||||
public class ElementFeatureFactory {
|
||||
public static ElementFeatures extractFeatures(Element element) throws PDFNetException {
|
||||
|
||||
return switch (element.getType()) {
|
||||
case Element.e_path -> buildPath(element);
|
||||
case Element.e_text -> buildText(element);
|
||||
case Element.e_image, Element.e_inline_image -> buildImage(element).build();
|
||||
case Element.e_form -> buildForm(element);
|
||||
// This technically should never happen, it's a safetynet
|
||||
default -> throw new RuntimeException("Feature Extraction is not supported for PDFTron.Element with type: " + element.getType());
|
||||
};
|
||||
}
|
||||
|
||||
public static ElementFeatures extractFeaturesWithHash(Element element, String hashObject) throws PDFNetException {
|
||||
return buildImage(element)
|
||||
.hashOfImage(hashObject)
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
private static ElementFeatures.Form buildForm(Element element) throws PDFNetException {
|
||||
|
||||
return ElementFeatures.Form.builder()
|
||||
.elementType(element.getType())
|
||||
.boundingBox(Converter.toRectangle2D(element.getBBox()))
|
||||
.xObjectType(element.getXObject().getType())
|
||||
.dictOrArrayOrStreamLength(element.getXObject().getType() == 7 ? element.getXObject().getDecodedStream().size() : 0)
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
private static ElementFeatures.Image.ImageBuilder<?, ?> buildImage(Element element) throws PDFNetException {
|
||||
|
||||
return ElementFeatures.Image.builder()
|
||||
.elementType(element.getType())
|
||||
.boundingBox(Converter.toRectangle2D(element.getBBox()))
|
||||
.dataSize(element.getImageDataSize())
|
||||
.height(element.getImageHeight())
|
||||
.width(element.getImageWidth())
|
||||
.renderingIntent(element.getImageRenderingIntent())
|
||||
.componentNum(element.getComponentNum())
|
||||
.bitsPerComponent(element.getBitsPerComponent());
|
||||
}
|
||||
|
||||
|
||||
private static ElementFeatures.Text buildText(Element element) throws PDFNetException {
|
||||
|
||||
return ElementFeatures.Text.builder()
|
||||
.elementType(element.getType())
|
||||
.boundingBox(Converter.toRectangle2D(element.getBBox()))
|
||||
.text(element.getTextString())
|
||||
.font(element.getGState().getFont().getType())
|
||||
.fontsize(element.getGState().getFontSize())
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
private static ElementFeatures.Path buildPath(Element element) throws PDFNetException {
|
||||
|
||||
return ElementFeatures.Path.builder()
|
||||
.elementType(element.getType())
|
||||
.boundingBox(Converter.toRectangle2D(element.getBBox()))
|
||||
.isClippingPath(element.isClippingPath())
|
||||
.isClipWindingFill(element.isClipWindingFill())
|
||||
.isStroked(element.isStroked())
|
||||
.isFilled(element.isFilled())
|
||||
.isWindingFill(element.isWindingFill())
|
||||
.fillColor(Converter.convertColor(element.getGState().getFillColorSpace(), element.getGState().getFillColor()))
|
||||
.strokeColor(Converter.convertColor(element.getGState().getStrokeColorSpace(), element.getGState().getStrokeColor()))
|
||||
.linePath(Converter.convertToGeneralPathAndTransformToInitialUserSpace(element.getPathData(), element.getCTM()))
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
@ -1,270 +0,0 @@
|
||||
package com.iqser.red.pdftronlogic.commons;
|
||||
|
||||
import static com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService.TOLERANCE;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.geom.GeneralPath;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.Element;
|
||||
import com.pdftron.pdf.Rect;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Getter;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
@Getter
|
||||
@SuperBuilder
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class ElementFeatures {
|
||||
|
||||
final private static double RECT_POSITION_SIMILARITY_THRESHOLD_FACTOR = 0.2; // specify how much the x and y value are allowed to differ
|
||||
final private static double RECT_SIZE_SIMILARITY_THRESHOLD_FACTOR = 0.1; // the scale the images are allowed to differ
|
||||
final private static double HAMMING_DISTANCE_THRESHOLD = 4; // defines the similarity of the hash of images
|
||||
int elementType;
|
||||
Rectangle2D boundingBox;
|
||||
|
||||
|
||||
public boolean almostMatches(Element element) throws PDFNetException {
|
||||
|
||||
return element.getType() == elementType && //
|
||||
element.getBBox() != null && //
|
||||
rectsAlmostMatch(element.getBBox());
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private boolean rectsAlmostMatch(Rect bBox) {
|
||||
// To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance
|
||||
|
||||
return almostEqual(bBox.getX1(), boundingBox.getX()) && //
|
||||
almostEqual(bBox.getY1(), boundingBox.getY()) && //
|
||||
almostEqual(bBox.getWidth(), boundingBox.getWidth()) && //
|
||||
almostEqual(bBox.getHeight(), boundingBox.getHeight());
|
||||
}
|
||||
|
||||
|
||||
protected boolean almostEqual(double a, double b) {
|
||||
|
||||
return Math.abs(a - b) < TOLERANCE;
|
||||
}
|
||||
|
||||
|
||||
public boolean almostMatches(ElementFeatures elementFeatures) {
|
||||
|
||||
return elementFeatures.getElementType() == elementType && elementFeatures.getBoundingBox() != null && rectsAlmostMatch(elementFeatures.getBoundingBox());
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private boolean rectsAlmostMatch(Rectangle2D bBox) {
|
||||
// To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance
|
||||
|
||||
return almostEqual(bBox.getX(), boundingBox.getX()) && //
|
||||
almostEqual(bBox.getY(), boundingBox.getY()) && //
|
||||
almostEqual(bBox.getWidth(), boundingBox.getWidth()) && //
|
||||
almostEqual(bBox.getHeight(), boundingBox.getHeight());
|
||||
}
|
||||
|
||||
|
||||
public boolean isSimilarTo(ElementFeatures elementFeatures) {
|
||||
|
||||
return elementFeatures.getElementType() == elementType && elementFeatures.getBoundingBox() != null && areRectsSimilar(elementFeatures.getBoundingBox());
|
||||
}
|
||||
|
||||
|
||||
private boolean areRectsSimilar(Rectangle2D rectangle2D) {
|
||||
// To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance
|
||||
|
||||
return isPositionSimilar(rectangle2D.getX(), boundingBox.getX(), rectangle2D.getWidth()) && //
|
||||
isPositionSimilar(rectangle2D.getY(), boundingBox.getY(), rectangle2D.getHeight()) && //
|
||||
isSizeSimilar(rectangle2D.getWidth(), boundingBox.getWidth()) && //
|
||||
isSizeSimilar(rectangle2D.getHeight(), boundingBox.getHeight());
|
||||
}
|
||||
|
||||
|
||||
protected boolean isPositionSimilar(double a, double b, double boxSize) {
|
||||
|
||||
return Math.abs(a - b) < boxSize * RECT_POSITION_SIMILARITY_THRESHOLD_FACTOR;
|
||||
}
|
||||
|
||||
|
||||
protected boolean isSizeSimilar(double a, double b) {
|
||||
|
||||
return Math.abs(a - b) < a * RECT_SIZE_SIMILARITY_THRESHOLD_FACTOR;
|
||||
}
|
||||
|
||||
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@Getter
|
||||
@SuperBuilder
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public static class Text extends ElementFeatures {
|
||||
|
||||
String text;
|
||||
int font;
|
||||
double fontsize;
|
||||
|
||||
|
||||
@Override
|
||||
public boolean almostMatches(Element element) throws PDFNetException {
|
||||
|
||||
return super.almostMatches(element) && //
|
||||
text.equals(element.getTextString()) && //
|
||||
font == element.getGState().getFont().getType() && //
|
||||
almostEqual(fontsize, element.getGState().getFontSize());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@Getter
|
||||
@SuperBuilder
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public static class Path extends ElementFeatures {
|
||||
|
||||
boolean isClippingPath;
|
||||
boolean isClipWindingFill;
|
||||
boolean isStroked;
|
||||
boolean isFilled;
|
||||
boolean isWindingFill;
|
||||
Color strokeColor;
|
||||
Color fillColor;
|
||||
GeneralPath linePath;
|
||||
|
||||
|
||||
@Override
|
||||
public boolean almostMatches(Element element) throws PDFNetException {
|
||||
|
||||
return super.almostMatches(element) && //
|
||||
isClippingPath == element.isClippingPath() && //
|
||||
isClipWindingFill == element.isClipWindingFill() && //
|
||||
isStroked == element.isStroked() && //
|
||||
isFilled == element.isFilled() && //
|
||||
isWindingFill == element.isWindingFill();
|
||||
|
||||
}
|
||||
|
||||
|
||||
public boolean matchesFillColor(Color color) {
|
||||
|
||||
return color.equals(fillColor);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public boolean isBackground(Rect area) {
|
||||
|
||||
return isFilled && //
|
||||
getBoundingBox().intersects(area.getX1(), area.getY1(), area.getWidth(), area.getHeight());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@Getter
|
||||
@SuperBuilder
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public static class Image extends ElementFeatures {
|
||||
|
||||
int dataSize;
|
||||
int height;
|
||||
int width;
|
||||
int renderingIntent;
|
||||
int componentNum;
|
||||
int bitsPerComponent;
|
||||
String hashOfImage;
|
||||
|
||||
|
||||
@Override
|
||||
public boolean almostMatches(Element element) throws PDFNetException {
|
||||
|
||||
return super.almostMatches(element) && //
|
||||
dataSize == element.getImageDataSize() && //
|
||||
height == element.getImageHeight() && //
|
||||
width == element.getImageWidth() && //
|
||||
renderingIntent == element.getImageRenderingIntent() && //
|
||||
componentNum == element.getComponentNum() && //
|
||||
bitsPerComponent == element.getBitsPerComponent();
|
||||
}
|
||||
|
||||
|
||||
public boolean almostMatches(ElementFeatures elementFeatures) {
|
||||
|
||||
if (elementFeatures.getClass() != this.getClass()) {
|
||||
return false;
|
||||
}
|
||||
return super.almostMatches(elementFeatures) && this.dataSize == ((Image) elementFeatures).getDataSize() && this.height == ((Image) elementFeatures).getHeight() && this.width == ((Image) elementFeatures).getWidth() && this.renderingIntent == ((Image) elementFeatures).getRenderingIntent() && this.componentNum == ((Image) elementFeatures).getComponentNum() && this.bitsPerComponent == ((Image) elementFeatures).getBitsPerComponent() && calculateHammingDistance(
|
||||
((Image) elementFeatures).getHashOfImage()) <= HAMMING_DISTANCE_THRESHOLD;
|
||||
}
|
||||
|
||||
|
||||
public boolean isSimilarTo(ElementFeatures elementFeatures) {
|
||||
|
||||
return super.isSimilarTo(elementFeatures) && //
|
||||
calculateHammingDistance(((Image) elementFeatures).getHashOfImage()) <= HAMMING_DISTANCE_THRESHOLD;
|
||||
}
|
||||
|
||||
|
||||
// Helper method to calculate the Hamming distance between two hexadecimal strings
|
||||
private int calculateHammingDistance(String hash2) {
|
||||
|
||||
int distance = 0;
|
||||
int maxLength = Math.max(this.hashOfImage.length(), hash2.length());
|
||||
for (int i = 0; i < maxLength; i++) {
|
||||
char char1 = i < this.hashOfImage.length() ? this.hashOfImage.charAt(i) : '0';
|
||||
char char2 = i < hash2.length() ? hash2.charAt(i) : '0';
|
||||
if (char1 != char2) {
|
||||
distance++;
|
||||
}
|
||||
}
|
||||
return distance;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@Getter
|
||||
@SuperBuilder
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public static class Form extends ElementFeatures {
|
||||
|
||||
int xObjectType;
|
||||
long dictOrArrayOrStreamLength;
|
||||
|
||||
|
||||
@Override
|
||||
public boolean almostMatches(Element element) throws PDFNetException {
|
||||
|
||||
return element.getType() == getElementType() && //
|
||||
element.getBBox() != null && //
|
||||
(super.rectsAlmostMatch(element.getBBox()) || almostRotateMatches(element.getBBox().getRectangle())) && xObjectType == element.getXObject()
|
||||
.getType() && dictOrArrayOrStreamLength == element.getXObject().getDecodedStream().size();
|
||||
}
|
||||
|
||||
|
||||
public boolean almostMatches(ElementFeatures elementFeatures) {
|
||||
|
||||
if (elementFeatures.getClass() != this.getClass()) {
|
||||
return false;
|
||||
}
|
||||
return elementFeatures.getElementType() == getElementType() && elementFeatures.getBoundingBox() != null && (super.rectsAlmostMatch(elementFeatures.getBoundingBox()) || almostRotateMatches(
|
||||
elementFeatures.getBoundingBox()
|
||||
.getBounds2D())) && xObjectType == ((Form) elementFeatures).getXObjectType() && dictOrArrayOrStreamLength == ((Form) elementFeatures).getDictOrArrayOrStreamLength();
|
||||
|
||||
}
|
||||
|
||||
|
||||
private boolean almostRotateMatches(Rectangle2D bBox) {
|
||||
|
||||
return almostEqual(bBox.getWidth(), getBoundingBox().getHeight()) && //
|
||||
almostEqual(bBox.getHeight(), getBoundingBox().getWidth());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -35,17 +35,16 @@ public class ImageHashFactory {
|
||||
@SneakyThrows
|
||||
private byte[] getBytesOfImage(com.pdftron.pdf.Image inputImage) {
|
||||
// 0 because the memory filter determines the size
|
||||
var memFilter = new MemoryFilter(0, false);
|
||||
var filterWriter = new FilterWriter(memFilter);
|
||||
try(var memFilter = new MemoryFilter(0, false);
|
||||
var filterWriter = new FilterWriter(memFilter)) {
|
||||
|
||||
inputImage.export(filterWriter);
|
||||
filterWriter.flushAll();
|
||||
byte[] res = memFilter.getBuffer();
|
||||
inputImage.export(filterWriter);
|
||||
filterWriter.flushAll();
|
||||
byte[] res = memFilter.getBuffer();
|
||||
|
||||
memFilter.flushAll();
|
||||
memFilter.destroy();
|
||||
filterWriter.destroy();
|
||||
return res;
|
||||
memFilter.flushAll();
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1,25 +1,29 @@
|
||||
package com.iqser.red.pdftronlogic.commons;
|
||||
|
||||
import static com.iqser.red.pdftronlogic.commons.VisualizationUtils.drawFeature;
|
||||
import static com.iqser.red.pdftronlogic.commons.VisualizationUtils.drawRect;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.Shape;
|
||||
import java.awt.geom.Area;
|
||||
import java.awt.geom.GeneralPath;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
import java.util.concurrent.atomic.AtomicReference;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.features.ElementFeatureFactory;
|
||||
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
|
||||
import com.iqser.red.pdftronlogic.commons.features.ImageFeatures;
|
||||
import com.iqser.red.pdftronlogic.commons.features.PathFeatures;
|
||||
import com.iqser.red.pdftronlogic.commons.lookup.ElementFeatureLookup;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.ColorPt;
|
||||
import com.pdftron.pdf.ColorSpace;
|
||||
import com.pdftron.pdf.Element;
|
||||
import com.pdftron.pdf.ElementBuilder;
|
||||
import com.pdftron.pdf.ElementReader;
|
||||
import com.pdftron.pdf.ElementWriter;
|
||||
import com.pdftron.pdf.GState;
|
||||
@ -39,6 +43,7 @@ import lombok.extern.slf4j.Slf4j;
|
||||
public class InvisibleElementRemovalService {
|
||||
|
||||
static public final double TOLERANCE = 1;
|
||||
public static final String KNECON_OCR = "KNECON_OCR";
|
||||
|
||||
|
||||
/**
|
||||
@ -46,79 +51,140 @@ public class InvisibleElementRemovalService {
|
||||
* handled cases:
|
||||
* -Text which is transparent or is set to not render
|
||||
* -Elements outside of clipping path
|
||||
* -Elements outside of Form XObjects
|
||||
* -Elements that have been painted over by visible and filled Paths
|
||||
* -Elements with the same color as background
|
||||
* unhandled cases:
|
||||
* -Elements covered by widely stroked path
|
||||
* -Any Text set to clipping with its many interactions with other elements
|
||||
*
|
||||
* @param pdfFile The PDF file to process
|
||||
* @param delta If this flag is set only the removed Elements will be written to the output file.
|
||||
* The Elements are red if they are removed by clipping path, blue for transparency, and a green bounding box for overlap.
|
||||
* @param out OutputStream to write the resulting file to
|
||||
* @param pdfFile The PDF file to process
|
||||
* @param removePaths If this flag is set, invisible path elements will be removed
|
||||
* @param delta If this flag is set only the removed Elements will be written to the output file.
|
||||
* The Elements are red if they are removed by clipping path, blue for transparency, and a green bounding box for overlap.
|
||||
* @param out OutputStream to write the resulting file to
|
||||
**/
|
||||
@SneakyThrows
|
||||
public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta) {
|
||||
public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta, boolean removePaths, Set<String> markedContentToIgnore) {
|
||||
|
||||
PDFDoc pdfDoc = new PDFDoc(pdfFile);
|
||||
|
||||
execute(pdfDoc, delta);
|
||||
|
||||
try {
|
||||
try (pdfDoc) {
|
||||
execute(pdfDoc, delta, removePaths, markedContentToIgnore);
|
||||
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
|
||||
} catch (Exception e) {
|
||||
log.error("File could not be saved after invisible element removal");
|
||||
throw new RuntimeException(e);
|
||||
} finally {
|
||||
pdfDoc.close();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This method is similar to {@link #removeInvisibleElements(InputStream, OutputStream, boolean)}, just with a PDFDoc.
|
||||
* This method is equal to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean, Set)}, with removePaths == true and markedContentsToIgnore == emptySet().
|
||||
*/
|
||||
@SneakyThrows
|
||||
public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta) {
|
||||
|
||||
removeInvisibleElements(pdfFile, out, delta, true, Collections.emptySet());
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This method is equal to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean, Set)}, with removePaths == true and markedContentsToIgnore = Set.of("KNECON_OCR").
|
||||
*/
|
||||
public void removeInvisibleElementsButKeepOcrText(InputStream pdfFile, OutputStream out, boolean delta) {
|
||||
|
||||
removeInvisibleElements(pdfFile, out, delta, true, Set.of(KNECON_OCR));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This method is equal to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean, Set)}, with removePaths == true and markedContentsToIgnore = Set.of("KNECON_OCR").
|
||||
*/
|
||||
public void removeInvisibleElementsButKeepOcrText(PDFDoc pdfFile, boolean delta) {
|
||||
|
||||
removeInvisibleElements(pdfFile, delta, true, Set.of(KNECON_OCR));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This method is equal to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean, Set)}, with markedContentsToIgnore == emptySet().
|
||||
*/
|
||||
@SneakyThrows
|
||||
public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta, boolean removePaths) {
|
||||
|
||||
removeInvisibleElements(pdfFile, out, delta, removePaths, Collections.emptySet());
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This method is similar to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean, Set)}, just with a PDFDoc.
|
||||
*/
|
||||
@SneakyThrows
|
||||
public void removeInvisibleElements(PDFDoc pdfDoc, boolean delta, boolean removePaths, Set<String> markedContentToIgnore) {
|
||||
|
||||
execute(pdfDoc, delta, removePaths, markedContentToIgnore);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This method is similar to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean)}, just with a PDFDoc.
|
||||
*/
|
||||
@SneakyThrows
|
||||
public void removeInvisibleElements(PDFDoc pdfDoc, boolean delta, boolean removePaths) {
|
||||
|
||||
execute(pdfDoc, delta, removePaths, Collections.emptySet());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This method is equal to {@link #removeInvisibleElements(PDFDoc, boolean)}, just with a PDFDoc.
|
||||
*/
|
||||
@SneakyThrows
|
||||
public void removeInvisibleElements(PDFDoc pdfDoc, boolean delta) {
|
||||
|
||||
execute(pdfDoc, delta);
|
||||
execute(pdfDoc, delta, true, Collections.emptySet());
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void execute(PDFDoc pdfDoc, boolean delta) {
|
||||
private void execute(PDFDoc pdfDoc, boolean delta, boolean removePaths, Set<String> markedContentToIgnore) {
|
||||
|
||||
log.info("Start removing invisible Elements");
|
||||
ElementWriter writer = new ElementWriter();
|
||||
ElementReader reader = new ElementReader();
|
||||
Set<Long> visitedXObjIds = new TreeSet<>();
|
||||
try (PageIterator iterator = pdfDoc.getPageIterator(); ElementWriter writer = new ElementWriter(); ElementReader reader = new ElementReader()) {
|
||||
Set<Long> visitedXObjIds = new TreeSet<>();
|
||||
|
||||
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
|
||||
while (iterator.hasNext()) {
|
||||
|
||||
Page page = iterator.next();
|
||||
Page page = iterator.next();
|
||||
|
||||
visitedXObjIds.add(page.getSDFObj().getObjNum());
|
||||
visitedXObjIds.add(page.getSDFObj().getObjNum());
|
||||
|
||||
InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder()
|
||||
.reader(reader)
|
||||
.clippingPathStack(new ClippingPathStack(page.getMediaBox()))
|
||||
.delta(delta)
|
||||
.overlappedElements(new ArrayList<>())
|
||||
.visibleElements(new ArrayList<>())
|
||||
.visitedXObjIds(visitedXObjIds)
|
||||
.build();
|
||||
try (InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder()
|
||||
.reader(reader)
|
||||
.clippingPathStack(new ClippingPathStack(page.getMediaBox()))
|
||||
.markedContentStack(new MarkedContentStack(pdfDoc))
|
||||
.removePaths(removePaths)
|
||||
.delta(delta)
|
||||
.overlappedElements(new ElementFeatureLookup())
|
||||
.visibleElements(new ElementFeatureLookup())
|
||||
.visitedXObjIds(visitedXObjIds)
|
||||
.markedContentToIgnore(markedContentToIgnore)
|
||||
.build()) {
|
||||
|
||||
removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context);
|
||||
removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context);
|
||||
|
||||
context.visitedXObjIds().clear();
|
||||
context.visitedXObjIds().clear();
|
||||
context.markedContentStack().clear();
|
||||
|
||||
removeOverlappedElements(page, writer, context);
|
||||
removeOverlappedElements(page, writer, context);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
writer.destroy();
|
||||
reader.destroy();
|
||||
|
||||
log.info("Finished removing invisible Elements");
|
||||
}
|
||||
|
||||
@ -128,6 +194,7 @@ public class InvisibleElementRemovalService {
|
||||
InvisibleElementRemovalContext context) throws PDFNetException {
|
||||
|
||||
context.reader().begin(page);
|
||||
context.markedContentStack().clear();
|
||||
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
|
||||
processElements(writer, context);
|
||||
writer.end();
|
||||
@ -137,7 +204,13 @@ public class InvisibleElementRemovalService {
|
||||
|
||||
private void processElements(ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
|
||||
|
||||
for (Element element = context.reader().next(); element != null; element = context.reader().next())
|
||||
for (Element element = context.reader().next(); element != null; element = context.reader().next()) {
|
||||
|
||||
if (context.markedContentStack().currentMarkedContentContainsAny(context.markedContentToIgnore()) && element.getType() != Element.e_marked_content_end) {
|
||||
writer.writeElement(element);
|
||||
continue;
|
||||
}
|
||||
|
||||
switch (element.getType()) {
|
||||
case Element.e_image, Element.e_inline_image -> processImages(element, writer, context);
|
||||
case Element.e_text -> processText(element, writer, context);
|
||||
@ -151,75 +224,94 @@ public class InvisibleElementRemovalService {
|
||||
context.clippingPathStack().leaveGState();
|
||||
writer.writeElement(element);
|
||||
}
|
||||
case Element.e_marked_content_begin -> {
|
||||
context.markedContentStack().enterMarkedContent(element.getMCTag().getName());
|
||||
writer.writeElement(element);
|
||||
}
|
||||
case Element.e_marked_content_end -> {
|
||||
context.markedContentStack().leaveMarkedContent();
|
||||
writer.writeElement(element);
|
||||
}
|
||||
default -> writer.writeElement(element);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void processImages(Element imageElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
|
||||
|
||||
Rect rect = imageElement.getBBox();
|
||||
try (Rect rect = imageElement.getBBox()) {
|
||||
|
||||
if (rect == null) {
|
||||
return;
|
||||
}
|
||||
if (rect == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
|
||||
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
|
||||
if (inClippingPath) {
|
||||
ImageFeatures imageFeatures = ElementFeatureFactory.buildImage(imageElement);
|
||||
if (!(context.markedContentStack.contextHasTransparency() || imageFeatures.isTransparent() || imageFeatures.isImageMask() || imageFeatures.isSoftMask())) {
|
||||
calculateOverlaps(context, imageFeatures, imageFeatures.isMasked());
|
||||
}
|
||||
context.visibleElements().add(imageFeatures);
|
||||
}
|
||||
|
||||
if (!context.delta() && inClippingPath) {
|
||||
context.visibleElements().add(ElementFeatureFactory.extractFeatures(imageElement));
|
||||
}
|
||||
|
||||
if (context.delta() ^ inClippingPath) {
|
||||
writer.writeElement(imageElement);
|
||||
if (context.delta() ^ inClippingPath) {
|
||||
writer.writeElement(imageElement);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void processText(Element textElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
|
||||
|
||||
Rect textBBox = textElement.getBBox();
|
||||
try (Rect textBBox = textElement.getBBox()) {
|
||||
|
||||
if (textBBox == null) {
|
||||
writer.writeElement(textElement);
|
||||
return;
|
||||
}
|
||||
|
||||
GState gState = textElement.getGState();
|
||||
|
||||
boolean inClippingPath = context.clippingPathStack().almostIntersects(textBBox.getX1(), textBBox.getY1(), textBBox.getWidth(), textBBox.getHeight());
|
||||
|
||||
boolean isTextVisible = isTextRenderedVisibly(gState, textBBox, context);
|
||||
|
||||
if (inClippingPath && isTextVisible) {
|
||||
context.visibleElements().add(ElementFeatureFactory.extractFeatures(textElement));
|
||||
}
|
||||
if (!context.delta()) {
|
||||
if (inClippingPath && isTextVisible) {
|
||||
if (textBBox == null) {
|
||||
writer.writeElement(textElement);
|
||||
} else if (textElement.hasTextMatrix()) {
|
||||
return;
|
||||
}
|
||||
|
||||
GState gState = textElement.getGState();
|
||||
|
||||
boolean inClippingPath = context.clippingPathStack().almostIntersects(textBBox.getX1(), textBBox.getY1(), textBBox.getWidth(), textBBox.getHeight());
|
||||
|
||||
boolean isTextVisible = isTextRenderedVisibly(gState, textBBox, context);
|
||||
|
||||
if (inClippingPath && isTextVisible) {
|
||||
context.visibleElements().add(ElementFeatureFactory.buildText(textElement, true, context.delta()));
|
||||
}
|
||||
if (!context.delta()) {
|
||||
if (inClippingPath && isTextVisible) {
|
||||
writer.writeElement(textElement);
|
||||
} else if (textElement.hasTextMatrix()) {
|
||||
/*
|
||||
PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
|
||||
hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
|
||||
Therefore, the position of a following Tj is affected by not writing the first Element.
|
||||
This is why, we write only the Tm command:
|
||||
*/
|
||||
writer.writeGStateChanges(textElement);
|
||||
}
|
||||
} else {
|
||||
if (!inClippingPath) {
|
||||
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||
// red for elements removed by clipping path
|
||||
gState.setFillColor(new ColorPt(1, 0, 0));
|
||||
writer.writeElement(textElement);
|
||||
}
|
||||
if (!isTextVisible) {
|
||||
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||
// blue for elements removed due to transparency or not rendered or same color as background
|
||||
gState.setFillColor(new ColorPt(0, 0, 1));
|
||||
gState.setTextRenderMode(GState.e_fill_text);
|
||||
gState.setFillOpacity(1);
|
||||
writer.writeElement(textElement);
|
||||
textElement.setTextData(new byte[]{});
|
||||
writer.writeElement(textElement);
|
||||
}
|
||||
} else {
|
||||
if (!inClippingPath) {
|
||||
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||
// red for elements removed by clipping path
|
||||
try (var color = new ColorPt(1, 0, 0)) {
|
||||
gState.setFillColor(color);
|
||||
}
|
||||
writer.writeElement(textElement);
|
||||
}
|
||||
if (!isTextVisible) {
|
||||
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||
// blue for elements removed due to transparency or not rendered or same color as background
|
||||
try (var color = new ColorPt(0, 0, 1)) {
|
||||
gState.setFillColor(color);
|
||||
}
|
||||
gState.setTextRenderMode(GState.e_fill_text);
|
||||
gState.setFillOpacity(1);
|
||||
writer.writeElement(textElement);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -232,19 +324,27 @@ public class InvisibleElementRemovalService {
|
||||
|
||||
if (!context.visitedXObjIds().contains(formObj.getObjNum())) {
|
||||
context.visitedXObjIds().add(formObj.getObjNum());
|
||||
|
||||
// writer needs to be newly initialized when entering a new content stream
|
||||
// see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
|
||||
ElementWriter formWriter = new ElementWriter();
|
||||
context.reader().formBegin();
|
||||
formWriter.begin(formObj);
|
||||
try (ElementWriter formWriter = new ElementWriter()) {
|
||||
context.markedContentStack.enterForm(formElement);
|
||||
context.clippingPathStack().enterNewGState();
|
||||
try (var formElementBBOX = formElement.getBBox()) {
|
||||
context.clippingPathStack().intersectClippingPath(Converter.toRectangle2D(formElementBBOX));
|
||||
context.reader().formBegin();
|
||||
formWriter.begin(formObj);
|
||||
|
||||
context.reader().clearChangeList();
|
||||
formWriter.setDefaultGState(context.reader());
|
||||
context.reader().clearChangeList();
|
||||
formWriter.setDefaultGState(context.reader());
|
||||
|
||||
processElements(formWriter, context);
|
||||
formWriter.end();
|
||||
formWriter.destroy();
|
||||
context.reader().end();
|
||||
processElements(formWriter, context);
|
||||
formWriter.end();
|
||||
context.reader().end();
|
||||
context.clippingPathStack().leaveGState();
|
||||
context.markedContentStack.leaveForm();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -253,67 +353,78 @@ public class InvisibleElementRemovalService {
|
||||
|
||||
PathData pathData = pathElement.getPathData();
|
||||
|
||||
if (pathData.getOperators().length == 0 && pathData.getPoints().length == 0 || pathElement.getBBox() == null) {
|
||||
writer.writeElement(pathElement);
|
||||
return;
|
||||
}
|
||||
|
||||
GeneralPath linePath = Converter.convertToGeneralPathAndTransformToInitialUserSpace(pathData, pathElement.getCTM());
|
||||
|
||||
var rect = linePath.getBounds2D();
|
||||
|
||||
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight());
|
||||
|
||||
if (pathElement.isClippingPath()) {
|
||||
if (pathElement.isClipWindingFill()) {
|
||||
linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
|
||||
} else {
|
||||
linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
|
||||
try (var bbox = pathElement.getBBox()) {
|
||||
if (pathData.getOperators().length == 0 && pathData.getPoints().length == 0 || bbox == null) {
|
||||
writer.writeElement(pathElement);
|
||||
return;
|
||||
}
|
||||
|
||||
context.clippingPathStack().intersectClippingPath(linePath);
|
||||
pathElement.setPathClip(!context.delta());
|
||||
writer.writeElement(pathElement);
|
||||
PathFeatures pathFeatures = ElementFeatureFactory.buildPath(pathElement);
|
||||
GeneralPath linePath = pathFeatures.getLinePath();
|
||||
|
||||
} else {
|
||||
if (pathElement.isWindingFill()) {
|
||||
linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
|
||||
} else {
|
||||
linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
|
||||
}
|
||||
var rect = linePath.getBounds2D();
|
||||
|
||||
if (inClippingPath) {
|
||||
if (isFilledAndNonTransparent(pathElement)) {
|
||||
List<ElementFeatures> currentOverlappedElements = context.visibleElements()
|
||||
.stream()
|
||||
.filter(features -> almostContains(linePath, features.getBoundingBox()))
|
||||
.toList();
|
||||
context.overlappedElements().addAll(currentOverlappedElements);
|
||||
context.visibleElements().removeAll(currentOverlappedElements);
|
||||
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight());
|
||||
|
||||
if (pathElement.isClippingPath()) {
|
||||
if (pathElement.isClipWindingFill()) {
|
||||
linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
|
||||
} else {
|
||||
linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
|
||||
}
|
||||
context.visibleElements().add(ElementFeatureFactory.extractFeatures(pathElement));
|
||||
if (!context.delta()) {
|
||||
|
||||
context.clippingPathStack().intersectClippingPath(linePath);
|
||||
pathElement.setPathClip(!context.delta());
|
||||
writer.writeElement(pathElement);
|
||||
|
||||
} else {
|
||||
if (pathElement.isWindingFill()) {
|
||||
linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
|
||||
} else {
|
||||
linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
|
||||
}
|
||||
|
||||
if (inClippingPath) {
|
||||
if (!context.markedContentStack.contextHasTransparency() && isFilledAndNonTransparent(pathElement)) {
|
||||
calculateOverlaps(context, pathFeatures, false);
|
||||
}
|
||||
context.visibleElements().add(ElementFeatureFactory.extractFeatures(pathElement));
|
||||
}
|
||||
|
||||
if (!context.delta() && (inClippingPath || !context.removePaths())) {
|
||||
writer.writeElement(pathElement);
|
||||
}
|
||||
}
|
||||
if (context.delta() && !inClippingPath) {
|
||||
pathElement.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||
pathElement.getGState().setFillColor(new ColorPt(1, 0, 0));
|
||||
pathElement.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
||||
pathElement.getGState().setStrokeColor(new ColorPt(1, 0, 0));
|
||||
writer.writeElement(pathElement);
|
||||
|
||||
if (context.delta() && !inClippingPath && context.removePaths()) {
|
||||
try (var color = new ColorPt(1, 0, 0)) {
|
||||
pathElement.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||
pathElement.getGState().setFillColor(color);
|
||||
pathElement.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
||||
pathElement.getGState().setStrokeColor(color);
|
||||
writer.writeElement(pathElement);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void calculateOverlaps(InvisibleElementRemovalContext context, ElementFeatures elementFeatures, boolean textOnly) {
|
||||
|
||||
List<ElementFeatures> currentOverlappedElements = context.visibleElements().findOverlapped(elementFeatures, textOnly);
|
||||
context.overlappedElements().addAll(currentOverlappedElements);
|
||||
context.visibleElements().removeAll(currentOverlappedElements);
|
||||
}
|
||||
|
||||
|
||||
private void removeOverlappedElements(Page page, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
|
||||
|
||||
context.reader().begin(page);
|
||||
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
|
||||
if (context.delta()) {
|
||||
// green for element removed due to overlapping
|
||||
context.overlappedElements().forEach(feature -> drawBBox(writer, feature.getBoundingBox(), "#00FF00"));
|
||||
context.overlappedElements()
|
||||
.forEach(feature -> drawFeature(writer, feature, Color.GREEN));
|
||||
context.overlappedElements().clear();
|
||||
}
|
||||
processOverlappedElements(writer, context);
|
||||
@ -321,7 +432,7 @@ public class InvisibleElementRemovalService {
|
||||
context.reader().end();
|
||||
|
||||
if (!context.overlappedElements().isEmpty()) {
|
||||
log.warn(context.overlappedElements().size() + " overlapped elements have not been found or removed");
|
||||
log.debug(context.overlappedElements().size() + " overlapped elements have not been found or removed");
|
||||
}
|
||||
}
|
||||
|
||||
@ -329,35 +440,66 @@ public class InvisibleElementRemovalService {
|
||||
private void processOverlappedElements(ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
|
||||
|
||||
for (Element element = context.reader().next(); element != null; element = context.reader().next()) {
|
||||
|
||||
if (context.markedContentStack().currentMarkedContentContainsAny(context.markedContentToIgnore()) && element.getType() != Element.e_marked_content_end) {
|
||||
writer.writeElement(element);
|
||||
continue;
|
||||
}
|
||||
|
||||
switch (element.getType()) {
|
||||
case Element.e_form -> processFormOverlappedElements(writer, element, context);
|
||||
case Element.e_path, Element.e_image, Element.e_inline_image, Element.e_text -> {
|
||||
boolean anyMatch = false;
|
||||
for (ElementFeatures elementToRemove : context.overlappedElements()) {
|
||||
if (elementToRemove.almostMatches(element)) {
|
||||
context.overlappedElements().remove(elementToRemove);
|
||||
anyMatch = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!anyMatch) {
|
||||
case Element.e_image, Element.e_inline_image, Element.e_text -> removeOverlappedElement(writer, context, element);
|
||||
case Element.e_path -> {
|
||||
if (context.removePaths()) {
|
||||
removeOverlappedElement(writer, context, element);
|
||||
} else {
|
||||
writer.writeElement(element);
|
||||
} else if (element.getType() == 3 && element.hasTextMatrix()) {
|
||||
/*
|
||||
PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
|
||||
hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
|
||||
Therefore, the position of a following Tj is affected by not writing the first Element.
|
||||
This is why, we write only the Tm command:
|
||||
*/
|
||||
writer.writeGStateChanges(element);
|
||||
}
|
||||
}
|
||||
case Element.e_marked_content_begin -> {
|
||||
context.markedContentStack().enterMarkedContent(element.getMCTag().getName());
|
||||
writer.writeElement(element);
|
||||
}
|
||||
case Element.e_marked_content_end -> {
|
||||
context.markedContentStack().leaveMarkedContent();
|
||||
writer.writeElement(element);
|
||||
}
|
||||
default -> writer.writeElement(element);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static void removeOverlappedElement(ElementWriter writer, InvisibleElementRemovalContext context, Element element) throws PDFNetException {
|
||||
|
||||
try (Rect bbox = element.getBBox()) {
|
||||
if (bbox == null) {
|
||||
writer.writeElement(element);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
Optional<ElementFeatures> optionalElementMatch = context.overlappedElements()
|
||||
.anyMatch(ElementFeatureFactory.extractFeatures(element));
|
||||
|
||||
if (optionalElementMatch.isPresent()) {
|
||||
context.overlappedElements().remove(optionalElementMatch.get());
|
||||
if (element.getType() == 3 && element.hasTextMatrix()) {
|
||||
/*
|
||||
PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
|
||||
hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
|
||||
Therefore, the position of a following Tj is affected by not writing the first Element.
|
||||
This is why, we write only the Tm command:
|
||||
*/
|
||||
element.setTextData(new byte[]{});
|
||||
writer.writeElement(element);
|
||||
}
|
||||
} else {
|
||||
writer.writeElement(element);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void processFormOverlappedElements(ElementWriter writer, Element formElement, InvisibleElementRemovalContext context) throws PDFNetException {
|
||||
|
||||
writer.writeElement(formElement);
|
||||
@ -367,17 +509,17 @@ public class InvisibleElementRemovalService {
|
||||
context.visitedXObjIds().add(formObj.getObjNum());
|
||||
// writer needs to be newly initialized when entering a new content stream
|
||||
// see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
|
||||
ElementWriter formWriter = new ElementWriter();
|
||||
context.reader().formBegin();
|
||||
formWriter.begin(formObj);
|
||||
try (ElementWriter formWriter = new ElementWriter()) {
|
||||
context.reader().formBegin();
|
||||
formWriter.begin(formObj);
|
||||
|
||||
context.reader().clearChangeList();
|
||||
formWriter.setDefaultGState(context.reader());
|
||||
context.reader().clearChangeList();
|
||||
formWriter.setDefaultGState(context.reader());
|
||||
|
||||
processOverlappedElements(formWriter, context);
|
||||
formWriter.end();
|
||||
formWriter.destroy();
|
||||
context.reader().end();
|
||||
processOverlappedElements(formWriter, context);
|
||||
formWriter.end();
|
||||
context.reader().end();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -396,102 +538,91 @@ public class InvisibleElementRemovalService {
|
||||
private boolean strokeIsVisible(GState gState, Rect textBBox, InvisibleElementRemovalContext context) throws PDFNetException {
|
||||
|
||||
return gState.getStrokeOpacity() != 0 && differentColorThanBackgroundColor(Converter.convertColor(gState.getStrokeColorSpace(), gState.getStrokeColor()),
|
||||
textBBox,
|
||||
context);
|
||||
textBBox,
|
||||
context);
|
||||
}
|
||||
|
||||
|
||||
private boolean fillIsVisible(GState gState, Rect textBBox, InvisibleElementRemovalContext context) throws PDFNetException {
|
||||
|
||||
return gState.getFillOpacity() != 0 && differentColorThanBackgroundColor(Converter.convertColor(gState.getFillColorSpace(), gState.getFillColor()), textBBox, context);
|
||||
try (var color = gState.getFillColor()) {
|
||||
return gState.getFillOpacity() != 0 && differentColorThanBackgroundColor(Converter.convertColor(gState.getFillColorSpace(), color), textBBox, context);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private boolean differentColorThanBackgroundColor(Color fillColor, Rect textBBox, InvisibleElementRemovalContext context) {
|
||||
|
||||
List<ElementFeatures.Path> backgroundElements = findVisiblePathElementsThatIntersect(textBBox, context);
|
||||
List<PathFeatures> backgroundElements = findVisiblePathElementsThatIntersect(textBBox, context);
|
||||
|
||||
if (backgroundElements.isEmpty()) {
|
||||
return !fillColor.equals(Color.WHITE);
|
||||
}
|
||||
|
||||
List<ElementFeatures.Path> pathElementsByColor = backgroundElements.stream().filter(path -> path.getFillColor().equals(fillColor)).toList();
|
||||
List<PathFeatures> pathElementsByColor = backgroundElements.stream()
|
||||
.filter(path -> path.getFillColor().equals(fillColor))
|
||||
.toList();
|
||||
if (pathElementsByColor.isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
Area backgroundArea = mergeLinePathsToArea(pathElementsByColor);
|
||||
return !almostContains(backgroundArea, Converter.toRectangle2D(textBBox));
|
||||
return !ComparisonUtils.almostContains(backgroundArea, Converter.toRectangle2D(textBBox));
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static List<ElementFeatures.Path> findVisiblePathElementsThatIntersect(Rect textBBox, InvisibleElementRemovalContext context) {
|
||||
private static List<PathFeatures> findVisiblePathElementsThatIntersect(Rect textBBox, InvisibleElementRemovalContext context) {
|
||||
|
||||
var result = new ArrayList<PathFeatures>();
|
||||
context.visibleElements().findIntersecting(textBBox)
|
||||
.forEach(element -> {
|
||||
if (element instanceof PathFeatures pathFeatures && !pathFeatures.getFillColor().equals(Color.WHITE) && pathFeatures.isFilled()) {
|
||||
result.add(pathFeatures);
|
||||
}
|
||||
});
|
||||
return result;
|
||||
|
||||
return context.visibleElements()
|
||||
.stream()
|
||||
.filter(elementFeatures -> elementFeatures.getElementType() == Element.e_path)
|
||||
.map(elementFeatures -> (ElementFeatures.Path) elementFeatures)
|
||||
.filter(elementFeatures -> !elementFeatures.getFillColor().equals(Color.WHITE))
|
||||
.filter(element -> element.isBackground(textBBox))
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
private static Area mergeLinePathsToArea(List<ElementFeatures.Path> pathElementsWithSameColor) {
|
||||
private static Area mergeLinePathsToArea(List<PathFeatures> pathElementsWithSameColor) {
|
||||
|
||||
Area backgroundArea = new Area();
|
||||
pathElementsWithSameColor.stream().map(ElementFeatures.Path::getLinePath).map(Area::new).forEach(backgroundArea::add);
|
||||
pathElementsWithSameColor.stream()
|
||||
.map(PathFeatures::getLinePath)
|
||||
.map(Area::new)
|
||||
.forEach(backgroundArea::add);
|
||||
return backgroundArea;
|
||||
}
|
||||
|
||||
|
||||
private boolean almostContains(Shape outer, Rectangle2D inner) {
|
||||
|
||||
//To address inconsistencies in the calculation of the bounding box we slightly shrink the inner rectangle
|
||||
double x_with_tolerance = inner.getX() >= 0 ? inner.getX() + TOLERANCE : inner.getX() - TOLERANCE;
|
||||
double y_with_tolerance = inner.getY() >= 0 ? inner.getY() + TOLERANCE : inner.getY() - TOLERANCE;
|
||||
double height_with_tolerance = inner.getHeight() - (2 * TOLERANCE);
|
||||
double width_with_tolerance = inner.getWidth() - (2 * TOLERANCE);
|
||||
Rectangle2D innerRect = new Rectangle2D.Double(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance);
|
||||
|
||||
return outer.contains(innerRect);
|
||||
}
|
||||
|
||||
|
||||
private boolean isFilledAndNonTransparent(Element element) throws PDFNetException {
|
||||
|
||||
return element.isFilled() && element.getGState().getFillOpacity() == 1;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void drawBBox(ElementWriter writer, Rectangle2D r, String hexcolor) {
|
||||
|
||||
ColorPt colorPt = new ColorPt(Integer.valueOf(hexcolor.substring(1, 3), 16) / 255d,
|
||||
Integer.valueOf(hexcolor.substring(3, 5), 16) / 255d,
|
||||
Integer.valueOf(hexcolor.substring(5, 7), 16) / 255d);
|
||||
ElementBuilder eb = new ElementBuilder();
|
||||
Element rect = eb.createRect(r.getX(), r.getY(), r.getWidth(), r.getHeight());
|
||||
rect.setPathStroke(true);
|
||||
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
||||
rect.getGState().setStrokeColor(colorPt);
|
||||
writer.writePlacedElement(rect);
|
||||
|
||||
colorPt.destroy();
|
||||
eb.destroy();
|
||||
}
|
||||
|
||||
|
||||
@Builder
|
||||
private record InvisibleElementRemovalContext(
|
||||
boolean removePaths,
|
||||
boolean delta,
|
||||
ElementReader reader,
|
||||
ClippingPathStack clippingPathStack,
|
||||
List<ElementFeatures> overlappedElements,
|
||||
List<ElementFeatures> visibleElements,
|
||||
Set<Long> visitedXObjIds) {
|
||||
MarkedContentStack markedContentStack,
|
||||
ElementFeatureLookup overlappedElements,
|
||||
ElementFeatureLookup visibleElements,
|
||||
Set<Long> visitedXObjIds,
|
||||
Set<String> markedContentToIgnore
|
||||
) implements AutoCloseable {
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
|
||||
overlappedElements.close();
|
||||
visibleElements.close();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -0,0 +1,122 @@
|
||||
package com.iqser.red.pdftronlogic.commons;
|
||||
|
||||
import java.util.Deque;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
|
||||
import com.pdftron.pdf.Element;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.ocg.Group;
|
||||
import com.pdftron.sdf.Obj;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@RequiredArgsConstructor
|
||||
public class MarkedContentStack {
|
||||
|
||||
private final PDFDoc pdfDoc;
|
||||
Deque<MarkedContent> stack = new LinkedList<>();
|
||||
Deque<Form> formStack = new LinkedList<>();
|
||||
|
||||
|
||||
public void enterMarkedContent(String name) {
|
||||
|
||||
stack.push(new MarkedContent(name, name.startsWith("OC")));
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void enterForm(Element formElement) {
|
||||
|
||||
Obj oc = formElement.getXObject().findObj("OC");
|
||||
Obj group = formElement.getXObject().findObj("Group");
|
||||
boolean transparency = false;
|
||||
if (group != null) {
|
||||
Obj groupSubType = group.findObj("S");
|
||||
if (groupSubType != null && groupSubType.isName() && groupSubType.getName().equals("Transparency")) {
|
||||
transparency = true;
|
||||
}
|
||||
}
|
||||
|
||||
formStack.push(new Form(formElement.getXObject().getObjNum(), oc != null, transparency));
|
||||
}
|
||||
|
||||
|
||||
public void leaveMarkedContent() {
|
||||
|
||||
stack.pop();
|
||||
}
|
||||
|
||||
|
||||
public String currentMarkedContent() {
|
||||
|
||||
if (stack.isEmpty()) {
|
||||
return "";
|
||||
}
|
||||
return stack.peek().name();
|
||||
}
|
||||
|
||||
|
||||
public boolean currentMarkedContentContains(String name) {
|
||||
|
||||
Iterator<MarkedContent> markedContentIterator = stack.descendingIterator();
|
||||
while (markedContentIterator.hasNext()) {
|
||||
var markedContent = markedContentIterator.next();
|
||||
if (markedContent.name().equals(name)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
public boolean currentMarkedContentContainsAny(Set<String> names) {
|
||||
|
||||
if (stack.isEmpty()) {
|
||||
return false;
|
||||
}
|
||||
Iterator<MarkedContent> markedContentIterator = stack.descendingIterator();
|
||||
while (markedContentIterator.hasNext()) {
|
||||
var markedContent = markedContentIterator.next();
|
||||
if (names.contains(markedContent.name())) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
public void clear() {
|
||||
|
||||
stack.clear();
|
||||
}
|
||||
|
||||
|
||||
public boolean contextHasTransparency() {
|
||||
|
||||
return formStack.stream()
|
||||
.anyMatch(form -> form.optionalContent || form.transparency) //
|
||||
|| stack.stream()
|
||||
.anyMatch(MarkedContent::optionalContent);
|
||||
}
|
||||
|
||||
|
||||
public void leaveForm() {
|
||||
|
||||
formStack.pop();
|
||||
|
||||
}
|
||||
|
||||
|
||||
private record MarkedContent(String name, boolean optionalContent) {
|
||||
|
||||
}
|
||||
|
||||
private record Form(long ref, boolean optionalContent, boolean transparency) {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -5,6 +5,7 @@ import com.pdftron.pdf.*;
|
||||
import com.pdftron.pdf.ocg.Group;
|
||||
import com.pdftron.pdf.ocg.OCMD;
|
||||
import com.pdftron.sdf.Obj;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.UtilityClass;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
@ -27,8 +28,9 @@ public class OCGWatermarkRemovalService {
|
||||
|
||||
@SneakyThrows
|
||||
private boolean hasOCGWatermarks(PDFDoc pdfDoc) {
|
||||
|
||||
Obj ocgs = pdfDoc.getOCGs();
|
||||
if(ocgs != null) {
|
||||
if (ocgs != null) {
|
||||
for (int i = 0; i < ocgs.size(); i++) {
|
||||
Group group = new Group(ocgs.getAt(i));
|
||||
if (group.isValid() && group.getName().equals("Watermark")) {
|
||||
@ -43,26 +45,21 @@ public class OCGWatermarkRemovalService {
|
||||
@SneakyThrows
|
||||
private void removeOCGWatermarks(PDFDoc pdfDoc) {
|
||||
|
||||
ElementReader reader = new ElementReader();
|
||||
ElementWriter writer = new ElementWriter();
|
||||
Set<Long> visitedXObjIds = new TreeSet<>();
|
||||
try (PageIterator iterator = pdfDoc.getPageIterator(); ElementReader reader = new ElementReader(); ElementWriter writer = new ElementWriter()) {
|
||||
Set<Long> visitedXObjIds = new TreeSet<>();
|
||||
|
||||
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
|
||||
while (iterator.hasNext()) {
|
||||
|
||||
Page page = iterator.next();
|
||||
writeAllElementsExceptWatermarks(page, reader, writer, visitedXObjIds);
|
||||
}
|
||||
|
||||
Page page = iterator.next();
|
||||
writeAllElementsExceptWatermarks(page, reader, writer, visitedXObjIds);
|
||||
}
|
||||
|
||||
reader.destroy();
|
||||
writer.destroy();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void writeAllElementsExceptWatermarks(Page page,
|
||||
ElementReader reader,
|
||||
ElementWriter writer,
|
||||
Set<Long> visitedXObjIds) {
|
||||
private void writeAllElementsExceptWatermarks(Page page, ElementReader reader, ElementWriter writer, Set<Long> visitedXObjIds) {
|
||||
|
||||
reader.begin(page);
|
||||
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
|
||||
@ -72,10 +69,7 @@ public class OCGWatermarkRemovalService {
|
||||
}
|
||||
|
||||
|
||||
private void processElements(Page page,
|
||||
ElementReader reader,
|
||||
ElementWriter writer,
|
||||
Set<Long> visitedXObjIds) throws PDFNetException {
|
||||
private void processElements(Page page, ElementReader reader, ElementWriter writer, Set<Long> visitedXObjIds) throws PDFNetException {
|
||||
|
||||
for (Element element = reader.next(); element != null; element = reader.next()) {
|
||||
|
||||
@ -93,6 +87,7 @@ public class OCGWatermarkRemovalService {
|
||||
|
||||
@SneakyThrows
|
||||
private boolean inOCGWatermark(Element element) {
|
||||
|
||||
var xObj = element.getXObject();
|
||||
if (xObj != null) {
|
||||
Obj oc = xObj.findObj("OC");
|
||||
@ -111,12 +106,7 @@ public class OCGWatermarkRemovalService {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void processForms(Page page,
|
||||
Element element,
|
||||
ElementReader reader,
|
||||
ElementWriter writer,
|
||||
Set<Long> visitedXObjIds) {
|
||||
|
||||
private void processForms(Page page, Element element, ElementReader reader, ElementWriter writer, Set<Long> visitedXObjIds) {
|
||||
|
||||
writer.writeElement(element);
|
||||
|
||||
@ -124,17 +114,17 @@ public class OCGWatermarkRemovalService {
|
||||
visitedXObjIds.add(element.getXObject().getObjNum());
|
||||
// writer needs to be newly initialized when entering a new content stream
|
||||
// see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
|
||||
ElementWriter formWriter = new ElementWriter();
|
||||
reader.formBegin();
|
||||
formWriter.begin(element.getXObject());
|
||||
try (ElementWriter formWriter = new ElementWriter()) {
|
||||
reader.formBegin();
|
||||
formWriter.begin(element.getXObject());
|
||||
|
||||
reader.clearChangeList();
|
||||
formWriter.setDefaultGState(reader);
|
||||
reader.clearChangeList();
|
||||
formWriter.setDefaultGState(reader);
|
||||
|
||||
processElements(page, reader, formWriter, visitedXObjIds);
|
||||
formWriter.end();
|
||||
formWriter.destroy();
|
||||
reader.end();
|
||||
processElements(page, reader, formWriter, visitedXObjIds);
|
||||
formWriter.end();
|
||||
reader.end();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,49 @@
|
||||
package com.iqser.red.pdftronlogic.commons;
|
||||
|
||||
import java.lang.reflect.Field;
|
||||
|
||||
import com.pdftron.pdf.Font;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class PDFNetUtils {
|
||||
|
||||
@SuppressWarnings("PMD")
|
||||
public void requireFontNotClosed(Font font) {
|
||||
|
||||
try {
|
||||
if (font.__GetHandle() == 0L) {
|
||||
throw new AssertionError("Font is already closed!");
|
||||
}
|
||||
Object refHandle = font.__GetRefHandle();
|
||||
|
||||
Class<?> clazz = refHandle.getClass();
|
||||
|
||||
Field implField = null;
|
||||
while (clazz != null) {
|
||||
try {
|
||||
|
||||
implField = clazz.getDeclaredField("impl");
|
||||
implField.setAccessible(true);
|
||||
break;
|
||||
} catch (NoSuchFieldException e) {
|
||||
clazz = clazz.getSuperclass();
|
||||
}
|
||||
}
|
||||
|
||||
if (implField != null) {
|
||||
long implValue = (Long) implField.get(refHandle);
|
||||
|
||||
if (implValue == 0L) {
|
||||
throw new AssertionError("Associated ElementReader of Font is already closed!");
|
||||
}
|
||||
}
|
||||
|
||||
} catch (IllegalAccessException e) {
|
||||
throw new AssertionError("Font Ref is missing the field impl, should never happen!");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
@ -0,0 +1,71 @@
|
||||
package com.iqser.red.pdftronlogic.commons;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.features.ElementFeatureFactory;
|
||||
import com.iqser.red.pdftronlogic.commons.features.ImageFeatures;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.Element;
|
||||
import com.pdftron.pdf.ElementReader;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.sdf.Obj;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class PdfImageExtraction {
|
||||
|
||||
public List<List<ImageFeatures>> extractImages(InputStream fileStream) throws IOException, PDFNetException {
|
||||
|
||||
try (PDFDoc pdfDoc = new PDFDoc(fileStream); ElementReader reader = new ElementReader()) {
|
||||
List<List<ImageFeatures>> imagesPerPage = new ArrayList<>(pdfDoc.getPageCount());
|
||||
|
||||
var iter = pdfDoc.getPageIterator();
|
||||
while (iter.hasNext()) {
|
||||
Page page = iter.next();
|
||||
Set<Long> visitedXObjIds = new HashSet<>();
|
||||
visitedXObjIds.add(page.getSDFObj().getObjNum());
|
||||
List<ImageFeatures> imageFeatures = new LinkedList<>();
|
||||
|
||||
reader.begin(page);
|
||||
|
||||
processElements(reader, imageFeatures, visitedXObjIds);
|
||||
|
||||
reader.end();
|
||||
|
||||
imagesPerPage.add(imageFeatures);
|
||||
}
|
||||
|
||||
return imagesPerPage;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private void processElements(ElementReader reader, List<ImageFeatures> imageFeaturesOnPage, Set<Long> visitedXObjIds) throws PDFNetException {
|
||||
|
||||
for (Element element = reader.next(); element != null; element = reader.next()) {
|
||||
switch (element.getType()) {
|
||||
case Element.e_image, Element.e_inline_image -> imageFeaturesOnPage.add(ElementFeatureFactory.buildImage(element));
|
||||
case Element.e_form -> {
|
||||
Obj formObj = element.getXObject();
|
||||
if (!visitedXObjIds.contains(formObj.getObjNum())) {
|
||||
visitedXObjIds.add(formObj.getObjNum());
|
||||
reader.formBegin();
|
||||
processElements(reader, imageFeaturesOnPage, visitedXObjIds);
|
||||
reader.end();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -3,42 +3,102 @@ package com.iqser.red.pdftronlogic.commons;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.features.ElementFeatureFactory;
|
||||
import com.iqser.red.pdftronlogic.commons.features.TextFeatures;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.Element;
|
||||
import com.pdftron.pdf.ElementReader;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.pdf.PageIterator;
|
||||
import com.pdftron.pdf.TextExtractor;
|
||||
import com.pdftron.sdf.Obj;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class PdfTextExtraction {
|
||||
|
||||
private static String execute(PDFDoc pdfDoc) throws IOException, PDFNetException{
|
||||
TextExtractor extractor = new TextExtractor();
|
||||
List<String> texts = new ArrayList<>();
|
||||
private static String execute(PDFDoc pdfDoc) throws PDFNetException {
|
||||
|
||||
PageIterator iterator = pdfDoc.getPageIterator();
|
||||
while (iterator.hasNext()) {
|
||||
Page page = iterator.next();
|
||||
extractor.begin(page);
|
||||
texts.add(extractor.getAsText());
|
||||
try (PageIterator iterator = pdfDoc.getPageIterator(); TextExtractor extractor = new TextExtractor()) {
|
||||
List<String> texts = new ArrayList<>();
|
||||
|
||||
while (iterator.hasNext()) {
|
||||
Page page = iterator.next();
|
||||
extractor.begin(page);
|
||||
texts.add(extractor.getAsText());
|
||||
}
|
||||
|
||||
pdfDoc.close();
|
||||
return String.join("\n", texts);
|
||||
}
|
||||
|
||||
extractor.destroy();
|
||||
pdfDoc.close();
|
||||
return String.join("\n", texts);
|
||||
}
|
||||
|
||||
|
||||
public static String extractAllTextFromDocument(InputStream fileStream) throws IOException, PDFNetException {
|
||||
|
||||
PDFDoc pdfDoc = new PDFDoc(fileStream);
|
||||
return execute(pdfDoc);
|
||||
}
|
||||
|
||||
|
||||
public static String extractAllTextFromDocument(PDFDoc pdfDoc) throws IOException, PDFNetException {
|
||||
|
||||
return execute(pdfDoc);
|
||||
}
|
||||
|
||||
|
||||
public static List<List<TextFeatures>> extractAllGlyphsFromDocument(InputStream fileStream, boolean includePathData) throws IOException, PDFNetException {
|
||||
|
||||
try (PDFDoc pdfDoc = new PDFDoc(fileStream); ElementReader reader = new ElementReader()) {
|
||||
List<List<TextFeatures>> glyphsPerPages = new ArrayList<>(pdfDoc.getPageCount());
|
||||
|
||||
var iter = pdfDoc.getPageIterator();
|
||||
while (iter.hasNext()) {
|
||||
Page page = iter.next();
|
||||
Set<Long> visitedXObjIds = new HashSet<>();
|
||||
visitedXObjIds.add(page.getSDFObj().getObjNum());
|
||||
List<TextFeatures> textFeatures = new LinkedList<>();
|
||||
|
||||
reader.begin(page);
|
||||
|
||||
processElements(reader, textFeatures, visitedXObjIds, includePathData);
|
||||
|
||||
reader.end();
|
||||
|
||||
glyphsPerPages.add(textFeatures);
|
||||
}
|
||||
|
||||
return glyphsPerPages;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static void processElements(ElementReader reader, List<TextFeatures> textFeaturesOnPage, Set<Long> visitedXObjIds, boolean includePathData) throws PDFNetException {
|
||||
|
||||
for (Element element = reader.next(); element != null; element = reader.next()) {
|
||||
switch (element.getType()) {
|
||||
case Element.e_text -> textFeaturesOnPage.add(ElementFeatureFactory.buildText(element, includePathData, includePathData));
|
||||
case Element.e_form -> {
|
||||
Obj formObj = element.getXObject();
|
||||
|
||||
if (!visitedXObjIds.contains(formObj.getObjNum())) {
|
||||
visitedXObjIds.add(formObj.getObjNum());
|
||||
reader.formBegin();
|
||||
processElements(reader, textFeaturesOnPage, visitedXObjIds, includePathData);
|
||||
reader.end();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,91 @@
|
||||
package com.iqser.red.pdftronlogic.commons;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
|
||||
import com.iqser.red.pdftronlogic.commons.features.GlyphInfo;
|
||||
import com.iqser.red.pdftronlogic.commons.features.TextFeatures;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.ColorPt;
|
||||
import com.pdftron.pdf.ColorSpace;
|
||||
import com.pdftron.pdf.Element;
|
||||
import com.pdftron.pdf.ElementBuilder;
|
||||
import com.pdftron.pdf.ElementWriter;
|
||||
import com.pdftron.pdf.PathData;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class VisualizationUtils {
|
||||
|
||||
@SneakyThrows
|
||||
public static void drawFeature(ElementWriter writer, ElementFeatures features, Color color) {
|
||||
|
||||
try (ElementBuilder builder = new ElementBuilder()) {
|
||||
|
||||
if (features instanceof TextFeatures textFeatures) {
|
||||
for (GlyphInfo glyph : textFeatures.getGlyphs()) {
|
||||
if (glyph.getPathData().isPresent()) {
|
||||
drawPathData(glyph.getPathData().get(), builder, writer, color);
|
||||
}
|
||||
}
|
||||
}
|
||||
drawRect(features.getBoundingBox(), builder, writer, color);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
public static void drawPathData(PathData pathData, ElementBuilder builder, ElementWriter writer, Color color) throws PDFNetException {
|
||||
|
||||
Element path = builder.createPath(pathData.getPoints(), pathData.getOperators());
|
||||
path.setPathFill(true);
|
||||
path.setPathStroke(false);
|
||||
path.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||
float[] comp = color.getColorComponents(null);
|
||||
try (ColorPt colorPt = new ColorPt(comp[0], comp[1], comp[2])) {
|
||||
path.getGState().setFillColor(colorPt);
|
||||
}
|
||||
path.setWindingFill(true);
|
||||
writer.writeElement(path);
|
||||
}
|
||||
|
||||
|
||||
public static void drawRect(Rectangle2D rectangle2D, ElementBuilder builder, ElementWriter writer, Color color) throws PDFNetException {
|
||||
|
||||
drawRect(rectangle2D, builder, writer, color, false);
|
||||
}
|
||||
|
||||
|
||||
public static void drawRect(Rectangle2D rectangle2D, ElementBuilder builder, ElementWriter writer, Color color, boolean fill) throws PDFNetException {
|
||||
|
||||
Element rect = builder.createRect(rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight());
|
||||
rect.setPathFill(false);
|
||||
rect.setPathStroke(true);
|
||||
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
||||
float[] comp = fill ? Color.BLACK.getColorComponents(null) : color.getColorComponents(null);
|
||||
try (ColorPt colorPt = new ColorPt(comp[0], comp[1], comp[2])) {
|
||||
rect.getGState().setStrokeColor(colorPt);
|
||||
}
|
||||
double lineWidth = fill ? 0.1 : 0.5;
|
||||
rect.getGState().setLineWidth(lineWidth);
|
||||
writer.writeElement(rect);
|
||||
|
||||
if (fill) {
|
||||
Element filledRect = builder.createRect(rectangle2D.getX() + lineWidth,
|
||||
rectangle2D.getY() + lineWidth,
|
||||
rectangle2D.getWidth() - 2 * lineWidth,
|
||||
rectangle2D.getHeight() - 2 * lineWidth);
|
||||
filledRect.setPathFill(true);
|
||||
filledRect.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||
float[] comp2 = color.getColorComponents(null);
|
||||
try (ColorPt colorPt = new ColorPt(comp2[0], comp2[1], comp2[2])) {
|
||||
filledRect.getGState().setFillColor(colorPt);
|
||||
}
|
||||
writer.writeElement(filledRect);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -10,6 +10,8 @@ import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.features.ElementFeatureFactory;
|
||||
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.Element;
|
||||
import com.pdftron.pdf.ElementReader;
|
||||
@ -93,27 +95,25 @@ public class WatermarkRemovalService {
|
||||
Map<Long, List<ElementFeatures>> formObjectsAndImagesForPages = new HashMap<>();
|
||||
Set<Long> visitedXObjIds = new TreeSet<>();
|
||||
|
||||
ElementReader reader = new ElementReader();
|
||||
try (ElementReader reader = new ElementReader(); PageIterator iterator = pdfDoc.getPageIterator()) {
|
||||
while (iterator.hasNext()) {
|
||||
|
||||
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
|
||||
Page page = iterator.next();
|
||||
|
||||
Page page = iterator.next();
|
||||
double minAreaCoveringFromPage = AREA_THRESHOLD * page.getPageHeight() * page.getPageWidth();
|
||||
|
||||
double minAreaCoveringFromPage = AREA_THRESHOLD * page.getPageHeight() * page.getPageWidth();
|
||||
LinkedList<ElementFeatures> elementFeaturesLinkedList = new LinkedList<>();
|
||||
|
||||
LinkedList<ElementFeatures> elementFeaturesLinkedList = new LinkedList<>();
|
||||
reader.begin(page);
|
||||
for (Element element = reader.next(); element != null; element = reader.next()) {
|
||||
processElement(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringFromPage, page);
|
||||
}
|
||||
|
||||
reader.begin(page);
|
||||
for (Element element = reader.next(); element != null; element = reader.next()) {
|
||||
processElement(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringFromPage, page);
|
||||
formObjectsAndImagesForPages.put(page.getSDFObj().getObjNum(), elementFeaturesLinkedList);
|
||||
}
|
||||
|
||||
formObjectsAndImagesForPages.put(page.getSDFObj().getObjNum(), elementFeaturesLinkedList);
|
||||
return formObjectsAndImagesForPages;
|
||||
}
|
||||
|
||||
reader.destroy();
|
||||
|
||||
return formObjectsAndImagesForPages;
|
||||
}
|
||||
|
||||
|
||||
@ -124,14 +124,16 @@ public class WatermarkRemovalService {
|
||||
double minAreaCoveringPage,
|
||||
Page page) throws PDFNetException {
|
||||
|
||||
if (element.getBBox() == null) {
|
||||
return;
|
||||
}
|
||||
try (var bbox = element.getBBox()) {
|
||||
if (bbox == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
switch (element.getType()) {
|
||||
case Element.e_form -> processXObject(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage, page);
|
||||
case Element.e_image, Element.e_inline_image -> processImages(element, elementFeaturesLinkedList, page, minAreaCoveringPage);
|
||||
case Element.e_text -> processText(element, elementFeaturesLinkedList, page);
|
||||
switch (element.getType()) {
|
||||
case Element.e_form -> processXObject(element, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage, page);
|
||||
case Element.e_image, Element.e_inline_image -> processImages(element, elementFeaturesLinkedList, page, minAreaCoveringPage);
|
||||
case Element.e_text -> processText(element, elementFeaturesLinkedList, page);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -148,11 +150,13 @@ public class WatermarkRemovalService {
|
||||
return;
|
||||
}
|
||||
|
||||
boolean isBigEnough = Math.abs(element.getBBox().getY1() - element.getBBox().getY2()) > page.getPageHeight() * MIN_TEXTWATERMARK_HEIGHT_THRESHOLD;
|
||||
try (var bbox = element.getBBox()) {
|
||||
boolean isBigEnough = Math.abs(bbox.getY1() - bbox.getY2()) > page.getPageHeight() * MIN_TEXTWATERMARK_HEIGHT_THRESHOLD;
|
||||
|
||||
if (isBigEnough) {
|
||||
ElementFeatures elementFeatures = ElementFeatureFactory.extractFeatures(element);
|
||||
elementFeaturesLinkedList.add(elementFeatures);
|
||||
if (isBigEnough) {
|
||||
ElementFeatures elementFeatures = ElementFeatureFactory.extractFeatures(element);
|
||||
elementFeaturesLinkedList.add(elementFeatures);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -161,8 +165,9 @@ public class WatermarkRemovalService {
|
||||
@SneakyThrows
|
||||
private boolean isTextRotated(Element element) {
|
||||
|
||||
return Math.abs(element.getCTM().getB()) < Math.sin(Math.toRadians(ROTATED_TEXT_THRESHOLD)) || Math.abs(element.getCTM()
|
||||
.getB()) > Math.sin(Math.toRadians(70 - ROTATED_TEXT_THRESHOLD));
|
||||
try (var ctm = element.getCTM()) {
|
||||
return Math.abs(ctm.getB()) < Math.sin(Math.toRadians(ROTATED_TEXT_THRESHOLD)) || Math.abs(ctm.getB()) > Math.sin(Math.toRadians(70 - ROTATED_TEXT_THRESHOLD));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -186,13 +191,15 @@ public class WatermarkRemovalService {
|
||||
if (element.getXObject() == null) {
|
||||
return;
|
||||
}
|
||||
if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringPage && isLocatedNearBorder(element, page)) {
|
||||
return;
|
||||
}
|
||||
try (var bbox = element.getBBox()) {
|
||||
if (bbox.getHeight() * bbox.getWidth() < minAreaCoveringPage && isLocatedNearBorder(element, page)) {
|
||||
return;
|
||||
}
|
||||
|
||||
String hashOfImage = ImageHashFactory.calculate(element);
|
||||
ElementFeatures elementFeatures = ElementFeatureFactory.extractFeaturesWithHash(element, hashOfImage);
|
||||
elementFeaturesLinkedList.add(elementFeatures);
|
||||
String hashOfImage = ImageHashFactory.calculate(element);
|
||||
ElementFeatures elementFeatures = ElementFeatureFactory.buildImageWithHash(element, hashOfImage);
|
||||
elementFeaturesLinkedList.add(elementFeatures);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -200,10 +207,12 @@ public class WatermarkRemovalService {
|
||||
@SneakyThrows
|
||||
private boolean isLocatedNearBorder(Element element, Page page) {
|
||||
|
||||
return element.getBBox().getY1() < page.getVisibleContentBox().getY1() + page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD || element.getBBox()
|
||||
.getY2() > page.getVisibleContentBox().getY2() - page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD || element.getBBox().getX1() < page.getVisibleContentBox()
|
||||
.getX1() + page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD || element.getBBox().getX2() > page.getVisibleContentBox()
|
||||
.getX2() - page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD;
|
||||
try (var bbox = element.getBBox(); var contentBox = page.getVisibleContentBox();) {
|
||||
return bbox.getY1() < contentBox.getY1() + page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD
|
||||
|| bbox.getY2() > contentBox.getY2() - page.getPageHeight() * IMAGE_POSITION_HEIGHT_THRESHOLD
|
||||
|| bbox.getX1() < contentBox.getX1() + page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD
|
||||
|| bbox.getX2() > contentBox.getX2() - page.getPageWidth() * IMAGE_POSITION_WIDTH_THRESHOLD;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -215,18 +224,20 @@ public class WatermarkRemovalService {
|
||||
double minAreaCoveringPage,
|
||||
Page page) {
|
||||
|
||||
if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringPage) {
|
||||
return;
|
||||
try (var bbox = element.getBBox()) {
|
||||
if (bbox.getHeight() * bbox.getWidth() < minAreaCoveringPage) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (visitedXObjIds.add(element.getXObject().getObjNum())) {
|
||||
ElementReader xObjectReader = new ElementReader();
|
||||
xObjectReader.begin(element.getXObject());
|
||||
for (Element element1 = xObjectReader.next(); element1 != null; element1 = xObjectReader.next()) {
|
||||
processElement(element1, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage, page);
|
||||
try (ElementReader xObjectReader = new ElementReader()) {
|
||||
xObjectReader.begin(element.getXObject());
|
||||
for (Element element1 = xObjectReader.next(); element1 != null; element1 = xObjectReader.next()) {
|
||||
processElement(element1, visitedXObjIds, elementFeaturesLinkedList, formObjectsOccuringMoreThanOnceOnAPage, minAreaCoveringPage, page);
|
||||
}
|
||||
elementFeaturesLinkedList.add(ElementFeatureFactory.extractFeatures(element));
|
||||
}
|
||||
elementFeaturesLinkedList.add(ElementFeatureFactory.extractFeatures(element));
|
||||
xObjectReader.destroy();
|
||||
} else {
|
||||
elementFeaturesLinkedList.add(ElementFeatureFactory.extractFeatures(element));
|
||||
}
|
||||
@ -245,10 +256,12 @@ public class WatermarkRemovalService {
|
||||
.stream()
|
||||
.flatMap(Collection::stream)
|
||||
.filter(elementFeature -> formObjectsPerPage.values()
|
||||
.stream()
|
||||
.filter(elementFeaturesOnPage -> elementFeaturesOnPage.stream()
|
||||
.anyMatch(elementFeature.getElementType() == Element.e_image || elementFeature.getElementType() == Element.e_inline_image ? elementFeature::isSimilarTo : elementFeature::almostMatches))
|
||||
.count() >= minPagesFilter)
|
||||
.stream()
|
||||
.filter(elementFeaturesOnPage -> elementFeaturesOnPage.stream()
|
||||
.anyMatch(elementFeature.getElementType() == Element.e_image
|
||||
|| elementFeature.getElementType()
|
||||
== Element.e_inline_image ? elementFeature::similar : elementFeature::matches))
|
||||
.count() >= minPagesFilter)
|
||||
.toList();
|
||||
}
|
||||
|
||||
@ -256,21 +269,16 @@ public class WatermarkRemovalService {
|
||||
@SneakyThrows
|
||||
private void removeAllWatermarks(PDFDoc pdfDoc, List<ElementFeatures> watermarksElementFeaturesList) {
|
||||
|
||||
ElementReader reader = new ElementReader();
|
||||
ElementWriter writer = new ElementWriter();
|
||||
Set<Long> visitedXObjIds = new TreeSet<>();
|
||||
try (PageIterator iterator = pdfDoc.getPageIterator(); ElementReader reader = new ElementReader(); ElementWriter writer = new ElementWriter()) {
|
||||
Set<Long> visitedXObjIds = new TreeSet<>();
|
||||
|
||||
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
|
||||
while (iterator.hasNext()) {
|
||||
|
||||
Page page = iterator.next();
|
||||
|
||||
writeAllElementsExceptWatermarks(page, reader, writer, watermarksElementFeaturesList, visitedXObjIds);
|
||||
Page page = iterator.next();
|
||||
|
||||
writeAllElementsExceptWatermarks(page, reader, writer, watermarksElementFeaturesList, visitedXObjIds);
|
||||
}
|
||||
}
|
||||
|
||||
reader.destroy();
|
||||
writer.destroy();
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -300,16 +308,21 @@ public class WatermarkRemovalService {
|
||||
|
||||
switch (element.getType()) {
|
||||
case Element.e_image, Element.e_inline_image -> {
|
||||
if (element.getBBox() == null) {
|
||||
writer.writeElement(element);
|
||||
continue;
|
||||
try (var bbox = element.getBBox()) {
|
||||
if (bbox == null) {
|
||||
writer.writeElement(element);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (element.getBBox().getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage && isLocatedNearBorder(element, page) && element.getBBox()
|
||||
.getHeight() * element.getBBox().getWidth() < minAreaCoveringFromPage || element.getXObject() == null) {
|
||||
try (var bbox = element.getBBox()) {
|
||||
if (bbox.getHeight() * bbox.getWidth() < minAreaCoveringFromPage
|
||||
&& isLocatedNearBorder(element, page)
|
||||
&& bbox.getHeight() * bbox.getWidth() < minAreaCoveringFromPage || element.getXObject() == null) {
|
||||
|
||||
writer.writeElement(element);
|
||||
continue;
|
||||
writer.writeElement(element);
|
||||
continue;
|
||||
|
||||
}
|
||||
}
|
||||
removeImages(element, writer, watermarksElementFeaturesList);
|
||||
}
|
||||
@ -330,7 +343,7 @@ public class WatermarkRemovalService {
|
||||
}
|
||||
|
||||
for (ElementFeatures elementFeatures : watermarksElementFeaturesList) {
|
||||
if (elementFeatures.almostMatches(element)) {
|
||||
if (elementFeatures.matches(ElementFeatureFactory.extractFeatures(element))) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
@ -349,8 +362,10 @@ public class WatermarkRemovalService {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (Math.max(element.getBBox().getY1(), element.getBBox().getY2()) < page.getVisibleContentBox().getY1() + page.getPageHeight() * TEXT_POSITION_THRESHOLD) {
|
||||
return false;
|
||||
try (var bbox = element.getBBox(); var contents = page.getVisibleContentBox();) {
|
||||
if (Math.max(bbox.getY1(), bbox.getY2()) < contents.getY1() + page.getPageHeight() * TEXT_POSITION_THRESHOLD) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@ -360,9 +375,9 @@ public class WatermarkRemovalService {
|
||||
private void removeImages(Element element, ElementWriter writer, List<ElementFeatures> watermarksElementFeaturesList) {
|
||||
|
||||
String hashValueOfImage = ImageHashFactory.calculate(element);
|
||||
ElementFeatures imageFeatures = ElementFeatureFactory.extractFeaturesWithHash(element, hashValueOfImage);
|
||||
ElementFeatures imageFeatures = ElementFeatureFactory.buildImageWithHash(element, hashValueOfImage);
|
||||
for (ElementFeatures elementFeatures : watermarksElementFeaturesList) {
|
||||
if (elementFeatures.isSimilarTo(imageFeatures)) {
|
||||
if (elementFeatures.similar(imageFeatures)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
@ -379,7 +394,7 @@ public class WatermarkRemovalService {
|
||||
Set<Long> visitedXObjIds) throws PDFNetException {
|
||||
|
||||
for (ElementFeatures elementFeatures : watermarksElementFeaturesList) {
|
||||
if (elementFeatures.almostMatches(element)) {
|
||||
if (elementFeatures.matches(ElementFeatureFactory.extractFeatures(element))) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
@ -390,17 +405,17 @@ public class WatermarkRemovalService {
|
||||
visitedXObjIds.add(element.getXObject().getObjNum());
|
||||
// writer needs to be newly initialized when entering a new content stream
|
||||
// see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
|
||||
ElementWriter formWriter = new ElementWriter();
|
||||
reader.formBegin();
|
||||
formWriter.begin(element.getXObject());
|
||||
try (ElementWriter formWriter = new ElementWriter()) {
|
||||
reader.formBegin();
|
||||
formWriter.begin(element.getXObject());
|
||||
|
||||
reader.clearChangeList();
|
||||
formWriter.setDefaultGState(reader);
|
||||
reader.clearChangeList();
|
||||
formWriter.setDefaultGState(reader);
|
||||
|
||||
processElements(page, reader, formWriter, watermarksElementFeaturesList, visitedXObjIds);
|
||||
formWriter.end();
|
||||
formWriter.destroy();
|
||||
reader.end();
|
||||
processElements(page, reader, formWriter, watermarksElementFeaturesList, visitedXObjIds);
|
||||
formWriter.end();
|
||||
reader.end();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,197 @@
|
||||
package com.iqser.red.pdftronlogic.commons.features;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.Converter;
|
||||
import com.pdftron.common.Matrix2D;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.CharData;
|
||||
import com.pdftron.pdf.CharIterator;
|
||||
import com.pdftron.pdf.Element;
|
||||
import com.pdftron.pdf.Font;
|
||||
import com.pdftron.pdf.GState;
|
||||
import com.pdftron.pdf.Image;
|
||||
import com.pdftron.sdf.Obj;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class ElementFeatureFactory {
|
||||
|
||||
public ElementFeatures extractFeatures(Element element) throws PDFNetException {
|
||||
|
||||
return switch (element.getType()) {
|
||||
case Element.e_path -> buildPath(element);
|
||||
case Element.e_text -> buildText(element, false, false);
|
||||
case Element.e_image, Element.e_inline_image -> buildImage(element);
|
||||
case Element.e_form -> buildForm(element);
|
||||
// This technically should never happen, it's a safetynet
|
||||
default -> throw new RuntimeException("Feature Extraction is not supported for PDFTron.Element with type: " + element.getType());
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
public ImageFeatures buildImageWithHash(Element element, String hashObject) throws PDFNetException {
|
||||
|
||||
return buildImageBase(element).hashOfImage(hashObject).build();
|
||||
}
|
||||
|
||||
|
||||
public ImageFeatures buildImage(Element element) throws PDFNetException {
|
||||
|
||||
return buildImageBase(element).build();
|
||||
}
|
||||
|
||||
|
||||
public FormFeatures buildForm(Element element) throws PDFNetException {
|
||||
|
||||
try (var bbox = element.getBBox();) {
|
||||
return FormFeatures.builder()
|
||||
.elementType(element.getType())
|
||||
.boundingBox(Converter.toRectangle2D(bbox))
|
||||
.xObjectType(element.getXObject().getType())
|
||||
.dictOrArrayOrStreamLength(element.getXObject().getType() == 7 ? element.getXObject().getDecodedStream().size() : 0)
|
||||
.build();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private ImageFeatures.ImageFeaturesBuilder<?, ?> buildImageBase(Element element) throws PDFNetException {
|
||||
|
||||
assert element.getType() == Element.e_image || element.getType() == Element.e_inline_image;
|
||||
try (var bbox = element.getBBox();) {
|
||||
boolean transparent = element.getGState().getBlendMode() != GState.e_bl_normal
|
||||
|| element.getGState().getFillOpacity() > 1
|
||||
|| element.getGState().getStrokeOpacity() > 1;
|
||||
|
||||
// see spec: 8.9.6.3 Explicit masking
|
||||
boolean masked = false;
|
||||
if (element.getType() == Element.e_image) {
|
||||
Image image = new Image(element.getXObject());
|
||||
if (image.getMask() != null && image.getMask().getType() == Obj.e_stream) {
|
||||
Image imageMask = new Image(image.getMask());
|
||||
masked = imageMask.isImageMask();
|
||||
}
|
||||
}
|
||||
return ImageFeatures.builder()
|
||||
.elementType(element.getType())
|
||||
.boundingBox(Converter.toRectangle2D(bbox))
|
||||
.dataSize(element.getImageDataSize())
|
||||
.height(element.getImageHeight())
|
||||
.width(element.getImageWidth())
|
||||
.renderingIntent(element.getImageRenderingIntent())
|
||||
.componentNum(element.getComponentNum())
|
||||
.bitsPerComponent(element.getBitsPerComponent())
|
||||
.imageMask(element.isImageMask())
|
||||
.softMask(element.getGState().getSoftMask() != null)
|
||||
.masked(masked)
|
||||
.transparent(transparent);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Use includeGlyphs = true and preComputePathData = true, when trying to draw the glyphs, see GlyphExtractionTest
|
||||
precomputePathData = true is needed, when trying to access the PathData after the PDFDoc/ElementReader has been closed
|
||||
*/
|
||||
public TextFeatures buildText(Element element, boolean includeGlyphs, boolean preComputePathData) throws PDFNetException {
|
||||
|
||||
try (var bbox = element.getBBox()) {
|
||||
|
||||
TextFeatures.TextFeaturesBuilder<?, ?> simpleTextFeatures = TextFeatures.builder()
|
||||
.elementType(element.getType())
|
||||
.boundingBox(Converter.toRectangle2D(bbox))
|
||||
.text(element.getTextString())
|
||||
.font(element.getGState().getFont().getType())
|
||||
.fontsize(element.getGState().getFontSize());
|
||||
|
||||
if (includeGlyphs) {
|
||||
simpleTextFeatures.glyphs(extractGlyphInfo(element, preComputePathData));
|
||||
}
|
||||
return simpleTextFeatures.build();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public PathFeatures buildPath(Element element) throws PDFNetException {
|
||||
|
||||
try (var bbox = element.getBBox(); var ctm = element.getCTM(); var fillColor = element.getGState().getFillColor(); var strokeColor = element.getGState().getStrokeColor()) {
|
||||
return PathFeatures.builder()
|
||||
.elementType(element.getType())
|
||||
.boundingBox(Converter.toRectangle2D(bbox))
|
||||
.clippingPath(element.isClippingPath())
|
||||
.clipWindingFill(element.isClipWindingFill())
|
||||
.stroked(element.isStroked())
|
||||
.filled(element.isFilled())
|
||||
.windingFill(element.isWindingFill())
|
||||
.fillColor(Converter.convertColor(element.getGState().getFillColorSpace(), fillColor))
|
||||
.strokeColor(Converter.convertColor(element.getGState().getStrokeColorSpace(), strokeColor))
|
||||
.linePath(Converter.convertToGeneralPathAndTransformToInitialUserSpace(element.getPathData(), ctm))
|
||||
.build();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private List<GlyphInfo> extractGlyphInfo(Element textElement, boolean precomputePathData) {
|
||||
|
||||
assert textElement != null && textElement.getType() == Element.e_text;
|
||||
|
||||
if (textElement.getBBox() == null) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
Font font = textElement.getGState().getFont();
|
||||
|
||||
if (font.getType() == Font.e_Type3) {
|
||||
// type 3 fonts seem to be much more difficult, one must use font.getType3GlyphStream and font.getType3FontMatrix instead
|
||||
// couldn't find much information except this post https://groups.google.com/g/pdfnet-sdk/c/SvhMflbtQho
|
||||
// will implement this when necessary
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
List<GlyphInfo> glyphs = new ArrayList<>();
|
||||
short unitsPerEm = font.getUnitsPerEm();
|
||||
|
||||
try (CharIterator charIterator = textElement.getCharIterator(); Matrix2D ctm = textElement.getCTM().multiply(textElement.getTextMatrix());) {
|
||||
|
||||
while (charIterator.hasNext()) {
|
||||
CharData charData = charIterator.next();
|
||||
long charCode = charData.getCharCode();
|
||||
|
||||
try (Matrix2D fontMatrix = computeFontMatrix(charData, textElement, unitsPerEm)) {
|
||||
|
||||
GlyphInfo glyph = GlyphInfo.builder() //
|
||||
.charCode(charCode) //
|
||||
.cachePathData(precomputePathData) //
|
||||
.glyphMatrix(ctm.multiply(fontMatrix)) //
|
||||
.font(font) //
|
||||
.build();
|
||||
|
||||
glyphs.add(glyph);
|
||||
|
||||
if (precomputePathData) {
|
||||
// call the functions once to cache all data
|
||||
glyph.getBoundingBox();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return glyphs;
|
||||
|
||||
}
|
||||
|
||||
|
||||
private Matrix2D computeFontMatrix(CharData charData, Element textElement, short unitsPerEm) throws PDFNetException {
|
||||
|
||||
double yScaleFactor = textElement.getGState().getFontSize() / unitsPerEm;
|
||||
double xScaleFactor = (textElement.getGState().getHorizontalScale() / 100) * yScaleFactor;
|
||||
|
||||
return new Matrix2D(xScaleFactor, 0, 0, -yScaleFactor, charData.getGlyphX(), charData.getGlyphY());
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,102 @@
|
||||
package com.iqser.red.pdftronlogic.commons.features;
|
||||
|
||||
import static com.iqser.red.pdftronlogic.commons.ComparisonUtils.almostEqual;
|
||||
|
||||
import java.awt.Shape;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.ComparisonUtils;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
@Getter
|
||||
@SuperBuilder
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class ElementFeatures {
|
||||
|
||||
final private static double RECT_POSITION_SIMILARITY_THRESHOLD_FACTOR = 0.2; // specify how much the x and y value are allowed to differ
|
||||
final private static double RECT_SIZE_SIMILARITY_THRESHOLD_FACTOR = 0.1; // the scale the images are allowed to differ
|
||||
|
||||
int elementType;
|
||||
Rectangle2D boundingBox;
|
||||
|
||||
|
||||
public boolean matches(ElementFeatures elementFeatures) {
|
||||
|
||||
return elementFeatures.getElementType() == elementType && elementFeatures.getBoundingBox() != null && bboxMatches(elementFeatures.getBoundingBox());
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
protected boolean bboxMatches(Rectangle2D bBox) {
|
||||
// To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance
|
||||
|
||||
return almostEqual(bBox.getX(), boundingBox.getX()) && //
|
||||
almostEqual(bBox.getY(), boundingBox.getY()) && //
|
||||
almostEqual(bBox.getWidth(), boundingBox.getWidth()) && //
|
||||
almostEqual(bBox.getHeight(), boundingBox.getHeight());
|
||||
}
|
||||
|
||||
|
||||
public Shape getOverlapShape() {
|
||||
|
||||
return boundingBox;
|
||||
}
|
||||
|
||||
|
||||
public boolean similar(ElementFeatures elementFeatures) {
|
||||
|
||||
return elementFeatures.getElementType() == elementType && elementFeatures.getBoundingBox() != null && areRectsSimilar(elementFeatures.getBoundingBox());
|
||||
}
|
||||
|
||||
|
||||
protected boolean areRectsSimilar(Rectangle2D rectangle2D) {
|
||||
// To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance
|
||||
|
||||
return isPositionSimilar(rectangle2D.getX(), boundingBox.getX(), rectangle2D.getWidth()) && //
|
||||
isPositionSimilar(rectangle2D.getY(), boundingBox.getY(), rectangle2D.getHeight()) && //
|
||||
isSizeSimilar(rectangle2D.getWidth(), boundingBox.getWidth()) && //
|
||||
isSizeSimilar(rectangle2D.getHeight(), boundingBox.getHeight());
|
||||
}
|
||||
|
||||
|
||||
protected boolean isPositionSimilar(double a, double b, double boxSize) {
|
||||
|
||||
return Math.abs(a - b) < boxSize * RECT_POSITION_SIMILARITY_THRESHOLD_FACTOR;
|
||||
}
|
||||
|
||||
|
||||
protected boolean isSizeSimilar(double a, double b) {
|
||||
|
||||
return Math.abs(a - b) < a * RECT_SIZE_SIMILARITY_THRESHOLD_FACTOR;
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(ElementFeatures features) {
|
||||
|
||||
return features.containedBy(this);
|
||||
}
|
||||
|
||||
|
||||
public boolean testOverlapped(ElementFeatures overlappingElement) {
|
||||
|
||||
return containedBy(overlappingElement);
|
||||
}
|
||||
|
||||
|
||||
private boolean containedBy(ElementFeatures features) {
|
||||
|
||||
Shape overlapShape = features.getOverlapShape();
|
||||
return overlapShape.contains(ComparisonUtils.shrinkRectangle(boundingBox));
|
||||
}
|
||||
|
||||
|
||||
public void destroy() {
|
||||
// do nothing, except for text
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,45 @@
|
||||
package com.iqser.red.pdftronlogic.commons.features;
|
||||
|
||||
import static com.iqser.red.pdftronlogic.commons.ComparisonUtils.almostEqual;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Getter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@Getter
|
||||
@SuperBuilder
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class FormFeatures extends ElementFeatures {
|
||||
|
||||
int xObjectType;
|
||||
long dictOrArrayOrStreamLength;
|
||||
|
||||
|
||||
public boolean matches(ElementFeatures elementFeatures) {
|
||||
|
||||
if (elementFeatures instanceof FormFeatures features) {
|
||||
return elementFeatures.getElementType() == getElementType()
|
||||
&& elementFeatures.getBoundingBox() != null
|
||||
&& (super.bboxMatches(elementFeatures.getBoundingBox())
|
||||
|| rotationMatches(elementFeatures.getBoundingBox()
|
||||
.getBounds2D()))
|
||||
&& xObjectType == features.getXObjectType()
|
||||
&& dictOrArrayOrStreamLength == features.getDictOrArrayOrStreamLength();
|
||||
}
|
||||
return false;
|
||||
|
||||
}
|
||||
|
||||
|
||||
private boolean rotationMatches(Rectangle2D bBox) {
|
||||
|
||||
return almostEqual(bBox.getWidth(), getBoundingBox().getHeight()) && //
|
||||
almostEqual(bBox.getHeight(), getBoundingBox().getWidth());
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,116 @@
|
||||
package com.iqser.red.pdftronlogic.commons.features;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Optional;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.ComparisonUtils;
|
||||
import com.iqser.red.pdftronlogic.commons.Converter;
|
||||
import com.iqser.red.pdftronlogic.commons.PDFNetUtils;
|
||||
import com.pdftron.common.Matrix2D;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.Font;
|
||||
import com.pdftron.pdf.PathData;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class GlyphInfo {
|
||||
|
||||
final Matrix2D glyphMatrix;
|
||||
final long charCode;
|
||||
final Font font;
|
||||
|
||||
// in order to speed up invisible element removal, we only calculate the pathdata where necessary, as it is the costliest operation.
|
||||
// It will only work as long as the associated ElementReader is still open, as the Font is bound to the ContentStream being read.
|
||||
Rectangle2D bbox;
|
||||
final boolean cachePathData;
|
||||
PathData pathData;
|
||||
|
||||
boolean overlapped;
|
||||
ElementFeatures overlappingElement;
|
||||
|
||||
|
||||
public boolean testOverlapped(ElementFeatures overlappingElement) {
|
||||
|
||||
if (overlapped) {
|
||||
return true;
|
||||
}
|
||||
Optional<Rectangle2D> bbox = getBoundingBox();
|
||||
if (bbox.isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (overlappingElement.getOverlapShape().contains(ComparisonUtils.shrinkRectangle(bbox.get()))) {
|
||||
overlapped = true;
|
||||
this.overlappingElement = overlappingElement;
|
||||
}
|
||||
|
||||
return overlapped;
|
||||
|
||||
}
|
||||
|
||||
|
||||
public String getUnicode() {
|
||||
|
||||
try {
|
||||
return new String(font.mapToUnicode(charCode));
|
||||
} catch (PDFNetException e) {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public Optional<PathData> getPathData() {
|
||||
|
||||
if (pathData == null) {
|
||||
|
||||
PDFNetUtils.requireFontNotClosed(font);
|
||||
|
||||
PathData computedPathData = font.getGlyphPath(charCode, true, glyphMatrix);
|
||||
if (computedPathData.getOperators().length == 1 && computedPathData.getOperators()[0] == 6) {
|
||||
// This happens for some chinese characters or whitespaces, don't know why...
|
||||
return Optional.empty();
|
||||
}
|
||||
if (cachePathData) {
|
||||
pathData = computedPathData;
|
||||
}
|
||||
return Optional.of(computedPathData);
|
||||
}
|
||||
return Optional.of(pathData);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public Optional<Rectangle2D> getBoundingBox() {
|
||||
|
||||
if (bbox == null) {
|
||||
Optional<PathData> pathData = getPathData();
|
||||
if (pathData.isEmpty()) {
|
||||
return Optional.empty();
|
||||
}
|
||||
bbox = Converter.convertToGeneralPath(pathData.get()).getBounds2D();
|
||||
}
|
||||
return Optional.of(bbox);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void destroy() {
|
||||
|
||||
if (glyphMatrix != null) {
|
||||
glyphMatrix.close();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,75 @@
|
||||
package com.iqser.red.pdftronlogic.commons.features;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Getter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@Getter
|
||||
@SuperBuilder
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class ImageFeatures extends ElementFeatures {
|
||||
|
||||
final private static double HAMMING_DISTANCE_THRESHOLD = 4; // defines the similarity of the hash of images
|
||||
int dataSize;
|
||||
int height;
|
||||
int width;
|
||||
int renderingIntent;
|
||||
int componentNum;
|
||||
int bitsPerComponent;
|
||||
boolean imageMask;
|
||||
boolean softMask;
|
||||
boolean masked;
|
||||
boolean transparent;
|
||||
String hashOfImage;
|
||||
|
||||
|
||||
@Override
|
||||
public boolean matches(ElementFeatures elementFeatures) {
|
||||
|
||||
if (elementFeatures instanceof ImageFeatures imageFeatures) {
|
||||
return super.matches(elementFeatures)
|
||||
&& this.dataSize == imageFeatures.getDataSize()
|
||||
&& this.height == imageFeatures.getHeight()
|
||||
&& this.width == imageFeatures.getWidth()
|
||||
&& this.renderingIntent == imageFeatures.getRenderingIntent()
|
||||
&& this.componentNum == imageFeatures.getComponentNum()
|
||||
&& this.bitsPerComponent == imageFeatures.getBitsPerComponent()
|
||||
&& this.imageMask == imageFeatures.isImageMask()
|
||||
&& this.softMask == imageFeatures.isSoftMask()
|
||||
&& this.transparent == imageFeatures.isTransparent()
|
||||
&& calculateHammingDistance(imageFeatures.getHashOfImage()) <= HAMMING_DISTANCE_THRESHOLD;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
public boolean similar(ElementFeatures elementFeatures) {
|
||||
|
||||
return super.similar(elementFeatures) && //
|
||||
calculateHammingDistance(((ImageFeatures) elementFeatures).getHashOfImage()) <= HAMMING_DISTANCE_THRESHOLD;
|
||||
}
|
||||
|
||||
|
||||
// Helper method to calculate the Hamming distance between two hexadecimal strings
|
||||
private int calculateHammingDistance(String hash2) {
|
||||
|
||||
if (hash2 == null) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int distance = 0;
|
||||
int maxLength = Math.max(this.hashOfImage.length(), hash2.length());
|
||||
for (int i = 0; i < maxLength; i++) {
|
||||
char char1 = i < this.hashOfImage.length() ? this.hashOfImage.charAt(i) : '0';
|
||||
char char2 = i < hash2.length() ? hash2.charAt(i) : '0';
|
||||
if (char1 != char2) {
|
||||
distance++;
|
||||
}
|
||||
}
|
||||
return distance;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,51 @@
|
||||
package com.iqser.red.pdftronlogic.commons.features;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.Shape;
|
||||
import java.awt.geom.GeneralPath;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Getter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
@Getter
|
||||
@SuperBuilder
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
public class PathFeatures extends ElementFeatures {
|
||||
|
||||
boolean clippingPath;
|
||||
boolean clipWindingFill;
|
||||
boolean stroked;
|
||||
boolean filled;
|
||||
boolean windingFill;
|
||||
Color strokeColor;
|
||||
Color fillColor;
|
||||
GeneralPath linePath;
|
||||
|
||||
|
||||
@Override
|
||||
public boolean matches(ElementFeatures element) {
|
||||
|
||||
if (element instanceof PathFeatures pathFeaturesElement) {
|
||||
return super.matches(element)
|
||||
&& clippingPath == pathFeaturesElement.isClippingPath()
|
||||
&& clipWindingFill == pathFeaturesElement.isClipWindingFill()
|
||||
&& stroked == pathFeaturesElement.isStroked()
|
||||
&& filled == pathFeaturesElement.isFilled()
|
||||
&& windingFill == pathFeaturesElement.isWindingFill();
|
||||
}
|
||||
return false;
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Shape getOverlapShape() {
|
||||
|
||||
return linePath;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,60 @@
|
||||
package com.iqser.red.pdftronlogic.commons.features;
|
||||
|
||||
import static com.iqser.red.pdftronlogic.commons.ComparisonUtils.almostEqual;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Builder;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Getter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@Getter
|
||||
@SuperBuilder
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
@SuppressWarnings("PMD")
|
||||
public class TextFeatures extends ElementFeatures {
|
||||
|
||||
String text;
|
||||
int font;
|
||||
double fontsize;
|
||||
|
||||
@Builder.Default
|
||||
List<GlyphInfo> glyphs = new ArrayList<>();
|
||||
|
||||
|
||||
@Override
|
||||
public boolean matches(ElementFeatures element) {
|
||||
|
||||
if (element instanceof TextFeatures textFeaturesElement) {
|
||||
return super.matches(textFeaturesElement)//
|
||||
&& text.equals(textFeaturesElement.getText()) //
|
||||
&& font == textFeaturesElement.getFont()//
|
||||
&& almostEqual(fontsize, textFeaturesElement.getFontsize());
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
public boolean testOverlapped(ElementFeatures overlappingElement) {
|
||||
|
||||
if (glyphs.isEmpty()) {
|
||||
return super.testOverlapped(overlappingElement);
|
||||
}
|
||||
|
||||
return super.testOverlapped(overlappingElement) || glyphs.stream()
|
||||
.allMatch(glyph -> glyph.testOverlapped(overlappingElement));
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void destroy() {
|
||||
|
||||
glyphs.forEach(GlyphInfo::destroy);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,43 @@
|
||||
package com.iqser.red.pdftronlogic.commons.lookup;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@RequiredArgsConstructor
|
||||
public class AnyMatchVisitor implements ElementFeatureVisitor {
|
||||
|
||||
private final ElementFeatures queryFeatures;
|
||||
@Getter
|
||||
private ElementFeatures match;
|
||||
|
||||
|
||||
public Optional<ElementFeatures> getAnyMatch() {
|
||||
|
||||
return Optional.ofNullable(match);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void visitItem(ElementFeatures features) {
|
||||
|
||||
if (hasAnyMatch()) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (queryFeatures.matches(features)) {
|
||||
match = features;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private boolean hasAnyMatch() {
|
||||
|
||||
return getAnyMatch().isPresent();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,135 @@
|
||||
package com.iqser.red.pdftronlogic.commons.lookup;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.Converter;
|
||||
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
|
||||
import com.pdftron.pdf.Element;
|
||||
import com.pdftron.pdf.Rect;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Getter
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class ElementFeatureLookup implements AutoCloseable {
|
||||
/*
|
||||
This class looks a bit weird and uses visitors since I tried to use the quadtree implementation by locationtech, as it uses Rectangles by default to query its data structure.
|
||||
Unfortunately there were always edge cases where it lost a few elements making it completely unusable. Further, it didn't even speed up the algorithm all that much.
|
||||
*/
|
||||
|
||||
List<ElementFeatures> allElements = new ArrayList<>();
|
||||
|
||||
|
||||
public void add(ElementFeatures elementFeatures) {
|
||||
|
||||
allElements.add(elementFeatures);
|
||||
}
|
||||
|
||||
|
||||
public void remove(ElementFeatures elementFeatures) {
|
||||
|
||||
allElements.remove(elementFeatures);
|
||||
}
|
||||
|
||||
|
||||
public Optional<ElementFeatures> anyMatch(ElementFeatures elementFeatures) {
|
||||
|
||||
AnyMatchVisitor visitor = new AnyMatchVisitor(elementFeatures);
|
||||
forEach(visitor::visitItem);
|
||||
return visitor.getAnyMatch();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public List<ElementFeatures> query(Predicate<ElementFeatures> predicate) {
|
||||
|
||||
PredicateItemVisitor visitor = new PredicateItemVisitor(predicate);
|
||||
forEach(visitor::visitItem);
|
||||
return visitor.getMatchingFeatures();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public List<ElementFeatures> findIntersecting(Rect bbox) {
|
||||
|
||||
Rectangle2D r = Converter.toRectangle2D(bbox);
|
||||
return query(elementFeatures -> elementFeatures.getBoundingBox().intersects(r));
|
||||
}
|
||||
|
||||
|
||||
public List<ElementFeatures> findOverlapped(ElementFeatures overlappingElement, boolean textOnly) {
|
||||
|
||||
List<ElementFeatures> overlappedElementFeatures = new LinkedList<>();
|
||||
|
||||
for (int i = 0; i < allElements.size(); i++) {
|
||||
ElementFeatures features = allElements.get(i);
|
||||
|
||||
if (textOnly && features.getElementType() != Element.e_text) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (features.getBoundingBox().intersects(overlappingElement.getBoundingBox())) {
|
||||
if (features.testOverlapped(overlappingElement)) {
|
||||
overlappedElementFeatures.add(features);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return overlappedElementFeatures;
|
||||
}
|
||||
|
||||
|
||||
public void forEach(Consumer<ElementFeatures> consumer) {
|
||||
|
||||
allElements.forEach(consumer);
|
||||
}
|
||||
|
||||
|
||||
public void clear() {
|
||||
|
||||
allElements.clear();
|
||||
}
|
||||
|
||||
|
||||
public boolean isEmpty() {
|
||||
|
||||
return allElements.isEmpty();
|
||||
}
|
||||
|
||||
|
||||
public int size() {
|
||||
|
||||
return allElements.size();
|
||||
}
|
||||
|
||||
|
||||
public void addAll(List<ElementFeatures> currentOverlappedElements) {
|
||||
|
||||
allElements.addAll(currentOverlappedElements);
|
||||
}
|
||||
|
||||
|
||||
public void removeAll(List<ElementFeatures> currentOverlappedElements) {
|
||||
|
||||
allElements.removeAll(currentOverlappedElements);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
|
||||
allElements.forEach(ElementFeatures::destroy);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,9 @@
|
||||
package com.iqser.red.pdftronlogic.commons.lookup;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
|
||||
|
||||
public interface ElementFeatureVisitor {
|
||||
|
||||
void visitItem(ElementFeatures features);
|
||||
|
||||
}
|
||||
@ -0,0 +1,29 @@
|
||||
package com.iqser.red.pdftronlogic.commons.lookup;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@RequiredArgsConstructor
|
||||
public class PredicateItemVisitor implements ElementFeatureVisitor {
|
||||
|
||||
private final Predicate<ElementFeatures> predicate;
|
||||
@Getter
|
||||
private final List<ElementFeatures> matchingFeatures = new ArrayList<>();
|
||||
|
||||
|
||||
@Override
|
||||
public void visitItem(ElementFeatures features) {
|
||||
|
||||
if (predicate.test(features)) {
|
||||
matchingFeatures.add(features);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,91 @@
|
||||
package com.iqser.red.pdftronlogic.commons;
|
||||
|
||||
import static com.iqser.red.pdftronlogic.commons.VisualizationUtils.drawPathData;
|
||||
import static com.iqser.red.pdftronlogic.commons.VisualizationUtils.drawRect;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.io.FileOutputStream;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.features.GlyphInfo;
|
||||
import com.iqser.red.pdftronlogic.commons.features.ImageFeatures;
|
||||
import com.iqser.red.pdftronlogic.commons.features.TextFeatures;
|
||||
import com.pdftron.pdf.ElementBuilder;
|
||||
import com.pdftron.pdf.ElementWriter;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.PDFNet;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.sdf.SDFDoc;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@Disabled // makes no sense to run in pipeline
|
||||
public class GlyphExtractionTest {
|
||||
|
||||
@BeforeAll
|
||||
static void init() {
|
||||
|
||||
PDFNet.initialize(PDFTronConfig.license);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testGlyphExtraction() {
|
||||
|
||||
String file = "files/everyCharIsImage.pdf";
|
||||
|
||||
List<List<TextFeatures>> textsPerPage;
|
||||
List<List<ImageFeatures>> imagesPerPage;
|
||||
try (var in = Thread.currentThread().getContextClassLoader().getResourceAsStream(file)) {
|
||||
textsPerPage = PdfTextExtraction.extractAllGlyphsFromDocument(in, true);
|
||||
}
|
||||
try (var in = Thread.currentThread().getContextClassLoader().getResourceAsStream(file)) {
|
||||
imagesPerPage = PdfImageExtraction.extractImages(in);
|
||||
}
|
||||
try (var in = Thread.currentThread().getContextClassLoader().getResourceAsStream(file);//
|
||||
var out = new FileOutputStream(Path.of("/tmp/").resolve(Path.of(file).getFileName() + "_GLYPHS.pdf").toFile())) {
|
||||
|
||||
try (PDFDoc pdfDoc = new PDFDoc(in)) {
|
||||
for (int i = 0; i < pdfDoc.getPageCount(); i++) {
|
||||
Page page = pdfDoc.getPage(i + 1);
|
||||
List<TextFeatures> textFeaturesOnPage = textsPerPage.get(i);
|
||||
List<ImageFeatures> imageFeaturesOnPage = imagesPerPage.get(i);
|
||||
try (ElementWriter writer = new ElementWriter(); ElementBuilder builder = new ElementBuilder()) {
|
||||
writer.begin(page, ElementWriter.e_overlay, false);
|
||||
|
||||
for (ImageFeatures imageFeatures : imageFeaturesOnPage) {
|
||||
if (imageFeatures.getBoundingBox().getHeight() * imageFeatures.getBoundingBox().getWidth() >= page.getPageHeight() * page.getPageWidth() * 0.8) {
|
||||
continue;
|
||||
}
|
||||
drawRect(imageFeatures.getBoundingBox(), builder, writer, Color.CYAN, true);
|
||||
}
|
||||
for (TextFeatures textFeatures : textFeaturesOnPage) {
|
||||
|
||||
drawRect(textFeatures.getBoundingBox(), builder, writer, Color.BLUE);
|
||||
|
||||
for (GlyphInfo glyph : textFeatures.getGlyphs()) {
|
||||
|
||||
if (glyph.getPathData().isPresent() && glyph.getBoundingBox().isPresent()) {
|
||||
drawPathData(glyph.getPathData().get(), builder, writer, Color.BLACK);
|
||||
drawRect(ComparisonUtils.shrinkRectangle(glyph.getBoundingBox().get()), builder, writer, Color.RED);
|
||||
drawRect(glyph.getBoundingBox().get(), builder, writer, Color.MAGENTA);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
writer.end();
|
||||
}
|
||||
}
|
||||
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -3,9 +3,13 @@ package com.iqser.red.pdftronlogic.commons;
|
||||
import static com.iqser.red.pdftronlogic.commons.PdfTextExtraction.extractAllTextFromDocument;
|
||||
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
@ -13,15 +17,23 @@ import com.pdftron.pdf.PDFNet;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@SuppressWarnings("PMD")
|
||||
@Slf4j
|
||||
class InvisibleElementRemovalServiceTest {
|
||||
|
||||
InvisibleElementRemovalService invisibleElementRemovalService;
|
||||
|
||||
|
||||
@BeforeEach
|
||||
void createService() {
|
||||
@BeforeAll
|
||||
static void init() {
|
||||
|
||||
PDFNet.initialize(PDFTronConfig.license);
|
||||
}
|
||||
|
||||
|
||||
@BeforeEach
|
||||
void createServices() {
|
||||
|
||||
invisibleElementRemovalService = new InvisibleElementRemovalService();
|
||||
}
|
||||
|
||||
@ -53,6 +65,47 @@ class InvisibleElementRemovalServiceTest {
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
void page32DoesNotCrash() {
|
||||
|
||||
String fileName = "files/Page32.pdf";
|
||||
|
||||
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new ByteArrayOutputStream()) {
|
||||
invisibleElementRemovalService.removeInvisibleElements(in, out, false);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
void removeInvisibleTextClippedByFormObjects() {
|
||||
|
||||
String fileName = "files/invisibleTextInNestedFormObjects.pdf";
|
||||
String resultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL");
|
||||
String deltaResultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL_DELTA");
|
||||
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(resultFileName)) {
|
||||
invisibleElementRemovalService.removeInvisibleElements(in, out, false);
|
||||
}
|
||||
|
||||
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(deltaResultFileName)) {
|
||||
invisibleElementRemovalService.removeInvisibleElements(in, out, true);
|
||||
}
|
||||
try (var in = new FileInputStream(resultFileName)) {
|
||||
String text = extractAllTextFromDocument(in);
|
||||
assertThat(text).isBlank();
|
||||
}
|
||||
try (var in = new FileInputStream(deltaResultFileName)) {
|
||||
String[] text = extractAllTextFromDocument(in).split("\n");
|
||||
assertThat(text).contains(":Bold S-enantiomer form if two codes are supplied",
|
||||
"Red : Only observed in laboratory soil studies",
|
||||
"Green : Observed in both laboratory soil studies and lysimeter leachate",
|
||||
"Blue : Only observed in lysimeter leachate");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
void removeInvisibleElementsWithColoredBackground() {
|
||||
@ -71,9 +124,20 @@ class InvisibleElementRemovalServiceTest {
|
||||
try (var in = new FileInputStream(deltaResultFileName)) {
|
||||
String result = PdfTextExtraction.extractAllTextFromDocument(in);
|
||||
assertThat(result).contains("#1 Dark",
|
||||
"#13 Yellow",
|
||||
"Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper suscipit lobortis nisl ut aliquip\n" + "ex ea commodo consequat. Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie\n" + "consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim\n" + "qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi.");
|
||||
assertThat(result).doesNotContain("Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut\n" + "labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et\n" + "ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem\n" + "ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et\n" + "dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea\n" + "rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum\n" + "dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore\n" + "magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet\n" + "clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. ");
|
||||
"#13 Yellow",
|
||||
"Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper suscipit lobortis nisl ut aliquip\n"
|
||||
+ "ex ea commodo consequat. Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie\n"
|
||||
+ "consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim\n"
|
||||
+ "qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi.");
|
||||
assertThat(result).doesNotContain("Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut\n"
|
||||
+ "labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et\n"
|
||||
+ "ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem\n"
|
||||
+ "ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et\n"
|
||||
+ "dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea\n"
|
||||
+ "rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum\n"
|
||||
+ "dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore\n"
|
||||
+ "magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet\n"
|
||||
+ "clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. ");
|
||||
}
|
||||
|
||||
}
|
||||
@ -124,4 +188,49 @@ class InvisibleElementRemovalServiceTest {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
void removeInvisibleElementsButKeepOCRText() {
|
||||
|
||||
String fileName = "files/singlePageWithOcrText.pdf";
|
||||
String resultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL");
|
||||
String deltaResultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL_DELTA");
|
||||
|
||||
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(resultFileName)) {
|
||||
invisibleElementRemovalService.removeInvisibleElementsButKeepOcrText(in, out, false);
|
||||
}
|
||||
|
||||
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(deltaResultFileName)) {
|
||||
invisibleElementRemovalService.removeInvisibleElementsButKeepOcrText(in, out, true);
|
||||
}
|
||||
try (var in = new FileInputStream(resultFileName)) {
|
||||
String result = PdfTextExtraction.extractAllTextFromDocument(in);
|
||||
assertThat(result).contains("TABLE 17:", "Intergroup comparison oftotal litter", "TABLE 20:");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
void removeInvisibleElementsWhereEachCharIsImage() {
|
||||
|
||||
String fileName = "files/everyCharIsImage.pdf";
|
||||
String resultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL");
|
||||
String deltaResultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL_DELTA");
|
||||
|
||||
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(resultFileName)) {
|
||||
invisibleElementRemovalService.removeInvisibleElementsButKeepOcrText(in, out, false);
|
||||
}
|
||||
|
||||
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(deltaResultFileName)) {
|
||||
invisibleElementRemovalService.removeInvisibleElementsButKeepOcrText(in, out, true);
|
||||
}
|
||||
try (var in = new FileInputStream(resultFileName)) {
|
||||
String result = PdfTextExtraction.extractAllTextFromDocument(in);
|
||||
assertThat(result).isBlank();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,314 @@
|
||||
package com.iqser.red.pdftronlogic.commons;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.rendering.GhostScriptService;
|
||||
import com.iqser.red.pdftronlogic.commons.rendering.ImageFile;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.PDFNet;
|
||||
import com.pdftron.sdf.SDFDoc;
|
||||
import com.sun.jna.NativeLibrary;
|
||||
import com.sun.jna.Pointer;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import net.sourceforge.lept4j.Box;
|
||||
import net.sourceforge.lept4j.Boxa;
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.lept4j.util.LeptUtils;
|
||||
|
||||
@Disabled // requires leptonica and ghostscript to be installed locally
|
||||
public class VisualEqualityTest {
|
||||
|
||||
/*
|
||||
We render both the origin and the processed file and then computes a diff per page, we then threshold and invert the diff.
|
||||
This means, a visual difference of luminance greater than the threshold value shows up as a black pixel.
|
||||
We then use Heckbert's Seed Fill Algorithm to detect connected black regions by recursively flooding connected pixels.
|
||||
We then filter these error regions, ensuring their area is at least the threshold.
|
||||
We do this, since single pixel errors are frequent, but cannot be perceived by a human. Most likely some float inaccuracies.
|
||||
If there are any error regions left, we count the test as failed.
|
||||
*/
|
||||
private static final int ERROR_REGION_AREA_THRESHOLD = 10;
|
||||
public static final int LUMINANCE_DIFFERENCE_THRESHOLD = 170;
|
||||
|
||||
private static final Path TEST_OUTPUT_DIR = Path.of("/tmp/AAA_EQUALITY_TEST/");
|
||||
private static final String LEPTONICA_DIR = "/home/kschuettler/software/leptonica/vcpkg/installed/x64-linux-dynamic/lib/";
|
||||
|
||||
GhostScriptService ghostScriptService = new GhostScriptService();
|
||||
InvisibleElementRemovalService invisibleElementRemovalService = new InvisibleElementRemovalService();
|
||||
|
||||
|
||||
@BeforeEach
|
||||
public void setup() {
|
||||
|
||||
PDFNet.initialize(PDFTronConfig.license);
|
||||
System.setProperty("jna.library.path", LEPTONICA_DIR);
|
||||
|
||||
try (NativeLibrary leptonicaLib = NativeLibrary.getInstance("leptonica")) {
|
||||
assert leptonicaLib != null;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void assertVisualEqualityOfProcessedFile() {
|
||||
|
||||
Path file = Path.of("/home/kschuettler/Dokumente/TestFiles/full_syn_dm_testfiles/3977411_Final_Thiamethoxam_SL_MNLY.pdf");
|
||||
Context context = new Context(TEST_OUTPUT_DIR, new HashMap<>());
|
||||
|
||||
runForFile(file, context);
|
||||
|
||||
System.out.println(context);
|
||||
|
||||
assert context.failedFiles.isEmpty();
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void assertVisualEqualityOfProcessedFolder() {
|
||||
|
||||
Path folder = Path.of("/home/kschuettler/Dokumente/TestFiles/full_syn_dm_testfiles");
|
||||
Context context = new Context(TEST_OUTPUT_DIR, new HashMap<>());
|
||||
|
||||
Files.walk(folder)
|
||||
.filter(Files::isRegularFile)
|
||||
.map(Path::toFile)
|
||||
.filter(file -> file.toString().endsWith(".pdf"))
|
||||
.map(File::toPath)
|
||||
.peek(file -> runForFile(file, context))
|
||||
.forEach(f -> System.out.println(context));
|
||||
|
||||
assert context.failedFiles.isEmpty();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void runForFile(Path originFile, Context context) {
|
||||
|
||||
System.out.println(originFile.toFile());
|
||||
Path fileFolder = context.getFileFolder(originFile);
|
||||
Files.createDirectories(fileFolder);
|
||||
Path processedFile = fileFolder.resolve("processed.pdf");
|
||||
Path deltaFile = fileFolder.resolve("delta.pdf");
|
||||
Path savedOriginFile = fileFolder.resolve("origin.pdf");
|
||||
|
||||
try (var in = new FileInputStream(originFile.toFile()); PDFDoc pdfDoc = new PDFDoc(in); var out = new FileOutputStream(savedOriginFile.toFile())) {
|
||||
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
|
||||
}
|
||||
|
||||
try (var in = new FileInputStream(originFile.toFile()); var out = new FileOutputStream(processedFile.toFile())) {
|
||||
invisibleElementRemovalService.removeInvisibleElements(in, out, false);
|
||||
}
|
||||
try (var in = new FileInputStream(originFile.toFile()); var out = new FileOutputStream(deltaFile.toFile())) {
|
||||
invisibleElementRemovalService.removeInvisibleElements(in, out, true);
|
||||
}
|
||||
System.out.println("removed invisible elements");
|
||||
assertVisualEquality(savedOriginFile, processedFile, context);
|
||||
System.out.println("finished visual equality check");
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void assertVisualEquality(Path originFile, Path processedFile, Context context) {
|
||||
|
||||
Path imageDir = context.getFileFolder(originFile).resolve("images");
|
||||
Path originDir = imageDir.resolve("origin");
|
||||
Files.createDirectories(originDir);
|
||||
CompletableFuture<List<ImageFile>> originalPagesFuture = ghostScriptService.renderDocument(originFile, originDir);
|
||||
Path processedDir = imageDir.resolve("processed");
|
||||
Files.createDirectories(processedDir);
|
||||
CompletableFuture<List<ImageFile>> processedPagesFuture = ghostScriptService.renderDocument(processedFile, processedDir);
|
||||
Files.walk(context.getErrorFolder(originFile))
|
||||
.map(Path::toFile)
|
||||
.filter(File::isFile)
|
||||
.forEach(File::delete);
|
||||
|
||||
List<ImageFile> originalPages = originalPagesFuture.join();
|
||||
List<ImageFile> processedPages = processedPagesFuture.join();
|
||||
|
||||
if (originalPages.size() != processedPages.size()) {
|
||||
context.getFailedFile(originFile).addErrorMessage("Differing page counts!");
|
||||
return;
|
||||
}
|
||||
|
||||
for (ImageFile originalPage : originalPages) {
|
||||
Optional<ImageFile> samePage = processedPages.stream()
|
||||
.filter(p -> p.pageNumber() == originalPage.pageNumber())
|
||||
.findFirst();
|
||||
if (samePage.isEmpty()) {
|
||||
context.getFailedFile(originFile).addErrorMessage("Page " + originalPage.pageNumber() + " missing!");
|
||||
return;
|
||||
}
|
||||
ImageFile processedPage = samePage.get();
|
||||
Pix originalPagePix;
|
||||
Pix processedPagePix;
|
||||
|
||||
synchronized (VisualEqualityTest.class) {
|
||||
originalPagePix = originalPage.readPix();
|
||||
processedPagePix = processedPage.readPix();
|
||||
}
|
||||
|
||||
String errorFile = context.getErrorFolder(originFile).resolve(originalPage.pageNumber() + ".tiff").toFile().toString();
|
||||
List<Rectangle2D> errorRegions = detectErrors(originalPagePix, processedPagePix, errorFile);
|
||||
|
||||
if (!errorRegions.isEmpty()) {
|
||||
context.getFailedFile(originFile).addErrorMessage("Page " + originalPage.pageNumber() + " has " + errorRegions.size() + " errors!");
|
||||
}
|
||||
|
||||
synchronized (VisualEqualityTest.class) {
|
||||
LeptUtils.disposePix(originalPagePix);
|
||||
LeptUtils.disposePix(processedPagePix);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
synchronized public List<Rectangle2D> detectErrors(Pix pix1, Pix pix2, String errorFile) {
|
||||
|
||||
Pix pixDiff = Leptonica1.pixAbsDifference(pix1, pix2);
|
||||
|
||||
Pix pixThresh = Leptonica1.pixThresholdToBinary(pixDiff, LUMINANCE_DIFFERENCE_THRESHOLD);
|
||||
Leptonica1.pixInvert(pixThresh, pixThresh);
|
||||
// checks for connected black regions and outputs them as a list of boxes, a boxa
|
||||
Boxa boxa = Leptonica1.pixConnComp(pixThresh, null, 8);
|
||||
|
||||
List<Rectangle2D> errorRegions = readRectsFromBoxa(boxa).stream()
|
||||
.filter(box -> box.getWidth() * box.getHeight() >= ERROR_REGION_AREA_THRESHOLD)
|
||||
.toList();
|
||||
|
||||
if (!errorRegions.isEmpty()) {
|
||||
System.out.println("Found error(s) on page " + Path.of(errorFile).getFileName().toString().replace(".tiff", "") + ", writing error file.");
|
||||
// Boxa errorRegionsBoxa = pushRectsIntoBoxa(errorRegions); // this does not work
|
||||
// Pix errorPix = Leptonica1.pixDrawBoxa(pixThresh, errorRegionsBoxa, 2, -1); // somehow this runs forever
|
||||
Leptonica1.pixWrite(errorFile, pixThresh, 4);
|
||||
// LeptUtils.disposePix(errorPix);
|
||||
// LeptUtils.dispose(errorRegionsBoxa);
|
||||
}
|
||||
|
||||
LeptUtils.dispose(boxa);
|
||||
LeptUtils.disposePix(pixDiff);
|
||||
LeptUtils.disposePix(pixThresh);
|
||||
return errorRegions;
|
||||
}
|
||||
|
||||
|
||||
private static List<Rectangle2D> readRectsFromBoxa(Boxa boxa) {
|
||||
|
||||
Pointer[] pointers = boxa.box.getPointer().getPointerArray(0, boxa.n);
|
||||
List<Rectangle2D> boxes = new ArrayList<>(boxa.n);
|
||||
for (int i = 0; i < boxa.n; i++) {
|
||||
Box box = new Box(pointers[i]);
|
||||
boxes.add(new Rectangle2D.Double(box.x, box.y, box.w, box.h));
|
||||
LeptUtils.dispose(box);
|
||||
}
|
||||
return boxes;
|
||||
}
|
||||
|
||||
/*
|
||||
private static Boxa pushRectsIntoBoxa(List<Rectangle2D> rects) {
|
||||
|
||||
Boxa boxa = new Boxa();
|
||||
boxa.n = rects.size();
|
||||
boxa.nalloc = rects.size();
|
||||
|
||||
Memory boxMemory = new Memory((long) Native.POINTER_SIZE * rects.size());
|
||||
|
||||
for (int i = 0; i < rects.size(); i++) {
|
||||
Rectangle2D rect = rects.get(i);
|
||||
Box box = new Box((int) rect.getX(), (int) rect.getY(), (int) rect.getWidth(), (int) rect.getHeight(), 0);
|
||||
|
||||
boxMemory.setPointer((long) i * Native.POINTER_SIZE, box.getPointer());
|
||||
}
|
||||
|
||||
boxa.box = new PointerByReference(boxMemory);
|
||||
|
||||
return boxa;
|
||||
}
|
||||
|
||||
*/
|
||||
|
||||
private record Context(Path outFolder, Map<Path, FailedFile> failedFiles) {
|
||||
|
||||
public FailedFile getFailedFile(Path path) {
|
||||
|
||||
return failedFiles.computeIfAbsent(path, p -> FailedFile.init());
|
||||
}
|
||||
|
||||
|
||||
public Path getFileFolder(Path file) {
|
||||
|
||||
return outFolder.resolve(file.getFileName());
|
||||
}
|
||||
|
||||
|
||||
public String toString() {
|
||||
|
||||
if (failedFiles.isEmpty()) {
|
||||
return "All files visually equal!";
|
||||
}
|
||||
StringBuilder sb = new StringBuilder();
|
||||
failedFiles.forEach((file, failedFile) -> sb.append(file.getFileName().toFile()).append(": ").append(failedFile.toString()).append("\n"));
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public Path getErrorFolder(Path originFile) {
|
||||
|
||||
Path errorDir = getFileFolder(originFile).resolve("error");
|
||||
Files.createDirectories(errorDir);
|
||||
return errorDir;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private record FailedFile(Map<ImageFile, FailedPage> failedPages, List<String> errors) {
|
||||
|
||||
public static FailedFile init() {
|
||||
|
||||
return new FailedFile(new HashMap<>(), new LinkedList<>());
|
||||
}
|
||||
|
||||
|
||||
public void addErrorMessage(String s) {
|
||||
|
||||
errors.add(s);
|
||||
}
|
||||
|
||||
|
||||
public void addFailedPage(ImageFile imageFile, double location) {
|
||||
|
||||
failedPages.computeIfAbsent(imageFile, file -> new FailedPage(new LinkedList<>())).locations().add(location);
|
||||
}
|
||||
|
||||
|
||||
public String toString() {
|
||||
|
||||
return String.join(", ", errors);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private record FailedPage(List<Double> locations) {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -13,6 +13,7 @@ import com.pdftron.pdf.PDFNet;
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@Disabled
|
||||
@SuppressWarnings("PMD")
|
||||
class WatermarkRemovalServiceTest {
|
||||
|
||||
@SneakyThrows
|
||||
@ -23,7 +24,7 @@ class WatermarkRemovalServiceTest {
|
||||
|
||||
WatermarkRemovalService watermarkRemovalService = new WatermarkRemovalService();
|
||||
|
||||
String filename = "files/1.A16148F - Toxicidade oral aguda (1).pdf";
|
||||
String filename = "files/syngenta/CustomerFiles/1.A16148F - Toxicidade oral aguda (1).pdf";
|
||||
|
||||
String tmpFilename = createTmpFileName(filename, "WATERMARK_REMOVAL");
|
||||
try (var in = this.getClass().getClassLoader().getResourceAsStream(filename); var out = new FileOutputStream(tmpFilename)) {
|
||||
|
||||
@ -0,0 +1,145 @@
|
||||
package com.iqser.red.pdftronlogic.commons.rendering;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.Map;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class GhostScriptOutputHandler extends Thread {
|
||||
|
||||
static Pattern pageFinishedPattern = Pattern.compile("Page (\\d+)");
|
||||
|
||||
// If the stdError or stdOut buffer of a thread is not being emptied it might lock the process in case of errors, so we need to empty both streams to prevent a deadlock.
|
||||
// Since both need to read simultaneously we need to implement the readers as separate threads.
|
||||
|
||||
final InputStream is;
|
||||
final String processName;
|
||||
final Type type;
|
||||
|
||||
final Map<Integer, ImageFile> pagesToProcess;
|
||||
final Consumer<ImageFile> outputHandler;
|
||||
final Consumer<String> errorHandler;
|
||||
|
||||
int currentPageNumber;
|
||||
|
||||
|
||||
public static GhostScriptOutputHandler stdError(InputStream is, Consumer<String> errorHandler) {
|
||||
|
||||
return new GhostScriptOutputHandler(is, "GS", Type.ERROR, null, null, errorHandler);
|
||||
}
|
||||
|
||||
|
||||
public static GhostScriptOutputHandler stdOut(InputStream is, Map<Integer, ImageFile> pagesToProcess, Consumer<ImageFile> imageFileOutput, Consumer<String> errorHandler) {
|
||||
|
||||
return new GhostScriptOutputHandler(is, "GS", Type.STD_OUT, pagesToProcess, imageFileOutput, errorHandler);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void run() {
|
||||
|
||||
try (InputStreamReader isr = new InputStreamReader(is); BufferedReader br = new BufferedReader(isr)) {
|
||||
|
||||
String line;
|
||||
while (true) {
|
||||
line = br.readLine();
|
||||
|
||||
if (line == null) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (type.equals(Type.ERROR)) {
|
||||
log.error("{}_{}>{}", processName, type.name(), line);
|
||||
} else {
|
||||
log.debug("{}_{}>{}", processName, type.name(), line);
|
||||
addProcessedImageToQueue(line);
|
||||
}
|
||||
}
|
||||
}
|
||||
is.close();
|
||||
if (type.equals(Type.STD_OUT)) {
|
||||
queueFinishedPage(currentPageNumber);
|
||||
|
||||
if (!pagesToProcess.isEmpty()) {
|
||||
errorHandler.accept(String.format("Ghostscript finished for batch, but pages %s remain unprocessed.", formatPagesToProcess()));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private String formatPagesToProcess() {
|
||||
|
||||
if (pagesToProcess.isEmpty()) {
|
||||
return "-";
|
||||
}
|
||||
|
||||
if (pagesToProcess.size() == 1) {
|
||||
return pagesToProcess.keySet()
|
||||
.iterator().next().toString();
|
||||
}
|
||||
return pagesToProcess.keySet()
|
||||
.stream()
|
||||
.mapToInt(Integer::intValue)
|
||||
.min()
|
||||
.orElse(0) + "-" + pagesToProcess.keySet()
|
||||
.stream()
|
||||
.mapToInt(Integer::intValue)
|
||||
.max()
|
||||
.orElse(0);
|
||||
}
|
||||
|
||||
|
||||
private void addProcessedImageToQueue(String line) {
|
||||
|
||||
/*
|
||||
Ghostscript prints the pageNumber it is currently working on, so we remember the current page and queue it as soon as the next comes in.
|
||||
*/
|
||||
Matcher pageNumberMatcher = pageFinishedPattern.matcher(line);
|
||||
if (pageNumberMatcher.find()) {
|
||||
int pageNumber = Integer.parseInt(pageNumberMatcher.group(1));
|
||||
|
||||
if (currentPageNumber == 0) {
|
||||
currentPageNumber = pageNumber;
|
||||
return;
|
||||
}
|
||||
|
||||
queueFinishedPage(currentPageNumber);
|
||||
currentPageNumber = pageNumber;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void queueFinishedPage(int pageNumber) {
|
||||
|
||||
var imageFile = this.pagesToProcess.remove(pageNumber);
|
||||
if (imageFile == null) {
|
||||
errorHandler.accept(String.format("Page number %d does not exist in this thread. It only has pagenumbers %s", pageNumber, pagesToProcess.keySet()));
|
||||
} else {
|
||||
if (!new File(imageFile.absoluteFilePath()).exists()) {
|
||||
errorHandler.accept(String.format("Rendered page with number %d does not exist!", pageNumber));
|
||||
}
|
||||
}
|
||||
outputHandler.accept(imageFile);
|
||||
}
|
||||
|
||||
|
||||
public enum Type {
|
||||
ERROR,
|
||||
STD_OUT
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,192 @@
|
||||
package com.iqser.red.pdftronlogic.commons.rendering;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
@SuppressWarnings("PMD") // can't figure out how to safely close the stdOut and stdError streams in line 142/144
|
||||
public class GhostScriptService {
|
||||
|
||||
int BATCH_SIZE = 256;
|
||||
String FORMAT = ".tiff";
|
||||
String DEVICE = "tiffgray";
|
||||
int DPI = 100;
|
||||
int PROCESS_COUNT = 1;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public CompletableFuture<List<ImageFile>> renderDocument(Path documentFile, Path imageDir) {
|
||||
|
||||
int pageCount = getPageCount(documentFile);
|
||||
List<Integer> allPages = IntStream.range(1, pageCount + 1).boxed()
|
||||
.toList();
|
||||
ImageSupervisorImpl supervisor = new ImageSupervisorImpl(allPages);
|
||||
renderPagesBatched(allPages, documentFile.toFile().toString(), imageDir, supervisor, supervisor.successHandler(), supervisor.errorHandler());
|
||||
return CompletableFuture.supplyAsync(() -> awaitImageFiles(supervisor));
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static List<ImageFile> awaitImageFiles(ImageSupervisorImpl supervisor) {
|
||||
|
||||
supervisor.awaitAll();
|
||||
return supervisor.getRenderedImages();
|
||||
}
|
||||
|
||||
|
||||
private static int getPageCount(Path documentFile) throws PDFNetException {
|
||||
|
||||
try (PDFDoc doc = new PDFDoc(documentFile.toFile().toString())) {
|
||||
return doc.getPageCount();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void renderPagesBatched(List<Integer> pagesToProcess,
|
||||
String documentAbsolutePath,
|
||||
Path tmpImageDir,
|
||||
ImageSupervisor supervisor,
|
||||
Consumer<ImageFile> successHandler,
|
||||
Consumer<String> errorHandler) {
|
||||
|
||||
List<List<ProcessInfo>> processInfoBatches = buildSubListForEachProcess(pagesToProcess,
|
||||
PROCESS_COUNT,
|
||||
BATCH_SIZE
|
||||
* PROCESS_COUNT); // GS has a limit on how many pageIndices per call are possible, so we limit it to 256 pages per process
|
||||
for (int batchIdx = 0; batchIdx < processInfoBatches.size(); batchIdx++) {
|
||||
|
||||
supervisor.requireNoErrors();
|
||||
|
||||
List<ProcessInfo> processInfos = processInfoBatches.get(batchIdx);
|
||||
|
||||
log.info("Batch {}: Running {} gs processes with ({}) pages each",
|
||||
batchIdx,
|
||||
processInfos.size(),
|
||||
processInfos.stream()
|
||||
.map(info -> info.pageNumbers().size())
|
||||
.map(String::valueOf)
|
||||
.collect(Collectors.joining(", ")));
|
||||
|
||||
int finalBatchIdx = batchIdx;
|
||||
List<Process> processes = processInfos.stream()
|
||||
.parallel()
|
||||
.map(info -> buildCmdArgs(info.processIdx(), finalBatchIdx, info.pageNumbers(), tmpImageDir, documentAbsolutePath))
|
||||
.peek(s -> log.debug(String.join(" ", s.cmdArgs())))
|
||||
.map(processInfo -> executeProcess(processInfo, successHandler, errorHandler))
|
||||
.toList();
|
||||
|
||||
List<Integer> processExitCodes = new LinkedList<>();
|
||||
for (Process process : processes) {
|
||||
processExitCodes.add(process.waitFor());
|
||||
}
|
||||
log.info("Batch {}: Ghostscript processes finished with exit codes {}", batchIdx, processExitCodes);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private List<List<ProcessInfo>> buildSubListForEachProcess(List<Integer> stitchedPageNumbers, int processCount, int batchSize) {
|
||||
|
||||
// GhostScript command line can only handle so many page numbers at once, so we split it into batches
|
||||
int batchCount = (int) Math.ceil((double) stitchedPageNumbers.size() / batchSize);
|
||||
|
||||
log.info("Splitting {} page renderings across {} process(es) in {} batch(es) with size {}", stitchedPageNumbers.size(), processCount, batchCount, batchSize);
|
||||
|
||||
List<List<ProcessInfo>> processInfoBatches = new ArrayList<>(batchCount);
|
||||
List<List<List<Integer>>> batchedBalancedSublist = ListSplittingUtils.buildBatchedBalancedSublist(stitchedPageNumbers.stream()
|
||||
.sorted()
|
||||
.toList(), processCount, batchCount);
|
||||
|
||||
for (var batch : batchedBalancedSublist) {
|
||||
List<ProcessInfo> processInfos = new ArrayList<>(processCount);
|
||||
for (int threadIdx = 0; threadIdx < batch.size(); threadIdx++) {
|
||||
List<Integer> balancedPageNumbersSubList = batch.get(threadIdx);
|
||||
processInfos.add(new ProcessInfo(threadIdx, balancedPageNumbersSubList));
|
||||
}
|
||||
processInfoBatches.add(processInfos);
|
||||
}
|
||||
return processInfoBatches;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private ProcessCmdsAndRenderedImageFiles buildCmdArgs(Integer processIdx,
|
||||
Integer batchIdx,
|
||||
List<Integer> stitchedImagePageIndices,
|
||||
Path outputDir,
|
||||
String documentAbsolutePath) {
|
||||
|
||||
String imagePathFormat = outputDir.resolve("output_" + processIdx + "_" + batchIdx + ".%04d" + FORMAT).toFile().toString();
|
||||
|
||||
Map<Integer, ImageFile> fullPageImages = new HashMap<>();
|
||||
for (int i = 0; i < stitchedImagePageIndices.size(); i++) {
|
||||
Integer pageNumber = stitchedImagePageIndices.get(i);
|
||||
fullPageImages.put(pageNumber, new ImageFile(pageNumber, String.format(imagePathFormat, i + 1)));
|
||||
}
|
||||
|
||||
String[] cmdArgs = buildCmdArgs(stitchedImagePageIndices, documentAbsolutePath, imagePathFormat);
|
||||
|
||||
return new ProcessCmdsAndRenderedImageFiles(cmdArgs, fullPageImages);
|
||||
}
|
||||
|
||||
|
||||
private String[] buildCmdArgs(List<Integer> pageNumbers, String documentAbsolutePath, String imagePathFormat) {
|
||||
|
||||
StringBuilder sPageList = new StringBuilder();
|
||||
int i = 1;
|
||||
for (Integer integer : pageNumbers) {
|
||||
sPageList.append(integer);
|
||||
if (i < pageNumbers.size()) {
|
||||
sPageList.append(",");
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
return new String[]{"gs", "-dNOPAUSE", "-sDEVICE=" + DEVICE, "-r" + DPI, "-sPageList=" + sPageList, "-sOutputFile=" + imagePathFormat, documentAbsolutePath, "-c", "quit"};
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private Process executeProcess(ProcessCmdsAndRenderedImageFiles processInfo, Consumer<ImageFile> successHandler, Consumer<String> errorHandler) {
|
||||
|
||||
Process p = Runtime.getRuntime().exec(processInfo.cmdArgs());
|
||||
InputStream stdOut = p.getInputStream();
|
||||
GhostScriptOutputHandler stdOutLogger = GhostScriptOutputHandler.stdOut(stdOut, processInfo.renderedPageImageFiles(), successHandler, errorHandler);
|
||||
InputStream stdError = p.getErrorStream();
|
||||
GhostScriptOutputHandler stdErrorLogger = GhostScriptOutputHandler.stdError(stdError, errorHandler);
|
||||
|
||||
stdOutLogger.start();
|
||||
stdErrorLogger.start();
|
||||
return p;
|
||||
}
|
||||
|
||||
|
||||
private record ProcessCmdsAndRenderedImageFiles(String[] cmdArgs, Map<Integer, ImageFile> renderedPageImageFiles) {
|
||||
|
||||
}
|
||||
|
||||
private record ProcessInfo(Integer processIdx, List<Integer> pageNumbers) {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,13 @@
|
||||
package com.iqser.red.pdftronlogic.commons.rendering;
|
||||
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
|
||||
public record ImageFile(int pageNumber, String absoluteFilePath) {
|
||||
|
||||
public Pix readPix() {
|
||||
|
||||
return Leptonica1.pixRead(absoluteFilePath);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,7 @@
|
||||
package com.iqser.red.pdftronlogic.commons.rendering;
|
||||
|
||||
public interface ImageSupervisor {
|
||||
|
||||
void requireNoErrors();
|
||||
|
||||
}
|
||||
@ -0,0 +1,114 @@
|
||||
package com.iqser.red.pdftronlogic.commons.rendering;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.CountDownLatch;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class ImageSupervisorImpl implements ImageSupervisor {
|
||||
|
||||
final Map<Integer, CountDownLatch> pageLatches;
|
||||
final Map<Integer, ImageFile> images;
|
||||
final List<String> errors;
|
||||
|
||||
final ImageFile[] finishedPages;
|
||||
|
||||
|
||||
public ImageSupervisorImpl(List<Integer> pageNumbers) {
|
||||
|
||||
this.pageLatches = Collections.synchronizedMap(new HashMap<>());
|
||||
this.images = Collections.synchronizedMap(new HashMap<>());
|
||||
this.errors = Collections.synchronizedList(new ArrayList<>());
|
||||
this.finishedPages = new ImageFile[pageNumbers.size()];
|
||||
for (Integer pageNumber : pageNumbers) {
|
||||
pageLatches.put(pageNumber, new CountDownLatch(1));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public List<ImageFile> getRenderedImages() {
|
||||
|
||||
return new ArrayList<>(images.values());
|
||||
}
|
||||
|
||||
|
||||
public void markPageFinished(ImageFile imageFile) {
|
||||
|
||||
log.debug("finished page: {}", imageFile.pageNumber());
|
||||
getPageLatch(imageFile.pageNumber()).countDown();
|
||||
images.put(imageFile.pageNumber(), imageFile);
|
||||
finishedPages[imageFile.pageNumber() - 1] = imageFile;
|
||||
}
|
||||
|
||||
|
||||
public Consumer<ImageFile> successHandler() {
|
||||
|
||||
return this::markPageFinished;
|
||||
}
|
||||
|
||||
|
||||
public Consumer<String> errorHandler() {
|
||||
|
||||
return this::markError;
|
||||
}
|
||||
|
||||
|
||||
private CountDownLatch getPageLatch(Integer pageNumber) {
|
||||
|
||||
if (pageNumber == null || !pageLatches.containsKey(pageNumber)) {
|
||||
throw new IllegalArgumentException("awaiting non-existent page " + pageNumber);
|
||||
}
|
||||
return pageLatches.get(pageNumber);
|
||||
}
|
||||
|
||||
|
||||
public ImageFile awaitProcessedPage(Integer pageNumber) throws InterruptedException {
|
||||
|
||||
if (hasErrors()) {
|
||||
return null;
|
||||
}
|
||||
getPageLatch(pageNumber).await();
|
||||
return images.get(pageNumber);
|
||||
}
|
||||
|
||||
|
||||
private boolean hasErrors() {
|
||||
|
||||
return errors.isEmpty();
|
||||
}
|
||||
|
||||
|
||||
public void markError(String errorMessage) {
|
||||
|
||||
this.errors.add(errorMessage);
|
||||
}
|
||||
|
||||
|
||||
public void awaitAll() throws InterruptedException {
|
||||
|
||||
for (CountDownLatch countDownLatch : pageLatches.values()) {
|
||||
countDownLatch.await();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void requireNoErrors() {
|
||||
// GS will log
|
||||
if (this.errors.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
throw new IllegalStateException("Error(s) occurred during image processing: " + String.join("\n", errors));
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,106 @@
|
||||
package com.iqser.red.pdftronlogic.commons.rendering;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class ListSplittingUtils {
|
||||
|
||||
public List<List<Integer>> buildBalancedContinuousSublist(Integer totalNumberOfEntries, int threadCount) {
|
||||
|
||||
return buildBalancedSublist(IntStream.range(0, totalNumberOfEntries)
|
||||
.map(i -> i + 1).boxed()
|
||||
.toList(), threadCount);
|
||||
}
|
||||
|
||||
|
||||
public <T> List<List<T>> buildBalancedSublist(List<T> entries, int threadCount) {
|
||||
|
||||
List<Integer> balancedEntryCounts = buildBalancedEntryCounts(entries.size(), threadCount);
|
||||
List<List<T>> balancedSublist = new ArrayList<>(threadCount);
|
||||
int startIdx = 0;
|
||||
for (Integer numberOfEntriesPerThread : balancedEntryCounts) {
|
||||
balancedSublist.add(entries.subList(startIdx, startIdx + numberOfEntriesPerThread));
|
||||
startIdx += numberOfEntriesPerThread;
|
||||
}
|
||||
return balancedSublist;
|
||||
}
|
||||
|
||||
|
||||
public <T> List<List<List<T>>> buildBatchedBalancedSublist(List<T> entries, int threadCount, int batchSize) {
|
||||
|
||||
// batches -> threads -> entries
|
||||
List<List<List<T>>> batchedBalancedSubList = new LinkedList<>();
|
||||
List<List<List<T>>> threadsWithBatches = buildBalancedSublist(entries, threadCount).stream()
|
||||
.map(list -> buildBalancedSublist(list, batchSize))
|
||||
.toList();
|
||||
// swap first two dimensions
|
||||
for (int batchIdx = 0; batchIdx < batchSize; batchIdx++) {
|
||||
List<List<T>> threadEntriesPerBatch = new ArrayList<>(threadCount);
|
||||
for (int threadIdx = 0; threadIdx < threadCount; threadIdx++) {
|
||||
threadEntriesPerBatch.add(threadsWithBatches.get(threadIdx).get(batchIdx));
|
||||
}
|
||||
batchedBalancedSubList.add(threadEntriesPerBatch);
|
||||
|
||||
}
|
||||
return batchedBalancedSubList;
|
||||
}
|
||||
|
||||
|
||||
public List<Integer> buildBalancedEntryCounts(int totalNumberOfEntries, int threadCount) {
|
||||
|
||||
List<Integer> numberOfPagesPerThread = new ArrayList<>(threadCount);
|
||||
for (int i = 0; i < threadCount; i++) {
|
||||
numberOfPagesPerThread.add(0);
|
||||
}
|
||||
int threadIdx;
|
||||
for (int i = 0; i < totalNumberOfEntries; i++) {
|
||||
threadIdx = i % threadCount;
|
||||
numberOfPagesPerThread.set(threadIdx, numberOfPagesPerThread.get(threadIdx) + 1);
|
||||
}
|
||||
return numberOfPagesPerThread;
|
||||
}
|
||||
|
||||
|
||||
public static List<String> formatIntervals(List<Integer> sortedList) {
|
||||
|
||||
List<String> intervals = new ArrayList<>();
|
||||
|
||||
if (sortedList.isEmpty()) {
|
||||
return intervals;
|
||||
}
|
||||
|
||||
int start = sortedList.get(0);
|
||||
int end = start;
|
||||
|
||||
for (int i = 1; i < sortedList.size(); i++) {
|
||||
int current = sortedList.get(i);
|
||||
|
||||
if (current == end + 1) {
|
||||
end = current;
|
||||
} else {
|
||||
intervals.add(formatInterval(start, end));
|
||||
start = current;
|
||||
end = start;
|
||||
}
|
||||
}
|
||||
|
||||
intervals.add(formatInterval(start, end));
|
||||
return intervals;
|
||||
}
|
||||
|
||||
|
||||
private static String formatInterval(int start, int end) {
|
||||
|
||||
if (start == end) {
|
||||
return String.valueOf(start);
|
||||
} else {
|
||||
return start + "-" + end;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
BIN
src/test/resources/files/Page32.pdf
Normal file
BIN
src/test/resources/files/Page32.pdf
Normal file
Binary file not shown.
1
src/test/resources/files/basf
Submodule
1
src/test/resources/files/basf
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit 9dc6c2337dea32e63aef53271dba0692537c6605
|
||||
1243907
src/test/resources/files/everyCharIsImage.pdf
Normal file
1243907
src/test/resources/files/everyCharIsImage.pdf
Normal file
File diff suppressed because one or more lines are too long
BIN
src/test/resources/files/invisibleTextInNestedFormObjects.pdf
Normal file
BIN
src/test/resources/files/invisibleTextInNestedFormObjects.pdf
Normal file
Binary file not shown.
BIN
src/test/resources/files/singlePageWithOcrText.pdf
Normal file
BIN
src/test/resources/files/singlePageWithOcrText.pdf
Normal file
Binary file not shown.
1
src/test/resources/files/syngenta
Submodule
1
src/test/resources/files/syngenta
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit 21fefb64bf27ca2b3329a6c69d90a27450b17930
|
||||
Loading…
x
Reference in New Issue
Block a user