RED-4875 - setup project and copy the 3 relevent classes from ocr-service

This commit is contained in:
Thomas Beyer 2023-03-16 15:35:41 +01:00
parent 65872f1282
commit 90ec4bd16d
10 changed files with 1092 additions and 0 deletions

36
bamboo-specs/pom.xml Normal file
View File

@ -0,0 +1,36 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.atlassian.bamboo</groupId>
<artifactId>bamboo-specs-parent</artifactId>
<version>8.1.3</version>
<relativePath/>
</parent>
<artifactId>bamboo-specs</artifactId>
<version>1.0.0-SNAPSHOT</version>
<packaging>jar</packaging>
<dependencies>
<dependency>
<groupId>com.atlassian.bamboo</groupId>
<artifactId>bamboo-specs-api</artifactId>
</dependency>
<dependency>
<groupId>com.atlassian.bamboo</groupId>
<artifactId>bamboo-specs</artifactId>
</dependency>
<!-- Test dependencies -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
<!-- run 'mvn test' to perform offline validation of the plan -->
<!-- run 'mvn -Ppublish-specs' to upload the plan to your Bamboo server -->
</project>

View File

@ -0,0 +1,127 @@
package buildjob;
import static com.atlassian.bamboo.specs.builders.task.TestParserTask.createJUnitParserTask;
import com.atlassian.bamboo.specs.api.BambooSpec;
import com.atlassian.bamboo.specs.api.builders.BambooKey;
import com.atlassian.bamboo.specs.api.builders.docker.DockerConfiguration;
import com.atlassian.bamboo.specs.api.builders.permission.PermissionType;
import com.atlassian.bamboo.specs.api.builders.permission.Permissions;
import com.atlassian.bamboo.specs.api.builders.permission.PlanPermissions;
import com.atlassian.bamboo.specs.api.builders.plan.Job;
import com.atlassian.bamboo.specs.api.builders.plan.Plan;
import com.atlassian.bamboo.specs.api.builders.plan.PlanIdentifier;
import com.atlassian.bamboo.specs.api.builders.plan.Stage;
import com.atlassian.bamboo.specs.api.builders.plan.branches.BranchCleanup;
import com.atlassian.bamboo.specs.api.builders.plan.branches.PlanBranchManagement;
import com.atlassian.bamboo.specs.api.builders.project.Project;
import com.atlassian.bamboo.specs.api.builders.Variable;
import com.atlassian.bamboo.specs.builders.task.CheckoutItem;
import com.atlassian.bamboo.specs.builders.task.InjectVariablesTask;
import com.atlassian.bamboo.specs.builders.task.ScriptTask;
import com.atlassian.bamboo.specs.builders.task.VcsCheckoutTask;
import com.atlassian.bamboo.specs.builders.task.VcsTagTask;
import com.atlassian.bamboo.specs.builders.trigger.BitbucketServerTrigger;
import com.atlassian.bamboo.specs.model.task.InjectVariablesScope;
import com.atlassian.bamboo.specs.util.BambooServer;
import com.atlassian.bamboo.specs.model.task.ScriptTaskProperties.Location;
/**
* Plan configuration for Bamboo.
* Learn more on: <a href="https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs">https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs</a>
*/
@BambooSpec
public class PlanSpec {
private static final String SERVICE_NAME = "pdftron-logic-commons";
private static final String SERVICE_KEY = SERVICE_NAME.toUpperCase().replaceAll("-", "");
/**
* Run main to publish plan on Bamboo
*/
public static void main(final String[] args) throws Exception {
//By default credentials are read from the '.credentials' file.
BambooServer bambooServer = new BambooServer("http://localhost:8085");
Plan plan = new PlanSpec().createPlan();
bambooServer.publish(plan);
PlanPermissions planPermission = new PlanSpec().createPlanPermission(plan.getIdentifier());
bambooServer.publish(planPermission);
}
private PlanPermissions createPlanPermission(PlanIdentifier planIdentifier) {
Permissions permission = new Permissions()
.userPermissions("atlbamboo", PermissionType.EDIT, PermissionType.VIEW, PermissionType.ADMIN, PermissionType.CLONE, PermissionType.BUILD)
.groupPermissions("development", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
.groupPermissions("devplant", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
.loggedInUserPermissions(PermissionType.VIEW)
.anonymousUserPermissionView();
return new PlanPermissions(planIdentifier.getProjectKey(), planIdentifier.getPlanKey()).permissions(permission);
}
private Project project() {
return new Project()
.name("RED")
.key(new BambooKey("RED"));
}
public Plan createPlan() {
return new Plan(
project(),
SERVICE_NAME, new BambooKey(SERVICE_KEY))
.description("Plan created from (enter repository url of your plan)")
.variables(new Variable("maven_add_param", ""))
.stages(new Stage("Default Stage")
.jobs(new Job("Default Job",
new BambooKey("JOB1"))
.tasks(
new ScriptTask()
.description("Clean")
.inlineBody("#!/bin/bash\n" +
"set -e\n" +
"rm -rf ./*"),
new VcsCheckoutTask()
.description("Checkout Default Repository")
.checkoutItems(new CheckoutItem().defaultRepository()),
new ScriptTask()
.description("Build")
.location(Location.FILE)
.fileFromPath("bamboo-specs/src/main/resources/scripts/build-java.sh")
.argument(SERVICE_NAME),
createJUnitParserTask()
.description("Resultparser")
.resultDirectories("**/test-reports/*.xml, **/target/surefire-reports/*.xml, **/target/failsafe-reports/*.xml")
.enabled(true),
new ScriptTask()
.description("Sonar")
.location(Location.FILE)
.fileFromPath("bamboo-specs/src/main/resources/scripts/sonar-java.sh")
.argument(SERVICE_NAME),
new InjectVariablesTask()
.description("Inject git Tag")
.path("git.tag")
.namespace("g")
.scope(InjectVariablesScope.LOCAL),
new VcsTagTask()
.description("${bamboo.g.gitTag}")
.tagName("${bamboo.g.gitTag}")
.defaultRepository())
.dockerConfiguration(
new DockerConfiguration()
.image("nexus.iqser.com:5001/infra/maven:3.8.4-openjdk-17-slim")
.volume("/etc/maven/settings.xml", "/usr/share/maven/conf/settings.xml")
.dockerRunArguments("--net=host")
.volume("/var/run/docker.sock", "/var/run/docker.sock")
)
)
)
.linkedRepositories("RED / " + SERVICE_NAME)
.triggers(new BitbucketServerTrigger())
.planBranchManagement(new PlanBranchManagement()
.createForVcsBranch()
.delete(new BranchCleanup()
.whenInactiveInRepositoryAfterDays(14))
.notificationForCommitters());
}
}

View File

@ -0,0 +1,52 @@
#!/bin/bash
set -e
SERVICE_NAME=$1
if [[ "$bamboo_planRepository_branchName" == "master" ]]
then
branchVersion=$(cat pom.xml | grep -Eo " <version>.*-SNAPSHOT</version>" | sed -s 's|<version>\(.*\)\..*\(-*.*\)</version>|\1|' | tr -d ' ')
latestVersion=$( semver $(git tag -l "${branchVersion}.*" ) | tail -n1 )
newVersion="$(semver $latestVersion -p -i minor)"
echo "new release on master with version $newVersion"
elif [[ "$bamboo_planRepository_branchName" == release* ]]
then
branchVersion=$(echo $bamboo_planRepository_branchName | sed -s 's|release\/\([0-9]\+\.[0-9]\+\)\.x|\1|')
latestVersion=$( semver $(git tag -l "${branchVersion}.*" ) | tail -n1 )
newVersion="$(semver $latestVersion -p -i patch)"
echo "new release on $bamboo_planRepository_branchName with version $newVersion"
elif [[ "${bamboo_version_tag}" != "dev" ]]
then
newVersion="${bamboo_version_tag}"
echo "new special version bild with $newVersion"
else
mvn -f ${bamboo_build_working_directory}/pom.xml \
--no-transfer-progress \
${bamboo_maven_add_param} \
clean install \
-Djava.security.egd=file:/dev/./urandomelse
echo "dev build with tag ${bamboo_planRepository_1_branch}_${bamboo_buildNumber}"
echo "gitTag=${bamboo_planRepository_1_branch}_${bamboo_buildNumber}" > git.tag
exit 0
fi
echo "gitTag=${newVersion}" > git.tag
mvn --no-transfer-progress \
-f ${bamboo_build_working_directory}/pom.xml \
${bamboo_maven_add_param} \
versions:set \
-DnewVersion=${newVersion}
mvn -f ${bamboo_build_working_directory}/pom.xml \
--no-transfer-progress \
clean deploy \
${bamboo_maven_add_param} \
-e \
-DdeployAtEnd=true \
-Dmaven.wagon.http.ssl.insecure=true \
-Dmaven.wagon.http.ssl.allowall=true \
-Dmaven.wagon.http.ssl.ignore.validity.dates=true \
-DaltDeploymentRepository=iqser_release::default::https://nexus.iqser.com/repository/red-platform-releases

View File

@ -0,0 +1,37 @@
#!/bin/bash
set -e
SERVICE_NAME=$1
echo "dependency-check:aggregate"
mvn --no-transfer-progress \
org.owasp:dependency-check-maven:aggregate \
-DknownExploitedEnabled=false
if [[ -z "${bamboo_repository_pr_key}" ]]
then
echo "Sonar Scan for branch: ${bamboo_planRepository_1_branch}"
mvn --no-transfer-progress \
sonar:sonar \
-Dsonar.projectKey=RED_$SERVICE_NAME \
-Dsonar.host.url=https://sonarqube.iqser.com \
-Dsonar.login=${bamboo_sonarqube_api_token_secret} \
-Dsonar.branch.name=${bamboo_planRepository_1_branch} \
-Dsonar.dependencyCheck.jsonReportPath=target/dependency-check-report.json \
-Dsonar.dependencyCheck.xmlReportPath=target/dependency-check-report.xml \
-Dsonar.dependencyCheck.htmlReportPath=target/dependency-check-report.html
else
echo "Sonar Scan for PR with key1: ${bamboo_repository_pr_key}"
mvn --no-transfer-progress \
sonar:sonar \
-Dsonar.projectKey=RED_$SERVICE_NAME \
-Dsonar.host.url=https://sonarqube.iqser.com \
-Dsonar.login=${bamboo_sonarqube_api_token_secret} \
-Dsonar.pullrequest.key=${bamboo_repository_pr_key} \
-Dsonar.pullrequest.branch=${bamboo_repository_pr_sourceBranch} \
-Dsonar.pullrequest.base=${bamboo_repository_pr_targetBranch} \
-Dsonar.dependencyCheck.jsonReportPath=target/dependency-check-report.json \
-Dsonar.dependencyCheck.xmlReportPath=target/dependency-check-report.xml \
-Dsonar.dependencyCheck.htmlReportPath=target/dependency-check-report.html
fi

View File

@ -0,0 +1,16 @@
package buildjob;
import com.atlassian.bamboo.specs.api.builders.plan.Plan;
import com.atlassian.bamboo.specs.api.exceptions.PropertiesValidationException;
import com.atlassian.bamboo.specs.api.util.EntityPropertiesBuilders;
import org.junit.Test;
public class PlanSpecTest {
@Test
public void checkYourPlanOffline() throws PropertiesValidationException {
Plan plan = new PlanSpec().createPlan();
EntityPropertiesBuilders.build(plan);
}
}

105
pom.xml Normal file
View File

@ -0,0 +1,105 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>platform-dependency</artifactId>
<groupId>com.iqser.red</groupId>
<version>1.17.0</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<artifactId>pdftron-logic-commons</artifactId>
<groupId>com.iqser.red.commons</groupId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-configuration-processor</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.pdftron</groupId>
<artifactId>PDFNet</artifactId>
<version>9.4.0</version>
<scope>provided</scope>
</dependency>
<!-- Test Dependencies -->
</dependencies>
<build>
<plugins>
<plugin>
<!-- create a test jar for the api classes to be used by other modules -->
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<executions>
<execution>
<goals>
<goal>test-jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.jacoco</groupId>
<artifactId>jacoco-maven-plugin</artifactId>
<executions>
<execution>
<id>prepare-agent</id>
<goals>
<goal>prepare-agent</goal>
</goals>
</execution>
<execution>
<id>report</id>
<goals>
<goal>report</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
<repositories>
<repository>
<id>pdftron</id>
<name>PDFNet Maven</name>
<url>https://pdftron.com/maven/release</url>
</repository>
</repositories>
</project>

View File

@ -0,0 +1,70 @@
package com.iqser.red.pdftronlogic.commons;
import static com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService.TOLERANCE;
import java.awt.geom.Area;
import java.awt.geom.GeneralPath;
import java.awt.geom.Rectangle2D;
import java.util.Deque;
import java.util.LinkedList;
import com.pdftron.pdf.Rect;
import lombok.Data;
import lombok.SneakyThrows;
@Data
public class ClippingPathStack {
private Deque<Area> stack = new LinkedList<>();
@SneakyThrows
public ClippingPathStack(Rect rectangle) {
stack.push(new Area(new Rectangle2D.Double(rectangle.getX1(), rectangle.getY1(), rectangle.getWidth(), rectangle.getHeight()).getBounds2D()));
}
@SneakyThrows
public void intersectClippingPath(GeneralPath path) {
getCurrentClippingPath().intersect(new Area(path));
}
public boolean almostIntersects(double x, double y, double width, double height) {
// To address inconsistencies in the calculation of the bounding box we slightly increase the rectangle
// Height or width are zero for straight lines, even though they are being rendered. Therefore, height or width must be at minimum >0.
double x_with_tolerance = x > 0 ? x - TOLERANCE : x + TOLERANCE;
double y_with_tolerance = y > 0 ? y - TOLERANCE : y + TOLERANCE;
double width_with_tolerance = width + (2 * TOLERANCE);
double height_with_tolerance = height + (2 * TOLERANCE);
return getCurrentClippingPath().intersects(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance);
}
public Area getCurrentClippingPath() {
return stack.peek();
}
public void enterNewGState() {
Area current = stack.peek();
Area cloned = new Area();
cloned.add(current);
stack.push(cloned);
}
public void leaveGState() {
stack.pop();
}
}

View File

@ -0,0 +1,170 @@
package com.iqser.red.pdftronlogic.commons;
import static com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService.TOLERANCE;
import java.awt.geom.Rectangle2D;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.Rect;
import lombok.AccessLevel;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Getter
@SuperBuilder
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ElementFeatures {
int elementType;
Rectangle2D boundingBox;
public boolean almostMatches(Element element) throws PDFNetException {
return element.getType() == elementType && //
element.getBBox() != null && //
rectsAlmostMatch(element.getBBox());
}
protected boolean almostEqual(double a, double b) {
return Math.abs(a - b) < TOLERANCE;
}
@SneakyThrows
private boolean rectsAlmostMatch(Rect bBox) {
// To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance
return almostEqual(bBox.getX1(), boundingBox.getX()) && //
almostEqual(bBox.getY1(), boundingBox.getY()) && //
almostEqual(bBox.getWidth(), boundingBox.getWidth()) && //
almostEqual(bBox.getHeight(), boundingBox.getHeight());
}
@EqualsAndHashCode(callSuper = true)
@Getter
@SuperBuilder
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
private static class Text extends ElementFeatures {
String text;
int font;
double fontsize;
@Override
public boolean almostMatches(Element element) throws PDFNetException {
return super.almostMatches(element) && //
text.equals(element.getTextString()) && //
font == element.getGState().getFont().getType() && //
almostEqual(fontsize, element.getGState().getFontSize());
}
}
@EqualsAndHashCode(callSuper = true)
@Getter
@SuperBuilder
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
private static class Path extends ElementFeatures {
boolean isClippingPath;
boolean isClipWindingFill;
boolean isStroked;
boolean isFilled;
boolean isWindingFill;
@Override
public boolean almostMatches(Element element) throws PDFNetException {
return super.almostMatches(element) && //
isClippingPath == element.isClippingPath() && //
isClipWindingFill == element.isClipWindingFill() && //
isStroked == element.isStroked() && //
isFilled == element.isFilled() && //
isWindingFill == element.isWindingFill();
}
}
@EqualsAndHashCode(callSuper = true)
@Getter
@SuperBuilder
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
private static class Image extends ElementFeatures {
int dataSize;
int height;
int width;
int renderingIntent;
int componentNum;
int bitsPerComponent;
@Override
public boolean almostMatches(Element element) throws PDFNetException {
return super.almostMatches(element) && //
dataSize == element.getImageDataSize() && //
height == element.getImageHeight() && //
width == element.getImageWidth() && //
renderingIntent == element.getImageRenderingIntent() && //
componentNum == element.getComponentNum() && //
bitsPerComponent == element.getBitsPerComponent();
}
}
public static ElementFeatures extractFeatures(Element element) throws PDFNetException {
return switch (element.getType()) {
case Element.e_path -> Path.builder()
.elementType(element.getType())
.boundingBox(toRectangle2D(element.getBBox()))
.isClippingPath(element.isClippingPath())
.isClipWindingFill(element.isClipWindingFill())
.isStroked(element.isStroked())
.isFilled(element.isFilled())
.isWindingFill(element.isWindingFill())
.build();
case Element.e_text -> Text.builder()
.elementType(element.getType())
.boundingBox(toRectangle2D(element.getBBox()))
.text(element.getTextString())
.font(element.getGState().getFont().getType())
.fontsize(element.getGState().getFontSize())
.build();
case Element.e_image, Element.e_inline_image -> Image.builder()
.elementType(element.getType())
.boundingBox(toRectangle2D(element.getBBox()))
.dataSize(element.getImageDataSize())
.height(element.getImageHeight())
.width(element.getImageWidth())
.renderingIntent(element.getImageRenderingIntent())
.componentNum(element.getComponentNum())
.bitsPerComponent(element.getBitsPerComponent())
.build();
// This technically should never happen, it's a safetynet
default -> throw new RuntimeException("Feature Extraction is not supported for PDFTron.Element with type: " + element.getType());
};
}
private static Rectangle2D toRectangle2D(Rect rect) throws PDFNetException {
return new Rectangle2D.Double(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
}
}

View File

@ -0,0 +1,464 @@
package com.iqser.red.pdftronlogic.commons;
import java.awt.Shape;
import java.awt.geom.AffineTransform;
import java.awt.geom.GeneralPath;
import java.awt.geom.Rectangle2D;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import org.springframework.stereotype.Service;
import com.google.common.primitives.Bytes;
import com.google.common.primitives.Doubles;
import com.pdftron.common.Matrix2D;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.ColorPt;
import com.pdftron.pdf.ColorSpace;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementBuilder;
import com.pdftron.pdf.ElementReader;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.GState;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
import com.pdftron.pdf.PathData;
import com.pdftron.pdf.Rect;
import com.pdftron.sdf.Obj;
import com.pdftron.sdf.SDFDoc;
import lombok.Builder;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
public class InvisibleElementRemovalService {
static public final double TOLERANCE = 1e-3;
/**
* Removes all hidden Text, Path and Image Elements from a PDF Document.
* handled cases:
* -Text which is transparent or is set to not render
* -Elements outside of clipping path
* -Elements that have been painted over by visible and filled Paths
* unhandled cases:
* -Elements covered by widely stroked path
* -Elements with the same color as background
* -Any Text set to clipping with its many interactions with other elements
*
* @param pdfFile The PDF file to process
* @param delta If this flag is set only the removed Elements will be written to the output file.
* The Elements are red if they are removed by clipping path, blue for transparency, and a green bounding box for overlap.
* @param out OutputStream to write the resulting file to
**/
@SneakyThrows
public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta) {
PDFDoc pdfDoc = new PDFDoc(pdfFile);
ElementWriter writer = new ElementWriter();
ElementReader reader = new ElementReader();
Set<Long> visitedXObjIds = new TreeSet<>();
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
Page page = iterator.next();
visitedXObjIds.add(page.getSDFObj().getObjNum());
InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder()
.reader(reader)
.clippingPathStack(new ClippingPathStack(page.getMediaBox()))
.delta(delta)
.overlappedElements(new ArrayList<>())
.visibleElements(new ArrayList<>())
.visitedXObjIds(visitedXObjIds)
.build();
removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context);
context.visitedXObjIds().clear();
removeOverlappedElements(page, writer, context);
}
try {
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
} catch (Exception e) {
log.error("File could not be saved after invisible element removal");
throw new RuntimeException(e);
}
writer.destroy();
reader.destroy();
pdfDoc.close();
}
private void removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(Page page,
ElementWriter writer,
InvisibleElementRemovalContext context) throws PDFNetException {
context.reader().begin(page);
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
processElements(writer, context);
writer.end();
context.reader().end();
}
private void processElements(ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
for (Element element = context.reader().next(); element != null; element = context.reader().next())
switch (element.getType()) {
case Element.e_image, Element.e_inline_image -> processImages(element, writer, context);
case Element.e_text -> processText(element, writer, context);
case Element.e_path -> processPath(element, writer, context);
case Element.e_form -> processForm(element, writer, context);
case Element.e_group_begin -> {
context.clippingPathStack().enterNewGState();
writer.writeElement(element);
}
case Element.e_group_end -> {
context.clippingPathStack().leaveGState();
writer.writeElement(element);
}
default -> writer.writeElement(element);
}
}
private void processImages(Element imageElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
Rect rect = imageElement.getBBox();
if (rect == null) {
return;
}
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
if (!context.delta() && inClippingPath) {
context.visibleElements().add(ElementFeatures.extractFeatures(imageElement));
}
if (context.delta() ^ inClippingPath) {
writer.writeElement(imageElement);
}
}
private void processText(Element textElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
Rect rect = textElement.getBBox();
if (rect == null) {
writer.writeElement(textElement);
return;
}
GState gState = textElement.getGState();
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
boolean isTextVisible = isTextRenderedVisibly(gState);
if (inClippingPath && isTextVisible) {
context.visibleElements().add(ElementFeatures.extractFeatures(textElement));
}
if (!context.delta()) {
if (inClippingPath && isTextVisible) {
writer.writeElement(textElement);
} else if (textElement.hasTextMatrix()) {
/*
PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
Therefore, the position of a following Tj is affected by not writing the first Element.
This is why, we write only the Tm command:
*/
writer.writeGStateChanges(textElement);
}
} else {
if (!inClippingPath) {
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
// red for elements removed by clipping path
gState.setFillColor(new ColorPt(1, 0, 0));
writer.writeElement(textElement);
}
if (!isTextVisible) {
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
// blue for elements removed due to transparency or not rendered
gState.setFillColor(new ColorPt(0, 0, 1));
gState.setTextRenderMode(GState.e_fill_text);
gState.setFillOpacity(1);
writer.writeElement(textElement);
}
}
}
private void processForm(Element formElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
writer.writeElement(formElement);
Obj formObj = formElement.getXObject();
if (!context.visitedXObjIds().contains(formObj.getObjNum())) {
context.visitedXObjIds().add(formObj.getObjNum());
// writer needs to be newly initialized when entering a new content stream
// see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
ElementWriter formWriter = new ElementWriter();
context.reader().formBegin();
formWriter.begin(formObj);
context.reader().clearChangeList();
formWriter.setDefaultGState(context.reader());
processElements(formWriter, context);
formWriter.end();
formWriter.destroy();
context.reader().end();
}
}
private void processPath(Element pathElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
PathData pathData = pathElement.getPathData();
if (pathData.getOperators().length == 0 && pathData.getPoints().length == 0) {
writer.writeGStateChanges(pathElement);
return;
}
GeneralPath linePath = convertToGeneralPath(pathData);
//transform path to initial user space
var ctm = pathElement.getCTM();
var affineTransform = toAffineTransform(ctm);
linePath.transform(affineTransform);
var rect = linePath.getBounds2D();
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight());
if (pathElement.isClippingPath()) {
if (pathElement.isClipWindingFill()) {
linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
} else {
linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
}
context.clippingPathStack().intersectClippingPath(linePath);
pathElement.setPathClip(!context.delta());
writer.writeElement(pathElement);
} else {
if (pathElement.isWindingFill()) {
linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
} else {
linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
}
if (inClippingPath) {
if (isFilledAndNonTransparent(pathElement)) {
List<ElementFeatures> currentOverlappedElements = context.visibleElements()
.stream()
.filter(features -> almostContains(linePath, features.getBoundingBox()))
.toList();
context.overlappedElements().addAll(currentOverlappedElements);
context.visibleElements().removeAll(currentOverlappedElements);
}
context.visibleElements().add(ElementFeatures.extractFeatures(pathElement));
if (!context.delta()) {
writer.writeElement(pathElement);
}
}
if (context.delta() && !inClippingPath) {
pathElement.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
pathElement.getGState().setFillColor(new ColorPt(1, 0, 0));
pathElement.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
pathElement.getGState().setStrokeColor(new ColorPt(1, 0, 0));
writer.writeElement(pathElement);
}
}
}
private void removeOverlappedElements(Page page, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
context.reader().begin(page);
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
if (context.delta()) {
// green for element removed due to overlapping
context.overlappedElements().forEach(feature -> drawBBox(writer, feature.getBoundingBox(), "#00FF00"));
context.overlappedElements().clear();
}
processOverlappedElements(writer, context);
writer.end();
context.reader().end();
if (context.overlappedElements().size() > 0) {
log.warn(context.overlappedElements().size() + " overlapped elements have not been found or removed");
}
}
private void processOverlappedElements(ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
for (Element element = context.reader().next(); element != null; element = context.reader().next()) {
switch (element.getType()) {
case Element.e_form -> processFormOverlappedElements(writer, element, context);
case Element.e_path, Element.e_image, Element.e_inline_image, Element.e_text -> {
boolean anyMatch = false;
for (ElementFeatures elementToRemove : context.overlappedElements()) {
if (elementToRemove.almostMatches(element)) {
context.overlappedElements().remove(elementToRemove);
anyMatch = true;
break;
}
}
if (!anyMatch) {
writer.writeElement(element);
} else if (element.getType() == 3 && element.hasTextMatrix()) {
/*
PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
Therefore, the position of a following Tj is affected by not writing the first Element.
This is why, we write only the Tm command:
*/
writer.writeGStateChanges(element);
}
}
default -> writer.writeElement(element);
}
}
}
private void processFormOverlappedElements(ElementWriter writer, Element formElement, InvisibleElementRemovalContext context) throws PDFNetException {
writer.writeElement(formElement);
Obj formObj = formElement.getXObject();
if (!context.visitedXObjIds().contains(formObj.getObjNum())) {
context.visitedXObjIds().add(formObj.getObjNum());
// writer needs to be newly initialized when entering a new content stream
// see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
ElementWriter formWriter = new ElementWriter();
context.reader().formBegin();
formWriter.begin(formObj);
context.reader().clearChangeList();
formWriter.setDefaultGState(context.reader());
processOverlappedElements(formWriter, context);
formWriter.end();
formWriter.destroy();
context.reader().end();
}
}
private boolean isTextRenderedVisibly(GState gState) throws PDFNetException {
return gState.getTextRenderMode() != GState.e_invisible_text && //
!(gState.getTextRenderMode() == GState.e_fill_text && gState.getFillOpacity() == 0) && //
!(gState.getTextRenderMode() == GState.e_stroke_text && gState.getStrokeOpacity() == 0) && //
!(gState.getTextRenderMode() == GState.e_fill_stroke_text && gState.getFillOpacity() == 0 && gState.getStrokeOpacity() == 0);
}
private GeneralPath convertToGeneralPath(PathData pathData) throws PDFNetException {
GeneralPath linePath = new GeneralPath();
Iterator<Double> points = Doubles.asList(pathData.getPoints()).iterator();
Iterable<Byte> operators = Bytes.asList(pathData.getOperators());
for (var operator : operators) {
switch (operator) {
case PathData.e_moveto -> linePath.moveTo(points.next(), points.next());
case PathData.e_lineto -> linePath.lineTo(points.next(), points.next());
case PathData.e_cubicto -> linePath.curveTo(points.next(), points.next(), points.next(), points.next(), points.next(), points.next());
case PathData.e_closepath -> linePath.closePath();
case PathData.e_rect -> {
double x = points.next();
double y = points.next();
double w = points.next();
double h = points.next();
linePath.moveTo(x, y);
linePath.lineTo(x + w, y);
linePath.lineTo(x + w, y + h);
linePath.lineTo(x, y + h);
linePath.closePath();
}
default -> throw new PDFNetException("Invalid Element Type", 0, "", "", "");
}
}
return linePath;
}
private boolean almostContains(Shape outer, Rectangle2D inner) {
//To address inconsistencies in the calculation of the bounding box we slightly shrink the inner rectangle
double x_with_tolerance = inner.getX() >= 0 ? inner.getX() + TOLERANCE : inner.getX() - TOLERANCE;
double y_with_tolerance = inner.getY() >= 0 ? inner.getY() + TOLERANCE : inner.getY() - TOLERANCE;
double height_with_tolerance = inner.getHeight() - (2 * TOLERANCE);
double width_with_tolerance = inner.getWidth() - (2 * TOLERANCE);
Rectangle2D innerRect = new Rectangle2D.Double(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance);
return outer.contains(innerRect);
}
private boolean isFilledAndNonTransparent(Element element) throws PDFNetException {
return element.isFilled() && element.getGState().getFillOpacity() == 1;
}
@SneakyThrows
private void drawBBox(ElementWriter writer, Rectangle2D r, String hexcolor) {
ColorPt colorPt = new ColorPt(Integer.valueOf(hexcolor.substring(1, 3), 16) / 255d,
Integer.valueOf(hexcolor.substring(3, 5), 16) / 255d,
Integer.valueOf(hexcolor.substring(5, 7), 16) / 255d);
ElementBuilder eb = new ElementBuilder();
Element rect = eb.createRect(r.getX(), r.getY(), r.getWidth(), r.getHeight());
rect.setPathStroke(true);
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
rect.getGState().setStrokeColor(colorPt);
writer.writePlacedElement(rect);
colorPt.destroy();
eb.destroy();
}
private static AffineTransform toAffineTransform(Matrix2D ctm) throws PDFNetException {
return new AffineTransform(ctm.getA(), ctm.getB(), ctm.getC(), ctm.getD(), ctm.getH(), ctm.getV());
}
@Builder
private record InvisibleElementRemovalContext(
boolean delta,
ElementReader reader,
ClippingPathStack clippingPathStack,
List<ElementFeatures> overlappedElements,
List<ElementFeatures> visibleElements,
Set<Long> visitedXObjIds) {
}
}

View File

@ -0,0 +1,15 @@
<Configuration>
<Appenders>
<Console name="CONSOLE" target="SYSTEM_OUT">
<PatternLayout pattern="%d{HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n"/>
</Console>
</Appenders>
<Loggers>
<Root level="INFO">
<AppenderRef ref="CONSOLE"/>
</Root>
</Loggers>
</Configuration>