RED-8212: fix tables for ocred documents

This commit is contained in:
Kilian Schüttler 2024-01-23 10:50:09 +01:00
parent 2caa3e92a4
commit 6b6417ed80

View File

@ -8,13 +8,9 @@ import java.awt.geom.Rectangle2D;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.atomic.AtomicReference;
import java.util.stream.Collectors;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.ColorPt;
@ -55,16 +51,17 @@ public class InvisibleElementRemovalService {
* -Any Text set to clipping with its many interactions with other elements
*
* @param pdfFile The PDF file to process
* @param removePaths If this flag is set, invisible path elements will be removed
* @param delta If this flag is set only the removed Elements will be written to the output file.
* The Elements are red if they are removed by clipping path, blue for transparency, and a green bounding box for overlap.
* @param out OutputStream to write the resulting file to
**/
@SneakyThrows
public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta) {
public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta, boolean removePaths) {
PDFDoc pdfDoc = new PDFDoc(pdfFile);
execute(pdfDoc, delta);
execute(pdfDoc, delta, removePaths);
try {
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
@ -79,17 +76,36 @@ public class InvisibleElementRemovalService {
/**
* This method is similar to {@link #removeInvisibleElements(InputStream, OutputStream, boolean)}, just with a PDFDoc.
* This method is equal to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean)}, with removePaths == true.
*/
@SneakyThrows
public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta) {
removeInvisibleElements(pdfFile, out, delta, true);
}
/**
* This method is similar to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean)}, just with a PDFDoc.
*/
@SneakyThrows
public void removeInvisibleElements(PDFDoc pdfDoc, boolean removePaths, boolean delta) {
execute(pdfDoc, delta, removePaths);
}
/**
* This method is equal to {@link #removeInvisibleElements(PDFDoc, boolean, boolean)}, with removePaths == true.
*/
@SneakyThrows
public void removeInvisibleElements(PDFDoc pdfDoc, boolean delta) {
execute(pdfDoc, delta);
execute(pdfDoc, delta, true);
}
@SneakyThrows
private void execute(PDFDoc pdfDoc, boolean delta) {
private void execute(PDFDoc pdfDoc, boolean delta, boolean removePaths) {
log.info("Start removing invisible Elements");
ElementWriter writer = new ElementWriter();
@ -105,6 +121,7 @@ public class InvisibleElementRemovalService {
InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder()
.reader(reader)
.clippingPathStack(new ClippingPathStack(page.getMediaBox()))
.removePaths(removePaths)
.delta(delta)
.overlappedElements(new ArrayList<>())
.visibleElements(new ArrayList<>())
@ -297,11 +314,11 @@ public class InvisibleElementRemovalService {
context.visibleElements().removeAll(currentOverlappedElements);
}
context.visibleElements().add(ElementFeatureFactory.extractFeatures(pathElement));
if (!context.delta()) {
if (!context.delta() || !context.removePaths()) {
writer.writeElement(pathElement);
}
}
if (context.delta() && !inClippingPath) {
if (context.delta() && !inClippingPath && context.removePaths()) {
pathElement.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
pathElement.getGState().setFillColor(new ColorPt(1, 0, 0));
pathElement.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
@ -336,25 +353,12 @@ public class InvisibleElementRemovalService {
for (Element element = context.reader().next(); element != null; element = context.reader().next()) {
switch (element.getType()) {
case Element.e_form -> processFormOverlappedElements(writer, element, context);
case Element.e_path, Element.e_image, Element.e_inline_image, Element.e_text -> {
boolean anyMatch = false;
for (ElementFeatures elementToRemove : context.overlappedElements()) {
if (elementToRemove.almostMatches(element)) {
context.overlappedElements().remove(elementToRemove);
anyMatch = true;
break;
}
}
if (!anyMatch) {
case Element.e_image, Element.e_inline_image, Element.e_text -> removeOverlappedElement(writer, context, element);
case Element.e_path -> {
if (context.removePaths()) {
removeOverlappedElement(writer, context, element);
} else {
writer.writeElement(element);
} else if (element.getType() == 3 && element.hasTextMatrix()) {
/*
PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
Therefore, the position of a following Tj is affected by not writing the first Element.
This is why, we write only the Tm command:
*/
writer.writeGStateChanges(element);
}
}
default -> writer.writeElement(element);
@ -363,6 +367,30 @@ public class InvisibleElementRemovalService {
}
private static void removeOverlappedElement(ElementWriter writer, InvisibleElementRemovalContext context, Element element) throws PDFNetException {
boolean anyMatch = false;
for (ElementFeatures elementToRemove : context.overlappedElements()) {
if (elementToRemove.almostMatches(element)) {
context.overlappedElements().remove(elementToRemove);
anyMatch = true;
break;
}
}
if (!anyMatch) {
writer.writeElement(element);
} else if (element.getType() == 3 && element.hasTextMatrix()) {
/*
PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
Therefore, the position of a following Tj is affected by not writing the first Element.
This is why, we write only the Tm command:
*/
writer.writeGStateChanges(element);
}
}
private void processFormOverlappedElements(ElementWriter writer, Element formElement, InvisibleElementRemovalContext context) throws PDFNetException {
writer.writeElement(formElement);
@ -490,6 +518,7 @@ public class InvisibleElementRemovalService {
@Builder
private record InvisibleElementRemovalContext(
boolean removePaths,
boolean delta,
ElementReader reader,
ClippingPathStack clippingPathStack,