RED-8385: add functionality to ignore specific marked contents

This commit is contained in:
Kilian Schuettler 2024-02-01 12:07:40 +01:00
parent 073312702c
commit c29d39cc38
4 changed files with 172 additions and 14 deletions

View File

@ -8,6 +8,7 @@ import java.awt.geom.Rectangle2D;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
@ -57,11 +58,11 @@ public class InvisibleElementRemovalService {
* @param out OutputStream to write the resulting file to
**/
@SneakyThrows
public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta, boolean removePaths) {
public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta, boolean removePaths, Set<String> markedContentToIgnore) {
PDFDoc pdfDoc = new PDFDoc(pdfFile);
execute(pdfDoc, delta, removePaths);
execute(pdfDoc, delta, removePaths, markedContentToIgnore);
try {
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
@ -76,15 +77,44 @@ public class InvisibleElementRemovalService {
/**
* This method is equal to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean)}, with removePaths == true.
* This method is equal to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean, Set)}, with removePaths == true and markedContentsToIgnore == emptySet().
*/
@SneakyThrows
public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta) {
removeInvisibleElements(pdfFile, out, delta, true);
removeInvisibleElements(pdfFile, out, delta, true, Collections.emptySet());
}
/**
* This method is equal to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean, Set)}, with removePaths == true and markedContentsToIgnore = Set.of("KNECON_OCR").
*/
public void removeInvisibleElementsButKeepOcrText(InputStream pdfFile, OutputStream out, boolean delta) {
removeInvisibleElements(pdfFile, out, delta, true, Set.of("KNECON_OCR"));
}
/**
* This method is equal to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean, Set)}, with markedContentsToIgnore == emptySet().
*/
@SneakyThrows
public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta, boolean removePaths) {
removeInvisibleElements(pdfFile, out, delta, removePaths, Collections.emptySet());
}
/**
* This method is similar to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean, Set)}, just with a PDFDoc.
*/
@SneakyThrows
public void removeInvisibleElements(PDFDoc pdfDoc, boolean delta, boolean removePaths, Set<String> markedContentToIgnore) {
execute(pdfDoc, delta, removePaths, markedContentToIgnore);
}
/**
* This method is similar to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean)}, just with a PDFDoc.
@ -92,22 +122,22 @@ public class InvisibleElementRemovalService {
@SneakyThrows
public void removeInvisibleElements(PDFDoc pdfDoc, boolean delta, boolean removePaths) {
execute(pdfDoc, delta, removePaths);
execute(pdfDoc, delta, removePaths, Collections.emptySet());
}
/**
* This method is equal to {@link #removeInvisibleElements(PDFDoc, boolean, boolean)}, with removePaths == true.
* This method is equal to {@link #removeInvisibleElements(PDFDoc, boolean)}, just with a PDFDoc.
*/
@SneakyThrows
public void removeInvisibleElements(PDFDoc pdfDoc, boolean delta) {
execute(pdfDoc, delta, true);
execute(pdfDoc, delta, true, Collections.emptySet());
}
@SneakyThrows
private void execute(PDFDoc pdfDoc, boolean delta, boolean removePaths) {
private void execute(PDFDoc pdfDoc, boolean delta, boolean removePaths, Set<String> markedContentToIgnore) {
log.info("Start removing invisible Elements");
ElementWriter writer = new ElementWriter();
@ -123,16 +153,19 @@ public class InvisibleElementRemovalService {
InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder()
.reader(reader)
.clippingPathStack(new ClippingPathStack(page.getMediaBox()))
.markedContentStack(new MarkedContentStack())
.removePaths(removePaths)
.delta(delta)
.overlappedElements(new ArrayList<>())
.visibleElements(new ArrayList<>())
.visitedXObjIds(visitedXObjIds)
.markedContentToIgnore(markedContentToIgnore)
.build();
removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context);
context.visitedXObjIds().clear();
context.markedContentStack().clear();
removeOverlappedElements(page, writer, context);
@ -149,6 +182,7 @@ public class InvisibleElementRemovalService {
InvisibleElementRemovalContext context) throws PDFNetException {
context.reader().begin(page);
context.markedContentStack().clear();
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
processElements(writer, context);
writer.end();
@ -158,7 +192,13 @@ public class InvisibleElementRemovalService {
private void processElements(ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
for (Element element = context.reader().next(); element != null; element = context.reader().next())
for (Element element = context.reader().next(); element != null; element = context.reader().next()) {
if (context.markedContentStack().currentMarkedContentContainsAny(context.markedContentToIgnore()) && element.getType() != Element.e_marked_content_end) {
writer.writeElement(element);
continue;
}
switch (element.getType()) {
case Element.e_image, Element.e_inline_image -> processImages(element, writer, context);
case Element.e_text -> processText(element, writer, context);
@ -172,8 +212,17 @@ public class InvisibleElementRemovalService {
context.clippingPathStack().leaveGState();
writer.writeElement(element);
}
case Element.e_marked_content_begin -> {
context.markedContentStack().enterMarkedContent(element.getMCTag().getName());
writer.writeElement(element);
}
case Element.e_marked_content_end -> {
context.markedContentStack().leaveMarkedContent();
writer.writeElement(element);
}
default -> writer.writeElement(element);
}
}
}
@ -330,10 +379,7 @@ public class InvisibleElementRemovalService {
private void calculateOverlapsForLinePath(InvisibleElementRemovalContext context, GeneralPath linePath) {
List<ElementFeatures> currentOverlappedElements = context.visibleElements()
.stream()
.filter(features -> almostContains(linePath, features.getBoundingBox()))
.toList();
List<ElementFeatures> currentOverlappedElements = context.visibleElements().stream().filter(features -> almostContains(linePath, features.getBoundingBox())).toList();
context.overlappedElements().addAll(currentOverlappedElements);
context.visibleElements().removeAll(currentOverlappedElements);
}
@ -361,6 +407,12 @@ public class InvisibleElementRemovalService {
private void processOverlappedElements(ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
for (Element element = context.reader().next(); element != null; element = context.reader().next()) {
if (context.markedContentStack().currentMarkedContentContainsAny(context.markedContentToIgnore()) && element.getType() != Element.e_marked_content_end) {
writer.writeElement(element);
continue;
}
switch (element.getType()) {
case Element.e_form -> processFormOverlappedElements(writer, element, context);
case Element.e_image, Element.e_inline_image, Element.e_text -> removeOverlappedElement(writer, context, element);
@ -371,6 +423,14 @@ public class InvisibleElementRemovalService {
writer.writeElement(element);
}
}
case Element.e_marked_content_begin -> {
context.markedContentStack().enterMarkedContent(element.getMCTag().getName());
writer.writeElement(element);
}
case Element.e_marked_content_end -> {
context.markedContentStack().leaveMarkedContent();
writer.writeElement(element);
}
default -> writer.writeElement(element);
}
}
@ -532,9 +592,11 @@ public class InvisibleElementRemovalService {
boolean delta,
ElementReader reader,
ClippingPathStack clippingPathStack,
MarkedContentStack markedContentStack,
List<ElementFeatures> overlappedElements,
List<ElementFeatures> visibleElements,
Set<Long> visitedXObjIds) {
Set<Long> visitedXObjIds,
Set<String> markedContentToIgnore) {
}

View File

@ -0,0 +1,73 @@
package com.iqser.red.pdftronlogic.commons;
import java.util.Deque;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Set;
public class MarkedContentStack {
Deque<MarkedContent> markedContentStack = new LinkedList<>();
public void enterMarkedContent(String name) {
markedContentStack.push(new MarkedContent(name));
}
public void leaveMarkedContent() {
markedContentStack.pop();
}
public String currentMarkedContent() {
if (markedContentStack.isEmpty()) {
return "";
}
return markedContentStack.peek().name();
}
public boolean currentMarkedContentContains(String name) {
Iterator<MarkedContent> markedContentIterator = markedContentStack.descendingIterator();
while (markedContentIterator.hasNext()) {
var markedContent = markedContentIterator.next();
if (markedContent.name().equals(name)) {
return true;
}
}
return false;
}
public boolean currentMarkedContentContainsAny(Set<String> names) {
if (markedContentStack.isEmpty()) {
return false;
}
Iterator<MarkedContent> markedContentIterator = markedContentStack.descendingIterator();
while (markedContentIterator.hasNext()) {
var markedContent = markedContentIterator.next();
if (names.contains(markedContent.name())) {
return true;
}
}
return false;
}
public void clear() {
markedContentStack.clear();
}
private record MarkedContent(String name) {
}
}

View File

@ -152,4 +152,27 @@ class InvisibleElementRemovalServiceTest {
}
@Test
@SneakyThrows
void removeInvisibleElementsButKeepOCRText() {
String fileName = "files/singlePageWithOcrText.pdf";
String resultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL");
String deltaResultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL_DELTA");
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(resultFileName)) {
invisibleElementRemovalService.removeInvisibleElementsButKeepOcrText(in, out, false);
}
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(deltaResultFileName)) {
invisibleElementRemovalService.removeInvisibleElementsButKeepOcrText(in, out, true);
}
try (var in = new FileInputStream(resultFileName)) {
String result = PdfTextExtraction.extractAllTextFromDocument(in);
assertThat(result).contains("TABLE 17:", "Intergroup comparison oftotal litter", "TABLE 20:");
}
}
}

Binary file not shown.