Merge branch 'RED-8385' into 'master'
RED-8385: add functionality to ignore specific marked contents Closes RED-8385 See merge request redactmanager/commons/pdftron-logic-commons!21
This commit is contained in:
commit
72b4e98538
@ -8,6 +8,7 @@ import java.awt.geom.Rectangle2D;
|
|||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.OutputStream;
|
import java.io.OutputStream;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.TreeSet;
|
import java.util.TreeSet;
|
||||||
@ -57,11 +58,11 @@ public class InvisibleElementRemovalService {
|
|||||||
* @param out OutputStream to write the resulting file to
|
* @param out OutputStream to write the resulting file to
|
||||||
**/
|
**/
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta, boolean removePaths) {
|
public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta, boolean removePaths, Set<String> markedContentToIgnore) {
|
||||||
|
|
||||||
PDFDoc pdfDoc = new PDFDoc(pdfFile);
|
PDFDoc pdfDoc = new PDFDoc(pdfFile);
|
||||||
|
|
||||||
execute(pdfDoc, delta, removePaths);
|
execute(pdfDoc, delta, removePaths, markedContentToIgnore);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
|
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
|
||||||
@ -76,15 +77,44 @@ public class InvisibleElementRemovalService {
|
|||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This method is equal to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean)}, with removePaths == true.
|
* This method is equal to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean, Set)}, with removePaths == true and markedContentsToIgnore == emptySet().
|
||||||
*/
|
*/
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta) {
|
public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta) {
|
||||||
|
|
||||||
removeInvisibleElements(pdfFile, out, delta, true);
|
removeInvisibleElements(pdfFile, out, delta, true, Collections.emptySet());
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This method is equal to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean, Set)}, with removePaths == true and markedContentsToIgnore = Set.of("KNECON_OCR").
|
||||||
|
*/
|
||||||
|
public void removeInvisibleElementsButKeepOcrText(InputStream pdfFile, OutputStream out, boolean delta) {
|
||||||
|
|
||||||
|
removeInvisibleElements(pdfFile, out, delta, true, Set.of("KNECON_OCR"));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This method is equal to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean, Set)}, with markedContentsToIgnore == emptySet().
|
||||||
|
*/
|
||||||
|
@SneakyThrows
|
||||||
|
public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta, boolean removePaths) {
|
||||||
|
|
||||||
|
removeInvisibleElements(pdfFile, out, delta, removePaths, Collections.emptySet());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This method is similar to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean, Set)}, just with a PDFDoc.
|
||||||
|
*/
|
||||||
|
@SneakyThrows
|
||||||
|
public void removeInvisibleElements(PDFDoc pdfDoc, boolean delta, boolean removePaths, Set<String> markedContentToIgnore) {
|
||||||
|
|
||||||
|
execute(pdfDoc, delta, removePaths, markedContentToIgnore);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This method is similar to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean)}, just with a PDFDoc.
|
* This method is similar to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean)}, just with a PDFDoc.
|
||||||
@ -92,22 +122,22 @@ public class InvisibleElementRemovalService {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void removeInvisibleElements(PDFDoc pdfDoc, boolean delta, boolean removePaths) {
|
public void removeInvisibleElements(PDFDoc pdfDoc, boolean delta, boolean removePaths) {
|
||||||
|
|
||||||
execute(pdfDoc, delta, removePaths);
|
execute(pdfDoc, delta, removePaths, Collections.emptySet());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This method is equal to {@link #removeInvisibleElements(PDFDoc, boolean, boolean)}, with removePaths == true.
|
* This method is equal to {@link #removeInvisibleElements(PDFDoc, boolean)}, just with a PDFDoc.
|
||||||
*/
|
*/
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void removeInvisibleElements(PDFDoc pdfDoc, boolean delta) {
|
public void removeInvisibleElements(PDFDoc pdfDoc, boolean delta) {
|
||||||
|
|
||||||
execute(pdfDoc, delta, true);
|
execute(pdfDoc, delta, true, Collections.emptySet());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private void execute(PDFDoc pdfDoc, boolean delta, boolean removePaths) {
|
private void execute(PDFDoc pdfDoc, boolean delta, boolean removePaths, Set<String> markedContentToIgnore) {
|
||||||
|
|
||||||
log.info("Start removing invisible Elements");
|
log.info("Start removing invisible Elements");
|
||||||
ElementWriter writer = new ElementWriter();
|
ElementWriter writer = new ElementWriter();
|
||||||
@ -123,16 +153,19 @@ public class InvisibleElementRemovalService {
|
|||||||
InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder()
|
InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder()
|
||||||
.reader(reader)
|
.reader(reader)
|
||||||
.clippingPathStack(new ClippingPathStack(page.getMediaBox()))
|
.clippingPathStack(new ClippingPathStack(page.getMediaBox()))
|
||||||
|
.markedContentStack(new MarkedContentStack())
|
||||||
.removePaths(removePaths)
|
.removePaths(removePaths)
|
||||||
.delta(delta)
|
.delta(delta)
|
||||||
.overlappedElements(new ArrayList<>())
|
.overlappedElements(new ArrayList<>())
|
||||||
.visibleElements(new ArrayList<>())
|
.visibleElements(new ArrayList<>())
|
||||||
.visitedXObjIds(visitedXObjIds)
|
.visitedXObjIds(visitedXObjIds)
|
||||||
|
.markedContentToIgnore(markedContentToIgnore)
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context);
|
removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context);
|
||||||
|
|
||||||
context.visitedXObjIds().clear();
|
context.visitedXObjIds().clear();
|
||||||
|
context.markedContentStack().clear();
|
||||||
|
|
||||||
removeOverlappedElements(page, writer, context);
|
removeOverlappedElements(page, writer, context);
|
||||||
|
|
||||||
@ -149,6 +182,7 @@ public class InvisibleElementRemovalService {
|
|||||||
InvisibleElementRemovalContext context) throws PDFNetException {
|
InvisibleElementRemovalContext context) throws PDFNetException {
|
||||||
|
|
||||||
context.reader().begin(page);
|
context.reader().begin(page);
|
||||||
|
context.markedContentStack().clear();
|
||||||
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
|
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
|
||||||
processElements(writer, context);
|
processElements(writer, context);
|
||||||
writer.end();
|
writer.end();
|
||||||
@ -158,7 +192,13 @@ public class InvisibleElementRemovalService {
|
|||||||
|
|
||||||
private void processElements(ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
|
private void processElements(ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
|
||||||
|
|
||||||
for (Element element = context.reader().next(); element != null; element = context.reader().next())
|
for (Element element = context.reader().next(); element != null; element = context.reader().next()) {
|
||||||
|
|
||||||
|
if (context.markedContentStack().currentMarkedContentContainsAny(context.markedContentToIgnore()) && element.getType() != Element.e_marked_content_end) {
|
||||||
|
writer.writeElement(element);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
switch (element.getType()) {
|
switch (element.getType()) {
|
||||||
case Element.e_image, Element.e_inline_image -> processImages(element, writer, context);
|
case Element.e_image, Element.e_inline_image -> processImages(element, writer, context);
|
||||||
case Element.e_text -> processText(element, writer, context);
|
case Element.e_text -> processText(element, writer, context);
|
||||||
@ -172,8 +212,17 @@ public class InvisibleElementRemovalService {
|
|||||||
context.clippingPathStack().leaveGState();
|
context.clippingPathStack().leaveGState();
|
||||||
writer.writeElement(element);
|
writer.writeElement(element);
|
||||||
}
|
}
|
||||||
|
case Element.e_marked_content_begin -> {
|
||||||
|
context.markedContentStack().enterMarkedContent(element.getMCTag().getName());
|
||||||
|
writer.writeElement(element);
|
||||||
|
}
|
||||||
|
case Element.e_marked_content_end -> {
|
||||||
|
context.markedContentStack().leaveMarkedContent();
|
||||||
|
writer.writeElement(element);
|
||||||
|
}
|
||||||
default -> writer.writeElement(element);
|
default -> writer.writeElement(element);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -330,10 +379,7 @@ public class InvisibleElementRemovalService {
|
|||||||
|
|
||||||
private void calculateOverlapsForLinePath(InvisibleElementRemovalContext context, GeneralPath linePath) {
|
private void calculateOverlapsForLinePath(InvisibleElementRemovalContext context, GeneralPath linePath) {
|
||||||
|
|
||||||
List<ElementFeatures> currentOverlappedElements = context.visibleElements()
|
List<ElementFeatures> currentOverlappedElements = context.visibleElements().stream().filter(features -> almostContains(linePath, features.getBoundingBox())).toList();
|
||||||
.stream()
|
|
||||||
.filter(features -> almostContains(linePath, features.getBoundingBox()))
|
|
||||||
.toList();
|
|
||||||
context.overlappedElements().addAll(currentOverlappedElements);
|
context.overlappedElements().addAll(currentOverlappedElements);
|
||||||
context.visibleElements().removeAll(currentOverlappedElements);
|
context.visibleElements().removeAll(currentOverlappedElements);
|
||||||
}
|
}
|
||||||
@ -361,6 +407,12 @@ public class InvisibleElementRemovalService {
|
|||||||
private void processOverlappedElements(ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
|
private void processOverlappedElements(ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
|
||||||
|
|
||||||
for (Element element = context.reader().next(); element != null; element = context.reader().next()) {
|
for (Element element = context.reader().next(); element != null; element = context.reader().next()) {
|
||||||
|
|
||||||
|
if (context.markedContentStack().currentMarkedContentContainsAny(context.markedContentToIgnore()) && element.getType() != Element.e_marked_content_end) {
|
||||||
|
writer.writeElement(element);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
switch (element.getType()) {
|
switch (element.getType()) {
|
||||||
case Element.e_form -> processFormOverlappedElements(writer, element, context);
|
case Element.e_form -> processFormOverlappedElements(writer, element, context);
|
||||||
case Element.e_image, Element.e_inline_image, Element.e_text -> removeOverlappedElement(writer, context, element);
|
case Element.e_image, Element.e_inline_image, Element.e_text -> removeOverlappedElement(writer, context, element);
|
||||||
@ -371,6 +423,14 @@ public class InvisibleElementRemovalService {
|
|||||||
writer.writeElement(element);
|
writer.writeElement(element);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
case Element.e_marked_content_begin -> {
|
||||||
|
context.markedContentStack().enterMarkedContent(element.getMCTag().getName());
|
||||||
|
writer.writeElement(element);
|
||||||
|
}
|
||||||
|
case Element.e_marked_content_end -> {
|
||||||
|
context.markedContentStack().leaveMarkedContent();
|
||||||
|
writer.writeElement(element);
|
||||||
|
}
|
||||||
default -> writer.writeElement(element);
|
default -> writer.writeElement(element);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -532,9 +592,11 @@ public class InvisibleElementRemovalService {
|
|||||||
boolean delta,
|
boolean delta,
|
||||||
ElementReader reader,
|
ElementReader reader,
|
||||||
ClippingPathStack clippingPathStack,
|
ClippingPathStack clippingPathStack,
|
||||||
|
MarkedContentStack markedContentStack,
|
||||||
List<ElementFeatures> overlappedElements,
|
List<ElementFeatures> overlappedElements,
|
||||||
List<ElementFeatures> visibleElements,
|
List<ElementFeatures> visibleElements,
|
||||||
Set<Long> visitedXObjIds) {
|
Set<Long> visitedXObjIds,
|
||||||
|
Set<String> markedContentToIgnore) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -0,0 +1,73 @@
|
|||||||
|
package com.iqser.red.pdftronlogic.commons;
|
||||||
|
|
||||||
|
import java.util.Deque;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
public class MarkedContentStack {
|
||||||
|
|
||||||
|
Deque<MarkedContent> markedContentStack = new LinkedList<>();
|
||||||
|
|
||||||
|
|
||||||
|
public void enterMarkedContent(String name) {
|
||||||
|
|
||||||
|
markedContentStack.push(new MarkedContent(name));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void leaveMarkedContent() {
|
||||||
|
|
||||||
|
markedContentStack.pop();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public String currentMarkedContent() {
|
||||||
|
|
||||||
|
if (markedContentStack.isEmpty()) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
return markedContentStack.peek().name();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean currentMarkedContentContains(String name) {
|
||||||
|
|
||||||
|
Iterator<MarkedContent> markedContentIterator = markedContentStack.descendingIterator();
|
||||||
|
while (markedContentIterator.hasNext()) {
|
||||||
|
var markedContent = markedContentIterator.next();
|
||||||
|
if (markedContent.name().equals(name)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean currentMarkedContentContainsAny(Set<String> names) {
|
||||||
|
|
||||||
|
if (markedContentStack.isEmpty()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
Iterator<MarkedContent> markedContentIterator = markedContentStack.descendingIterator();
|
||||||
|
while (markedContentIterator.hasNext()) {
|
||||||
|
var markedContent = markedContentIterator.next();
|
||||||
|
if (names.contains(markedContent.name())) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void clear() {
|
||||||
|
|
||||||
|
markedContentStack.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private record MarkedContent(String name) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -152,4 +152,27 @@ class InvisibleElementRemovalServiceTest {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@SneakyThrows
|
||||||
|
void removeInvisibleElementsButKeepOCRText() {
|
||||||
|
|
||||||
|
String fileName = "files/singlePageWithOcrText.pdf";
|
||||||
|
String resultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL");
|
||||||
|
String deltaResultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL_DELTA");
|
||||||
|
|
||||||
|
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(resultFileName)) {
|
||||||
|
invisibleElementRemovalService.removeInvisibleElementsButKeepOcrText(in, out, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(deltaResultFileName)) {
|
||||||
|
invisibleElementRemovalService.removeInvisibleElementsButKeepOcrText(in, out, true);
|
||||||
|
}
|
||||||
|
try (var in = new FileInputStream(resultFileName)) {
|
||||||
|
String result = PdfTextExtraction.extractAllTextFromDocument(in);
|
||||||
|
assertThat(result).contains("TABLE 17:", "Intergroup comparison oftotal litter", "TABLE 20:");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
BIN
src/test/resources/files/singlePageWithOcrText.pdf
Normal file
BIN
src/test/resources/files/singlePageWithOcrText.pdf
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user