RED-8385: add functionality to ignore specific marked contents
This commit is contained in:
parent
073312702c
commit
c29d39cc38
@ -8,6 +8,7 @@ import java.awt.geom.Rectangle2D;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
@ -57,11 +58,11 @@ public class InvisibleElementRemovalService {
|
||||
* @param out OutputStream to write the resulting file to
|
||||
**/
|
||||
@SneakyThrows
|
||||
public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta, boolean removePaths) {
|
||||
public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta, boolean removePaths, Set<String> markedContentToIgnore) {
|
||||
|
||||
PDFDoc pdfDoc = new PDFDoc(pdfFile);
|
||||
|
||||
execute(pdfDoc, delta, removePaths);
|
||||
execute(pdfDoc, delta, removePaths, markedContentToIgnore);
|
||||
|
||||
try {
|
||||
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
|
||||
@ -76,15 +77,44 @@ public class InvisibleElementRemovalService {
|
||||
|
||||
|
||||
/**
|
||||
* This method is equal to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean)}, with removePaths == true.
|
||||
* This method is equal to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean, Set)}, with removePaths == true and markedContentsToIgnore == emptySet().
|
||||
*/
|
||||
@SneakyThrows
|
||||
public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta) {
|
||||
|
||||
removeInvisibleElements(pdfFile, out, delta, true);
|
||||
removeInvisibleElements(pdfFile, out, delta, true, Collections.emptySet());
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* This method is equal to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean, Set)}, with removePaths == true and markedContentsToIgnore = Set.of("KNECON_OCR").
|
||||
*/
|
||||
public void removeInvisibleElementsButKeepOcrText(InputStream pdfFile, OutputStream out, boolean delta) {
|
||||
|
||||
removeInvisibleElements(pdfFile, out, delta, true, Set.of("KNECON_OCR"));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This method is equal to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean, Set)}, with markedContentsToIgnore == emptySet().
|
||||
*/
|
||||
@SneakyThrows
|
||||
public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta, boolean removePaths) {
|
||||
|
||||
removeInvisibleElements(pdfFile, out, delta, removePaths, Collections.emptySet());
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This method is similar to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean, Set)}, just with a PDFDoc.
|
||||
*/
|
||||
@SneakyThrows
|
||||
public void removeInvisibleElements(PDFDoc pdfDoc, boolean delta, boolean removePaths, Set<String> markedContentToIgnore) {
|
||||
|
||||
execute(pdfDoc, delta, removePaths, markedContentToIgnore);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This method is similar to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean)}, just with a PDFDoc.
|
||||
@ -92,22 +122,22 @@ public class InvisibleElementRemovalService {
|
||||
@SneakyThrows
|
||||
public void removeInvisibleElements(PDFDoc pdfDoc, boolean delta, boolean removePaths) {
|
||||
|
||||
execute(pdfDoc, delta, removePaths);
|
||||
execute(pdfDoc, delta, removePaths, Collections.emptySet());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This method is equal to {@link #removeInvisibleElements(PDFDoc, boolean, boolean)}, with removePaths == true.
|
||||
* This method is equal to {@link #removeInvisibleElements(PDFDoc, boolean)}, just with a PDFDoc.
|
||||
*/
|
||||
@SneakyThrows
|
||||
public void removeInvisibleElements(PDFDoc pdfDoc, boolean delta) {
|
||||
|
||||
execute(pdfDoc, delta, true);
|
||||
execute(pdfDoc, delta, true, Collections.emptySet());
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void execute(PDFDoc pdfDoc, boolean delta, boolean removePaths) {
|
||||
private void execute(PDFDoc pdfDoc, boolean delta, boolean removePaths, Set<String> markedContentToIgnore) {
|
||||
|
||||
log.info("Start removing invisible Elements");
|
||||
ElementWriter writer = new ElementWriter();
|
||||
@ -123,16 +153,19 @@ public class InvisibleElementRemovalService {
|
||||
InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder()
|
||||
.reader(reader)
|
||||
.clippingPathStack(new ClippingPathStack(page.getMediaBox()))
|
||||
.markedContentStack(new MarkedContentStack())
|
||||
.removePaths(removePaths)
|
||||
.delta(delta)
|
||||
.overlappedElements(new ArrayList<>())
|
||||
.visibleElements(new ArrayList<>())
|
||||
.visitedXObjIds(visitedXObjIds)
|
||||
.markedContentToIgnore(markedContentToIgnore)
|
||||
.build();
|
||||
|
||||
removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context);
|
||||
|
||||
context.visitedXObjIds().clear();
|
||||
context.markedContentStack().clear();
|
||||
|
||||
removeOverlappedElements(page, writer, context);
|
||||
|
||||
@ -149,6 +182,7 @@ public class InvisibleElementRemovalService {
|
||||
InvisibleElementRemovalContext context) throws PDFNetException {
|
||||
|
||||
context.reader().begin(page);
|
||||
context.markedContentStack().clear();
|
||||
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
|
||||
processElements(writer, context);
|
||||
writer.end();
|
||||
@ -158,7 +192,13 @@ public class InvisibleElementRemovalService {
|
||||
|
||||
private void processElements(ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
|
||||
|
||||
for (Element element = context.reader().next(); element != null; element = context.reader().next())
|
||||
for (Element element = context.reader().next(); element != null; element = context.reader().next()) {
|
||||
|
||||
if (context.markedContentStack().currentMarkedContentContainsAny(context.markedContentToIgnore()) && element.getType() != Element.e_marked_content_end) {
|
||||
writer.writeElement(element);
|
||||
continue;
|
||||
}
|
||||
|
||||
switch (element.getType()) {
|
||||
case Element.e_image, Element.e_inline_image -> processImages(element, writer, context);
|
||||
case Element.e_text -> processText(element, writer, context);
|
||||
@ -172,8 +212,17 @@ public class InvisibleElementRemovalService {
|
||||
context.clippingPathStack().leaveGState();
|
||||
writer.writeElement(element);
|
||||
}
|
||||
case Element.e_marked_content_begin -> {
|
||||
context.markedContentStack().enterMarkedContent(element.getMCTag().getName());
|
||||
writer.writeElement(element);
|
||||
}
|
||||
case Element.e_marked_content_end -> {
|
||||
context.markedContentStack().leaveMarkedContent();
|
||||
writer.writeElement(element);
|
||||
}
|
||||
default -> writer.writeElement(element);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -330,10 +379,7 @@ public class InvisibleElementRemovalService {
|
||||
|
||||
private void calculateOverlapsForLinePath(InvisibleElementRemovalContext context, GeneralPath linePath) {
|
||||
|
||||
List<ElementFeatures> currentOverlappedElements = context.visibleElements()
|
||||
.stream()
|
||||
.filter(features -> almostContains(linePath, features.getBoundingBox()))
|
||||
.toList();
|
||||
List<ElementFeatures> currentOverlappedElements = context.visibleElements().stream().filter(features -> almostContains(linePath, features.getBoundingBox())).toList();
|
||||
context.overlappedElements().addAll(currentOverlappedElements);
|
||||
context.visibleElements().removeAll(currentOverlappedElements);
|
||||
}
|
||||
@ -361,6 +407,12 @@ public class InvisibleElementRemovalService {
|
||||
private void processOverlappedElements(ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
|
||||
|
||||
for (Element element = context.reader().next(); element != null; element = context.reader().next()) {
|
||||
|
||||
if (context.markedContentStack().currentMarkedContentContainsAny(context.markedContentToIgnore()) && element.getType() != Element.e_marked_content_end) {
|
||||
writer.writeElement(element);
|
||||
continue;
|
||||
}
|
||||
|
||||
switch (element.getType()) {
|
||||
case Element.e_form -> processFormOverlappedElements(writer, element, context);
|
||||
case Element.e_image, Element.e_inline_image, Element.e_text -> removeOverlappedElement(writer, context, element);
|
||||
@ -371,6 +423,14 @@ public class InvisibleElementRemovalService {
|
||||
writer.writeElement(element);
|
||||
}
|
||||
}
|
||||
case Element.e_marked_content_begin -> {
|
||||
context.markedContentStack().enterMarkedContent(element.getMCTag().getName());
|
||||
writer.writeElement(element);
|
||||
}
|
||||
case Element.e_marked_content_end -> {
|
||||
context.markedContentStack().leaveMarkedContent();
|
||||
writer.writeElement(element);
|
||||
}
|
||||
default -> writer.writeElement(element);
|
||||
}
|
||||
}
|
||||
@ -532,9 +592,11 @@ public class InvisibleElementRemovalService {
|
||||
boolean delta,
|
||||
ElementReader reader,
|
||||
ClippingPathStack clippingPathStack,
|
||||
MarkedContentStack markedContentStack,
|
||||
List<ElementFeatures> overlappedElements,
|
||||
List<ElementFeatures> visibleElements,
|
||||
Set<Long> visitedXObjIds) {
|
||||
Set<Long> visitedXObjIds,
|
||||
Set<String> markedContentToIgnore) {
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -0,0 +1,73 @@
|
||||
package com.iqser.red.pdftronlogic.commons;
|
||||
|
||||
import java.util.Deque;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Set;
|
||||
|
||||
public class MarkedContentStack {
|
||||
|
||||
Deque<MarkedContent> markedContentStack = new LinkedList<>();
|
||||
|
||||
|
||||
public void enterMarkedContent(String name) {
|
||||
|
||||
markedContentStack.push(new MarkedContent(name));
|
||||
}
|
||||
|
||||
|
||||
public void leaveMarkedContent() {
|
||||
|
||||
markedContentStack.pop();
|
||||
}
|
||||
|
||||
|
||||
public String currentMarkedContent() {
|
||||
|
||||
if (markedContentStack.isEmpty()) {
|
||||
return "";
|
||||
}
|
||||
return markedContentStack.peek().name();
|
||||
}
|
||||
|
||||
|
||||
public boolean currentMarkedContentContains(String name) {
|
||||
|
||||
Iterator<MarkedContent> markedContentIterator = markedContentStack.descendingIterator();
|
||||
while (markedContentIterator.hasNext()) {
|
||||
var markedContent = markedContentIterator.next();
|
||||
if (markedContent.name().equals(name)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
public boolean currentMarkedContentContainsAny(Set<String> names) {
|
||||
|
||||
if (markedContentStack.isEmpty()) {
|
||||
return false;
|
||||
}
|
||||
Iterator<MarkedContent> markedContentIterator = markedContentStack.descendingIterator();
|
||||
while (markedContentIterator.hasNext()) {
|
||||
var markedContent = markedContentIterator.next();
|
||||
if (names.contains(markedContent.name())) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
public void clear() {
|
||||
|
||||
markedContentStack.clear();
|
||||
}
|
||||
|
||||
|
||||
private record MarkedContent(String name) {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -152,4 +152,27 @@ class InvisibleElementRemovalServiceTest {
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
void removeInvisibleElementsButKeepOCRText() {
|
||||
|
||||
String fileName = "files/singlePageWithOcrText.pdf";
|
||||
String resultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL");
|
||||
String deltaResultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL_DELTA");
|
||||
|
||||
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(resultFileName)) {
|
||||
invisibleElementRemovalService.removeInvisibleElementsButKeepOcrText(in, out, false);
|
||||
}
|
||||
|
||||
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(deltaResultFileName)) {
|
||||
invisibleElementRemovalService.removeInvisibleElementsButKeepOcrText(in, out, true);
|
||||
}
|
||||
try (var in = new FileInputStream(resultFileName)) {
|
||||
String result = PdfTextExtraction.extractAllTextFromDocument(in);
|
||||
assertThat(result).contains("TABLE 17:", "Intergroup comparison oftotal litter", "TABLE 20:");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
BIN
src/test/resources/files/singlePageWithOcrText.pdf
Normal file
BIN
src/test/resources/files/singlePageWithOcrText.pdf
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user