diff --git a/src/main/java/com/iqser/red/pdftronlogic/commons/InvisibleElementRemovalService.java b/src/main/java/com/iqser/red/pdftronlogic/commons/InvisibleElementRemovalService.java index e1f64ec..b9c3abb 100644 --- a/src/main/java/com/iqser/red/pdftronlogic/commons/InvisibleElementRemovalService.java +++ b/src/main/java/com/iqser/red/pdftronlogic/commons/InvisibleElementRemovalService.java @@ -8,6 +8,7 @@ import java.awt.geom.Rectangle2D; import java.io.InputStream; import java.io.OutputStream; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import java.util.Set; import java.util.TreeSet; @@ -57,11 +58,11 @@ public class InvisibleElementRemovalService { * @param out OutputStream to write the resulting file to **/ @SneakyThrows - public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta, boolean removePaths) { + public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta, boolean removePaths, Set markedContentToIgnore) { PDFDoc pdfDoc = new PDFDoc(pdfFile); - execute(pdfDoc, delta, removePaths); + execute(pdfDoc, delta, removePaths, markedContentToIgnore); try { pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null); @@ -76,15 +77,44 @@ public class InvisibleElementRemovalService { /** - * This method is equal to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean)}, with removePaths == true. + * This method is equal to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean, Set)}, with removePaths == true and markedContentsToIgnore == emptySet(). */ @SneakyThrows public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta) { - removeInvisibleElements(pdfFile, out, delta, true); + removeInvisibleElements(pdfFile, out, delta, true, Collections.emptySet()); } + /** + * This method is equal to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean, Set)}, with removePaths == true and markedContentsToIgnore = Set.of("KNECON_OCR"). + */ + public void removeInvisibleElementsButKeepOcrText(InputStream pdfFile, OutputStream out, boolean delta) { + + removeInvisibleElements(pdfFile, out, delta, true, Set.of("KNECON_OCR")); + } + + + /** + * This method is equal to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean, Set)}, with markedContentsToIgnore == emptySet(). + */ + @SneakyThrows + public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta, boolean removePaths) { + + removeInvisibleElements(pdfFile, out, delta, removePaths, Collections.emptySet()); + + } + + + /** + * This method is similar to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean, Set)}, just with a PDFDoc. + */ + @SneakyThrows + public void removeInvisibleElements(PDFDoc pdfDoc, boolean delta, boolean removePaths, Set markedContentToIgnore) { + + execute(pdfDoc, delta, removePaths, markedContentToIgnore); + } + /** * This method is similar to {@link #removeInvisibleElements(InputStream, OutputStream, boolean, boolean)}, just with a PDFDoc. @@ -92,22 +122,22 @@ public class InvisibleElementRemovalService { @SneakyThrows public void removeInvisibleElements(PDFDoc pdfDoc, boolean delta, boolean removePaths) { - execute(pdfDoc, delta, removePaths); + execute(pdfDoc, delta, removePaths, Collections.emptySet()); } /** - * This method is equal to {@link #removeInvisibleElements(PDFDoc, boolean, boolean)}, with removePaths == true. + * This method is equal to {@link #removeInvisibleElements(PDFDoc, boolean)}, just with a PDFDoc. */ @SneakyThrows public void removeInvisibleElements(PDFDoc pdfDoc, boolean delta) { - execute(pdfDoc, delta, true); + execute(pdfDoc, delta, true, Collections.emptySet()); } @SneakyThrows - private void execute(PDFDoc pdfDoc, boolean delta, boolean removePaths) { + private void execute(PDFDoc pdfDoc, boolean delta, boolean removePaths, Set markedContentToIgnore) { log.info("Start removing invisible Elements"); ElementWriter writer = new ElementWriter(); @@ -123,16 +153,19 @@ public class InvisibleElementRemovalService { InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder() .reader(reader) .clippingPathStack(new ClippingPathStack(page.getMediaBox())) + .markedContentStack(new MarkedContentStack()) .removePaths(removePaths) .delta(delta) .overlappedElements(new ArrayList<>()) .visibleElements(new ArrayList<>()) .visitedXObjIds(visitedXObjIds) + .markedContentToIgnore(markedContentToIgnore) .build(); removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context); context.visitedXObjIds().clear(); + context.markedContentStack().clear(); removeOverlappedElements(page, writer, context); @@ -149,6 +182,7 @@ public class InvisibleElementRemovalService { InvisibleElementRemovalContext context) throws PDFNetException { context.reader().begin(page); + context.markedContentStack().clear(); writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict()); processElements(writer, context); writer.end(); @@ -158,7 +192,13 @@ public class InvisibleElementRemovalService { private void processElements(ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { - for (Element element = context.reader().next(); element != null; element = context.reader().next()) + for (Element element = context.reader().next(); element != null; element = context.reader().next()) { + + if (context.markedContentStack().currentMarkedContentContainsAny(context.markedContentToIgnore()) && element.getType() != Element.e_marked_content_end) { + writer.writeElement(element); + continue; + } + switch (element.getType()) { case Element.e_image, Element.e_inline_image -> processImages(element, writer, context); case Element.e_text -> processText(element, writer, context); @@ -172,8 +212,17 @@ public class InvisibleElementRemovalService { context.clippingPathStack().leaveGState(); writer.writeElement(element); } + case Element.e_marked_content_begin -> { + context.markedContentStack().enterMarkedContent(element.getMCTag().getName()); + writer.writeElement(element); + } + case Element.e_marked_content_end -> { + context.markedContentStack().leaveMarkedContent(); + writer.writeElement(element); + } default -> writer.writeElement(element); } + } } @@ -330,10 +379,7 @@ public class InvisibleElementRemovalService { private void calculateOverlapsForLinePath(InvisibleElementRemovalContext context, GeneralPath linePath) { - List currentOverlappedElements = context.visibleElements() - .stream() - .filter(features -> almostContains(linePath, features.getBoundingBox())) - .toList(); + List currentOverlappedElements = context.visibleElements().stream().filter(features -> almostContains(linePath, features.getBoundingBox())).toList(); context.overlappedElements().addAll(currentOverlappedElements); context.visibleElements().removeAll(currentOverlappedElements); } @@ -361,6 +407,12 @@ public class InvisibleElementRemovalService { private void processOverlappedElements(ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { for (Element element = context.reader().next(); element != null; element = context.reader().next()) { + + if (context.markedContentStack().currentMarkedContentContainsAny(context.markedContentToIgnore()) && element.getType() != Element.e_marked_content_end) { + writer.writeElement(element); + continue; + } + switch (element.getType()) { case Element.e_form -> processFormOverlappedElements(writer, element, context); case Element.e_image, Element.e_inline_image, Element.e_text -> removeOverlappedElement(writer, context, element); @@ -371,6 +423,14 @@ public class InvisibleElementRemovalService { writer.writeElement(element); } } + case Element.e_marked_content_begin -> { + context.markedContentStack().enterMarkedContent(element.getMCTag().getName()); + writer.writeElement(element); + } + case Element.e_marked_content_end -> { + context.markedContentStack().leaveMarkedContent(); + writer.writeElement(element); + } default -> writer.writeElement(element); } } @@ -532,9 +592,11 @@ public class InvisibleElementRemovalService { boolean delta, ElementReader reader, ClippingPathStack clippingPathStack, + MarkedContentStack markedContentStack, List overlappedElements, List visibleElements, - Set visitedXObjIds) { + Set visitedXObjIds, + Set markedContentToIgnore) { } diff --git a/src/main/java/com/iqser/red/pdftronlogic/commons/MarkedContentStack.java b/src/main/java/com/iqser/red/pdftronlogic/commons/MarkedContentStack.java new file mode 100644 index 0000000..20747fb --- /dev/null +++ b/src/main/java/com/iqser/red/pdftronlogic/commons/MarkedContentStack.java @@ -0,0 +1,73 @@ +package com.iqser.red.pdftronlogic.commons; + +import java.util.Deque; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.Set; + +public class MarkedContentStack { + + Deque markedContentStack = new LinkedList<>(); + + + public void enterMarkedContent(String name) { + + markedContentStack.push(new MarkedContent(name)); + } + + + public void leaveMarkedContent() { + + markedContentStack.pop(); + } + + + public String currentMarkedContent() { + + if (markedContentStack.isEmpty()) { + return ""; + } + return markedContentStack.peek().name(); + } + + + public boolean currentMarkedContentContains(String name) { + + Iterator markedContentIterator = markedContentStack.descendingIterator(); + while (markedContentIterator.hasNext()) { + var markedContent = markedContentIterator.next(); + if (markedContent.name().equals(name)) { + return true; + } + } + return false; + } + + + public boolean currentMarkedContentContainsAny(Set names) { + + if (markedContentStack.isEmpty()) { + return false; + } + Iterator markedContentIterator = markedContentStack.descendingIterator(); + while (markedContentIterator.hasNext()) { + var markedContent = markedContentIterator.next(); + if (names.contains(markedContent.name())) { + return true; + } + } + return false; + } + + + public void clear() { + + markedContentStack.clear(); + } + + + private record MarkedContent(String name) { + + } + +} diff --git a/src/test/java/com/iqser/red/pdftronlogic/commons/InvisibleElementRemovalServiceTest.java b/src/test/java/com/iqser/red/pdftronlogic/commons/InvisibleElementRemovalServiceTest.java index d817678..135385b 100644 --- a/src/test/java/com/iqser/red/pdftronlogic/commons/InvisibleElementRemovalServiceTest.java +++ b/src/test/java/com/iqser/red/pdftronlogic/commons/InvisibleElementRemovalServiceTest.java @@ -152,4 +152,27 @@ class InvisibleElementRemovalServiceTest { } + + @Test + @SneakyThrows + void removeInvisibleElementsButKeepOCRText() { + + String fileName = "files/singlePageWithOcrText.pdf"; + String resultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL"); + String deltaResultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL_DELTA"); + + try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(resultFileName)) { + invisibleElementRemovalService.removeInvisibleElementsButKeepOcrText(in, out, false); + } + + try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(deltaResultFileName)) { + invisibleElementRemovalService.removeInvisibleElementsButKeepOcrText(in, out, true); + } + try (var in = new FileInputStream(resultFileName)) { + String result = PdfTextExtraction.extractAllTextFromDocument(in); + assertThat(result).contains("TABLE 17:", "Intergroup comparison oftotal litter", "TABLE 20:"); + } + + } + } \ No newline at end of file diff --git a/src/test/resources/files/singlePageWithOcrText.pdf b/src/test/resources/files/singlePageWithOcrText.pdf new file mode 100644 index 0000000..b795566 Binary files /dev/null and b/src/test/resources/files/singlePageWithOcrText.pdf differ