diff --git a/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java b/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java index f031b7d..51be5ab 100644 --- a/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java +++ b/src/main/java/com/iqser/red/pdftronlogic/commons/ElementFeatures.java @@ -3,6 +3,7 @@ package com.iqser.red.pdftronlogic.commons; import static com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService.TOLERANCE; import java.awt.Color; +import java.awt.geom.Area; import java.awt.geom.GeneralPath; import java.awt.geom.Rectangle2D; @@ -77,12 +78,6 @@ public class ElementFeatures { } - public boolean isBackground(Rect area) { - - return false; - } - - @SneakyThrows private boolean rectsAlmostMatch(Rect bBox) { // To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance @@ -94,12 +89,6 @@ public class ElementFeatures { } - public boolean matchesFillColor(Color color) { - - return false; - } - - @EqualsAndHashCode(callSuper = true) @Getter @SuperBuilder @@ -126,7 +115,7 @@ public class ElementFeatures { @Getter @SuperBuilder @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) - private static class Path extends ElementFeatures { + public static class Path extends ElementFeatures { boolean isClippingPath; boolean isClipWindingFill; @@ -151,7 +140,6 @@ public class ElementFeatures { } - @Override public boolean matchesFillColor(Color color) { return color.equals(fillColor); @@ -162,8 +150,7 @@ public class ElementFeatures { public boolean isBackground(Rect area) { return isFilled && // - getBoundingBox().intersects(area.getX1(), area.getY1(), area.getWidth(), area.getHeight()) && // - linePath.contains(area.getX1(), area.getY1(), area.getWidth(), area.getHeight()); + getBoundingBox().intersects(area.getX1(), area.getY1(), area.getWidth(), area.getHeight()); } } diff --git a/src/main/java/com/iqser/red/pdftronlogic/commons/InvisibleElementRemovalService.java b/src/main/java/com/iqser/red/pdftronlogic/commons/InvisibleElementRemovalService.java index 231be21..d9b7057 100644 --- a/src/main/java/com/iqser/red/pdftronlogic/commons/InvisibleElementRemovalService.java +++ b/src/main/java/com/iqser/red/pdftronlogic/commons/InvisibleElementRemovalService.java @@ -2,14 +2,18 @@ package com.iqser.red.pdftronlogic.commons; import java.awt.Color; import java.awt.Shape; +import java.awt.geom.Area; import java.awt.geom.GeneralPath; import java.awt.geom.Rectangle2D; import java.io.InputStream; import java.io.OutputStream; import java.util.ArrayList; import java.util.List; +import java.util.Map; import java.util.Set; import java.util.TreeSet; +import java.util.concurrent.atomic.AtomicReference; +import java.util.stream.Collectors; import com.pdftron.common.PDFNetException; import com.pdftron.pdf.ColorPt; @@ -34,7 +38,7 @@ import lombok.extern.slf4j.Slf4j; @Slf4j public class InvisibleElementRemovalService { - static public final double TOLERANCE = 1e-3; + static public final double TOLERANCE = 1; /** @@ -403,19 +407,47 @@ public class InvisibleElementRemovalService { } + @SneakyThrows private boolean differentColorThanBackgroundColor(Color fillColor, Rect textBBox, InvisibleElementRemovalContext context) { - List backgroundElements = context.visibleElements().stream().filter(element -> element.isBackground(textBBox)).toList(); + List backgroundElements = findVisiblePathElementsThatIntersect(textBBox, context); + if (backgroundElements.isEmpty()) { return !fillColor.equals(Color.WHITE); } - return backgroundElements.stream().anyMatch(element -> !element.matchesFillColor(fillColor)); + + List pathElementsByColor = backgroundElements.stream().filter(path -> path.getFillColor().equals(fillColor)).toList(); + if (pathElementsByColor.isEmpty()) { + return true; + } + Area backgroundArea = mergeLinePathsToArea(pathElementsByColor); + return !almostContains(backgroundArea, Converter.toRectangle2D(textBBox)); + + } + + + private static List findVisiblePathElementsThatIntersect(Rect textBBox, InvisibleElementRemovalContext context) { + + return context.visibleElements() + .stream() + .filter(elementFeatures -> elementFeatures.getElementType() == Element.e_path) + .map(elementFeatures -> (ElementFeatures.Path) elementFeatures) + .filter(element -> element.isBackground(textBBox)) + .toList(); + } + + + private static Area mergeLinePathsToArea(List pathElementsWithSameColor) { + + Area backgroundArea = new Area(); + pathElementsWithSameColor.stream().map(ElementFeatures.Path::getLinePath).map(Area::new).forEach(backgroundArea::add); + return backgroundArea; } private boolean almostContains(Shape outer, Rectangle2D inner) { - //To address inconsistencies in the calculation of the bounding box we slightly shrink the inner rectangle + //To address inconsistencies in the calculation of the bounding box we slightly shrink the inner rectangle double x_with_tolerance = inner.getX() >= 0 ? inner.getX() + TOLERANCE : inner.getX() - TOLERANCE; double y_with_tolerance = inner.getY() >= 0 ? inner.getY() + TOLERANCE : inner.getY() - TOLERANCE; double height_with_tolerance = inner.getHeight() - (2 * TOLERANCE); @@ -450,9 +482,6 @@ public class InvisibleElementRemovalService { } - - - @Builder private record InvisibleElementRemovalContext( boolean delta, diff --git a/src/test/java/com/iqser/red/pdftronlogic/commons/InvisibleElementRemovalServiceTest.java b/src/test/java/com/iqser/red/pdftronlogic/commons/InvisibleElementRemovalServiceTest.java new file mode 100644 index 0000000..2fce87c --- /dev/null +++ b/src/test/java/com/iqser/red/pdftronlogic/commons/InvisibleElementRemovalServiceTest.java @@ -0,0 +1,125 @@ +package com.iqser.red.pdftronlogic.commons; + +import static com.iqser.red.pdftronlogic.commons.PdfTextExtraction.extractAllTextFromDocument; +import static org.assertj.core.api.AssertionsForClassTypes.assertThat; + +import java.io.FileInputStream; +import java.io.FileOutputStream; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import com.pdftron.pdf.PDFNet; + +import lombok.SneakyThrows; + +class InvisibleElementRemovalServiceTest { + + InvisibleElementRemovalService invisibleElementRemovalService; + private static final String pdftronLicense = "demo:1650351709282:7bd235e003000000004ec28a6743e1163a085e2115de2536ab6e2cfe5a"; + + + @BeforeEach + void createService() { + + PDFNet.initialize(pdftronLicense); + invisibleElementRemovalService = new InvisibleElementRemovalService(); + } + + @Test + @SneakyThrows + void removeInvisibleText() { + + String fileName = "files/InvisibleText.pdf"; + String resultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL"); + String deltaResultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL_DELTA"); + + try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(resultFileName)) { + invisibleElementRemovalService.removeInvisibleElements(in, out, false); + } + + try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(deltaResultFileName)) { + invisibleElementRemovalService.removeInvisibleElements(in, out, true); + } + try (var in = new FileInputStream(resultFileName)) { + String[] text = extractAllTextFromDocument(in).split("\n"); + assertThat(text).contains("APPENDIX 16 Pathology Report", "Amendment 1", "Page 255 of 260"); + } + try (var in = new FileInputStream(deltaResultFileName)) { + String[] text = extractAllTextFromDocument(in).split("\n"); + assertThat(text).contains("Michela Gregori DVM PhD Pathologist", "AUTHOR(S):", "COMPLETION DATE:"); + } + + } + + + @Test + @SneakyThrows + void removeInvisibleElementsWithColoredBackground() { + + String fileName = "files/textOnColoredBackground.pdf"; + String resultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL"); + String deltaResultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL_DELTA"); + + try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(resultFileName)) { + invisibleElementRemovalService.removeInvisibleElements(in, out, false); + } + + try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(deltaResultFileName)) { + invisibleElementRemovalService.removeInvisibleElements(in, out, true); + } + try (var in = new FileInputStream(deltaResultFileName)) { + String result = PdfTextExtraction.extractAllTextFromDocument(in); + assertThat(result).contains("#1 Dark", + "#13 Yellow", + "Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper suscipit lobortis nisl ut aliquip\n" + "ex ea commodo consequat. Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie\n" + "consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim\n" + "qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi."); + assertThat(result).doesNotContain("Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut\n" + "labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et\n" + "ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem\n" + "ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et\n" + "dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea\n" + "rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum\n" + "dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore\n" + "magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet\n" + "clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. "); + } + + } + + @Test + @SneakyThrows + void removeInvisibleElementsThinFilledTable() { + + String fileName = "files/tableIsSingleLinePath.pdf"; + String resultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL"); + String deltaResultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL_DELTA"); + + try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(resultFileName)) { + invisibleElementRemovalService.removeInvisibleElements(in, out, false); + } + + try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(deltaResultFileName)) { + invisibleElementRemovalService.removeInvisibleElements(in, out, true); + } + try (var in = new FileInputStream(deltaResultFileName)) { + String result = PdfTextExtraction.extractAllTextFromDocument(in); + assertThat(result).isEqualTo(""); + } + + } + + @Test + @SneakyThrows + void removeInvisibleElementsChineseOverlapped() { + + String fileName = "files/chineseInvisibleElements.pdf"; + String resultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL"); + String deltaResultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL_DELTA"); + + try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(resultFileName)) { + invisibleElementRemovalService.removeInvisibleElements(in, out, false); + } + + try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(deltaResultFileName)) { + invisibleElementRemovalService.removeInvisibleElements(in, out, true); + } + try (var in = new FileInputStream(deltaResultFileName)) { + String result = PdfTextExtraction.extractAllTextFromDocument(in); + assertThat(result).contains("[Table_KeyInfo]", "[Table_StockInfo]", "[Table_BaseInfo]", "国内无线键鼠龙头企", "业", "精研研发发制制造造商商先先的的本"); + } + + } + +} \ No newline at end of file diff --git a/src/test/java/com/iqser/red/pdftronlogic/commons/OsUtils.java b/src/test/java/com/iqser/red/pdftronlogic/commons/OsUtils.java new file mode 100644 index 0000000..0349392 --- /dev/null +++ b/src/test/java/com/iqser/red/pdftronlogic/commons/OsUtils.java @@ -0,0 +1,31 @@ +package com.iqser.red.pdftronlogic.commons; + +import java.nio.file.Path; +import java.util.Locale; + +import org.junit.platform.commons.util.StringUtils; + +public class OsUtils { + + private static boolean isWindows() { + + return System.getProperty("os.name").toLowerCase(Locale.ROOT).contains("windows"); + } + + + public static String getTemporaryDirectory() { + + String tmpdir = System.getProperty("java.io.tmpdir"); + if (isWindows() && StringUtils.isNotBlank(tmpdir)) { + return tmpdir; + } + return "/tmp"; + } + + + public static String createTmpFileName(String filename, String suffix) { + + return Path.of(OsUtils.getTemporaryDirectory()).resolve(Path.of(filename).getFileName()).toString().replace(".pdf", "_" + suffix + ".pdf"); + } + +} diff --git a/src/test/resources/files/InvisibleText.pdf b/src/test/resources/files/InvisibleText.pdf new file mode 100644 index 0000000..0eb7128 Binary files /dev/null and b/src/test/resources/files/InvisibleText.pdf differ diff --git a/src/test/resources/files/chineseInvisibleElements.pdf b/src/test/resources/files/chineseInvisibleElements.pdf new file mode 100644 index 0000000..682b650 Binary files /dev/null and b/src/test/resources/files/chineseInvisibleElements.pdf differ diff --git a/src/test/resources/files/tableIsSingleLinePath.pdf b/src/test/resources/files/tableIsSingleLinePath.pdf new file mode 100644 index 0000000..e6d9a07 Binary files /dev/null and b/src/test/resources/files/tableIsSingleLinePath.pdf differ diff --git a/src/test/resources/files/textOnColoredBackground.pdf b/src/test/resources/files/textOnColoredBackground.pdf new file mode 100644 index 0000000..ec065a8 Binary files /dev/null and b/src/test/resources/files/textOnColoredBackground.pdf differ