Merge branch 'DM-326' into 'master'

DM-326: extend removeInvisibleElements

Closes DM-326

See merge request redactmanager/commons/pdftron-logic-commons!5
This commit is contained in:
Kilian Schüttler 2023-07-14 12:33:51 +02:00
commit 5811cf76d4
8 changed files with 195 additions and 23 deletions

View File

@ -3,6 +3,7 @@ package com.iqser.red.pdftronlogic.commons;
import static com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService.TOLERANCE;
import java.awt.Color;
import java.awt.geom.Area;
import java.awt.geom.GeneralPath;
import java.awt.geom.Rectangle2D;
@ -77,12 +78,6 @@ public class ElementFeatures {
}
public boolean isBackground(Rect area) {
return false;
}
@SneakyThrows
private boolean rectsAlmostMatch(Rect bBox) {
// To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance
@ -94,12 +89,6 @@ public class ElementFeatures {
}
public boolean matchesFillColor(Color color) {
return false;
}
@EqualsAndHashCode(callSuper = true)
@Getter
@SuperBuilder
@ -126,7 +115,7 @@ public class ElementFeatures {
@Getter
@SuperBuilder
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
private static class Path extends ElementFeatures {
public static class Path extends ElementFeatures {
boolean isClippingPath;
boolean isClipWindingFill;
@ -151,7 +140,6 @@ public class ElementFeatures {
}
@Override
public boolean matchesFillColor(Color color) {
return color.equals(fillColor);
@ -162,8 +150,7 @@ public class ElementFeatures {
public boolean isBackground(Rect area) {
return isFilled && //
getBoundingBox().intersects(area.getX1(), area.getY1(), area.getWidth(), area.getHeight()) && //
linePath.contains(area.getX1(), area.getY1(), area.getWidth(), area.getHeight());
getBoundingBox().intersects(area.getX1(), area.getY1(), area.getWidth(), area.getHeight());
}
}

View File

@ -2,14 +2,18 @@ package com.iqser.red.pdftronlogic.commons;
import java.awt.Color;
import java.awt.Shape;
import java.awt.geom.Area;
import java.awt.geom.GeneralPath;
import java.awt.geom.Rectangle2D;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.atomic.AtomicReference;
import java.util.stream.Collectors;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.ColorPt;
@ -34,7 +38,7 @@ import lombok.extern.slf4j.Slf4j;
@Slf4j
public class InvisibleElementRemovalService {
static public final double TOLERANCE = 1e-3;
static public final double TOLERANCE = 1;
/**
@ -403,19 +407,47 @@ public class InvisibleElementRemovalService {
}
@SneakyThrows
private boolean differentColorThanBackgroundColor(Color fillColor, Rect textBBox, InvisibleElementRemovalContext context) {
List<ElementFeatures> backgroundElements = context.visibleElements().stream().filter(element -> element.isBackground(textBBox)).toList();
List<ElementFeatures.Path> backgroundElements = findVisiblePathElementsThatIntersect(textBBox, context);
if (backgroundElements.isEmpty()) {
return !fillColor.equals(Color.WHITE);
}
return backgroundElements.stream().anyMatch(element -> !element.matchesFillColor(fillColor));
List<ElementFeatures.Path> pathElementsByColor = backgroundElements.stream().filter(path -> path.getFillColor().equals(fillColor)).toList();
if (pathElementsByColor.isEmpty()) {
return true;
}
Area backgroundArea = mergeLinePathsToArea(pathElementsByColor);
return !almostContains(backgroundArea, Converter.toRectangle2D(textBBox));
}
private static List<ElementFeatures.Path> findVisiblePathElementsThatIntersect(Rect textBBox, InvisibleElementRemovalContext context) {
return context.visibleElements()
.stream()
.filter(elementFeatures -> elementFeatures.getElementType() == Element.e_path)
.map(elementFeatures -> (ElementFeatures.Path) elementFeatures)
.filter(element -> element.isBackground(textBBox))
.toList();
}
private static Area mergeLinePathsToArea(List<ElementFeatures.Path> pathElementsWithSameColor) {
Area backgroundArea = new Area();
pathElementsWithSameColor.stream().map(ElementFeatures.Path::getLinePath).map(Area::new).forEach(backgroundArea::add);
return backgroundArea;
}
private boolean almostContains(Shape outer, Rectangle2D inner) {
//To address inconsistencies in the calculation of the bounding box we slightly shrink the inner rectangle
//To address inconsistencies in the calculation of the bounding box we slightly shrink the inner rectangle
double x_with_tolerance = inner.getX() >= 0 ? inner.getX() + TOLERANCE : inner.getX() - TOLERANCE;
double y_with_tolerance = inner.getY() >= 0 ? inner.getY() + TOLERANCE : inner.getY() - TOLERANCE;
double height_with_tolerance = inner.getHeight() - (2 * TOLERANCE);
@ -450,9 +482,6 @@ public class InvisibleElementRemovalService {
}
@Builder
private record InvisibleElementRemovalContext(
boolean delta,

View File

@ -0,0 +1,125 @@
package com.iqser.red.pdftronlogic.commons;
import static com.iqser.red.pdftronlogic.commons.PdfTextExtraction.extractAllTextFromDocument;
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import com.pdftron.pdf.PDFNet;
import lombok.SneakyThrows;
class InvisibleElementRemovalServiceTest {
InvisibleElementRemovalService invisibleElementRemovalService;
private static final String pdftronLicense = "demo:1650351709282:7bd235e003000000004ec28a6743e1163a085e2115de2536ab6e2cfe5a";
@BeforeEach
void createService() {
PDFNet.initialize(pdftronLicense);
invisibleElementRemovalService = new InvisibleElementRemovalService();
}
@Test
@SneakyThrows
void removeInvisibleText() {
String fileName = "files/InvisibleText.pdf";
String resultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL");
String deltaResultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL_DELTA");
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(resultFileName)) {
invisibleElementRemovalService.removeInvisibleElements(in, out, false);
}
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(deltaResultFileName)) {
invisibleElementRemovalService.removeInvisibleElements(in, out, true);
}
try (var in = new FileInputStream(resultFileName)) {
String[] text = extractAllTextFromDocument(in).split("\n");
assertThat(text).contains("APPENDIX 16 Pathology Report", "Amendment 1", "Page 255 of 260");
}
try (var in = new FileInputStream(deltaResultFileName)) {
String[] text = extractAllTextFromDocument(in).split("\n");
assertThat(text).contains("Michela Gregori DVM PhD Pathologist", "AUTHOR(S):", "COMPLETION DATE:");
}
}
@Test
@SneakyThrows
void removeInvisibleElementsWithColoredBackground() {
String fileName = "files/textOnColoredBackground.pdf";
String resultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL");
String deltaResultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL_DELTA");
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(resultFileName)) {
invisibleElementRemovalService.removeInvisibleElements(in, out, false);
}
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(deltaResultFileName)) {
invisibleElementRemovalService.removeInvisibleElements(in, out, true);
}
try (var in = new FileInputStream(deltaResultFileName)) {
String result = PdfTextExtraction.extractAllTextFromDocument(in);
assertThat(result).contains("#1 Dark",
"#13 Yellow",
"Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper suscipit lobortis nisl ut aliquip\n" + "ex ea commodo consequat. Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie\n" + "consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim\n" + "qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi.");
assertThat(result).doesNotContain("Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut\n" + "labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et\n" + "ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem\n" + "ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et\n" + "dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea\n" + "rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum\n" + "dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore\n" + "magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet\n" + "clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. ");
}
}
@Test
@SneakyThrows
void removeInvisibleElementsThinFilledTable() {
String fileName = "files/tableIsSingleLinePath.pdf";
String resultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL");
String deltaResultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL_DELTA");
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(resultFileName)) {
invisibleElementRemovalService.removeInvisibleElements(in, out, false);
}
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(deltaResultFileName)) {
invisibleElementRemovalService.removeInvisibleElements(in, out, true);
}
try (var in = new FileInputStream(deltaResultFileName)) {
String result = PdfTextExtraction.extractAllTextFromDocument(in);
assertThat(result).isEqualTo("");
}
}
@Test
@SneakyThrows
void removeInvisibleElementsChineseOverlapped() {
String fileName = "files/chineseInvisibleElements.pdf";
String resultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL");
String deltaResultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL_DELTA");
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(resultFileName)) {
invisibleElementRemovalService.removeInvisibleElements(in, out, false);
}
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(deltaResultFileName)) {
invisibleElementRemovalService.removeInvisibleElements(in, out, true);
}
try (var in = new FileInputStream(deltaResultFileName)) {
String result = PdfTextExtraction.extractAllTextFromDocument(in);
assertThat(result).contains("[Table_KeyInfo]", "[Table_StockInfo]", "[Table_BaseInfo]", "国内无线键鼠龙头企", "", "精研研发发制制造造商商先先的的本");
}
}
}

View File

@ -0,0 +1,31 @@
package com.iqser.red.pdftronlogic.commons;
import java.nio.file.Path;
import java.util.Locale;
import org.junit.platform.commons.util.StringUtils;
public class OsUtils {
private static boolean isWindows() {
return System.getProperty("os.name").toLowerCase(Locale.ROOT).contains("windows");
}
public static String getTemporaryDirectory() {
String tmpdir = System.getProperty("java.io.tmpdir");
if (isWindows() && StringUtils.isNotBlank(tmpdir)) {
return tmpdir;
}
return "/tmp";
}
public static String createTmpFileName(String filename, String suffix) {
return Path.of(OsUtils.getTemporaryDirectory()).resolve(Path.of(filename).getFileName()).toString().replace(".pdf", "_" + suffix + ".pdf");
}
}

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.