Merge branch 'DM-326' into 'master'
DM-326: extend removeInvisibleElements Closes DM-326 See merge request redactmanager/commons/pdftron-logic-commons!5
This commit is contained in:
commit
5811cf76d4
@ -3,6 +3,7 @@ package com.iqser.red.pdftronlogic.commons;
|
||||
import static com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService.TOLERANCE;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.geom.Area;
|
||||
import java.awt.geom.GeneralPath;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
@ -77,12 +78,6 @@ public class ElementFeatures {
|
||||
}
|
||||
|
||||
|
||||
public boolean isBackground(Rect area) {
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private boolean rectsAlmostMatch(Rect bBox) {
|
||||
// To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance
|
||||
@ -94,12 +89,6 @@ public class ElementFeatures {
|
||||
}
|
||||
|
||||
|
||||
public boolean matchesFillColor(Color color) {
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@Getter
|
||||
@SuperBuilder
|
||||
@ -126,7 +115,7 @@ public class ElementFeatures {
|
||||
@Getter
|
||||
@SuperBuilder
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
private static class Path extends ElementFeatures {
|
||||
public static class Path extends ElementFeatures {
|
||||
|
||||
boolean isClippingPath;
|
||||
boolean isClipWindingFill;
|
||||
@ -151,7 +140,6 @@ public class ElementFeatures {
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean matchesFillColor(Color color) {
|
||||
|
||||
return color.equals(fillColor);
|
||||
@ -162,8 +150,7 @@ public class ElementFeatures {
|
||||
public boolean isBackground(Rect area) {
|
||||
|
||||
return isFilled && //
|
||||
getBoundingBox().intersects(area.getX1(), area.getY1(), area.getWidth(), area.getHeight()) && //
|
||||
linePath.contains(area.getX1(), area.getY1(), area.getWidth(), area.getHeight());
|
||||
getBoundingBox().intersects(area.getX1(), area.getY1(), area.getWidth(), area.getHeight());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -2,14 +2,18 @@ package com.iqser.red.pdftronlogic.commons;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.Shape;
|
||||
import java.awt.geom.Area;
|
||||
import java.awt.geom.GeneralPath;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
import java.util.concurrent.atomic.AtomicReference;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.ColorPt;
|
||||
@ -34,7 +38,7 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@Slf4j
|
||||
public class InvisibleElementRemovalService {
|
||||
|
||||
static public final double TOLERANCE = 1e-3;
|
||||
static public final double TOLERANCE = 1;
|
||||
|
||||
|
||||
/**
|
||||
@ -403,19 +407,47 @@ public class InvisibleElementRemovalService {
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private boolean differentColorThanBackgroundColor(Color fillColor, Rect textBBox, InvisibleElementRemovalContext context) {
|
||||
|
||||
List<ElementFeatures> backgroundElements = context.visibleElements().stream().filter(element -> element.isBackground(textBBox)).toList();
|
||||
List<ElementFeatures.Path> backgroundElements = findVisiblePathElementsThatIntersect(textBBox, context);
|
||||
|
||||
if (backgroundElements.isEmpty()) {
|
||||
return !fillColor.equals(Color.WHITE);
|
||||
}
|
||||
return backgroundElements.stream().anyMatch(element -> !element.matchesFillColor(fillColor));
|
||||
|
||||
List<ElementFeatures.Path> pathElementsByColor = backgroundElements.stream().filter(path -> path.getFillColor().equals(fillColor)).toList();
|
||||
if (pathElementsByColor.isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
Area backgroundArea = mergeLinePathsToArea(pathElementsByColor);
|
||||
return !almostContains(backgroundArea, Converter.toRectangle2D(textBBox));
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static List<ElementFeatures.Path> findVisiblePathElementsThatIntersect(Rect textBBox, InvisibleElementRemovalContext context) {
|
||||
|
||||
return context.visibleElements()
|
||||
.stream()
|
||||
.filter(elementFeatures -> elementFeatures.getElementType() == Element.e_path)
|
||||
.map(elementFeatures -> (ElementFeatures.Path) elementFeatures)
|
||||
.filter(element -> element.isBackground(textBBox))
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
private static Area mergeLinePathsToArea(List<ElementFeatures.Path> pathElementsWithSameColor) {
|
||||
|
||||
Area backgroundArea = new Area();
|
||||
pathElementsWithSameColor.stream().map(ElementFeatures.Path::getLinePath).map(Area::new).forEach(backgroundArea::add);
|
||||
return backgroundArea;
|
||||
}
|
||||
|
||||
|
||||
private boolean almostContains(Shape outer, Rectangle2D inner) {
|
||||
//To address inconsistencies in the calculation of the bounding box we slightly shrink the inner rectangle
|
||||
|
||||
//To address inconsistencies in the calculation of the bounding box we slightly shrink the inner rectangle
|
||||
double x_with_tolerance = inner.getX() >= 0 ? inner.getX() + TOLERANCE : inner.getX() - TOLERANCE;
|
||||
double y_with_tolerance = inner.getY() >= 0 ? inner.getY() + TOLERANCE : inner.getY() - TOLERANCE;
|
||||
double height_with_tolerance = inner.getHeight() - (2 * TOLERANCE);
|
||||
@ -450,9 +482,6 @@ public class InvisibleElementRemovalService {
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@Builder
|
||||
private record InvisibleElementRemovalContext(
|
||||
boolean delta,
|
||||
|
||||
@ -0,0 +1,125 @@
|
||||
package com.iqser.red.pdftronlogic.commons;
|
||||
|
||||
import static com.iqser.red.pdftronlogic.commons.PdfTextExtraction.extractAllTextFromDocument;
|
||||
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
|
||||
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.pdftron.pdf.PDFNet;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
class InvisibleElementRemovalServiceTest {
|
||||
|
||||
InvisibleElementRemovalService invisibleElementRemovalService;
|
||||
private static final String pdftronLicense = "demo:1650351709282:7bd235e003000000004ec28a6743e1163a085e2115de2536ab6e2cfe5a";
|
||||
|
||||
|
||||
@BeforeEach
|
||||
void createService() {
|
||||
|
||||
PDFNet.initialize(pdftronLicense);
|
||||
invisibleElementRemovalService = new InvisibleElementRemovalService();
|
||||
}
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
void removeInvisibleText() {
|
||||
|
||||
String fileName = "files/InvisibleText.pdf";
|
||||
String resultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL");
|
||||
String deltaResultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL_DELTA");
|
||||
|
||||
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(resultFileName)) {
|
||||
invisibleElementRemovalService.removeInvisibleElements(in, out, false);
|
||||
}
|
||||
|
||||
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(deltaResultFileName)) {
|
||||
invisibleElementRemovalService.removeInvisibleElements(in, out, true);
|
||||
}
|
||||
try (var in = new FileInputStream(resultFileName)) {
|
||||
String[] text = extractAllTextFromDocument(in).split("\n");
|
||||
assertThat(text).contains("APPENDIX 16 Pathology Report", "Amendment 1", "Page 255 of 260");
|
||||
}
|
||||
try (var in = new FileInputStream(deltaResultFileName)) {
|
||||
String[] text = extractAllTextFromDocument(in).split("\n");
|
||||
assertThat(text).contains("Michela Gregori DVM PhD Pathologist", "AUTHOR(S):", "COMPLETION DATE:");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
void removeInvisibleElementsWithColoredBackground() {
|
||||
|
||||
String fileName = "files/textOnColoredBackground.pdf";
|
||||
String resultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL");
|
||||
String deltaResultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL_DELTA");
|
||||
|
||||
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(resultFileName)) {
|
||||
invisibleElementRemovalService.removeInvisibleElements(in, out, false);
|
||||
}
|
||||
|
||||
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(deltaResultFileName)) {
|
||||
invisibleElementRemovalService.removeInvisibleElements(in, out, true);
|
||||
}
|
||||
try (var in = new FileInputStream(deltaResultFileName)) {
|
||||
String result = PdfTextExtraction.extractAllTextFromDocument(in);
|
||||
assertThat(result).contains("#1 Dark",
|
||||
"#13 Yellow",
|
||||
"Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper suscipit lobortis nisl ut aliquip\n" + "ex ea commodo consequat. Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie\n" + "consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim\n" + "qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi.");
|
||||
assertThat(result).doesNotContain("Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut\n" + "labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et\n" + "ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem\n" + "ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et\n" + "dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea\n" + "rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum\n" + "dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore\n" + "magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet\n" + "clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. ");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
void removeInvisibleElementsThinFilledTable() {
|
||||
|
||||
String fileName = "files/tableIsSingleLinePath.pdf";
|
||||
String resultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL");
|
||||
String deltaResultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL_DELTA");
|
||||
|
||||
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(resultFileName)) {
|
||||
invisibleElementRemovalService.removeInvisibleElements(in, out, false);
|
||||
}
|
||||
|
||||
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(deltaResultFileName)) {
|
||||
invisibleElementRemovalService.removeInvisibleElements(in, out, true);
|
||||
}
|
||||
try (var in = new FileInputStream(deltaResultFileName)) {
|
||||
String result = PdfTextExtraction.extractAllTextFromDocument(in);
|
||||
assertThat(result).isEqualTo("");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
void removeInvisibleElementsChineseOverlapped() {
|
||||
|
||||
String fileName = "files/chineseInvisibleElements.pdf";
|
||||
String resultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL");
|
||||
String deltaResultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL_DELTA");
|
||||
|
||||
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(resultFileName)) {
|
||||
invisibleElementRemovalService.removeInvisibleElements(in, out, false);
|
||||
}
|
||||
|
||||
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(deltaResultFileName)) {
|
||||
invisibleElementRemovalService.removeInvisibleElements(in, out, true);
|
||||
}
|
||||
try (var in = new FileInputStream(deltaResultFileName)) {
|
||||
String result = PdfTextExtraction.extractAllTextFromDocument(in);
|
||||
assertThat(result).contains("[Table_KeyInfo]", "[Table_StockInfo]", "[Table_BaseInfo]", "国内无线键鼠龙头企", "业", "精研研发发制制造造商商先先的的本");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,31 @@
|
||||
package com.iqser.red.pdftronlogic.commons;
|
||||
|
||||
import java.nio.file.Path;
|
||||
import java.util.Locale;
|
||||
|
||||
import org.junit.platform.commons.util.StringUtils;
|
||||
|
||||
public class OsUtils {
|
||||
|
||||
private static boolean isWindows() {
|
||||
|
||||
return System.getProperty("os.name").toLowerCase(Locale.ROOT).contains("windows");
|
||||
}
|
||||
|
||||
|
||||
public static String getTemporaryDirectory() {
|
||||
|
||||
String tmpdir = System.getProperty("java.io.tmpdir");
|
||||
if (isWindows() && StringUtils.isNotBlank(tmpdir)) {
|
||||
return tmpdir;
|
||||
}
|
||||
return "/tmp";
|
||||
}
|
||||
|
||||
|
||||
public static String createTmpFileName(String filename, String suffix) {
|
||||
|
||||
return Path.of(OsUtils.getTemporaryDirectory()).resolve(Path.of(filename).getFileName()).toString().replace(".pdf", "_" + suffix + ".pdf");
|
||||
}
|
||||
|
||||
}
|
||||
BIN
src/test/resources/files/InvisibleText.pdf
Normal file
BIN
src/test/resources/files/InvisibleText.pdf
Normal file
Binary file not shown.
BIN
src/test/resources/files/chineseInvisibleElements.pdf
Normal file
BIN
src/test/resources/files/chineseInvisibleElements.pdf
Normal file
Binary file not shown.
BIN
src/test/resources/files/tableIsSingleLinePath.pdf
Normal file
BIN
src/test/resources/files/tableIsSingleLinePath.pdf
Normal file
Binary file not shown.
BIN
src/test/resources/files/textOnColoredBackground.pdf
Normal file
BIN
src/test/resources/files/textOnColoredBackground.pdf
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user