Merge branch 'DM-326' into 'master'
DM-326: extend removeInvisibleElements Closes DM-326 See merge request redactmanager/commons/pdftron-logic-commons!5
This commit is contained in:
commit
5811cf76d4
@ -3,6 +3,7 @@ package com.iqser.red.pdftronlogic.commons;
|
|||||||
import static com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService.TOLERANCE;
|
import static com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService.TOLERANCE;
|
||||||
|
|
||||||
import java.awt.Color;
|
import java.awt.Color;
|
||||||
|
import java.awt.geom.Area;
|
||||||
import java.awt.geom.GeneralPath;
|
import java.awt.geom.GeneralPath;
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
|
|
||||||
@ -77,12 +78,6 @@ public class ElementFeatures {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public boolean isBackground(Rect area) {
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private boolean rectsAlmostMatch(Rect bBox) {
|
private boolean rectsAlmostMatch(Rect bBox) {
|
||||||
// To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance
|
// To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance
|
||||||
@ -94,12 +89,6 @@ public class ElementFeatures {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public boolean matchesFillColor(Color color) {
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@EqualsAndHashCode(callSuper = true)
|
@EqualsAndHashCode(callSuper = true)
|
||||||
@Getter
|
@Getter
|
||||||
@SuperBuilder
|
@SuperBuilder
|
||||||
@ -126,7 +115,7 @@ public class ElementFeatures {
|
|||||||
@Getter
|
@Getter
|
||||||
@SuperBuilder
|
@SuperBuilder
|
||||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
private static class Path extends ElementFeatures {
|
public static class Path extends ElementFeatures {
|
||||||
|
|
||||||
boolean isClippingPath;
|
boolean isClippingPath;
|
||||||
boolean isClipWindingFill;
|
boolean isClipWindingFill;
|
||||||
@ -151,7 +140,6 @@ public class ElementFeatures {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean matchesFillColor(Color color) {
|
public boolean matchesFillColor(Color color) {
|
||||||
|
|
||||||
return color.equals(fillColor);
|
return color.equals(fillColor);
|
||||||
@ -162,8 +150,7 @@ public class ElementFeatures {
|
|||||||
public boolean isBackground(Rect area) {
|
public boolean isBackground(Rect area) {
|
||||||
|
|
||||||
return isFilled && //
|
return isFilled && //
|
||||||
getBoundingBox().intersects(area.getX1(), area.getY1(), area.getWidth(), area.getHeight()) && //
|
getBoundingBox().intersects(area.getX1(), area.getY1(), area.getWidth(), area.getHeight());
|
||||||
linePath.contains(area.getX1(), area.getY1(), area.getWidth(), area.getHeight());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -2,14 +2,18 @@ package com.iqser.red.pdftronlogic.commons;
|
|||||||
|
|
||||||
import java.awt.Color;
|
import java.awt.Color;
|
||||||
import java.awt.Shape;
|
import java.awt.Shape;
|
||||||
|
import java.awt.geom.Area;
|
||||||
import java.awt.geom.GeneralPath;
|
import java.awt.geom.GeneralPath;
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.OutputStream;
|
import java.io.OutputStream;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.TreeSet;
|
import java.util.TreeSet;
|
||||||
|
import java.util.concurrent.atomic.AtomicReference;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import com.pdftron.common.PDFNetException;
|
import com.pdftron.common.PDFNetException;
|
||||||
import com.pdftron.pdf.ColorPt;
|
import com.pdftron.pdf.ColorPt;
|
||||||
@ -34,7 +38,7 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
@Slf4j
|
@Slf4j
|
||||||
public class InvisibleElementRemovalService {
|
public class InvisibleElementRemovalService {
|
||||||
|
|
||||||
static public final double TOLERANCE = 1e-3;
|
static public final double TOLERANCE = 1;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -403,19 +407,47 @@ public class InvisibleElementRemovalService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
private boolean differentColorThanBackgroundColor(Color fillColor, Rect textBBox, InvisibleElementRemovalContext context) {
|
private boolean differentColorThanBackgroundColor(Color fillColor, Rect textBBox, InvisibleElementRemovalContext context) {
|
||||||
|
|
||||||
List<ElementFeatures> backgroundElements = context.visibleElements().stream().filter(element -> element.isBackground(textBBox)).toList();
|
List<ElementFeatures.Path> backgroundElements = findVisiblePathElementsThatIntersect(textBBox, context);
|
||||||
|
|
||||||
if (backgroundElements.isEmpty()) {
|
if (backgroundElements.isEmpty()) {
|
||||||
return !fillColor.equals(Color.WHITE);
|
return !fillColor.equals(Color.WHITE);
|
||||||
}
|
}
|
||||||
return backgroundElements.stream().anyMatch(element -> !element.matchesFillColor(fillColor));
|
|
||||||
|
List<ElementFeatures.Path> pathElementsByColor = backgroundElements.stream().filter(path -> path.getFillColor().equals(fillColor)).toList();
|
||||||
|
if (pathElementsByColor.isEmpty()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
Area backgroundArea = mergeLinePathsToArea(pathElementsByColor);
|
||||||
|
return !almostContains(backgroundArea, Converter.toRectangle2D(textBBox));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static List<ElementFeatures.Path> findVisiblePathElementsThatIntersect(Rect textBBox, InvisibleElementRemovalContext context) {
|
||||||
|
|
||||||
|
return context.visibleElements()
|
||||||
|
.stream()
|
||||||
|
.filter(elementFeatures -> elementFeatures.getElementType() == Element.e_path)
|
||||||
|
.map(elementFeatures -> (ElementFeatures.Path) elementFeatures)
|
||||||
|
.filter(element -> element.isBackground(textBBox))
|
||||||
|
.toList();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static Area mergeLinePathsToArea(List<ElementFeatures.Path> pathElementsWithSameColor) {
|
||||||
|
|
||||||
|
Area backgroundArea = new Area();
|
||||||
|
pathElementsWithSameColor.stream().map(ElementFeatures.Path::getLinePath).map(Area::new).forEach(backgroundArea::add);
|
||||||
|
return backgroundArea;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private boolean almostContains(Shape outer, Rectangle2D inner) {
|
private boolean almostContains(Shape outer, Rectangle2D inner) {
|
||||||
//To address inconsistencies in the calculation of the bounding box we slightly shrink the inner rectangle
|
|
||||||
|
|
||||||
|
//To address inconsistencies in the calculation of the bounding box we slightly shrink the inner rectangle
|
||||||
double x_with_tolerance = inner.getX() >= 0 ? inner.getX() + TOLERANCE : inner.getX() - TOLERANCE;
|
double x_with_tolerance = inner.getX() >= 0 ? inner.getX() + TOLERANCE : inner.getX() - TOLERANCE;
|
||||||
double y_with_tolerance = inner.getY() >= 0 ? inner.getY() + TOLERANCE : inner.getY() - TOLERANCE;
|
double y_with_tolerance = inner.getY() >= 0 ? inner.getY() + TOLERANCE : inner.getY() - TOLERANCE;
|
||||||
double height_with_tolerance = inner.getHeight() - (2 * TOLERANCE);
|
double height_with_tolerance = inner.getHeight() - (2 * TOLERANCE);
|
||||||
@ -450,9 +482,6 @@ public class InvisibleElementRemovalService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@Builder
|
@Builder
|
||||||
private record InvisibleElementRemovalContext(
|
private record InvisibleElementRemovalContext(
|
||||||
boolean delta,
|
boolean delta,
|
||||||
|
|||||||
@ -0,0 +1,125 @@
|
|||||||
|
package com.iqser.red.pdftronlogic.commons;
|
||||||
|
|
||||||
|
import static com.iqser.red.pdftronlogic.commons.PdfTextExtraction.extractAllTextFromDocument;
|
||||||
|
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
|
||||||
|
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.FileOutputStream;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import com.pdftron.pdf.PDFNet;
|
||||||
|
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
|
||||||
|
class InvisibleElementRemovalServiceTest {
|
||||||
|
|
||||||
|
InvisibleElementRemovalService invisibleElementRemovalService;
|
||||||
|
private static final String pdftronLicense = "demo:1650351709282:7bd235e003000000004ec28a6743e1163a085e2115de2536ab6e2cfe5a";
|
||||||
|
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
void createService() {
|
||||||
|
|
||||||
|
PDFNet.initialize(pdftronLicense);
|
||||||
|
invisibleElementRemovalService = new InvisibleElementRemovalService();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@SneakyThrows
|
||||||
|
void removeInvisibleText() {
|
||||||
|
|
||||||
|
String fileName = "files/InvisibleText.pdf";
|
||||||
|
String resultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL");
|
||||||
|
String deltaResultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL_DELTA");
|
||||||
|
|
||||||
|
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(resultFileName)) {
|
||||||
|
invisibleElementRemovalService.removeInvisibleElements(in, out, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(deltaResultFileName)) {
|
||||||
|
invisibleElementRemovalService.removeInvisibleElements(in, out, true);
|
||||||
|
}
|
||||||
|
try (var in = new FileInputStream(resultFileName)) {
|
||||||
|
String[] text = extractAllTextFromDocument(in).split("\n");
|
||||||
|
assertThat(text).contains("APPENDIX 16 Pathology Report", "Amendment 1", "Page 255 of 260");
|
||||||
|
}
|
||||||
|
try (var in = new FileInputStream(deltaResultFileName)) {
|
||||||
|
String[] text = extractAllTextFromDocument(in).split("\n");
|
||||||
|
assertThat(text).contains("Michela Gregori DVM PhD Pathologist", "AUTHOR(S):", "COMPLETION DATE:");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@SneakyThrows
|
||||||
|
void removeInvisibleElementsWithColoredBackground() {
|
||||||
|
|
||||||
|
String fileName = "files/textOnColoredBackground.pdf";
|
||||||
|
String resultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL");
|
||||||
|
String deltaResultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL_DELTA");
|
||||||
|
|
||||||
|
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(resultFileName)) {
|
||||||
|
invisibleElementRemovalService.removeInvisibleElements(in, out, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(deltaResultFileName)) {
|
||||||
|
invisibleElementRemovalService.removeInvisibleElements(in, out, true);
|
||||||
|
}
|
||||||
|
try (var in = new FileInputStream(deltaResultFileName)) {
|
||||||
|
String result = PdfTextExtraction.extractAllTextFromDocument(in);
|
||||||
|
assertThat(result).contains("#1 Dark",
|
||||||
|
"#13 Yellow",
|
||||||
|
"Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper suscipit lobortis nisl ut aliquip\n" + "ex ea commodo consequat. Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie\n" + "consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim\n" + "qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi.");
|
||||||
|
assertThat(result).doesNotContain("Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut\n" + "labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et\n" + "ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem\n" + "ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et\n" + "dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea\n" + "rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum\n" + "dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore\n" + "magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet\n" + "clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. ");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@SneakyThrows
|
||||||
|
void removeInvisibleElementsThinFilledTable() {
|
||||||
|
|
||||||
|
String fileName = "files/tableIsSingleLinePath.pdf";
|
||||||
|
String resultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL");
|
||||||
|
String deltaResultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL_DELTA");
|
||||||
|
|
||||||
|
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(resultFileName)) {
|
||||||
|
invisibleElementRemovalService.removeInvisibleElements(in, out, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(deltaResultFileName)) {
|
||||||
|
invisibleElementRemovalService.removeInvisibleElements(in, out, true);
|
||||||
|
}
|
||||||
|
try (var in = new FileInputStream(deltaResultFileName)) {
|
||||||
|
String result = PdfTextExtraction.extractAllTextFromDocument(in);
|
||||||
|
assertThat(result).isEqualTo("");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@SneakyThrows
|
||||||
|
void removeInvisibleElementsChineseOverlapped() {
|
||||||
|
|
||||||
|
String fileName = "files/chineseInvisibleElements.pdf";
|
||||||
|
String resultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL");
|
||||||
|
String deltaResultFileName = OsUtils.createTmpFileName(fileName, "INVISIBLE_REMOVAL_DELTA");
|
||||||
|
|
||||||
|
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(resultFileName)) {
|
||||||
|
invisibleElementRemovalService.removeInvisibleElements(in, out, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new FileOutputStream(deltaResultFileName)) {
|
||||||
|
invisibleElementRemovalService.removeInvisibleElements(in, out, true);
|
||||||
|
}
|
||||||
|
try (var in = new FileInputStream(deltaResultFileName)) {
|
||||||
|
String result = PdfTextExtraction.extractAllTextFromDocument(in);
|
||||||
|
assertThat(result).contains("[Table_KeyInfo]", "[Table_StockInfo]", "[Table_BaseInfo]", "国内无线键鼠龙头企", "业", "精研研发发制制造造商商先先的的本");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,31 @@
|
|||||||
|
package com.iqser.red.pdftronlogic.commons;
|
||||||
|
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.Locale;
|
||||||
|
|
||||||
|
import org.junit.platform.commons.util.StringUtils;
|
||||||
|
|
||||||
|
public class OsUtils {
|
||||||
|
|
||||||
|
private static boolean isWindows() {
|
||||||
|
|
||||||
|
return System.getProperty("os.name").toLowerCase(Locale.ROOT).contains("windows");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static String getTemporaryDirectory() {
|
||||||
|
|
||||||
|
String tmpdir = System.getProperty("java.io.tmpdir");
|
||||||
|
if (isWindows() && StringUtils.isNotBlank(tmpdir)) {
|
||||||
|
return tmpdir;
|
||||||
|
}
|
||||||
|
return "/tmp";
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static String createTmpFileName(String filename, String suffix) {
|
||||||
|
|
||||||
|
return Path.of(OsUtils.getTemporaryDirectory()).resolve(Path.of(filename).getFileName()).toString().replace(".pdf", "_" + suffix + ".pdf");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
BIN
src/test/resources/files/InvisibleText.pdf
Normal file
BIN
src/test/resources/files/InvisibleText.pdf
Normal file
Binary file not shown.
BIN
src/test/resources/files/chineseInvisibleElements.pdf
Normal file
BIN
src/test/resources/files/chineseInvisibleElements.pdf
Normal file
Binary file not shown.
BIN
src/test/resources/files/tableIsSingleLinePath.pdf
Normal file
BIN
src/test/resources/files/tableIsSingleLinePath.pdf
Normal file
Binary file not shown.
BIN
src/test/resources/files/textOnColoredBackground.pdf
Normal file
BIN
src/test/resources/files/textOnColoredBackground.pdf
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user