Compare commits
4 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c9424a5f4b | ||
|
|
e86e6fba2a | ||
|
|
ff9fd7bd44 | ||
|
|
e6a1656e18 |
@ -27,7 +27,7 @@ repositories {
|
||||
dependencies {
|
||||
api("org.projectlombok:lombok:1.18.30")
|
||||
api("com.google.guava:guava:33.0.0-jre")
|
||||
api("com.pdftron:PDFNet:10.11.0")
|
||||
api("com.pdftron:PDFNet:11.0.0")
|
||||
testImplementation("net.sourceforge.lept4j:lept4j:1.19.1")
|
||||
testImplementation("org.junit.jupiter:junit-jupiter:5.10.2")
|
||||
testImplementation("org.assertj:assertj-core:3.24.2")
|
||||
|
||||
@ -163,7 +163,7 @@ public class InvisibleElementRemovalService {
|
||||
|
||||
visitedXObjIds.add(page.getSDFObj().getObjNum());
|
||||
|
||||
InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder()
|
||||
try (InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder()
|
||||
.reader(reader)
|
||||
.clippingPathStack(new ClippingPathStack(page.getMediaBox()))
|
||||
.markedContentStack(new MarkedContentStack(pdfDoc))
|
||||
@ -173,14 +173,15 @@ public class InvisibleElementRemovalService {
|
||||
.visibleElements(new ElementFeatureLookup())
|
||||
.visitedXObjIds(visitedXObjIds)
|
||||
.markedContentToIgnore(markedContentToIgnore)
|
||||
.build();
|
||||
.build()) {
|
||||
|
||||
removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context);
|
||||
removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context);
|
||||
|
||||
context.visitedXObjIds().clear();
|
||||
context.markedContentStack().clear();
|
||||
context.visitedXObjIds().clear();
|
||||
context.markedContentStack().clear();
|
||||
|
||||
removeOverlappedElements(page, writer, context);
|
||||
removeOverlappedElements(page, writer, context);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -248,10 +249,7 @@ public class InvisibleElementRemovalService {
|
||||
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
|
||||
if (inClippingPath) {
|
||||
ImageFeatures imageFeatures = ElementFeatureFactory.buildImage(imageElement);
|
||||
if (!(context.markedContentStack.contextHasTransparency()
|
||||
|| imageFeatures.isTransparent()
|
||||
|| imageFeatures.isImageMask()
|
||||
|| imageFeatures.isSoftMask())) {
|
||||
if (!(context.markedContentStack.contextHasTransparency() || imageFeatures.isTransparent() || imageFeatures.isImageMask() || imageFeatures.isSoftMask())) {
|
||||
calculateOverlaps(context, imageFeatures, imageFeatures.isMasked());
|
||||
}
|
||||
context.visibleElements().add(imageFeatures);
|
||||
@ -280,7 +278,7 @@ public class InvisibleElementRemovalService {
|
||||
boolean isTextVisible = isTextRenderedVisibly(gState, textBBox, context);
|
||||
|
||||
if (inClippingPath && isTextVisible) {
|
||||
context.visibleElements().add(ElementFeatureFactory.buildText(textElement, context.delta()));
|
||||
context.visibleElements().add(ElementFeatureFactory.buildText(textElement, true, context.delta()));
|
||||
}
|
||||
if (!context.delta()) {
|
||||
if (inClippingPath && isTextVisible) {
|
||||
@ -292,7 +290,8 @@ public class InvisibleElementRemovalService {
|
||||
Therefore, the position of a following Tj is affected by not writing the first Element.
|
||||
This is why, we write only the Tm command:
|
||||
*/
|
||||
writer.writeGStateChanges(textElement);
|
||||
textElement.setTextData(new byte[]{});
|
||||
writer.writeElement(textElement);
|
||||
}
|
||||
} else {
|
||||
if (!inClippingPath) {
|
||||
@ -433,7 +432,7 @@ public class InvisibleElementRemovalService {
|
||||
context.reader().end();
|
||||
|
||||
if (!context.overlappedElements().isEmpty()) {
|
||||
log.warn(context.overlappedElements().size() + " overlapped elements have not been found or removed");
|
||||
log.debug(context.overlappedElements().size() + " overlapped elements have not been found or removed");
|
||||
}
|
||||
}
|
||||
|
||||
@ -492,7 +491,8 @@ public class InvisibleElementRemovalService {
|
||||
Therefore, the position of a following Tj is affected by not writing the first Element.
|
||||
This is why, we write only the Tm command:
|
||||
*/
|
||||
writer.writeGStateChanges(element);
|
||||
element.setTextData(new byte[]{});
|
||||
writer.writeElement(element);
|
||||
}
|
||||
} else {
|
||||
writer.writeElement(element);
|
||||
@ -614,7 +614,14 @@ public class InvisibleElementRemovalService {
|
||||
ElementFeatureLookup visibleElements,
|
||||
Set<Long> visitedXObjIds,
|
||||
Set<String> markedContentToIgnore
|
||||
) {
|
||||
) implements AutoCloseable {
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
|
||||
overlappedElements.close();
|
||||
visibleElements.close();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -0,0 +1,49 @@
|
||||
package com.iqser.red.pdftronlogic.commons;
|
||||
|
||||
import java.lang.reflect.Field;
|
||||
|
||||
import com.pdftron.pdf.Font;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class PDFNetUtils {
|
||||
|
||||
@SuppressWarnings("PMD")
|
||||
public void requireFontNotClosed(Font font) {
|
||||
|
||||
try {
|
||||
if (font.__GetHandle() == 0L) {
|
||||
throw new AssertionError("Font is already closed!");
|
||||
}
|
||||
Object refHandle = font.__GetRefHandle();
|
||||
|
||||
Class<?> clazz = refHandle.getClass();
|
||||
|
||||
Field implField = null;
|
||||
while (clazz != null) {
|
||||
try {
|
||||
|
||||
implField = clazz.getDeclaredField("impl");
|
||||
implField.setAccessible(true);
|
||||
break;
|
||||
} catch (NoSuchFieldException e) {
|
||||
clazz = clazz.getSuperclass();
|
||||
}
|
||||
}
|
||||
|
||||
if (implField != null) {
|
||||
long implValue = (Long) implField.get(refHandle);
|
||||
|
||||
if (implValue == 0L) {
|
||||
throw new AssertionError("Associated ElementReader of Font is already closed!");
|
||||
}
|
||||
}
|
||||
|
||||
} catch (IllegalAccessException e) {
|
||||
throw new AssertionError("Font Ref is missing the field impl, should never happen!");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
@ -85,7 +85,7 @@ public class PdfTextExtraction {
|
||||
|
||||
for (Element element = reader.next(); element != null; element = reader.next()) {
|
||||
switch (element.getType()) {
|
||||
case Element.e_text -> textFeaturesOnPage.add(ElementFeatureFactory.buildText(element, includePathData));
|
||||
case Element.e_text -> textFeaturesOnPage.add(ElementFeatureFactory.buildText(element, includePathData, includePathData));
|
||||
case Element.e_form -> {
|
||||
Obj formObj = element.getXObject();
|
||||
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
package com.iqser.red.pdftronlogic.commons.features;
|
||||
|
||||
import java.awt.geom.GeneralPath;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
@ -14,7 +13,7 @@ import com.pdftron.pdf.Element;
|
||||
import com.pdftron.pdf.Font;
|
||||
import com.pdftron.pdf.GState;
|
||||
import com.pdftron.pdf.Image;
|
||||
import com.pdftron.pdf.PathData;
|
||||
import com.pdftron.sdf.Obj;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.UtilityClass;
|
||||
@ -26,7 +25,7 @@ public class ElementFeatureFactory {
|
||||
|
||||
return switch (element.getType()) {
|
||||
case Element.e_path -> buildPath(element);
|
||||
case Element.e_text -> buildText(element);
|
||||
case Element.e_text -> buildText(element, false, false);
|
||||
case Element.e_image, Element.e_inline_image -> buildImage(element);
|
||||
case Element.e_form -> buildForm(element);
|
||||
// This technically should never happen, it's a safetynet
|
||||
@ -72,7 +71,7 @@ public class ElementFeatureFactory {
|
||||
boolean masked = false;
|
||||
if (element.getType() == Element.e_image) {
|
||||
Image image = new Image(element.getXObject());
|
||||
if (image.getMask() != null) {
|
||||
if (image.getMask() != null && image.getMask().getType() == Obj.e_stream) {
|
||||
Image imageMask = new Image(image.getMask());
|
||||
masked = imageMask.isImageMask();
|
||||
}
|
||||
@ -94,27 +93,25 @@ public class ElementFeatureFactory {
|
||||
}
|
||||
|
||||
|
||||
public TextFeatures buildText(Element element) throws PDFNetException {
|
||||
|
||||
return buildText(element, false);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Use includePathData = true, when trying to draw the glyphs, see GlyphExtractionTest
|
||||
Use includeGlyphs = true and preComputePathData = true, when trying to draw the glyphs, see GlyphExtractionTest
|
||||
precomputePathData = true is needed, when trying to access the PathData after the PDFDoc/ElementReader has been closed
|
||||
*/
|
||||
public TextFeatures buildText(Element element, boolean includePathData) throws PDFNetException {
|
||||
public TextFeatures buildText(Element element, boolean includeGlyphs, boolean preComputePathData) throws PDFNetException {
|
||||
|
||||
try (var bbox = element.getBBox()) {
|
||||
|
||||
return TextFeatures.builder()
|
||||
TextFeatures.TextFeaturesBuilder<?, ?> simpleTextFeatures = TextFeatures.builder()
|
||||
.elementType(element.getType())
|
||||
.boundingBox(Converter.toRectangle2D(bbox))
|
||||
.text(element.getTextString())
|
||||
.font(element.getGState().getFont().getType())
|
||||
.fontsize(element.getGState().getFontSize())
|
||||
.glyphs(extractGlyphInfo(element, includePathData))
|
||||
.build();
|
||||
.fontsize(element.getGState().getFontSize());
|
||||
|
||||
if (includeGlyphs) {
|
||||
simpleTextFeatures.glyphs(extractGlyphInfo(element, preComputePathData));
|
||||
}
|
||||
return simpleTextFeatures.build();
|
||||
}
|
||||
}
|
||||
|
||||
@ -139,7 +136,7 @@ public class ElementFeatureFactory {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private List<GlyphInfo> extractGlyphInfo(Element textElement, boolean includePathData) {
|
||||
private List<GlyphInfo> extractGlyphInfo(Element textElement, boolean precomputePathData) {
|
||||
|
||||
assert textElement != null && textElement.getType() == Element.e_text;
|
||||
|
||||
@ -157,34 +154,29 @@ public class ElementFeatureFactory {
|
||||
}
|
||||
|
||||
List<GlyphInfo> glyphs = new ArrayList<>();
|
||||
short unitsPerEm = font.getUnitsPerEm();
|
||||
|
||||
try (CharIterator charIterator = textElement.getCharIterator(); Matrix2D ctm = textElement.getCTM().multiply(textElement.getTextMatrix());) {
|
||||
|
||||
try (CharIterator charIterator = textElement.getCharIterator()) {
|
||||
while (charIterator.hasNext()) {
|
||||
CharData charData = charIterator.next();
|
||||
long charCode = charData.getCharCode();
|
||||
String glyphText = new String(font.mapToUnicode(charCode));
|
||||
|
||||
if (Character.isWhitespace(glyphText.charAt(0))) {
|
||||
continue;
|
||||
}
|
||||
try (Matrix2D fontMatrix = computeFontMatrix(charData, textElement, unitsPerEm)) {
|
||||
|
||||
try (Matrix2D fontMatrix = computeFontMatrix(charData, textElement, font); //
|
||||
Matrix2D glyphMatrix = textElement.getCTM()//
|
||||
.multiply(textElement.getTextMatrix())//
|
||||
.multiply(fontMatrix)) {
|
||||
PathData pathData = font.getGlyphPath(charCode, true, glyphMatrix);
|
||||
if (pathData.getOperators().length == 1 && pathData.getOperators()[0] == 6) {
|
||||
// This happens for some chinese characters or whitespaces, don't know why...
|
||||
continue;
|
||||
GlyphInfo glyph = GlyphInfo.builder() //
|
||||
.charCode(charCode) //
|
||||
.cachePathData(precomputePathData) //
|
||||
.glyphMatrix(ctm.multiply(fontMatrix)) //
|
||||
.font(font) //
|
||||
.build();
|
||||
|
||||
glyphs.add(glyph);
|
||||
|
||||
if (precomputePathData) {
|
||||
// call the functions once to cache all data
|
||||
glyph.getBoundingBox();
|
||||
}
|
||||
GeneralPath glyphPath = Converter.convertToGeneralPath(pathData);
|
||||
GlyphInfo.GlyphInfoBuilder glyphInfo = GlyphInfo.builder().unicode(glyphText).bbox(glyphPath.getBounds2D());
|
||||
|
||||
if (includePathData) {
|
||||
glyphInfo.pathData(pathData);
|
||||
}
|
||||
|
||||
glyphs.add(glyphInfo.build());
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -194,9 +186,9 @@ public class ElementFeatureFactory {
|
||||
}
|
||||
|
||||
|
||||
private Matrix2D computeFontMatrix(CharData charData, Element textElement, Font font) throws PDFNetException {
|
||||
private Matrix2D computeFontMatrix(CharData charData, Element textElement, short unitsPerEm) throws PDFNetException {
|
||||
|
||||
double yScaleFactor = textElement.getGState().getFontSize() / font.getUnitsPerEm();
|
||||
double yScaleFactor = textElement.getGState().getFontSize() / unitsPerEm;
|
||||
double xScaleFactor = (textElement.getGState().getHorizontalScale() / 100) * yScaleFactor;
|
||||
|
||||
return new Matrix2D(xScaleFactor, 0, 0, -yScaleFactor, charData.getGlyphX(), charData.getGlyphY());
|
||||
|
||||
@ -94,4 +94,9 @@ public class ElementFeatures {
|
||||
return overlapShape.contains(ComparisonUtils.shrinkRectangle(boundingBox));
|
||||
}
|
||||
|
||||
|
||||
public void destroy() {
|
||||
// do nothing, except for text
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,30 +1,40 @@
|
||||
package com.iqser.red.pdftronlogic.commons.features;
|
||||
|
||||
import java.awt.Shape;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Optional;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.ComparisonUtils;
|
||||
import com.iqser.red.pdftronlogic.commons.Converter;
|
||||
import com.iqser.red.pdftronlogic.commons.PDFNetUtils;
|
||||
import com.pdftron.common.Matrix2D;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.Font;
|
||||
import com.pdftron.pdf.PathData;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class GlyphInfo {
|
||||
|
||||
@Getter
|
||||
final String unicode;
|
||||
@Getter
|
||||
final Rectangle2D bbox;
|
||||
final PathData pathData;
|
||||
final Matrix2D glyphMatrix;
|
||||
final long charCode;
|
||||
final Font font;
|
||||
|
||||
// in order to speed up invisible element removal, we only calculate the pathdata where necessary, as it is the costliest operation.
|
||||
// It will only work as long as the associated ElementReader is still open, as the Font is bound to the ContentStream being read.
|
||||
Rectangle2D bbox;
|
||||
final boolean cachePathData;
|
||||
PathData pathData;
|
||||
|
||||
boolean overlapped;
|
||||
ElementFeatures overlappingElement;
|
||||
@ -35,8 +45,12 @@ public class GlyphInfo {
|
||||
if (overlapped) {
|
||||
return true;
|
||||
}
|
||||
Optional<Rectangle2D> bbox = getBoundingBox();
|
||||
if (bbox.isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (overlappingElement.getOverlapShape().contains(ComparisonUtils.shrinkRectangle(bbox))) {
|
||||
if (overlappingElement.getOverlapShape().contains(ComparisonUtils.shrinkRectangle(bbox.get()))) {
|
||||
overlapped = true;
|
||||
this.overlappingElement = overlappingElement;
|
||||
}
|
||||
@ -46,25 +60,57 @@ public class GlyphInfo {
|
||||
}
|
||||
|
||||
|
||||
public boolean matches(GlyphInfo glyph2) {
|
||||
public String getUnicode() {
|
||||
|
||||
return unicode.equals(glyph2.unicode)//
|
||||
&& calculateIntersectedArea(glyph2.bbox, bbox) > 0.9 * Math.min(bbox.getWidth() * bbox.getHeight(), glyph2.bbox.getHeight() * glyph2.bbox.getWidth());
|
||||
try {
|
||||
return new String(font.mapToUnicode(charCode));
|
||||
} catch (PDFNetException e) {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public Optional<PathData> getPathData() {
|
||||
|
||||
return Optional.ofNullable(pathData);
|
||||
if (pathData == null) {
|
||||
|
||||
PDFNetUtils.requireFontNotClosed(font);
|
||||
|
||||
PathData computedPathData = font.getGlyphPath(charCode, true, glyphMatrix);
|
||||
if (computedPathData.getOperators().length == 1 && computedPathData.getOperators()[0] == 6) {
|
||||
// This happens for some chinese characters or whitespaces, don't know why...
|
||||
return Optional.empty();
|
||||
}
|
||||
if (cachePathData) {
|
||||
pathData = computedPathData;
|
||||
}
|
||||
return Optional.of(computedPathData);
|
||||
}
|
||||
return Optional.of(pathData);
|
||||
}
|
||||
|
||||
|
||||
private static double calculateIntersectedArea(Rectangle2D r1, Rectangle2D r2) {
|
||||
@SneakyThrows
|
||||
public Optional<Rectangle2D> getBoundingBox() {
|
||||
|
||||
double xOverlap = Math.max(0, Math.min(r1.getMaxX(), r2.getMaxX()) - Math.max(r1.getMinX(), r2.getMinX()));
|
||||
double yOverlap = Math.max(0, Math.min(r1.getMaxY(), r2.getMaxY()) - Math.max(r1.getY(), r2.getY()));
|
||||
if (bbox == null) {
|
||||
Optional<PathData> pathData = getPathData();
|
||||
if (pathData.isEmpty()) {
|
||||
return Optional.empty();
|
||||
}
|
||||
bbox = Converter.convertToGeneralPath(pathData.get()).getBounds2D();
|
||||
}
|
||||
return Optional.of(bbox);
|
||||
}
|
||||
|
||||
return xOverlap * yOverlap;
|
||||
|
||||
@SneakyThrows
|
||||
public void destroy() {
|
||||
|
||||
if (glyphMatrix != null) {
|
||||
glyphMatrix.close();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -5,8 +5,6 @@ import static com.iqser.red.pdftronlogic.commons.ComparisonUtils.almostEqual;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.ComparisonUtils;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Builder;
|
||||
import lombok.EqualsAndHashCode;
|
||||
@ -24,6 +22,7 @@ public class TextFeatures extends ElementFeatures {
|
||||
String text;
|
||||
int font;
|
||||
double fontsize;
|
||||
|
||||
@Builder.Default
|
||||
List<GlyphInfo> glyphs = new ArrayList<>();
|
||||
|
||||
@ -41,20 +40,6 @@ public class TextFeatures extends ElementFeatures {
|
||||
}
|
||||
|
||||
|
||||
private boolean glyphsMatch(TextFeatures textFeaturesElement) {
|
||||
|
||||
if (glyphs.size() != textFeaturesElement.getGlyphs().size()) {
|
||||
return false;
|
||||
}
|
||||
for (int i = 0; i < glyphs.size(); i++) {
|
||||
if (!glyphs.get(i).matches(textFeaturesElement.getGlyphs().get(i))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
public boolean testOverlapped(ElementFeatures overlappingElement) {
|
||||
|
||||
if (glyphs.isEmpty()) {
|
||||
@ -65,4 +50,11 @@ public class TextFeatures extends ElementFeatures {
|
||||
.allMatch(glyph -> glyph.testOverlapped(overlappingElement));
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void destroy() {
|
||||
|
||||
glyphs.forEach(GlyphInfo::destroy);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,15 +1,16 @@
|
||||
package com.iqser.red.pdftronlogic.commons.lookup;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.HashSet;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.Converter;
|
||||
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
|
||||
import com.pdftron.pdf.Element;
|
||||
import com.pdftron.pdf.Rect;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
@ -21,13 +22,13 @@ import lombok.experimental.FieldDefaults;
|
||||
@Getter
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class ElementFeatureLookup {
|
||||
public class ElementFeatureLookup implements AutoCloseable {
|
||||
/*
|
||||
This class looks a bit weird and uses visitors since I tried to use the quadtree implementation by locationtech, since it uses Rectangles by default to query its data structure.
|
||||
Unfortunately there were always edge cases, where it lost a few elements making it completely unusable. Further, it didn't even speed up the algorithm all that much.
|
||||
This class looks a bit weird and uses visitors since I tried to use the quadtree implementation by locationtech, as it uses Rectangles by default to query its data structure.
|
||||
Unfortunately there were always edge cases where it lost a few elements making it completely unusable. Further, it didn't even speed up the algorithm all that much.
|
||||
*/
|
||||
|
||||
Set<ElementFeatures> allElements = new HashSet<>();
|
||||
List<ElementFeatures> allElements = new ArrayList<>();
|
||||
|
||||
|
||||
public void add(ElementFeatures elementFeatures) {
|
||||
@ -69,9 +70,23 @@ public class ElementFeatureLookup {
|
||||
|
||||
public List<ElementFeatures> findOverlapped(ElementFeatures overlappingElement, boolean textOnly) {
|
||||
|
||||
OverlapVisitor overlapVisitor = new OverlapVisitor(overlappingElement, textOnly);
|
||||
forEach(overlapVisitor::visitItem);
|
||||
return overlapVisitor.getOverlappedElementFeatures();
|
||||
List<ElementFeatures> overlappedElementFeatures = new LinkedList<>();
|
||||
|
||||
for (int i = 0; i < allElements.size(); i++) {
|
||||
ElementFeatures features = allElements.get(i);
|
||||
|
||||
if (textOnly && features.getElementType() != Element.e_text) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (features.getBoundingBox().intersects(overlappingElement.getBoundingBox())) {
|
||||
if (features.testOverlapped(overlappingElement)) {
|
||||
overlappedElementFeatures.add(features);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return overlappedElementFeatures;
|
||||
}
|
||||
|
||||
|
||||
@ -101,13 +116,20 @@ public class ElementFeatureLookup {
|
||||
|
||||
public void addAll(List<ElementFeatures> currentOverlappedElements) {
|
||||
|
||||
currentOverlappedElements.forEach(this::add);
|
||||
allElements.addAll(currentOverlappedElements);
|
||||
}
|
||||
|
||||
|
||||
public void removeAll(List<ElementFeatures> currentOverlappedElements) {
|
||||
|
||||
currentOverlappedElements.forEach(this::remove);
|
||||
allElements.removeAll(currentOverlappedElements);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
|
||||
allElements.forEach(ElementFeatures::destroy);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,42 +0,0 @@
|
||||
package com.iqser.red.pdftronlogic.commons.lookup;
|
||||
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.ComparisonUtils;
|
||||
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
|
||||
import com.pdftron.pdf.Element;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class OverlapVisitor implements ElementFeatureVisitor {
|
||||
|
||||
ElementFeatures overlappingElement;
|
||||
|
||||
boolean textOnly;
|
||||
|
||||
@Getter
|
||||
List<ElementFeatures> overlappedElementFeatures = new LinkedList<>();
|
||||
|
||||
|
||||
@Override
|
||||
public void visitItem(ElementFeatures features) {
|
||||
|
||||
if (textOnly && features.getElementType() != Element.e_text) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (ComparisonUtils.padRectangle(features.getBoundingBox()).intersects(ComparisonUtils.padRectangle(overlappingElement.getBoundingBox()))) {
|
||||
if (features.testOverlapped(overlappingElement)) {
|
||||
overlappedElementFeatures.add(features);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -8,7 +8,9 @@ import java.io.FileOutputStream;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.features.GlyphInfo;
|
||||
@ -23,10 +25,11 @@ import com.pdftron.sdf.SDFDoc;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@Disabled // makes no sense to run in pipeline
|
||||
public class GlyphExtractionTest {
|
||||
|
||||
@BeforeEach
|
||||
void createService() {
|
||||
@BeforeAll
|
||||
static void init() {
|
||||
|
||||
PDFNet.initialize(PDFTronConfig.license);
|
||||
}
|
||||
@ -69,11 +72,11 @@ public class GlyphExtractionTest {
|
||||
|
||||
for (GlyphInfo glyph : textFeatures.getGlyphs()) {
|
||||
|
||||
if (glyph.getPathData().isPresent()) {
|
||||
if (glyph.getPathData().isPresent() && glyph.getBoundingBox().isPresent()) {
|
||||
drawPathData(glyph.getPathData().get(), builder, writer, Color.BLACK);
|
||||
drawRect(ComparisonUtils.shrinkRectangle(glyph.getBoundingBox().get()), builder, writer, Color.RED);
|
||||
drawRect(glyph.getBoundingBox().get(), builder, writer, Color.MAGENTA);
|
||||
}
|
||||
drawRect(ComparisonUtils.shrinkRectangle(glyph.getBbox()), builder, writer, Color.RED);
|
||||
drawRect(glyph.getBbox(), builder, writer, Color.MAGENTA);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -3,11 +3,13 @@ package com.iqser.red.pdftronlogic.commons;
|
||||
import static com.iqser.red.pdftronlogic.commons.PdfTextExtraction.extractAllTextFromDocument;
|
||||
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
@ -22,10 +24,16 @@ class InvisibleElementRemovalServiceTest {
|
||||
InvisibleElementRemovalService invisibleElementRemovalService;
|
||||
|
||||
|
||||
@BeforeEach
|
||||
void createService() {
|
||||
@BeforeAll
|
||||
static void init() {
|
||||
|
||||
PDFNet.initialize(PDFTronConfig.license);
|
||||
}
|
||||
|
||||
|
||||
@BeforeEach
|
||||
void createServices() {
|
||||
|
||||
invisibleElementRemovalService = new InvisibleElementRemovalService();
|
||||
}
|
||||
|
||||
@ -57,6 +65,19 @@ class InvisibleElementRemovalServiceTest {
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
void page32DoesNotCrash() {
|
||||
|
||||
String fileName = "files/Page32.pdf";
|
||||
|
||||
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new ByteArrayOutputStream()) {
|
||||
invisibleElementRemovalService.removeInvisibleElements(in, out, false);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
void removeInvisibleTextClippedByFormObjects() {
|
||||
|
||||
BIN
src/test/resources/files/Page32.pdf
Normal file
BIN
src/test/resources/files/Page32.pdf
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user