Compare commits
4 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c9424a5f4b | ||
|
|
e86e6fba2a | ||
|
|
ff9fd7bd44 | ||
|
|
e6a1656e18 |
@ -27,7 +27,7 @@ repositories {
|
|||||||
dependencies {
|
dependencies {
|
||||||
api("org.projectlombok:lombok:1.18.30")
|
api("org.projectlombok:lombok:1.18.30")
|
||||||
api("com.google.guava:guava:33.0.0-jre")
|
api("com.google.guava:guava:33.0.0-jre")
|
||||||
api("com.pdftron:PDFNet:10.11.0")
|
api("com.pdftron:PDFNet:11.0.0")
|
||||||
testImplementation("net.sourceforge.lept4j:lept4j:1.19.1")
|
testImplementation("net.sourceforge.lept4j:lept4j:1.19.1")
|
||||||
testImplementation("org.junit.jupiter:junit-jupiter:5.10.2")
|
testImplementation("org.junit.jupiter:junit-jupiter:5.10.2")
|
||||||
testImplementation("org.assertj:assertj-core:3.24.2")
|
testImplementation("org.assertj:assertj-core:3.24.2")
|
||||||
|
|||||||
@ -163,7 +163,7 @@ public class InvisibleElementRemovalService {
|
|||||||
|
|
||||||
visitedXObjIds.add(page.getSDFObj().getObjNum());
|
visitedXObjIds.add(page.getSDFObj().getObjNum());
|
||||||
|
|
||||||
InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder()
|
try (InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder()
|
||||||
.reader(reader)
|
.reader(reader)
|
||||||
.clippingPathStack(new ClippingPathStack(page.getMediaBox()))
|
.clippingPathStack(new ClippingPathStack(page.getMediaBox()))
|
||||||
.markedContentStack(new MarkedContentStack(pdfDoc))
|
.markedContentStack(new MarkedContentStack(pdfDoc))
|
||||||
@ -173,14 +173,15 @@ public class InvisibleElementRemovalService {
|
|||||||
.visibleElements(new ElementFeatureLookup())
|
.visibleElements(new ElementFeatureLookup())
|
||||||
.visitedXObjIds(visitedXObjIds)
|
.visitedXObjIds(visitedXObjIds)
|
||||||
.markedContentToIgnore(markedContentToIgnore)
|
.markedContentToIgnore(markedContentToIgnore)
|
||||||
.build();
|
.build()) {
|
||||||
|
|
||||||
removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context);
|
removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context);
|
||||||
|
|
||||||
context.visitedXObjIds().clear();
|
context.visitedXObjIds().clear();
|
||||||
context.markedContentStack().clear();
|
context.markedContentStack().clear();
|
||||||
|
|
||||||
removeOverlappedElements(page, writer, context);
|
removeOverlappedElements(page, writer, context);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -248,10 +249,7 @@ public class InvisibleElementRemovalService {
|
|||||||
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
|
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
|
||||||
if (inClippingPath) {
|
if (inClippingPath) {
|
||||||
ImageFeatures imageFeatures = ElementFeatureFactory.buildImage(imageElement);
|
ImageFeatures imageFeatures = ElementFeatureFactory.buildImage(imageElement);
|
||||||
if (!(context.markedContentStack.contextHasTransparency()
|
if (!(context.markedContentStack.contextHasTransparency() || imageFeatures.isTransparent() || imageFeatures.isImageMask() || imageFeatures.isSoftMask())) {
|
||||||
|| imageFeatures.isTransparent()
|
|
||||||
|| imageFeatures.isImageMask()
|
|
||||||
|| imageFeatures.isSoftMask())) {
|
|
||||||
calculateOverlaps(context, imageFeatures, imageFeatures.isMasked());
|
calculateOverlaps(context, imageFeatures, imageFeatures.isMasked());
|
||||||
}
|
}
|
||||||
context.visibleElements().add(imageFeatures);
|
context.visibleElements().add(imageFeatures);
|
||||||
@ -280,7 +278,7 @@ public class InvisibleElementRemovalService {
|
|||||||
boolean isTextVisible = isTextRenderedVisibly(gState, textBBox, context);
|
boolean isTextVisible = isTextRenderedVisibly(gState, textBBox, context);
|
||||||
|
|
||||||
if (inClippingPath && isTextVisible) {
|
if (inClippingPath && isTextVisible) {
|
||||||
context.visibleElements().add(ElementFeatureFactory.buildText(textElement, context.delta()));
|
context.visibleElements().add(ElementFeatureFactory.buildText(textElement, true, context.delta()));
|
||||||
}
|
}
|
||||||
if (!context.delta()) {
|
if (!context.delta()) {
|
||||||
if (inClippingPath && isTextVisible) {
|
if (inClippingPath && isTextVisible) {
|
||||||
@ -292,7 +290,8 @@ public class InvisibleElementRemovalService {
|
|||||||
Therefore, the position of a following Tj is affected by not writing the first Element.
|
Therefore, the position of a following Tj is affected by not writing the first Element.
|
||||||
This is why, we write only the Tm command:
|
This is why, we write only the Tm command:
|
||||||
*/
|
*/
|
||||||
writer.writeGStateChanges(textElement);
|
textElement.setTextData(new byte[]{});
|
||||||
|
writer.writeElement(textElement);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (!inClippingPath) {
|
if (!inClippingPath) {
|
||||||
@ -433,7 +432,7 @@ public class InvisibleElementRemovalService {
|
|||||||
context.reader().end();
|
context.reader().end();
|
||||||
|
|
||||||
if (!context.overlappedElements().isEmpty()) {
|
if (!context.overlappedElements().isEmpty()) {
|
||||||
log.warn(context.overlappedElements().size() + " overlapped elements have not been found or removed");
|
log.debug(context.overlappedElements().size() + " overlapped elements have not been found or removed");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -492,7 +491,8 @@ public class InvisibleElementRemovalService {
|
|||||||
Therefore, the position of a following Tj is affected by not writing the first Element.
|
Therefore, the position of a following Tj is affected by not writing the first Element.
|
||||||
This is why, we write only the Tm command:
|
This is why, we write only the Tm command:
|
||||||
*/
|
*/
|
||||||
writer.writeGStateChanges(element);
|
element.setTextData(new byte[]{});
|
||||||
|
writer.writeElement(element);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
writer.writeElement(element);
|
writer.writeElement(element);
|
||||||
@ -614,7 +614,14 @@ public class InvisibleElementRemovalService {
|
|||||||
ElementFeatureLookup visibleElements,
|
ElementFeatureLookup visibleElements,
|
||||||
Set<Long> visitedXObjIds,
|
Set<Long> visitedXObjIds,
|
||||||
Set<String> markedContentToIgnore
|
Set<String> markedContentToIgnore
|
||||||
) {
|
) implements AutoCloseable {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() {
|
||||||
|
|
||||||
|
overlappedElements.close();
|
||||||
|
visibleElements.close();
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -0,0 +1,49 @@
|
|||||||
|
package com.iqser.red.pdftronlogic.commons;
|
||||||
|
|
||||||
|
import java.lang.reflect.Field;
|
||||||
|
|
||||||
|
import com.pdftron.pdf.Font;
|
||||||
|
|
||||||
|
import lombok.experimental.UtilityClass;
|
||||||
|
|
||||||
|
@UtilityClass
|
||||||
|
public class PDFNetUtils {
|
||||||
|
|
||||||
|
@SuppressWarnings("PMD")
|
||||||
|
public void requireFontNotClosed(Font font) {
|
||||||
|
|
||||||
|
try {
|
||||||
|
if (font.__GetHandle() == 0L) {
|
||||||
|
throw new AssertionError("Font is already closed!");
|
||||||
|
}
|
||||||
|
Object refHandle = font.__GetRefHandle();
|
||||||
|
|
||||||
|
Class<?> clazz = refHandle.getClass();
|
||||||
|
|
||||||
|
Field implField = null;
|
||||||
|
while (clazz != null) {
|
||||||
|
try {
|
||||||
|
|
||||||
|
implField = clazz.getDeclaredField("impl");
|
||||||
|
implField.setAccessible(true);
|
||||||
|
break;
|
||||||
|
} catch (NoSuchFieldException e) {
|
||||||
|
clazz = clazz.getSuperclass();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (implField != null) {
|
||||||
|
long implValue = (Long) implField.get(refHandle);
|
||||||
|
|
||||||
|
if (implValue == 0L) {
|
||||||
|
throw new AssertionError("Associated ElementReader of Font is already closed!");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch (IllegalAccessException e) {
|
||||||
|
throw new AssertionError("Font Ref is missing the field impl, should never happen!");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
@ -85,7 +85,7 @@ public class PdfTextExtraction {
|
|||||||
|
|
||||||
for (Element element = reader.next(); element != null; element = reader.next()) {
|
for (Element element = reader.next(); element != null; element = reader.next()) {
|
||||||
switch (element.getType()) {
|
switch (element.getType()) {
|
||||||
case Element.e_text -> textFeaturesOnPage.add(ElementFeatureFactory.buildText(element, includePathData));
|
case Element.e_text -> textFeaturesOnPage.add(ElementFeatureFactory.buildText(element, includePathData, includePathData));
|
||||||
case Element.e_form -> {
|
case Element.e_form -> {
|
||||||
Obj formObj = element.getXObject();
|
Obj formObj = element.getXObject();
|
||||||
|
|
||||||
|
|||||||
@ -1,6 +1,5 @@
|
|||||||
package com.iqser.red.pdftronlogic.commons.features;
|
package com.iqser.red.pdftronlogic.commons.features;
|
||||||
|
|
||||||
import java.awt.geom.GeneralPath;
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -14,7 +13,7 @@ import com.pdftron.pdf.Element;
|
|||||||
import com.pdftron.pdf.Font;
|
import com.pdftron.pdf.Font;
|
||||||
import com.pdftron.pdf.GState;
|
import com.pdftron.pdf.GState;
|
||||||
import com.pdftron.pdf.Image;
|
import com.pdftron.pdf.Image;
|
||||||
import com.pdftron.pdf.PathData;
|
import com.pdftron.sdf.Obj;
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import lombok.experimental.UtilityClass;
|
import lombok.experimental.UtilityClass;
|
||||||
@ -26,7 +25,7 @@ public class ElementFeatureFactory {
|
|||||||
|
|
||||||
return switch (element.getType()) {
|
return switch (element.getType()) {
|
||||||
case Element.e_path -> buildPath(element);
|
case Element.e_path -> buildPath(element);
|
||||||
case Element.e_text -> buildText(element);
|
case Element.e_text -> buildText(element, false, false);
|
||||||
case Element.e_image, Element.e_inline_image -> buildImage(element);
|
case Element.e_image, Element.e_inline_image -> buildImage(element);
|
||||||
case Element.e_form -> buildForm(element);
|
case Element.e_form -> buildForm(element);
|
||||||
// This technically should never happen, it's a safetynet
|
// This technically should never happen, it's a safetynet
|
||||||
@ -72,7 +71,7 @@ public class ElementFeatureFactory {
|
|||||||
boolean masked = false;
|
boolean masked = false;
|
||||||
if (element.getType() == Element.e_image) {
|
if (element.getType() == Element.e_image) {
|
||||||
Image image = new Image(element.getXObject());
|
Image image = new Image(element.getXObject());
|
||||||
if (image.getMask() != null) {
|
if (image.getMask() != null && image.getMask().getType() == Obj.e_stream) {
|
||||||
Image imageMask = new Image(image.getMask());
|
Image imageMask = new Image(image.getMask());
|
||||||
masked = imageMask.isImageMask();
|
masked = imageMask.isImageMask();
|
||||||
}
|
}
|
||||||
@ -94,27 +93,25 @@ public class ElementFeatureFactory {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public TextFeatures buildText(Element element) throws PDFNetException {
|
|
||||||
|
|
||||||
return buildText(element, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Use includePathData = true, when trying to draw the glyphs, see GlyphExtractionTest
|
Use includeGlyphs = true and preComputePathData = true, when trying to draw the glyphs, see GlyphExtractionTest
|
||||||
|
precomputePathData = true is needed, when trying to access the PathData after the PDFDoc/ElementReader has been closed
|
||||||
*/
|
*/
|
||||||
public TextFeatures buildText(Element element, boolean includePathData) throws PDFNetException {
|
public TextFeatures buildText(Element element, boolean includeGlyphs, boolean preComputePathData) throws PDFNetException {
|
||||||
|
|
||||||
try (var bbox = element.getBBox()) {
|
try (var bbox = element.getBBox()) {
|
||||||
|
|
||||||
return TextFeatures.builder()
|
TextFeatures.TextFeaturesBuilder<?, ?> simpleTextFeatures = TextFeatures.builder()
|
||||||
.elementType(element.getType())
|
.elementType(element.getType())
|
||||||
.boundingBox(Converter.toRectangle2D(bbox))
|
.boundingBox(Converter.toRectangle2D(bbox))
|
||||||
.text(element.getTextString())
|
.text(element.getTextString())
|
||||||
.font(element.getGState().getFont().getType())
|
.font(element.getGState().getFont().getType())
|
||||||
.fontsize(element.getGState().getFontSize())
|
.fontsize(element.getGState().getFontSize());
|
||||||
.glyphs(extractGlyphInfo(element, includePathData))
|
|
||||||
.build();
|
if (includeGlyphs) {
|
||||||
|
simpleTextFeatures.glyphs(extractGlyphInfo(element, preComputePathData));
|
||||||
|
}
|
||||||
|
return simpleTextFeatures.build();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -139,7 +136,7 @@ public class ElementFeatureFactory {
|
|||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private List<GlyphInfo> extractGlyphInfo(Element textElement, boolean includePathData) {
|
private List<GlyphInfo> extractGlyphInfo(Element textElement, boolean precomputePathData) {
|
||||||
|
|
||||||
assert textElement != null && textElement.getType() == Element.e_text;
|
assert textElement != null && textElement.getType() == Element.e_text;
|
||||||
|
|
||||||
@ -157,34 +154,29 @@ public class ElementFeatureFactory {
|
|||||||
}
|
}
|
||||||
|
|
||||||
List<GlyphInfo> glyphs = new ArrayList<>();
|
List<GlyphInfo> glyphs = new ArrayList<>();
|
||||||
|
short unitsPerEm = font.getUnitsPerEm();
|
||||||
|
|
||||||
|
try (CharIterator charIterator = textElement.getCharIterator(); Matrix2D ctm = textElement.getCTM().multiply(textElement.getTextMatrix());) {
|
||||||
|
|
||||||
try (CharIterator charIterator = textElement.getCharIterator()) {
|
|
||||||
while (charIterator.hasNext()) {
|
while (charIterator.hasNext()) {
|
||||||
CharData charData = charIterator.next();
|
CharData charData = charIterator.next();
|
||||||
long charCode = charData.getCharCode();
|
long charCode = charData.getCharCode();
|
||||||
String glyphText = new String(font.mapToUnicode(charCode));
|
|
||||||
|
|
||||||
if (Character.isWhitespace(glyphText.charAt(0))) {
|
try (Matrix2D fontMatrix = computeFontMatrix(charData, textElement, unitsPerEm)) {
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
try (Matrix2D fontMatrix = computeFontMatrix(charData, textElement, font); //
|
GlyphInfo glyph = GlyphInfo.builder() //
|
||||||
Matrix2D glyphMatrix = textElement.getCTM()//
|
.charCode(charCode) //
|
||||||
.multiply(textElement.getTextMatrix())//
|
.cachePathData(precomputePathData) //
|
||||||
.multiply(fontMatrix)) {
|
.glyphMatrix(ctm.multiply(fontMatrix)) //
|
||||||
PathData pathData = font.getGlyphPath(charCode, true, glyphMatrix);
|
.font(font) //
|
||||||
if (pathData.getOperators().length == 1 && pathData.getOperators()[0] == 6) {
|
.build();
|
||||||
// This happens for some chinese characters or whitespaces, don't know why...
|
|
||||||
continue;
|
glyphs.add(glyph);
|
||||||
|
|
||||||
|
if (precomputePathData) {
|
||||||
|
// call the functions once to cache all data
|
||||||
|
glyph.getBoundingBox();
|
||||||
}
|
}
|
||||||
GeneralPath glyphPath = Converter.convertToGeneralPath(pathData);
|
|
||||||
GlyphInfo.GlyphInfoBuilder glyphInfo = GlyphInfo.builder().unicode(glyphText).bbox(glyphPath.getBounds2D());
|
|
||||||
|
|
||||||
if (includePathData) {
|
|
||||||
glyphInfo.pathData(pathData);
|
|
||||||
}
|
|
||||||
|
|
||||||
glyphs.add(glyphInfo.build());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -194,9 +186,9 @@ public class ElementFeatureFactory {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private Matrix2D computeFontMatrix(CharData charData, Element textElement, Font font) throws PDFNetException {
|
private Matrix2D computeFontMatrix(CharData charData, Element textElement, short unitsPerEm) throws PDFNetException {
|
||||||
|
|
||||||
double yScaleFactor = textElement.getGState().getFontSize() / font.getUnitsPerEm();
|
double yScaleFactor = textElement.getGState().getFontSize() / unitsPerEm;
|
||||||
double xScaleFactor = (textElement.getGState().getHorizontalScale() / 100) * yScaleFactor;
|
double xScaleFactor = (textElement.getGState().getHorizontalScale() / 100) * yScaleFactor;
|
||||||
|
|
||||||
return new Matrix2D(xScaleFactor, 0, 0, -yScaleFactor, charData.getGlyphX(), charData.getGlyphY());
|
return new Matrix2D(xScaleFactor, 0, 0, -yScaleFactor, charData.getGlyphX(), charData.getGlyphY());
|
||||||
|
|||||||
@ -94,4 +94,9 @@ public class ElementFeatures {
|
|||||||
return overlapShape.contains(ComparisonUtils.shrinkRectangle(boundingBox));
|
return overlapShape.contains(ComparisonUtils.shrinkRectangle(boundingBox));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void destroy() {
|
||||||
|
// do nothing, except for text
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,30 +1,40 @@
|
|||||||
package com.iqser.red.pdftronlogic.commons.features;
|
package com.iqser.red.pdftronlogic.commons.features;
|
||||||
|
|
||||||
import java.awt.Shape;
|
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
import com.iqser.red.pdftronlogic.commons.ComparisonUtils;
|
import com.iqser.red.pdftronlogic.commons.ComparisonUtils;
|
||||||
|
import com.iqser.red.pdftronlogic.commons.Converter;
|
||||||
|
import com.iqser.red.pdftronlogic.commons.PDFNetUtils;
|
||||||
|
import com.pdftron.common.Matrix2D;
|
||||||
|
import com.pdftron.common.PDFNetException;
|
||||||
|
import com.pdftron.pdf.Font;
|
||||||
import com.pdftron.pdf.PathData;
|
import com.pdftron.pdf.PathData;
|
||||||
|
|
||||||
import lombok.AccessLevel;
|
import lombok.AccessLevel;
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.Builder;
|
import lombok.Builder;
|
||||||
import lombok.Getter;
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.SneakyThrows;
|
||||||
import lombok.experimental.FieldDefaults;
|
import lombok.experimental.FieldDefaults;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
|
@Slf4j
|
||||||
@Builder
|
@Builder
|
||||||
@AllArgsConstructor
|
@AllArgsConstructor
|
||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||||
public class GlyphInfo {
|
public class GlyphInfo {
|
||||||
|
|
||||||
@Getter
|
final Matrix2D glyphMatrix;
|
||||||
final String unicode;
|
final long charCode;
|
||||||
@Getter
|
final Font font;
|
||||||
final Rectangle2D bbox;
|
|
||||||
final PathData pathData;
|
// in order to speed up invisible element removal, we only calculate the pathdata where necessary, as it is the costliest operation.
|
||||||
|
// It will only work as long as the associated ElementReader is still open, as the Font is bound to the ContentStream being read.
|
||||||
|
Rectangle2D bbox;
|
||||||
|
final boolean cachePathData;
|
||||||
|
PathData pathData;
|
||||||
|
|
||||||
boolean overlapped;
|
boolean overlapped;
|
||||||
ElementFeatures overlappingElement;
|
ElementFeatures overlappingElement;
|
||||||
@ -35,8 +45,12 @@ public class GlyphInfo {
|
|||||||
if (overlapped) {
|
if (overlapped) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
Optional<Rectangle2D> bbox = getBoundingBox();
|
||||||
|
if (bbox.isEmpty()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
if (overlappingElement.getOverlapShape().contains(ComparisonUtils.shrinkRectangle(bbox))) {
|
if (overlappingElement.getOverlapShape().contains(ComparisonUtils.shrinkRectangle(bbox.get()))) {
|
||||||
overlapped = true;
|
overlapped = true;
|
||||||
this.overlappingElement = overlappingElement;
|
this.overlappingElement = overlappingElement;
|
||||||
}
|
}
|
||||||
@ -46,25 +60,57 @@ public class GlyphInfo {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public boolean matches(GlyphInfo glyph2) {
|
public String getUnicode() {
|
||||||
|
|
||||||
return unicode.equals(glyph2.unicode)//
|
try {
|
||||||
&& calculateIntersectedArea(glyph2.bbox, bbox) > 0.9 * Math.min(bbox.getWidth() * bbox.getHeight(), glyph2.bbox.getHeight() * glyph2.bbox.getWidth());
|
return new String(font.mapToUnicode(charCode));
|
||||||
|
} catch (PDFNetException e) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
public Optional<PathData> getPathData() {
|
public Optional<PathData> getPathData() {
|
||||||
|
|
||||||
return Optional.ofNullable(pathData);
|
if (pathData == null) {
|
||||||
|
|
||||||
|
PDFNetUtils.requireFontNotClosed(font);
|
||||||
|
|
||||||
|
PathData computedPathData = font.getGlyphPath(charCode, true, glyphMatrix);
|
||||||
|
if (computedPathData.getOperators().length == 1 && computedPathData.getOperators()[0] == 6) {
|
||||||
|
// This happens for some chinese characters or whitespaces, don't know why...
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
if (cachePathData) {
|
||||||
|
pathData = computedPathData;
|
||||||
|
}
|
||||||
|
return Optional.of(computedPathData);
|
||||||
|
}
|
||||||
|
return Optional.of(pathData);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static double calculateIntersectedArea(Rectangle2D r1, Rectangle2D r2) {
|
@SneakyThrows
|
||||||
|
public Optional<Rectangle2D> getBoundingBox() {
|
||||||
|
|
||||||
double xOverlap = Math.max(0, Math.min(r1.getMaxX(), r2.getMaxX()) - Math.max(r1.getMinX(), r2.getMinX()));
|
if (bbox == null) {
|
||||||
double yOverlap = Math.max(0, Math.min(r1.getMaxY(), r2.getMaxY()) - Math.max(r1.getY(), r2.getY()));
|
Optional<PathData> pathData = getPathData();
|
||||||
|
if (pathData.isEmpty()) {
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
bbox = Converter.convertToGeneralPath(pathData.get()).getBounds2D();
|
||||||
|
}
|
||||||
|
return Optional.of(bbox);
|
||||||
|
}
|
||||||
|
|
||||||
return xOverlap * yOverlap;
|
|
||||||
|
@SneakyThrows
|
||||||
|
public void destroy() {
|
||||||
|
|
||||||
|
if (glyphMatrix != null) {
|
||||||
|
glyphMatrix.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -5,8 +5,6 @@ import static com.iqser.red.pdftronlogic.commons.ComparisonUtils.almostEqual;
|
|||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import com.iqser.red.pdftronlogic.commons.ComparisonUtils;
|
|
||||||
|
|
||||||
import lombok.AccessLevel;
|
import lombok.AccessLevel;
|
||||||
import lombok.Builder;
|
import lombok.Builder;
|
||||||
import lombok.EqualsAndHashCode;
|
import lombok.EqualsAndHashCode;
|
||||||
@ -24,6 +22,7 @@ public class TextFeatures extends ElementFeatures {
|
|||||||
String text;
|
String text;
|
||||||
int font;
|
int font;
|
||||||
double fontsize;
|
double fontsize;
|
||||||
|
|
||||||
@Builder.Default
|
@Builder.Default
|
||||||
List<GlyphInfo> glyphs = new ArrayList<>();
|
List<GlyphInfo> glyphs = new ArrayList<>();
|
||||||
|
|
||||||
@ -41,20 +40,6 @@ public class TextFeatures extends ElementFeatures {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private boolean glyphsMatch(TextFeatures textFeaturesElement) {
|
|
||||||
|
|
||||||
if (glyphs.size() != textFeaturesElement.getGlyphs().size()) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
for (int i = 0; i < glyphs.size(); i++) {
|
|
||||||
if (!glyphs.get(i).matches(textFeaturesElement.getGlyphs().get(i))) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public boolean testOverlapped(ElementFeatures overlappingElement) {
|
public boolean testOverlapped(ElementFeatures overlappingElement) {
|
||||||
|
|
||||||
if (glyphs.isEmpty()) {
|
if (glyphs.isEmpty()) {
|
||||||
@ -65,4 +50,11 @@ public class TextFeatures extends ElementFeatures {
|
|||||||
.allMatch(glyph -> glyph.testOverlapped(overlappingElement));
|
.allMatch(glyph -> glyph.testOverlapped(overlappingElement));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void destroy() {
|
||||||
|
|
||||||
|
glyphs.forEach(GlyphInfo::destroy);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,15 +1,16 @@
|
|||||||
package com.iqser.red.pdftronlogic.commons.lookup;
|
package com.iqser.red.pdftronlogic.commons.lookup;
|
||||||
|
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.util.HashSet;
|
import java.util.ArrayList;
|
||||||
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.Set;
|
|
||||||
import java.util.function.Consumer;
|
import java.util.function.Consumer;
|
||||||
import java.util.function.Predicate;
|
import java.util.function.Predicate;
|
||||||
|
|
||||||
import com.iqser.red.pdftronlogic.commons.Converter;
|
import com.iqser.red.pdftronlogic.commons.Converter;
|
||||||
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
|
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
|
||||||
|
import com.pdftron.pdf.Element;
|
||||||
import com.pdftron.pdf.Rect;
|
import com.pdftron.pdf.Rect;
|
||||||
|
|
||||||
import lombok.AccessLevel;
|
import lombok.AccessLevel;
|
||||||
@ -21,13 +22,13 @@ import lombok.experimental.FieldDefaults;
|
|||||||
@Getter
|
@Getter
|
||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
public class ElementFeatureLookup {
|
public class ElementFeatureLookup implements AutoCloseable {
|
||||||
/*
|
/*
|
||||||
This class looks a bit weird and uses visitors since I tried to use the quadtree implementation by locationtech, since it uses Rectangles by default to query its data structure.
|
This class looks a bit weird and uses visitors since I tried to use the quadtree implementation by locationtech, as it uses Rectangles by default to query its data structure.
|
||||||
Unfortunately there were always edge cases, where it lost a few elements making it completely unusable. Further, it didn't even speed up the algorithm all that much.
|
Unfortunately there were always edge cases where it lost a few elements making it completely unusable. Further, it didn't even speed up the algorithm all that much.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
Set<ElementFeatures> allElements = new HashSet<>();
|
List<ElementFeatures> allElements = new ArrayList<>();
|
||||||
|
|
||||||
|
|
||||||
public void add(ElementFeatures elementFeatures) {
|
public void add(ElementFeatures elementFeatures) {
|
||||||
@ -69,9 +70,23 @@ public class ElementFeatureLookup {
|
|||||||
|
|
||||||
public List<ElementFeatures> findOverlapped(ElementFeatures overlappingElement, boolean textOnly) {
|
public List<ElementFeatures> findOverlapped(ElementFeatures overlappingElement, boolean textOnly) {
|
||||||
|
|
||||||
OverlapVisitor overlapVisitor = new OverlapVisitor(overlappingElement, textOnly);
|
List<ElementFeatures> overlappedElementFeatures = new LinkedList<>();
|
||||||
forEach(overlapVisitor::visitItem);
|
|
||||||
return overlapVisitor.getOverlappedElementFeatures();
|
for (int i = 0; i < allElements.size(); i++) {
|
||||||
|
ElementFeatures features = allElements.get(i);
|
||||||
|
|
||||||
|
if (textOnly && features.getElementType() != Element.e_text) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (features.getBoundingBox().intersects(overlappingElement.getBoundingBox())) {
|
||||||
|
if (features.testOverlapped(overlappingElement)) {
|
||||||
|
overlappedElementFeatures.add(features);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return overlappedElementFeatures;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -101,13 +116,20 @@ public class ElementFeatureLookup {
|
|||||||
|
|
||||||
public void addAll(List<ElementFeatures> currentOverlappedElements) {
|
public void addAll(List<ElementFeatures> currentOverlappedElements) {
|
||||||
|
|
||||||
currentOverlappedElements.forEach(this::add);
|
allElements.addAll(currentOverlappedElements);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void removeAll(List<ElementFeatures> currentOverlappedElements) {
|
public void removeAll(List<ElementFeatures> currentOverlappedElements) {
|
||||||
|
|
||||||
currentOverlappedElements.forEach(this::remove);
|
allElements.removeAll(currentOverlappedElements);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() {
|
||||||
|
|
||||||
|
allElements.forEach(ElementFeatures::destroy);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,42 +0,0 @@
|
|||||||
package com.iqser.red.pdftronlogic.commons.lookup;
|
|
||||||
|
|
||||||
import java.util.LinkedList;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import com.iqser.red.pdftronlogic.commons.ComparisonUtils;
|
|
||||||
import com.iqser.red.pdftronlogic.commons.features.ElementFeatures;
|
|
||||||
import com.pdftron.pdf.Element;
|
|
||||||
|
|
||||||
import lombok.AccessLevel;
|
|
||||||
import lombok.Getter;
|
|
||||||
import lombok.RequiredArgsConstructor;
|
|
||||||
import lombok.experimental.FieldDefaults;
|
|
||||||
|
|
||||||
@RequiredArgsConstructor
|
|
||||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
|
||||||
public class OverlapVisitor implements ElementFeatureVisitor {
|
|
||||||
|
|
||||||
ElementFeatures overlappingElement;
|
|
||||||
|
|
||||||
boolean textOnly;
|
|
||||||
|
|
||||||
@Getter
|
|
||||||
List<ElementFeatures> overlappedElementFeatures = new LinkedList<>();
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void visitItem(ElementFeatures features) {
|
|
||||||
|
|
||||||
if (textOnly && features.getElementType() != Element.e_text) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ComparisonUtils.padRectangle(features.getBoundingBox()).intersects(ComparisonUtils.padRectangle(overlappingElement.getBoundingBox()))) {
|
|
||||||
if (features.testOverlapped(overlappingElement)) {
|
|
||||||
overlappedElementFeatures.add(features);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -8,7 +8,9 @@ import java.io.FileOutputStream;
|
|||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Disabled;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import com.iqser.red.pdftronlogic.commons.features.GlyphInfo;
|
import com.iqser.red.pdftronlogic.commons.features.GlyphInfo;
|
||||||
@ -23,10 +25,11 @@ import com.pdftron.sdf.SDFDoc;
|
|||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
|
||||||
|
@Disabled // makes no sense to run in pipeline
|
||||||
public class GlyphExtractionTest {
|
public class GlyphExtractionTest {
|
||||||
|
|
||||||
@BeforeEach
|
@BeforeAll
|
||||||
void createService() {
|
static void init() {
|
||||||
|
|
||||||
PDFNet.initialize(PDFTronConfig.license);
|
PDFNet.initialize(PDFTronConfig.license);
|
||||||
}
|
}
|
||||||
@ -69,11 +72,11 @@ public class GlyphExtractionTest {
|
|||||||
|
|
||||||
for (GlyphInfo glyph : textFeatures.getGlyphs()) {
|
for (GlyphInfo glyph : textFeatures.getGlyphs()) {
|
||||||
|
|
||||||
if (glyph.getPathData().isPresent()) {
|
if (glyph.getPathData().isPresent() && glyph.getBoundingBox().isPresent()) {
|
||||||
drawPathData(glyph.getPathData().get(), builder, writer, Color.BLACK);
|
drawPathData(glyph.getPathData().get(), builder, writer, Color.BLACK);
|
||||||
|
drawRect(ComparisonUtils.shrinkRectangle(glyph.getBoundingBox().get()), builder, writer, Color.RED);
|
||||||
|
drawRect(glyph.getBoundingBox().get(), builder, writer, Color.MAGENTA);
|
||||||
}
|
}
|
||||||
drawRect(ComparisonUtils.shrinkRectangle(glyph.getBbox()), builder, writer, Color.RED);
|
|
||||||
drawRect(glyph.getBbox(), builder, writer, Color.MAGENTA);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -3,11 +3,13 @@ package com.iqser.red.pdftronlogic.commons;
|
|||||||
import static com.iqser.red.pdftronlogic.commons.PdfTextExtraction.extractAllTextFromDocument;
|
import static com.iqser.red.pdftronlogic.commons.PdfTextExtraction.extractAllTextFromDocument;
|
||||||
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
|
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
|
||||||
|
|
||||||
|
import java.io.ByteArrayOutputStream;
|
||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
import java.io.FileOutputStream;
|
import java.io.FileOutputStream;
|
||||||
|
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
@ -22,10 +24,16 @@ class InvisibleElementRemovalServiceTest {
|
|||||||
InvisibleElementRemovalService invisibleElementRemovalService;
|
InvisibleElementRemovalService invisibleElementRemovalService;
|
||||||
|
|
||||||
|
|
||||||
@BeforeEach
|
@BeforeAll
|
||||||
void createService() {
|
static void init() {
|
||||||
|
|
||||||
PDFNet.initialize(PDFTronConfig.license);
|
PDFNet.initialize(PDFTronConfig.license);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
void createServices() {
|
||||||
|
|
||||||
invisibleElementRemovalService = new InvisibleElementRemovalService();
|
invisibleElementRemovalService = new InvisibleElementRemovalService();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -57,6 +65,19 @@ class InvisibleElementRemovalServiceTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@SneakyThrows
|
||||||
|
void page32DoesNotCrash() {
|
||||||
|
|
||||||
|
String fileName = "files/Page32.pdf";
|
||||||
|
|
||||||
|
try (var in = this.getClass().getClassLoader().getResourceAsStream(fileName); var out = new ByteArrayOutputStream()) {
|
||||||
|
invisibleElementRemovalService.removeInvisibleElements(in, out, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
void removeInvisibleTextClippedByFormObjects() {
|
void removeInvisibleTextClippedByFormObjects() {
|
||||||
|
|||||||
BIN
src/test/resources/files/Page32.pdf
Normal file
BIN
src/test/resources/files/Page32.pdf
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user