RED-4875 - call logic of new repo pdftron-logic-commons instead of local one
This commit is contained in:
parent
74a094b42d
commit
143538fa40
@ -23,6 +23,12 @@
|
|||||||
<groupId>com.iqser.red.commons</groupId>
|
<groupId>com.iqser.red.commons</groupId>
|
||||||
<artifactId>storage-commons</artifactId>
|
<artifactId>storage-commons</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.iqser.red.commons</groupId>
|
||||||
|
<artifactId>pdftron-logic-commons</artifactId>
|
||||||
|
<version>dev_red4875_2_4dc4d</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.iqser.red.commons</groupId>
|
<groupId>com.iqser.red.commons</groupId>
|
||||||
<artifactId>spring-commons</artifactId>
|
<artifactId>spring-commons</artifactId>
|
||||||
|
|||||||
@ -10,6 +10,7 @@ import org.springframework.context.annotation.Bean;
|
|||||||
import org.springframework.context.annotation.Import;
|
import org.springframework.context.annotation.Import;
|
||||||
import org.springframework.scheduling.annotation.EnableAsync;
|
import org.springframework.scheduling.annotation.EnableAsync;
|
||||||
|
|
||||||
|
import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService;
|
||||||
import com.iqser.red.service.ocr.v1.server.client.FileStatusProcessingUpdateClient;
|
import com.iqser.red.service.ocr.v1.server.client.FileStatusProcessingUpdateClient;
|
||||||
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
|
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
|
||||||
import com.iqser.red.service.ocr.v1.server.multitenancy.AsyncConfig;
|
import com.iqser.red.service.ocr.v1.server.multitenancy.AsyncConfig;
|
||||||
@ -44,4 +45,11 @@ public class Application {
|
|||||||
return new TimedAspect(registry);
|
return new TimedAspect(registry);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Bean
|
||||||
|
public InvisibleElementRemovalService invisibleElementRemovalService() {
|
||||||
|
|
||||||
|
return new InvisibleElementRemovalService();
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,68 +1,68 @@
|
|||||||
package com.iqser.red.service.ocr.v1.server.model;
|
//package com.iqser.red.service.ocr.v1.server.model;
|
||||||
|
//
|
||||||
import static com.iqser.red.service.ocr.v1.server.service.InvisibleElementRemovalService.TOLERANCE;
|
//import static com.iqser.red.service.ocr.v1.server.service.InvisibleElementRemovalService.TOLERANCE;
|
||||||
|
//
|
||||||
import java.awt.geom.Area;
|
//import java.awt.geom.Area;
|
||||||
import java.awt.geom.GeneralPath;
|
//import java.awt.geom.GeneralPath;
|
||||||
import java.awt.geom.Rectangle2D;
|
//import java.awt.geom.Rectangle2D;
|
||||||
import java.util.Deque;
|
//import java.util.Deque;
|
||||||
import java.util.LinkedList;
|
//import java.util.LinkedList;
|
||||||
|
//
|
||||||
import com.pdftron.pdf.Rect;
|
//import com.pdftron.pdf.Rect;
|
||||||
|
//
|
||||||
import lombok.Data;
|
//import lombok.Data;
|
||||||
import lombok.SneakyThrows;
|
//import lombok.SneakyThrows;
|
||||||
|
//
|
||||||
@Data
|
//@Data
|
||||||
public class ClippingPathStack {
|
//public class ClippingPathStack {
|
||||||
|
//
|
||||||
private Deque<Area> stack = new LinkedList<>();
|
// private Deque<Area> stack = new LinkedList<>();
|
||||||
|
//
|
||||||
|
//
|
||||||
@SneakyThrows
|
// @SneakyThrows
|
||||||
public ClippingPathStack(Rect rectangle) {
|
// public ClippingPathStack(Rect rectangle) {
|
||||||
|
//
|
||||||
stack.push(new Area(new Rectangle2D.Double(rectangle.getX1(), rectangle.getY1(), rectangle.getWidth(), rectangle.getHeight()).getBounds2D()));
|
// stack.push(new Area(new Rectangle2D.Double(rectangle.getX1(), rectangle.getY1(), rectangle.getWidth(), rectangle.getHeight()).getBounds2D()));
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
|
//
|
||||||
@SneakyThrows
|
// @SneakyThrows
|
||||||
public void intersectClippingPath(GeneralPath path) {
|
// public void intersectClippingPath(GeneralPath path) {
|
||||||
|
//
|
||||||
getCurrentClippingPath().intersect(new Area(path));
|
// getCurrentClippingPath().intersect(new Area(path));
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
|
//
|
||||||
public boolean almostIntersects(double x, double y, double width, double height) {
|
// public boolean almostIntersects(double x, double y, double width, double height) {
|
||||||
// To address inconsistencies in the calculation of the bounding box we slightly increase the rectangle
|
// // To address inconsistencies in the calculation of the bounding box we slightly increase the rectangle
|
||||||
// Height or width are zero for straight lines, even though they are being rendered. Therefore, height or width must be at minimum >0.
|
// // Height or width are zero for straight lines, even though they are being rendered. Therefore, height or width must be at minimum >0.
|
||||||
|
//
|
||||||
double x_with_tolerance = x > 0 ? x - TOLERANCE : x + TOLERANCE;
|
// double x_with_tolerance = x > 0 ? x - TOLERANCE : x + TOLERANCE;
|
||||||
double y_with_tolerance = y > 0 ? y - TOLERANCE : y + TOLERANCE;
|
// double y_with_tolerance = y > 0 ? y - TOLERANCE : y + TOLERANCE;
|
||||||
double width_with_tolerance = width + (2 * TOLERANCE);
|
// double width_with_tolerance = width + (2 * TOLERANCE);
|
||||||
double height_with_tolerance = height + (2 * TOLERANCE);
|
// double height_with_tolerance = height + (2 * TOLERANCE);
|
||||||
return getCurrentClippingPath().intersects(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance);
|
// return getCurrentClippingPath().intersects(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
|
//
|
||||||
public Area getCurrentClippingPath() {
|
// public Area getCurrentClippingPath() {
|
||||||
|
//
|
||||||
return stack.peek();
|
// return stack.peek();
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
|
//
|
||||||
public void enterNewGState() {
|
// public void enterNewGState() {
|
||||||
|
//
|
||||||
Area current = stack.peek();
|
// Area current = stack.peek();
|
||||||
Area cloned = new Area();
|
// Area cloned = new Area();
|
||||||
cloned.add(current);
|
// cloned.add(current);
|
||||||
stack.push(cloned);
|
// stack.push(cloned);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
|
//
|
||||||
public void leaveGState() {
|
// public void leaveGState() {
|
||||||
|
//
|
||||||
stack.pop();
|
// stack.pop();
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
}
|
//}
|
||||||
@ -1,170 +1,170 @@
|
|||||||
package com.iqser.red.service.ocr.v1.server.model;
|
//package com.iqser.red.service.ocr.v1.server.model;
|
||||||
|
//
|
||||||
import static com.iqser.red.service.ocr.v1.server.service.InvisibleElementRemovalService.TOLERANCE;
|
//import static com.iqser.red.service.ocr.v1.server.service.InvisibleElementRemovalService.TOLERANCE;
|
||||||
|
//
|
||||||
import java.awt.geom.Rectangle2D;
|
//import java.awt.geom.Rectangle2D;
|
||||||
|
//
|
||||||
import com.pdftron.common.PDFNetException;
|
//import com.pdftron.common.PDFNetException;
|
||||||
import com.pdftron.pdf.Element;
|
//import com.pdftron.pdf.Element;
|
||||||
import com.pdftron.pdf.Rect;
|
//import com.pdftron.pdf.Rect;
|
||||||
|
//
|
||||||
import lombok.AccessLevel;
|
//import lombok.AccessLevel;
|
||||||
import lombok.EqualsAndHashCode;
|
//import lombok.EqualsAndHashCode;
|
||||||
import lombok.Getter;
|
//import lombok.Getter;
|
||||||
import lombok.SneakyThrows;
|
//import lombok.SneakyThrows;
|
||||||
import lombok.experimental.FieldDefaults;
|
//import lombok.experimental.FieldDefaults;
|
||||||
import lombok.experimental.SuperBuilder;
|
//import lombok.experimental.SuperBuilder;
|
||||||
|
//
|
||||||
@Getter
|
//@Getter
|
||||||
@SuperBuilder
|
//@SuperBuilder
|
||||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
//@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
public class ElementFeatures {
|
//public class ElementFeatures {
|
||||||
|
//
|
||||||
int elementType;
|
// int elementType;
|
||||||
Rectangle2D boundingBox;
|
// Rectangle2D boundingBox;
|
||||||
|
//
|
||||||
|
//
|
||||||
public boolean almostMatches(Element element) throws PDFNetException {
|
// public boolean almostMatches(Element element) throws PDFNetException {
|
||||||
|
//
|
||||||
return element.getType() == elementType && //
|
// return element.getType() == elementType && //
|
||||||
element.getBBox() != null && //
|
// element.getBBox() != null && //
|
||||||
rectsAlmostMatch(element.getBBox());
|
// rectsAlmostMatch(element.getBBox());
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
|
//
|
||||||
protected boolean almostEqual(double a, double b) {
|
// protected boolean almostEqual(double a, double b) {
|
||||||
|
//
|
||||||
return Math.abs(a - b) < TOLERANCE;
|
// return Math.abs(a - b) < TOLERANCE;
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
|
//
|
||||||
@SneakyThrows
|
// @SneakyThrows
|
||||||
private boolean rectsAlmostMatch(Rect bBox) {
|
// private boolean rectsAlmostMatch(Rect bBox) {
|
||||||
// To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance
|
// // To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance
|
||||||
|
//
|
||||||
return almostEqual(bBox.getX1(), boundingBox.getX()) && //
|
// return almostEqual(bBox.getX1(), boundingBox.getX()) && //
|
||||||
almostEqual(bBox.getY1(), boundingBox.getY()) && //
|
// almostEqual(bBox.getY1(), boundingBox.getY()) && //
|
||||||
almostEqual(bBox.getWidth(), boundingBox.getWidth()) && //
|
// almostEqual(bBox.getWidth(), boundingBox.getWidth()) && //
|
||||||
almostEqual(bBox.getHeight(), boundingBox.getHeight());
|
// almostEqual(bBox.getHeight(), boundingBox.getHeight());
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
|
//
|
||||||
@EqualsAndHashCode(callSuper = true)
|
// @EqualsAndHashCode(callSuper = true)
|
||||||
@Getter
|
// @Getter
|
||||||
@SuperBuilder
|
// @SuperBuilder
|
||||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
// @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
private static class Text extends ElementFeatures {
|
// private static class Text extends ElementFeatures {
|
||||||
|
//
|
||||||
String text;
|
// String text;
|
||||||
int font;
|
// int font;
|
||||||
double fontsize;
|
// double fontsize;
|
||||||
|
//
|
||||||
|
//
|
||||||
@Override
|
// @Override
|
||||||
public boolean almostMatches(Element element) throws PDFNetException {
|
// public boolean almostMatches(Element element) throws PDFNetException {
|
||||||
|
//
|
||||||
return super.almostMatches(element) && //
|
// return super.almostMatches(element) && //
|
||||||
text.equals(element.getTextString()) && //
|
// text.equals(element.getTextString()) && //
|
||||||
font == element.getGState().getFont().getType() && //
|
// font == element.getGState().getFont().getType() && //
|
||||||
almostEqual(fontsize, element.getGState().getFontSize());
|
// almostEqual(fontsize, element.getGState().getFontSize());
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
@EqualsAndHashCode(callSuper = true)
|
// @EqualsAndHashCode(callSuper = true)
|
||||||
@Getter
|
// @Getter
|
||||||
@SuperBuilder
|
// @SuperBuilder
|
||||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
// @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
private static class Path extends ElementFeatures {
|
// private static class Path extends ElementFeatures {
|
||||||
|
//
|
||||||
boolean isClippingPath;
|
// boolean isClippingPath;
|
||||||
boolean isClipWindingFill;
|
// boolean isClipWindingFill;
|
||||||
boolean isStroked;
|
// boolean isStroked;
|
||||||
boolean isFilled;
|
// boolean isFilled;
|
||||||
boolean isWindingFill;
|
// boolean isWindingFill;
|
||||||
|
//
|
||||||
|
//
|
||||||
@Override
|
// @Override
|
||||||
public boolean almostMatches(Element element) throws PDFNetException {
|
// public boolean almostMatches(Element element) throws PDFNetException {
|
||||||
|
//
|
||||||
return super.almostMatches(element) && //
|
// return super.almostMatches(element) && //
|
||||||
isClippingPath == element.isClippingPath() && //
|
// isClippingPath == element.isClippingPath() && //
|
||||||
isClipWindingFill == element.isClipWindingFill() && //
|
// isClipWindingFill == element.isClipWindingFill() && //
|
||||||
isStroked == element.isStroked() && //
|
// isStroked == element.isStroked() && //
|
||||||
isFilled == element.isFilled() && //
|
// isFilled == element.isFilled() && //
|
||||||
isWindingFill == element.isWindingFill();
|
// isWindingFill == element.isWindingFill();
|
||||||
|
//
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
@EqualsAndHashCode(callSuper = true)
|
// @EqualsAndHashCode(callSuper = true)
|
||||||
@Getter
|
// @Getter
|
||||||
@SuperBuilder
|
// @SuperBuilder
|
||||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
// @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||||
private static class Image extends ElementFeatures {
|
// private static class Image extends ElementFeatures {
|
||||||
|
//
|
||||||
int dataSize;
|
// int dataSize;
|
||||||
int height;
|
// int height;
|
||||||
int width;
|
// int width;
|
||||||
int renderingIntent;
|
// int renderingIntent;
|
||||||
int componentNum;
|
// int componentNum;
|
||||||
int bitsPerComponent;
|
// int bitsPerComponent;
|
||||||
|
//
|
||||||
|
//
|
||||||
@Override
|
// @Override
|
||||||
public boolean almostMatches(Element element) throws PDFNetException {
|
// public boolean almostMatches(Element element) throws PDFNetException {
|
||||||
|
//
|
||||||
return super.almostMatches(element) && //
|
// return super.almostMatches(element) && //
|
||||||
dataSize == element.getImageDataSize() && //
|
// dataSize == element.getImageDataSize() && //
|
||||||
height == element.getImageHeight() && //
|
// height == element.getImageHeight() && //
|
||||||
width == element.getImageWidth() && //
|
// width == element.getImageWidth() && //
|
||||||
renderingIntent == element.getImageRenderingIntent() && //
|
// renderingIntent == element.getImageRenderingIntent() && //
|
||||||
componentNum == element.getComponentNum() && //
|
// componentNum == element.getComponentNum() && //
|
||||||
bitsPerComponent == element.getBitsPerComponent();
|
// bitsPerComponent == element.getBitsPerComponent();
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
|
//
|
||||||
public static ElementFeatures extractFeatures(Element element) throws PDFNetException {
|
// public static ElementFeatures extractFeatures(Element element) throws PDFNetException {
|
||||||
|
//
|
||||||
return switch (element.getType()) {
|
// return switch (element.getType()) {
|
||||||
case Element.e_path -> Path.builder()
|
// case Element.e_path -> Path.builder()
|
||||||
.elementType(element.getType())
|
// .elementType(element.getType())
|
||||||
.boundingBox(toRectangle2D(element.getBBox()))
|
// .boundingBox(toRectangle2D(element.getBBox()))
|
||||||
.isClippingPath(element.isClippingPath())
|
// .isClippingPath(element.isClippingPath())
|
||||||
.isClipWindingFill(element.isClipWindingFill())
|
// .isClipWindingFill(element.isClipWindingFill())
|
||||||
.isStroked(element.isStroked())
|
// .isStroked(element.isStroked())
|
||||||
.isFilled(element.isFilled())
|
// .isFilled(element.isFilled())
|
||||||
.isWindingFill(element.isWindingFill())
|
// .isWindingFill(element.isWindingFill())
|
||||||
.build();
|
// .build();
|
||||||
case Element.e_text -> Text.builder()
|
// case Element.e_text -> Text.builder()
|
||||||
.elementType(element.getType())
|
// .elementType(element.getType())
|
||||||
.boundingBox(toRectangle2D(element.getBBox()))
|
// .boundingBox(toRectangle2D(element.getBBox()))
|
||||||
.text(element.getTextString())
|
// .text(element.getTextString())
|
||||||
.font(element.getGState().getFont().getType())
|
// .font(element.getGState().getFont().getType())
|
||||||
.fontsize(element.getGState().getFontSize())
|
// .fontsize(element.getGState().getFontSize())
|
||||||
.build();
|
// .build();
|
||||||
case Element.e_image, Element.e_inline_image -> Image.builder()
|
// case Element.e_image, Element.e_inline_image -> Image.builder()
|
||||||
.elementType(element.getType())
|
// .elementType(element.getType())
|
||||||
.boundingBox(toRectangle2D(element.getBBox()))
|
// .boundingBox(toRectangle2D(element.getBBox()))
|
||||||
.dataSize(element.getImageDataSize())
|
// .dataSize(element.getImageDataSize())
|
||||||
.height(element.getImageHeight())
|
// .height(element.getImageHeight())
|
||||||
.width(element.getImageWidth())
|
// .width(element.getImageWidth())
|
||||||
.renderingIntent(element.getImageRenderingIntent())
|
// .renderingIntent(element.getImageRenderingIntent())
|
||||||
.componentNum(element.getComponentNum())
|
// .componentNum(element.getComponentNum())
|
||||||
.bitsPerComponent(element.getBitsPerComponent())
|
// .bitsPerComponent(element.getBitsPerComponent())
|
||||||
.build();
|
// .build();
|
||||||
// This technically should never happen, it's a safetynet
|
// // This technically should never happen, it's a safetynet
|
||||||
default -> throw new RuntimeException("Feature Extraction is not supported for PDFTron.Element with type: " + element.getType());
|
// default -> throw new RuntimeException("Feature Extraction is not supported for PDFTron.Element with type: " + element.getType());
|
||||||
};
|
// };
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
|
//
|
||||||
private static Rectangle2D toRectangle2D(Rect rect) throws PDFNetException {
|
// private static Rectangle2D toRectangle2D(Rect rect) throws PDFNetException {
|
||||||
|
//
|
||||||
return new Rectangle2D.Double(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
|
// return new Rectangle2D.Double(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
}
|
//}
|
||||||
|
|||||||
@ -1,466 +1,466 @@
|
|||||||
package com.iqser.red.service.ocr.v1.server.service;
|
//package com.iqser.red.service.ocr.v1.server.service;
|
||||||
|
//
|
||||||
import java.awt.Shape;
|
//import java.awt.Shape;
|
||||||
import java.awt.geom.AffineTransform;
|
//import java.awt.geom.AffineTransform;
|
||||||
import java.awt.geom.GeneralPath;
|
//import java.awt.geom.GeneralPath;
|
||||||
import java.awt.geom.Rectangle2D;
|
//import java.awt.geom.Rectangle2D;
|
||||||
import java.io.InputStream;
|
//import java.io.InputStream;
|
||||||
import java.io.OutputStream;
|
//import java.io.OutputStream;
|
||||||
import java.util.ArrayList;
|
//import java.util.ArrayList;
|
||||||
import java.util.Iterator;
|
//import java.util.Iterator;
|
||||||
import java.util.List;
|
//import java.util.List;
|
||||||
import java.util.Set;
|
//import java.util.Set;
|
||||||
import java.util.TreeSet;
|
//import java.util.TreeSet;
|
||||||
|
//
|
||||||
import org.springframework.stereotype.Service;
|
//import org.springframework.stereotype.Service;
|
||||||
|
//
|
||||||
import com.google.common.primitives.Bytes;
|
//import com.google.common.primitives.Bytes;
|
||||||
import com.google.common.primitives.Doubles;
|
//import com.google.common.primitives.Doubles;
|
||||||
import com.iqser.red.service.ocr.v1.server.model.ClippingPathStack;
|
//import com.iqser.red.service.ocr.v1.server.model.ClippingPathStack;
|
||||||
import com.iqser.red.service.ocr.v1.server.model.ElementFeatures;
|
//import com.iqser.red.service.ocr.v1.server.model.ElementFeatures;
|
||||||
import com.pdftron.common.Matrix2D;
|
//import com.pdftron.common.Matrix2D;
|
||||||
import com.pdftron.common.PDFNetException;
|
//import com.pdftron.common.PDFNetException;
|
||||||
import com.pdftron.pdf.ColorPt;
|
//import com.pdftron.pdf.ColorPt;
|
||||||
import com.pdftron.pdf.ColorSpace;
|
//import com.pdftron.pdf.ColorSpace;
|
||||||
import com.pdftron.pdf.Element;
|
//import com.pdftron.pdf.Element;
|
||||||
import com.pdftron.pdf.ElementBuilder;
|
//import com.pdftron.pdf.ElementBuilder;
|
||||||
import com.pdftron.pdf.ElementReader;
|
//import com.pdftron.pdf.ElementReader;
|
||||||
import com.pdftron.pdf.ElementWriter;
|
//import com.pdftron.pdf.ElementWriter;
|
||||||
import com.pdftron.pdf.GState;
|
//import com.pdftron.pdf.GState;
|
||||||
import com.pdftron.pdf.PDFDoc;
|
//import com.pdftron.pdf.PDFDoc;
|
||||||
import com.pdftron.pdf.Page;
|
//import com.pdftron.pdf.Page;
|
||||||
import com.pdftron.pdf.PageIterator;
|
//import com.pdftron.pdf.PageIterator;
|
||||||
import com.pdftron.pdf.PathData;
|
//import com.pdftron.pdf.PathData;
|
||||||
import com.pdftron.pdf.Rect;
|
//import com.pdftron.pdf.Rect;
|
||||||
import com.pdftron.sdf.Obj;
|
//import com.pdftron.sdf.Obj;
|
||||||
import com.pdftron.sdf.SDFDoc;
|
//import com.pdftron.sdf.SDFDoc;
|
||||||
|
//
|
||||||
import lombok.Builder;
|
//import lombok.Builder;
|
||||||
import lombok.SneakyThrows;
|
//import lombok.SneakyThrows;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
//import lombok.extern.slf4j.Slf4j;
|
||||||
|
//
|
||||||
@Slf4j
|
//@Slf4j
|
||||||
@Service
|
//@Service
|
||||||
public class InvisibleElementRemovalService {
|
//public class InvisibleElementRemovalService {
|
||||||
|
//
|
||||||
static public final double TOLERANCE = 1e-3;
|
// static public final double TOLERANCE = 1e-3;
|
||||||
|
//
|
||||||
|
//
|
||||||
/**
|
// /**
|
||||||
* Removes all hidden Text, Path and Image Elements from a PDF Document.
|
// * Removes all hidden Text, Path and Image Elements from a PDF Document.
|
||||||
* handled cases:
|
// * handled cases:
|
||||||
* -Text which is transparent or is set to not render
|
// * -Text which is transparent or is set to not render
|
||||||
* -Elements outside of clipping path
|
// * -Elements outside of clipping path
|
||||||
* -Elements that have been painted over by visible and filled Paths
|
// * -Elements that have been painted over by visible and filled Paths
|
||||||
* unhandled cases:
|
// * unhandled cases:
|
||||||
* -Elements covered by widely stroked path
|
// * -Elements covered by widely stroked path
|
||||||
* -Elements with the same color as background
|
// * -Elements with the same color as background
|
||||||
* -Any Text set to clipping with its many interactions with other elements
|
// * -Any Text set to clipping with its many interactions with other elements
|
||||||
*
|
// *
|
||||||
* @param pdfFile The PDF file to process
|
// * @param pdfFile The PDF file to process
|
||||||
* @param delta If this flag is set only the removed Elements will be written to the output file.
|
// * @param delta If this flag is set only the removed Elements will be written to the output file.
|
||||||
* The Elements are red if they are removed by clipping path, blue for transparency, and a green bounding box for overlap.
|
// * The Elements are red if they are removed by clipping path, blue for transparency, and a green bounding box for overlap.
|
||||||
* @param out OutputStream to write the resulting file to
|
// * @param out OutputStream to write the resulting file to
|
||||||
**/
|
// **/
|
||||||
@SneakyThrows
|
// @SneakyThrows
|
||||||
public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta) {
|
// public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta) {
|
||||||
|
//
|
||||||
PDFDoc pdfDoc = new PDFDoc(pdfFile);
|
// PDFDoc pdfDoc = new PDFDoc(pdfFile);
|
||||||
|
//
|
||||||
ElementWriter writer = new ElementWriter();
|
// ElementWriter writer = new ElementWriter();
|
||||||
ElementReader reader = new ElementReader();
|
// ElementReader reader = new ElementReader();
|
||||||
Set<Long> visitedXObjIds = new TreeSet<>();
|
// Set<Long> visitedXObjIds = new TreeSet<>();
|
||||||
|
//
|
||||||
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
|
// for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
|
||||||
|
//
|
||||||
Page page = iterator.next();
|
// Page page = iterator.next();
|
||||||
|
//
|
||||||
visitedXObjIds.add(page.getSDFObj().getObjNum());
|
// visitedXObjIds.add(page.getSDFObj().getObjNum());
|
||||||
|
//
|
||||||
|
//
|
||||||
InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder()
|
// InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder()
|
||||||
.reader(reader)
|
// .reader(reader)
|
||||||
.clippingPathStack(new ClippingPathStack(page.getMediaBox()))
|
// .clippingPathStack(new ClippingPathStack(page.getMediaBox()))
|
||||||
.delta(delta)
|
// .delta(delta)
|
||||||
.overlappedElements(new ArrayList<>())
|
// .overlappedElements(new ArrayList<>())
|
||||||
.visibleElements(new ArrayList<>())
|
// .visibleElements(new ArrayList<>())
|
||||||
.visitedXObjIds(visitedXObjIds)
|
// .visitedXObjIds(visitedXObjIds)
|
||||||
.build();
|
// .build();
|
||||||
|
//
|
||||||
removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context);
|
// removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context);
|
||||||
|
//
|
||||||
context.visitedXObjIds().clear();
|
// context.visitedXObjIds().clear();
|
||||||
|
//
|
||||||
removeOverlappedElements(page, writer, context);
|
// removeOverlappedElements(page, writer, context);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
try {
|
// try {
|
||||||
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
|
// pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
|
||||||
} catch (Exception e) {
|
// } catch (Exception e) {
|
||||||
log.error("File could not be saved after invisible element removal");
|
// log.error("File could not be saved after invisible element removal");
|
||||||
throw new RuntimeException(e);
|
// throw new RuntimeException(e);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
writer.destroy();
|
// writer.destroy();
|
||||||
reader.destroy();
|
// reader.destroy();
|
||||||
pdfDoc.close();
|
// pdfDoc.close();
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
|
//
|
||||||
private void removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(Page page,
|
// private void removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(Page page,
|
||||||
ElementWriter writer,
|
// ElementWriter writer,
|
||||||
InvisibleElementRemovalContext context) throws PDFNetException {
|
// InvisibleElementRemovalContext context) throws PDFNetException {
|
||||||
|
//
|
||||||
context.reader().begin(page);
|
// context.reader().begin(page);
|
||||||
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
|
// writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
|
||||||
processElements(writer, context);
|
// processElements(writer, context);
|
||||||
writer.end();
|
// writer.end();
|
||||||
context.reader().end();
|
// context.reader().end();
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
|
//
|
||||||
private void processElements(ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
|
// private void processElements(ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
|
||||||
|
//
|
||||||
for (Element element = context.reader().next(); element != null; element = context.reader().next())
|
// for (Element element = context.reader().next(); element != null; element = context.reader().next())
|
||||||
switch (element.getType()) {
|
// switch (element.getType()) {
|
||||||
case Element.e_image, Element.e_inline_image -> processImages(element, writer, context);
|
// case Element.e_image, Element.e_inline_image -> processImages(element, writer, context);
|
||||||
case Element.e_text -> processText(element, writer, context);
|
// case Element.e_text -> processText(element, writer, context);
|
||||||
case Element.e_path -> processPath(element, writer, context);
|
// case Element.e_path -> processPath(element, writer, context);
|
||||||
case Element.e_form -> processForm(element, writer, context);
|
// case Element.e_form -> processForm(element, writer, context);
|
||||||
case Element.e_group_begin -> {
|
// case Element.e_group_begin -> {
|
||||||
context.clippingPathStack().enterNewGState();
|
// context.clippingPathStack().enterNewGState();
|
||||||
writer.writeElement(element);
|
// writer.writeElement(element);
|
||||||
}
|
// }
|
||||||
case Element.e_group_end -> {
|
// case Element.e_group_end -> {
|
||||||
context.clippingPathStack().leaveGState();
|
// context.clippingPathStack().leaveGState();
|
||||||
writer.writeElement(element);
|
// writer.writeElement(element);
|
||||||
}
|
// }
|
||||||
default -> writer.writeElement(element);
|
// default -> writer.writeElement(element);
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
|
//
|
||||||
private void processImages(Element imageElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
|
// private void processImages(Element imageElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
|
||||||
|
//
|
||||||
Rect rect = imageElement.getBBox();
|
// Rect rect = imageElement.getBBox();
|
||||||
|
//
|
||||||
if (rect == null) {
|
// if (rect == null) {
|
||||||
return;
|
// return;
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
|
// boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
|
||||||
|
//
|
||||||
if (!context.delta() && inClippingPath) {
|
// if (!context.delta() && inClippingPath) {
|
||||||
context.visibleElements().add(ElementFeatures.extractFeatures(imageElement));
|
// context.visibleElements().add(ElementFeatures.extractFeatures(imageElement));
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
if (context.delta() ^ inClippingPath) {
|
// if (context.delta() ^ inClippingPath) {
|
||||||
writer.writeElement(imageElement);
|
// writer.writeElement(imageElement);
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
|
//
|
||||||
private void processText(Element textElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
|
// private void processText(Element textElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
|
||||||
|
//
|
||||||
Rect rect = textElement.getBBox();
|
// Rect rect = textElement.getBBox();
|
||||||
|
//
|
||||||
if (rect == null) {
|
// if (rect == null) {
|
||||||
writer.writeElement(textElement);
|
// writer.writeElement(textElement);
|
||||||
return;
|
// return;
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
GState gState = textElement.getGState();
|
// GState gState = textElement.getGState();
|
||||||
|
//
|
||||||
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
|
// boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
|
||||||
|
//
|
||||||
boolean isTextVisible = isTextRenderedVisibly(gState);
|
// boolean isTextVisible = isTextRenderedVisibly(gState);
|
||||||
|
//
|
||||||
if (inClippingPath && isTextVisible) {
|
// if (inClippingPath && isTextVisible) {
|
||||||
context.visibleElements().add(ElementFeatures.extractFeatures(textElement));
|
// context.visibleElements().add(ElementFeatures.extractFeatures(textElement));
|
||||||
}
|
// }
|
||||||
if (!context.delta()) {
|
// if (!context.delta()) {
|
||||||
if (inClippingPath && isTextVisible) {
|
// if (inClippingPath && isTextVisible) {
|
||||||
writer.writeElement(textElement);
|
// writer.writeElement(textElement);
|
||||||
} else if (textElement.hasTextMatrix()) {
|
// } else if (textElement.hasTextMatrix()) {
|
||||||
/*
|
// /*
|
||||||
PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
|
// PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
|
||||||
hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
|
// hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
|
||||||
Therefore, the position of a following Tj is affected by not writing the first Element.
|
// Therefore, the position of a following Tj is affected by not writing the first Element.
|
||||||
This is why, we write only the Tm command:
|
// This is why, we write only the Tm command:
|
||||||
*/
|
// */
|
||||||
writer.writeGStateChanges(textElement);
|
// writer.writeGStateChanges(textElement);
|
||||||
}
|
// }
|
||||||
} else {
|
// } else {
|
||||||
if (!inClippingPath) {
|
// if (!inClippingPath) {
|
||||||
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
|
// gState.setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||||
// red for elements removed by clipping path
|
// // red for elements removed by clipping path
|
||||||
gState.setFillColor(new ColorPt(1, 0, 0));
|
// gState.setFillColor(new ColorPt(1, 0, 0));
|
||||||
writer.writeElement(textElement);
|
// writer.writeElement(textElement);
|
||||||
}
|
// }
|
||||||
if (!isTextVisible) {
|
// if (!isTextVisible) {
|
||||||
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
|
// gState.setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||||
// blue for elements removed due to transparency or not rendered
|
// // blue for elements removed due to transparency or not rendered
|
||||||
gState.setFillColor(new ColorPt(0, 0, 1));
|
// gState.setFillColor(new ColorPt(0, 0, 1));
|
||||||
gState.setTextRenderMode(GState.e_fill_text);
|
// gState.setTextRenderMode(GState.e_fill_text);
|
||||||
gState.setFillOpacity(1);
|
// gState.setFillOpacity(1);
|
||||||
writer.writeElement(textElement);
|
// writer.writeElement(textElement);
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
|
//
|
||||||
private void processForm(Element formElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
|
// private void processForm(Element formElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
|
||||||
|
//
|
||||||
writer.writeElement(formElement);
|
// writer.writeElement(formElement);
|
||||||
Obj formObj = formElement.getXObject();
|
// Obj formObj = formElement.getXObject();
|
||||||
|
//
|
||||||
if (!context.visitedXObjIds().contains(formObj.getObjNum())) {
|
// if (!context.visitedXObjIds().contains(formObj.getObjNum())) {
|
||||||
context.visitedXObjIds().add(formObj.getObjNum());
|
// context.visitedXObjIds().add(formObj.getObjNum());
|
||||||
// writer needs to be newly initialized when entering a new content stream
|
// // writer needs to be newly initialized when entering a new content stream
|
||||||
// see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
|
// // see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
|
||||||
ElementWriter formWriter = new ElementWriter();
|
// ElementWriter formWriter = new ElementWriter();
|
||||||
context.reader().formBegin();
|
// context.reader().formBegin();
|
||||||
formWriter.begin(formObj);
|
// formWriter.begin(formObj);
|
||||||
|
//
|
||||||
context.reader().clearChangeList();
|
// context.reader().clearChangeList();
|
||||||
formWriter.setDefaultGState(context.reader());
|
// formWriter.setDefaultGState(context.reader());
|
||||||
|
//
|
||||||
processElements(formWriter, context);
|
// processElements(formWriter, context);
|
||||||
formWriter.end();
|
// formWriter.end();
|
||||||
formWriter.destroy();
|
// formWriter.destroy();
|
||||||
context.reader().end();
|
// context.reader().end();
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
|
//
|
||||||
private void processPath(Element pathElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
|
// private void processPath(Element pathElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
|
||||||
|
//
|
||||||
PathData pathData = pathElement.getPathData();
|
// PathData pathData = pathElement.getPathData();
|
||||||
|
//
|
||||||
if (pathData.getOperators().length == 0 && pathData.getPoints().length == 0) {
|
// if (pathData.getOperators().length == 0 && pathData.getPoints().length == 0) {
|
||||||
writer.writeGStateChanges(pathElement);
|
// writer.writeGStateChanges(pathElement);
|
||||||
return;
|
// return;
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
GeneralPath linePath = convertToGeneralPath(pathData);
|
// GeneralPath linePath = convertToGeneralPath(pathData);
|
||||||
|
//
|
||||||
//transform path to initial user space
|
// //transform path to initial user space
|
||||||
var ctm = pathElement.getCTM();
|
// var ctm = pathElement.getCTM();
|
||||||
var affineTransform = toAffineTransform(ctm);
|
// var affineTransform = toAffineTransform(ctm);
|
||||||
linePath.transform(affineTransform);
|
// linePath.transform(affineTransform);
|
||||||
|
//
|
||||||
var rect = linePath.getBounds2D();
|
// var rect = linePath.getBounds2D();
|
||||||
|
//
|
||||||
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight());
|
// boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight());
|
||||||
|
//
|
||||||
if (pathElement.isClippingPath()) {
|
// if (pathElement.isClippingPath()) {
|
||||||
if (pathElement.isClipWindingFill()) {
|
// if (pathElement.isClipWindingFill()) {
|
||||||
linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
|
// linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
|
||||||
} else {
|
// } else {
|
||||||
linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
|
// linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
context.clippingPathStack().intersectClippingPath(linePath);
|
// context.clippingPathStack().intersectClippingPath(linePath);
|
||||||
pathElement.setPathClip(!context.delta());
|
// pathElement.setPathClip(!context.delta());
|
||||||
writer.writeElement(pathElement);
|
// writer.writeElement(pathElement);
|
||||||
|
//
|
||||||
} else {
|
// } else {
|
||||||
if (pathElement.isWindingFill()) {
|
// if (pathElement.isWindingFill()) {
|
||||||
linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
|
// linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
|
||||||
} else {
|
// } else {
|
||||||
linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
|
// linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
if (inClippingPath) {
|
// if (inClippingPath) {
|
||||||
if (isFilledAndNonTransparent(pathElement)) {
|
// if (isFilledAndNonTransparent(pathElement)) {
|
||||||
List<ElementFeatures> currentOverlappedElements = context.visibleElements()
|
// List<ElementFeatures> currentOverlappedElements = context.visibleElements()
|
||||||
.stream()
|
// .stream()
|
||||||
.filter(features -> almostContains(linePath, features.getBoundingBox()))
|
// .filter(features -> almostContains(linePath, features.getBoundingBox()))
|
||||||
.toList();
|
// .toList();
|
||||||
context.overlappedElements().addAll(currentOverlappedElements);
|
// context.overlappedElements().addAll(currentOverlappedElements);
|
||||||
context.visibleElements().removeAll(currentOverlappedElements);
|
// context.visibleElements().removeAll(currentOverlappedElements);
|
||||||
}
|
// }
|
||||||
context.visibleElements().add(ElementFeatures.extractFeatures(pathElement));
|
// context.visibleElements().add(ElementFeatures.extractFeatures(pathElement));
|
||||||
if (!context.delta()) {
|
// if (!context.delta()) {
|
||||||
writer.writeElement(pathElement);
|
// writer.writeElement(pathElement);
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
if (context.delta() && !inClippingPath) {
|
// if (context.delta() && !inClippingPath) {
|
||||||
pathElement.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
|
// pathElement.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||||
pathElement.getGState().setFillColor(new ColorPt(1, 0, 0));
|
// pathElement.getGState().setFillColor(new ColorPt(1, 0, 0));
|
||||||
pathElement.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
// pathElement.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
||||||
pathElement.getGState().setStrokeColor(new ColorPt(1, 0, 0));
|
// pathElement.getGState().setStrokeColor(new ColorPt(1, 0, 0));
|
||||||
writer.writeElement(pathElement);
|
// writer.writeElement(pathElement);
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
|
//
|
||||||
private void removeOverlappedElements(Page page, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
|
// private void removeOverlappedElements(Page page, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
|
||||||
|
//
|
||||||
context.reader().begin(page);
|
// context.reader().begin(page);
|
||||||
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
|
// writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
|
||||||
if (context.delta()) {
|
// if (context.delta()) {
|
||||||
// green for element removed due to overlapping
|
// // green for element removed due to overlapping
|
||||||
context.overlappedElements().forEach(feature -> drawBBox(writer, feature.getBoundingBox(), "#00FF00"));
|
// context.overlappedElements().forEach(feature -> drawBBox(writer, feature.getBoundingBox(), "#00FF00"));
|
||||||
context.overlappedElements().clear();
|
// context.overlappedElements().clear();
|
||||||
}
|
// }
|
||||||
processOverlappedElements(writer, context);
|
// processOverlappedElements(writer, context);
|
||||||
writer.end();
|
// writer.end();
|
||||||
context.reader().end();
|
// context.reader().end();
|
||||||
|
//
|
||||||
if (context.overlappedElements().size() > 0) {
|
// if (context.overlappedElements().size() > 0) {
|
||||||
log.warn(context.overlappedElements().size() + " overlapped elements have not been found or removed");
|
// log.warn(context.overlappedElements().size() + " overlapped elements have not been found or removed");
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
|
//
|
||||||
private void processOverlappedElements(ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
|
// private void processOverlappedElements(ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
|
||||||
|
//
|
||||||
for (Element element = context.reader().next(); element != null; element = context.reader().next()) {
|
// for (Element element = context.reader().next(); element != null; element = context.reader().next()) {
|
||||||
switch (element.getType()) {
|
// switch (element.getType()) {
|
||||||
case Element.e_form -> processFormOverlappedElements(writer, element, context);
|
// case Element.e_form -> processFormOverlappedElements(writer, element, context);
|
||||||
case Element.e_path, Element.e_image, Element.e_inline_image, Element.e_text -> {
|
// case Element.e_path, Element.e_image, Element.e_inline_image, Element.e_text -> {
|
||||||
boolean anyMatch = false;
|
// boolean anyMatch = false;
|
||||||
for (ElementFeatures elementToRemove : context.overlappedElements()) {
|
// for (ElementFeatures elementToRemove : context.overlappedElements()) {
|
||||||
if (elementToRemove.almostMatches(element)) {
|
// if (elementToRemove.almostMatches(element)) {
|
||||||
context.overlappedElements().remove(elementToRemove);
|
// context.overlappedElements().remove(elementToRemove);
|
||||||
anyMatch = true;
|
// anyMatch = true;
|
||||||
break;
|
// break;
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
if (!anyMatch) {
|
// if (!anyMatch) {
|
||||||
writer.writeElement(element);
|
// writer.writeElement(element);
|
||||||
} else if (element.getType() == 3 && element.hasTextMatrix()) {
|
// } else if (element.getType() == 3 && element.hasTextMatrix()) {
|
||||||
/*
|
// /*
|
||||||
PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
|
// PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
|
||||||
hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
|
// hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
|
||||||
Therefore, the position of a following Tj is affected by not writing the first Element.
|
// Therefore, the position of a following Tj is affected by not writing the first Element.
|
||||||
This is why, we write only the Tm command:
|
// This is why, we write only the Tm command:
|
||||||
*/
|
// */
|
||||||
writer.writeGStateChanges(element);
|
// writer.writeGStateChanges(element);
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
default -> writer.writeElement(element);
|
// default -> writer.writeElement(element);
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
|
//
|
||||||
private void processFormOverlappedElements(ElementWriter writer, Element formElement, InvisibleElementRemovalContext context) throws PDFNetException {
|
// private void processFormOverlappedElements(ElementWriter writer, Element formElement, InvisibleElementRemovalContext context) throws PDFNetException {
|
||||||
|
//
|
||||||
writer.writeElement(formElement);
|
// writer.writeElement(formElement);
|
||||||
Obj formObj = formElement.getXObject();
|
// Obj formObj = formElement.getXObject();
|
||||||
|
//
|
||||||
if (!context.visitedXObjIds().contains(formObj.getObjNum())) {
|
// if (!context.visitedXObjIds().contains(formObj.getObjNum())) {
|
||||||
context.visitedXObjIds().add(formObj.getObjNum());
|
// context.visitedXObjIds().add(formObj.getObjNum());
|
||||||
// writer needs to be newly initialized when entering a new content stream
|
// // writer needs to be newly initialized when entering a new content stream
|
||||||
// see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
|
// // see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
|
||||||
ElementWriter formWriter = new ElementWriter();
|
// ElementWriter formWriter = new ElementWriter();
|
||||||
context.reader().formBegin();
|
// context.reader().formBegin();
|
||||||
formWriter.begin(formObj);
|
// formWriter.begin(formObj);
|
||||||
|
//
|
||||||
context.reader().clearChangeList();
|
// context.reader().clearChangeList();
|
||||||
formWriter.setDefaultGState(context.reader());
|
// formWriter.setDefaultGState(context.reader());
|
||||||
|
//
|
||||||
processOverlappedElements(formWriter, context);
|
// processOverlappedElements(formWriter, context);
|
||||||
formWriter.end();
|
// formWriter.end();
|
||||||
formWriter.destroy();
|
// formWriter.destroy();
|
||||||
context.reader().end();
|
// context.reader().end();
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
|
//
|
||||||
private boolean isTextRenderedVisibly(GState gState) throws PDFNetException {
|
// private boolean isTextRenderedVisibly(GState gState) throws PDFNetException {
|
||||||
|
//
|
||||||
return gState.getTextRenderMode() != GState.e_invisible_text && //
|
// return gState.getTextRenderMode() != GState.e_invisible_text && //
|
||||||
!(gState.getTextRenderMode() == GState.e_fill_text && gState.getFillOpacity() == 0) && //
|
// !(gState.getTextRenderMode() == GState.e_fill_text && gState.getFillOpacity() == 0) && //
|
||||||
!(gState.getTextRenderMode() == GState.e_stroke_text && gState.getStrokeOpacity() == 0) && //
|
// !(gState.getTextRenderMode() == GState.e_stroke_text && gState.getStrokeOpacity() == 0) && //
|
||||||
!(gState.getTextRenderMode() == GState.e_fill_stroke_text && gState.getFillOpacity() == 0 && gState.getStrokeOpacity() == 0);
|
// !(gState.getTextRenderMode() == GState.e_fill_stroke_text && gState.getFillOpacity() == 0 && gState.getStrokeOpacity() == 0);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
|
//
|
||||||
private GeneralPath convertToGeneralPath(PathData pathData) throws PDFNetException {
|
// private GeneralPath convertToGeneralPath(PathData pathData) throws PDFNetException {
|
||||||
|
//
|
||||||
GeneralPath linePath = new GeneralPath();
|
// GeneralPath linePath = new GeneralPath();
|
||||||
Iterator<Double> points = Doubles.asList(pathData.getPoints()).iterator();
|
// Iterator<Double> points = Doubles.asList(pathData.getPoints()).iterator();
|
||||||
Iterable<Byte> operators = Bytes.asList(pathData.getOperators());
|
// Iterable<Byte> operators = Bytes.asList(pathData.getOperators());
|
||||||
for (var operator : operators) {
|
// for (var operator : operators) {
|
||||||
switch (operator) {
|
// switch (operator) {
|
||||||
case PathData.e_moveto -> linePath.moveTo(points.next(), points.next());
|
// case PathData.e_moveto -> linePath.moveTo(points.next(), points.next());
|
||||||
case PathData.e_lineto -> linePath.lineTo(points.next(), points.next());
|
// case PathData.e_lineto -> linePath.lineTo(points.next(), points.next());
|
||||||
case PathData.e_cubicto -> linePath.curveTo(points.next(), points.next(), points.next(), points.next(), points.next(), points.next());
|
// case PathData.e_cubicto -> linePath.curveTo(points.next(), points.next(), points.next(), points.next(), points.next(), points.next());
|
||||||
case PathData.e_closepath -> linePath.closePath();
|
// case PathData.e_closepath -> linePath.closePath();
|
||||||
case PathData.e_rect -> {
|
// case PathData.e_rect -> {
|
||||||
double x = points.next();
|
// double x = points.next();
|
||||||
double y = points.next();
|
// double y = points.next();
|
||||||
double w = points.next();
|
// double w = points.next();
|
||||||
double h = points.next();
|
// double h = points.next();
|
||||||
linePath.moveTo(x, y);
|
// linePath.moveTo(x, y);
|
||||||
linePath.lineTo(x + w, y);
|
// linePath.lineTo(x + w, y);
|
||||||
linePath.lineTo(x + w, y + h);
|
// linePath.lineTo(x + w, y + h);
|
||||||
linePath.lineTo(x, y + h);
|
// linePath.lineTo(x, y + h);
|
||||||
linePath.closePath();
|
// linePath.closePath();
|
||||||
}
|
// }
|
||||||
default -> throw new PDFNetException("Invalid Element Type", 0, "", "", "");
|
// default -> throw new PDFNetException("Invalid Element Type", 0, "", "", "");
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
return linePath;
|
// return linePath;
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
|
//
|
||||||
private boolean almostContains(Shape outer, Rectangle2D inner) {
|
// private boolean almostContains(Shape outer, Rectangle2D inner) {
|
||||||
//To address inconsistencies in the calculation of the bounding box we slightly shrink the inner rectangle
|
// //To address inconsistencies in the calculation of the bounding box we slightly shrink the inner rectangle
|
||||||
|
//
|
||||||
double x_with_tolerance = inner.getX() >= 0 ? inner.getX() + TOLERANCE : inner.getX() - TOLERANCE;
|
// double x_with_tolerance = inner.getX() >= 0 ? inner.getX() + TOLERANCE : inner.getX() - TOLERANCE;
|
||||||
double y_with_tolerance = inner.getY() >= 0 ? inner.getY() + TOLERANCE : inner.getY() - TOLERANCE;
|
// double y_with_tolerance = inner.getY() >= 0 ? inner.getY() + TOLERANCE : inner.getY() - TOLERANCE;
|
||||||
double height_with_tolerance = inner.getHeight() - (2 * TOLERANCE);
|
// double height_with_tolerance = inner.getHeight() - (2 * TOLERANCE);
|
||||||
double width_with_tolerance = inner.getWidth() - (2 * TOLERANCE);
|
// double width_with_tolerance = inner.getWidth() - (2 * TOLERANCE);
|
||||||
Rectangle2D innerRect = new Rectangle2D.Double(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance);
|
// Rectangle2D innerRect = new Rectangle2D.Double(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance);
|
||||||
|
//
|
||||||
return outer.contains(innerRect);
|
// return outer.contains(innerRect);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
|
//
|
||||||
private boolean isFilledAndNonTransparent(Element element) throws PDFNetException {
|
// private boolean isFilledAndNonTransparent(Element element) throws PDFNetException {
|
||||||
|
//
|
||||||
return element.isFilled() && element.getGState().getFillOpacity() == 1;
|
// return element.isFilled() && element.getGState().getFillOpacity() == 1;
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
|
//
|
||||||
@SneakyThrows
|
// @SneakyThrows
|
||||||
private void drawBBox(ElementWriter writer, Rectangle2D r, String hexcolor) {
|
// private void drawBBox(ElementWriter writer, Rectangle2D r, String hexcolor) {
|
||||||
|
//
|
||||||
ColorPt colorPt = new ColorPt(Integer.valueOf(hexcolor.substring(1, 3), 16) / 255d,
|
// ColorPt colorPt = new ColorPt(Integer.valueOf(hexcolor.substring(1, 3), 16) / 255d,
|
||||||
Integer.valueOf(hexcolor.substring(3, 5), 16) / 255d,
|
// Integer.valueOf(hexcolor.substring(3, 5), 16) / 255d,
|
||||||
Integer.valueOf(hexcolor.substring(5, 7), 16) / 255d);
|
// Integer.valueOf(hexcolor.substring(5, 7), 16) / 255d);
|
||||||
ElementBuilder eb = new ElementBuilder();
|
// ElementBuilder eb = new ElementBuilder();
|
||||||
Element rect = eb.createRect(r.getX(), r.getY(), r.getWidth(), r.getHeight());
|
// Element rect = eb.createRect(r.getX(), r.getY(), r.getWidth(), r.getHeight());
|
||||||
rect.setPathStroke(true);
|
// rect.setPathStroke(true);
|
||||||
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
// rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
||||||
rect.getGState().setStrokeColor(colorPt);
|
// rect.getGState().setStrokeColor(colorPt);
|
||||||
writer.writePlacedElement(rect);
|
// writer.writePlacedElement(rect);
|
||||||
|
//
|
||||||
colorPt.destroy();
|
// colorPt.destroy();
|
||||||
eb.destroy();
|
// eb.destroy();
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
|
//
|
||||||
private static AffineTransform toAffineTransform(Matrix2D ctm) throws PDFNetException {
|
// private static AffineTransform toAffineTransform(Matrix2D ctm) throws PDFNetException {
|
||||||
|
//
|
||||||
return new AffineTransform(ctm.getA(), ctm.getB(), ctm.getC(), ctm.getD(), ctm.getH(), ctm.getV());
|
// return new AffineTransform(ctm.getA(), ctm.getB(), ctm.getC(), ctm.getD(), ctm.getH(), ctm.getV());
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
|
//
|
||||||
@Builder
|
// @Builder
|
||||||
private record InvisibleElementRemovalContext(
|
// private record InvisibleElementRemovalContext(
|
||||||
boolean delta,
|
// boolean delta,
|
||||||
ElementReader reader,
|
// ElementReader reader,
|
||||||
ClippingPathStack clippingPathStack,
|
// ClippingPathStack clippingPathStack,
|
||||||
List<ElementFeatures> overlappedElements,
|
// List<ElementFeatures> overlappedElements,
|
||||||
List<ElementFeatures> visibleElements,
|
// List<ElementFeatures> visibleElements,
|
||||||
Set<Long> visitedXObjIds) {
|
// Set<Long> visitedXObjIds) {
|
||||||
|
//
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
}
|
//}
|
||||||
@ -14,6 +14,7 @@ import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
|||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService;
|
||||||
import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse;
|
import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse;
|
||||||
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
|
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
|
||||||
import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings;
|
import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings;
|
||||||
@ -69,10 +70,10 @@ public class OCRService {
|
|||||||
try (ByteArrayOutputStream transferOutputStream = new ByteArrayOutputStream()) {
|
try (ByteArrayOutputStream transferOutputStream = new ByteArrayOutputStream()) {
|
||||||
try (InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId)) {
|
try (InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId)) {
|
||||||
long removalStart = System.currentTimeMillis();
|
long removalStart = System.currentTimeMillis();
|
||||||
log.debug("Start invisible element removal for file with dossierId {} and fileId {}", dossierId, fileId);
|
log.info("Start invisible element removal for file with dossierId {} and fileId {}", dossierId, fileId);
|
||||||
invisibleElementRemovalService.removeInvisibleElements(fileStream, transferOutputStream, false);
|
invisibleElementRemovalService.removeInvisibleElements(fileStream, transferOutputStream, false);
|
||||||
long removalEnd = System.currentTimeMillis();
|
long removalEnd = System.currentTimeMillis();
|
||||||
log.debug("Invisible element removal successful for file with dossierId {} and fileId {}, took {}s",
|
log.info("Invisible element removal successful for file with dossierId {} and fileId {}, took {}s",
|
||||||
dossierId,
|
dossierId,
|
||||||
fileId,
|
fileId,
|
||||||
format("%.1f", (removalEnd - removalStart) / 1000.0));
|
format("%.1f", (removalEnd - removalStart) / 1000.0));
|
||||||
|
|||||||
@ -9,16 +9,18 @@ import java.io.FileOutputStream;
|
|||||||
|
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
import org.springframework.beans.factory.annotation.Autowired;
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
|
import org.springframework.context.annotation.Bean;
|
||||||
import org.springframework.core.io.ClassPathResource;
|
import org.springframework.core.io.ClassPathResource;
|
||||||
|
|
||||||
|
import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService;
|
||||||
import com.iqser.red.service.ocr.v1.server.AbstractTest;
|
import com.iqser.red.service.ocr.v1.server.AbstractTest;
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
|
||||||
public class InvisibleElementRemovalServiceTest extends AbstractTest {
|
public class InvisibleElementRemovalServiceTest extends AbstractTest {
|
||||||
|
|
||||||
@Autowired
|
@Autowired
|
||||||
private InvisibleElementRemovalService invisibleElementRemovalService;
|
private InvisibleElementRemovalService invisibleElementRemovalService;
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user