RED-6019: Remove hidden text when processing OCR
*code refactor *upgrade to java 17
This commit is contained in:
parent
fd7ec6e7aa
commit
99a0cb51d0
@ -86,16 +86,6 @@
|
||||
</dependencies>
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<configuration>
|
||||
<annotationProcessors>
|
||||
<annotationProcessor>lombok.launch.AnnotationProcessorHider$AnnotationProcessor</annotationProcessor>
|
||||
<annotationProcessor>com.dslplatform.json.processor.CompiledJsonAnnotationProcessor</annotationProcessor>
|
||||
</annotationProcessors>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<!-- generate git.properties for exposure in /info -->
|
||||
<groupId>pl.project13.maven</groupId>
|
||||
|
||||
@ -1,53 +1,67 @@
|
||||
package com.iqser.red.service.ocr.v1.server.model;
|
||||
|
||||
import com.pdftron.pdf.Rect;
|
||||
import lombok.Data;
|
||||
import lombok.SneakyThrows;
|
||||
import static com.iqser.red.service.ocr.v1.server.service.InvisibleElementRemovalService.TOLERANCE;
|
||||
|
||||
import java.awt.geom.Area;
|
||||
import java.awt.geom.GeneralPath;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayDeque;
|
||||
import java.util.Deque;
|
||||
import java.util.LinkedList;
|
||||
|
||||
import com.pdftron.pdf.Rect;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@Data
|
||||
public class ClippingPathStack {
|
||||
|
||||
private Deque<Area> stack = new ArrayDeque<>();
|
||||
private Deque<Area> stack = new LinkedList<>();
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public ClippingPathStack(Rect rectangle) {
|
||||
|
||||
stack.push(new Area(new Rectangle2D.Double(rectangle.getX1(), rectangle.getY1(), rectangle.getWidth(), rectangle.getHeight()).getBounds2D()));
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void intersectClippingPath(GeneralPath path) {
|
||||
|
||||
getCurrentClippingPath().intersect(new Area(path));
|
||||
}
|
||||
|
||||
|
||||
public boolean almostIntersects(double x, double y, double width, double height) {
|
||||
// To address inconsistencies in the calculation of the bounding box we slightly increase the rectangle
|
||||
// Height or width are zero for straight lines, even though they are being rendered. Therefore, height or width must be at minimum >0.
|
||||
double tolerance = 1e-3;
|
||||
double x_with_tolerance = x > 0 ? x - tolerance : x + tolerance;
|
||||
double y_with_tolerance = y > 0 ? y - tolerance : y + tolerance;
|
||||
double width_with_tolerance = width + 2 * tolerance;
|
||||
double height_with_tolerance = height + 2 * tolerance;
|
||||
|
||||
double x_with_tolerance = x > 0 ? x - TOLERANCE : x + TOLERANCE;
|
||||
double y_with_tolerance = y > 0 ? y - TOLERANCE : y + TOLERANCE;
|
||||
double width_with_tolerance = width + (2 * TOLERANCE);
|
||||
double height_with_tolerance = height + (2 * TOLERANCE);
|
||||
return getCurrentClippingPath().intersects(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance);
|
||||
}
|
||||
|
||||
|
||||
public Area getCurrentClippingPath() {
|
||||
|
||||
return stack.peek();
|
||||
}
|
||||
|
||||
|
||||
public void enterNewGState() {
|
||||
|
||||
Area current = stack.peek();
|
||||
Area cloned = new Area();
|
||||
cloned.add(current);
|
||||
stack.push(cloned);
|
||||
}
|
||||
|
||||
|
||||
public void leaveGState() {
|
||||
|
||||
stack.pop();
|
||||
}
|
||||
|
||||
|
||||
@ -1,148 +1,170 @@
|
||||
package com.iqser.red.service.ocr.v1.server.model;
|
||||
|
||||
import static com.iqser.red.service.ocr.v1.server.service.InvisibleElementRemovalService.TOLERANCE;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.Element;
|
||||
import com.pdftron.pdf.Rect;
|
||||
import lombok.*;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Getter;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.experimental.SuperBuilder;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
@Data
|
||||
@Getter
|
||||
@SuperBuilder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public abstract class ElementFeatures {
|
||||
private int elementType;
|
||||
private Rectangle2D boundingBox;
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class ElementFeatures {
|
||||
|
||||
int elementType;
|
||||
Rectangle2D boundingBox;
|
||||
|
||||
|
||||
public boolean almostMatches(Element element) throws PDFNetException {
|
||||
if (element.getType() != elementType) return false;
|
||||
if (element.getBBox() == null) return false;
|
||||
return rectsAlmostMatch(element.getBBox());
|
||||
|
||||
return element.getType() == elementType && //
|
||||
element.getBBox() != null && //
|
||||
rectsAlmostMatch(element.getBBox());
|
||||
}
|
||||
|
||||
|
||||
protected boolean almostEqual(double a, double b) {
|
||||
double tolerance = 1e-3;
|
||||
return Math.abs(a - b) < tolerance;
|
||||
|
||||
return Math.abs(a - b) < TOLERANCE;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private boolean rectsAlmostMatch(Rect bBox) {
|
||||
if (!almostEqual(bBox.getX1(), boundingBox.getX())) return false;
|
||||
if (!almostEqual(bBox.getY1(), boundingBox.getY())) return false;
|
||||
if (!almostEqual(bBox.getWidth(), boundingBox.getWidth())) return false;
|
||||
return almostEqual(bBox.getHeight(), boundingBox.getHeight());
|
||||
// To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance
|
||||
|
||||
return almostEqual(bBox.getX1(), boundingBox.getX()) && //
|
||||
almostEqual(bBox.getY1(), boundingBox.getY()) && //
|
||||
almostEqual(bBox.getWidth(), boundingBox.getWidth()) && //
|
||||
almostEqual(bBox.getHeight(), boundingBox.getHeight());
|
||||
}
|
||||
|
||||
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@Data
|
||||
@Getter
|
||||
@SuperBuilder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public static class Text extends ElementFeatures {
|
||||
private String text;
|
||||
private int font;
|
||||
private double fontsize;
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
private static class Text extends ElementFeatures {
|
||||
|
||||
String text;
|
||||
int font;
|
||||
double fontsize;
|
||||
|
||||
|
||||
@Override
|
||||
public boolean almostMatches(Element element) throws PDFNetException {
|
||||
if (!super.almostMatches(element)) return false;
|
||||
if (!text.equals(element.getTextString())) return false;
|
||||
if (font != element.getGState().getFont().getType()) return false;
|
||||
return almostEqual(fontsize, element.getGState().getFontSize());
|
||||
|
||||
return super.almostMatches(element) && //
|
||||
text.equals(element.getTextString()) && //
|
||||
font == element.getGState().getFont().getType() && //
|
||||
almostEqual(fontsize, element.getGState().getFontSize());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@Data
|
||||
@Getter
|
||||
@SuperBuilder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public static class Path extends ElementFeatures {
|
||||
private boolean isClippingPath;
|
||||
private boolean isClipWindingFill;
|
||||
private boolean isStroked;
|
||||
private boolean isFilled;
|
||||
private boolean isWindingFill;
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
private static class Path extends ElementFeatures {
|
||||
|
||||
boolean isClippingPath;
|
||||
boolean isClipWindingFill;
|
||||
boolean isStroked;
|
||||
boolean isFilled;
|
||||
boolean isWindingFill;
|
||||
|
||||
|
||||
@Override
|
||||
public boolean almostMatches(Element element) throws PDFNetException {
|
||||
if (!super.almostMatches(element)) return false;
|
||||
if (isClippingPath != element.isClippingPath()) return false;
|
||||
if (isClipWindingFill != element.isClipWindingFill()) return false;
|
||||
if (isStroked != element.isStroked()) return false;
|
||||
if (isFilled != element.isFilled()) return false;
|
||||
if (isWindingFill != element.isWindingFill()) return false;
|
||||
|
||||
return true;
|
||||
return super.almostMatches(element) && //
|
||||
isClippingPath == element.isClippingPath() && //
|
||||
isClipWindingFill == element.isClipWindingFill() && //
|
||||
isStroked == element.isStroked() && //
|
||||
isFilled == element.isFilled() && //
|
||||
isWindingFill == element.isWindingFill();
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@Data
|
||||
@Getter
|
||||
@SuperBuilder
|
||||
@NoArgsConstructor
|
||||
public static class Image extends ElementFeatures {
|
||||
private int dataSize;
|
||||
private int height;
|
||||
private int width;
|
||||
private int renderingIntent;
|
||||
private int componentNum;
|
||||
private int bitsPerComponent;
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
private static class Image extends ElementFeatures {
|
||||
|
||||
int dataSize;
|
||||
int height;
|
||||
int width;
|
||||
int renderingIntent;
|
||||
int componentNum;
|
||||
int bitsPerComponent;
|
||||
|
||||
|
||||
@Override
|
||||
public boolean almostMatches(Element element) throws PDFNetException {
|
||||
if (!super.almostMatches(element)) return false;
|
||||
if (dataSize != element.getImageDataSize()) return false;
|
||||
if (height != element.getImageHeight()) return false;
|
||||
if (width != element.getImageWidth()) return false;
|
||||
if (renderingIntent != element.getImageRenderingIntent()) return false;
|
||||
if (componentNum != element.getComponentNum()) return false;
|
||||
if (bitsPerComponent != element.getBitsPerComponent()) return false;
|
||||
return true;
|
||||
|
||||
return super.almostMatches(element) && //
|
||||
dataSize == element.getImageDataSize() && //
|
||||
height == element.getImageHeight() && //
|
||||
width == element.getImageWidth() && //
|
||||
renderingIntent == element.getImageRenderingIntent() && //
|
||||
componentNum == element.getComponentNum() && //
|
||||
bitsPerComponent == element.getBitsPerComponent();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
public static ElementFeatures extractFeatures(Element element) throws PDFNetException {
|
||||
switch (element.getType()) {
|
||||
case Element.e_path:
|
||||
return ElementFeatures.Path.builder()
|
||||
.elementType(element.getType())
|
||||
.boundingBox(toRectangle2D(element.getBBox()))
|
||||
.isClippingPath(element.isClippingPath())
|
||||
.isClipWindingFill(element.isClipWindingFill())
|
||||
.isStroked(element.isStroked())
|
||||
.isFilled(element.isFilled())
|
||||
.isWindingFill(element.isWindingFill())
|
||||
.build();
|
||||
case Element.e_text:
|
||||
return ElementFeatures.Text.builder()
|
||||
.elementType(element.getType())
|
||||
.boundingBox(toRectangle2D(element.getBBox()))
|
||||
.text(element.getTextString())
|
||||
.font(element.getGState().getFont().getType())
|
||||
.fontsize(element.getGState().getFontSize())
|
||||
.build();
|
||||
case Element.e_image:
|
||||
case Element.e_inline_image:
|
||||
return Image.builder()
|
||||
.elementType(element.getType())
|
||||
.boundingBox(toRectangle2D(element.getBBox()))
|
||||
.dataSize(element.getImageDataSize())
|
||||
.height(element.getImageHeight())
|
||||
.width(element.getImageWidth())
|
||||
.renderingIntent(element.getImageRenderingIntent())
|
||||
.componentNum(element.getComponentNum())
|
||||
.bitsPerComponent(element.getBitsPerComponent())
|
||||
.build();
|
||||
default:
|
||||
throw new UnsupportedOperationException("Feature Extraction is not supported for PDFTron.Element with type: " + element.getType());
|
||||
}
|
||||
|
||||
return switch (element.getType()) {
|
||||
case Element.e_path -> Path.builder()
|
||||
.elementType(element.getType())
|
||||
.boundingBox(toRectangle2D(element.getBBox()))
|
||||
.isClippingPath(element.isClippingPath())
|
||||
.isClipWindingFill(element.isClipWindingFill())
|
||||
.isStroked(element.isStroked())
|
||||
.isFilled(element.isFilled())
|
||||
.isWindingFill(element.isWindingFill())
|
||||
.build();
|
||||
case Element.e_text -> Text.builder()
|
||||
.elementType(element.getType())
|
||||
.boundingBox(toRectangle2D(element.getBBox()))
|
||||
.text(element.getTextString())
|
||||
.font(element.getGState().getFont().getType())
|
||||
.fontsize(element.getGState().getFontSize())
|
||||
.build();
|
||||
case Element.e_image, Element.e_inline_image -> Image.builder()
|
||||
.elementType(element.getType())
|
||||
.boundingBox(toRectangle2D(element.getBBox()))
|
||||
.dataSize(element.getImageDataSize())
|
||||
.height(element.getImageHeight())
|
||||
.width(element.getImageWidth())
|
||||
.renderingIntent(element.getImageRenderingIntent())
|
||||
.componentNum(element.getComponentNum())
|
||||
.bitsPerComponent(element.getBitsPerComponent())
|
||||
.build();
|
||||
// This technically should never happen, it's a safetynet
|
||||
default -> throw new RuntimeException("Feature Extraction is not supported for PDFTron.Element with type: " + element.getType());
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
private static Rectangle2D toRectangle2D(Rect rect) throws PDFNetException {
|
||||
|
||||
return new Rectangle2D.Double(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,25 @@
|
||||
package com.iqser.red.service.ocr.v1.server.model;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.pdftron.pdf.ElementReader;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true)
|
||||
public class InvisibleElementRemovalDto {
|
||||
|
||||
boolean delta;
|
||||
ElementReader reader;
|
||||
ClippingPathStack clippingPathStack;
|
||||
List<ElementFeatures> overlappedElements;
|
||||
List<ElementFeatures> visibleElements;
|
||||
Set<Long> visitedXObjIds;
|
||||
|
||||
}
|
||||
@ -0,0 +1,419 @@
|
||||
package com.iqser.red.service.ocr.v1.server.service;
|
||||
|
||||
import java.awt.Shape;
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.GeneralPath;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.google.common.primitives.Bytes;
|
||||
import com.google.common.primitives.Doubles;
|
||||
import com.iqser.red.service.ocr.v1.server.model.ClippingPathStack;
|
||||
import com.iqser.red.service.ocr.v1.server.model.ElementFeatures;
|
||||
import com.iqser.red.service.ocr.v1.server.model.InvisibleElementRemovalDto;
|
||||
import com.pdftron.common.Matrix2D;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.ColorPt;
|
||||
import com.pdftron.pdf.ColorSpace;
|
||||
import com.pdftron.pdf.Element;
|
||||
import com.pdftron.pdf.ElementBuilder;
|
||||
import com.pdftron.pdf.ElementReader;
|
||||
import com.pdftron.pdf.ElementWriter;
|
||||
import com.pdftron.pdf.GState;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.pdf.PageIterator;
|
||||
import com.pdftron.pdf.PathData;
|
||||
import com.pdftron.pdf.Rect;
|
||||
import com.pdftron.sdf.Obj;
|
||||
import com.pdftron.sdf.SDFDoc;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
public class InvisibleElementRemovalService {
|
||||
|
||||
static public final double TOLERANCE = 1e-3;
|
||||
|
||||
|
||||
/**
|
||||
* Removes all hidden Text, Path and Image Elements from a PDF Document.
|
||||
* handled cases:
|
||||
* -Text which is transparent or is set to not render
|
||||
* -Elements outside of clipping path
|
||||
* -Elements that have been painted over by visible and filled Paths
|
||||
* unhandled cases:
|
||||
* -Elements covered by widely stroked path
|
||||
* -Elements with the same color as background
|
||||
* -Any Text set to clipping with its many interactions with other elements
|
||||
*
|
||||
* @param pdfFile The PDF file to process
|
||||
* @param delta If this flag is set only the removed Elements will be written to the output file.
|
||||
* The Elements are red if they are removed by clipping path, blue for transparency, and a green bounding box for overlap.
|
||||
* @return The resulting PDF File as bytes.
|
||||
**/
|
||||
@SneakyThrows
|
||||
public byte[] removeInvisibleElements(InputStream pdfFile, boolean delta) {
|
||||
|
||||
PDFDoc pdfDoc = new PDFDoc(pdfFile);
|
||||
|
||||
ElementWriter writer = new ElementWriter();
|
||||
ElementReader reader = new ElementReader();
|
||||
Set<Long> visitedXObjIds = new TreeSet<>();
|
||||
|
||||
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
|
||||
|
||||
Page page = iterator.next();
|
||||
|
||||
visitedXObjIds.add(page.getSDFObj().getObjNum());
|
||||
InvisibleElementRemovalDto dto = InvisibleElementRemovalDto.builder()
|
||||
.reader(reader)
|
||||
.clippingPathStack(new ClippingPathStack(page.getMediaBox()))
|
||||
.delta(delta)
|
||||
.overlappedElements(new ArrayList<>())
|
||||
.visibleElements(new ArrayList<>())
|
||||
.visitedXObjIds(visitedXObjIds)
|
||||
.build();
|
||||
|
||||
removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, dto);
|
||||
|
||||
dto.getVisitedXObjIds().clear();
|
||||
|
||||
removeOverlappedElements(page, writer, dto);
|
||||
}
|
||||
return pdfDoc.save(SDFDoc.SaveMode.REMOVE_UNUSED, null);
|
||||
}
|
||||
|
||||
|
||||
private void removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(Page page, ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException {
|
||||
|
||||
dto.getReader().begin(page);
|
||||
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
|
||||
processElements(writer, dto);
|
||||
writer.end();
|
||||
dto.getReader().end();
|
||||
}
|
||||
|
||||
|
||||
private void processElements(ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException {
|
||||
|
||||
for (Element element = dto.getReader().next(); element != null; element = dto.getReader().next())
|
||||
switch (element.getType()) {
|
||||
case Element.e_image, Element.e_inline_image -> processImages(element, writer, dto);
|
||||
case Element.e_text -> processText(element, writer, dto);
|
||||
case Element.e_path -> processPath(element, writer, dto);
|
||||
case Element.e_form -> processForm(element, writer, dto);
|
||||
case Element.e_group_begin -> {
|
||||
dto.getClippingPathStack().enterNewGState();
|
||||
writer.writeElement(element);
|
||||
}
|
||||
case Element.e_group_end -> {
|
||||
dto.getClippingPathStack().leaveGState();
|
||||
writer.writeElement(element);
|
||||
}
|
||||
default -> writer.writeElement(element);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void processImages(Element imageElement, ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException {
|
||||
|
||||
Rect rect = imageElement.getBBox();
|
||||
|
||||
if (rect == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
boolean inClippingPath = dto.getClippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
|
||||
|
||||
if (!dto.isDelta() && inClippingPath) {
|
||||
dto.getVisibleElements().add(ElementFeatures.extractFeatures(imageElement));
|
||||
}
|
||||
|
||||
if (dto.isDelta() ^ inClippingPath) {
|
||||
writer.writeElement(imageElement);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void processText(Element textElement, ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException {
|
||||
|
||||
Rect rect = textElement.getBBox();
|
||||
|
||||
if (rect == null) {
|
||||
writer.writeElement(textElement);
|
||||
return;
|
||||
}
|
||||
|
||||
GState gState = textElement.getGState();
|
||||
|
||||
boolean inClippingPath = dto.getClippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
|
||||
|
||||
boolean isTextVisible = isTextRenderedVisibly(gState);
|
||||
|
||||
if (inClippingPath && isTextVisible) {
|
||||
dto.getVisibleElements().add(ElementFeatures.extractFeatures(textElement));
|
||||
}
|
||||
if (!dto.isDelta()) {
|
||||
if (inClippingPath && isTextVisible) {
|
||||
writer.writeElement(textElement);
|
||||
} else if (textElement.hasTextMatrix()) {
|
||||
/*
|
||||
PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
|
||||
hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
|
||||
Therefore, the position of a following Tj is affected by not writing the first Element.
|
||||
This is why, we write only the Tm command:
|
||||
*/
|
||||
writer.writeGStateChanges(textElement);
|
||||
}
|
||||
} else {
|
||||
if (!inClippingPath) {
|
||||
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||
gState.setFillColor(new ColorPt(1, 0, 0));
|
||||
writer.writeElement(textElement);
|
||||
}
|
||||
if (!isTextVisible) {
|
||||
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||
gState.setFillColor(new ColorPt(0, 0, 1));
|
||||
gState.setTextRenderMode(GState.e_fill_text);
|
||||
gState.setFillOpacity(1);
|
||||
writer.writeElement(textElement);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void processForm(Element formElement, ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException {
|
||||
|
||||
writer.writeElement(formElement);
|
||||
Obj formObj = formElement.getXObject();
|
||||
|
||||
if (!dto.getVisitedXObjIds().contains(formObj.getObjNum())) {
|
||||
dto.getVisitedXObjIds().add(formObj.getObjNum());
|
||||
// writer needs to be newly initialized when entering a new content stream
|
||||
// see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
|
||||
ElementWriter formWriter = new ElementWriter();
|
||||
dto.getReader().formBegin();
|
||||
formWriter.begin(formObj);
|
||||
|
||||
dto.getReader().clearChangeList();
|
||||
formWriter.setDefaultGState(dto.getReader());
|
||||
|
||||
processElements(formWriter, dto);
|
||||
formWriter.end();
|
||||
dto.getReader().end();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void processPath(Element pathElement, ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException {
|
||||
|
||||
GeneralPath linePath = convertToGeneralPath(pathElement.getPathData());
|
||||
|
||||
//transform path to initial user space
|
||||
var ctm = pathElement.getCTM();
|
||||
var affineTransform = getAffineTransform(ctm);
|
||||
linePath.transform(affineTransform);
|
||||
|
||||
var rect = linePath.getBounds2D();
|
||||
|
||||
boolean inClippingPath = dto.getClippingPathStack().almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight());
|
||||
|
||||
if (pathElement.isClippingPath()) {
|
||||
if (pathElement.isClipWindingFill()) {
|
||||
linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
|
||||
} else {
|
||||
linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
|
||||
}
|
||||
|
||||
dto.getClippingPathStack().intersectClippingPath(linePath);
|
||||
pathElement.setPathClip(!dto.isDelta());
|
||||
writer.writeElement(pathElement);
|
||||
|
||||
} else {
|
||||
if (inClippingPath) {
|
||||
// TODO: WINDING RULE
|
||||
if (isFilledAndNonTransparent(pathElement)) {
|
||||
List<ElementFeatures> currentOverlappedElements = dto.getVisibleElements()
|
||||
.stream()
|
||||
.filter(features -> almostContains(linePath, features.getBoundingBox()))
|
||||
.toList();
|
||||
dto.getOverlappedElements().addAll(currentOverlappedElements);
|
||||
dto.getVisibleElements().removeAll(currentOverlappedElements);
|
||||
}
|
||||
dto.getVisibleElements().add(ElementFeatures.extractFeatures(pathElement));
|
||||
if (!dto.isDelta()) {
|
||||
writer.writeElement(pathElement);
|
||||
}
|
||||
}
|
||||
if (dto.isDelta() && !inClippingPath) {
|
||||
pathElement.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||
pathElement.getGState().setFillColor(new ColorPt(1, 0, 0));
|
||||
pathElement.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
||||
pathElement.getGState().setStrokeColor(new ColorPt(1, 0, 0));
|
||||
writer.writeElement(pathElement);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static AffineTransform getAffineTransform(Matrix2D ctm) throws PDFNetException {
|
||||
|
||||
return new AffineTransform(ctm.getA(), ctm.getB(), ctm.getC(), ctm.getD(), ctm.getH(), ctm.getV());
|
||||
}
|
||||
|
||||
|
||||
private void removeOverlappedElements(Page page, ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException {
|
||||
|
||||
dto.getReader().begin(page);
|
||||
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
|
||||
if (dto.isDelta()) {
|
||||
dto.getOverlappedElements().forEach(feature -> drawBBox(writer, feature.getBoundingBox(), "#00FF00"));
|
||||
dto.getOverlappedElements().clear();
|
||||
}
|
||||
processOverlappedElements(writer, dto);
|
||||
writer.end();
|
||||
dto.getReader().end();
|
||||
|
||||
if (dto.getOverlappedElements().size() > 0) {
|
||||
log.warn(dto.getOverlappedElements().size() + " overlapped elements have not been found or removed");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void processOverlappedElements(ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException {
|
||||
|
||||
for (Element element = dto.getReader().next(); element != null; element = dto.getReader().next()) {
|
||||
switch (element.getType()) {
|
||||
case Element.e_form -> processFormOverlappedElements(writer, element, dto);
|
||||
case Element.e_path, Element.e_image, Element.e_inline_image, Element.e_text -> {
|
||||
boolean anyMatch = false;
|
||||
for (ElementFeatures elementToRemove : dto.getOverlappedElements()) {
|
||||
if (elementToRemove.almostMatches(element)) {
|
||||
dto.getOverlappedElements().remove(elementToRemove);
|
||||
anyMatch = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!anyMatch) {
|
||||
writer.writeElement(element);
|
||||
} else if (element.getType() == 3 && element.hasTextMatrix()) {
|
||||
/*
|
||||
PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
|
||||
hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
|
||||
Therefore, the position of a following Tj is affected by not writing the first Element.
|
||||
This is why, we write only the Tm command:
|
||||
*/
|
||||
writer.writeGStateChanges(element);
|
||||
}
|
||||
}
|
||||
default -> writer.writeElement(element);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void processFormOverlappedElements(ElementWriter writer, Element formElement, InvisibleElementRemovalDto dto) throws PDFNetException {
|
||||
|
||||
writer.writeElement(formElement);
|
||||
Obj formObj = formElement.getXObject();
|
||||
|
||||
if (!dto.getVisitedXObjIds().contains(formObj.getObjNum())) {
|
||||
dto.getVisitedXObjIds().add(formObj.getObjNum());
|
||||
// writer needs to be newly initialized when entering a new content stream
|
||||
// see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
|
||||
ElementWriter formWriter = new ElementWriter();
|
||||
dto.getReader().formBegin();
|
||||
formWriter.begin(formObj);
|
||||
|
||||
dto.getReader().clearChangeList();
|
||||
formWriter.setDefaultGState(dto.getReader());
|
||||
|
||||
processOverlappedElements(formWriter, dto);
|
||||
formWriter.end();
|
||||
dto.getReader().end();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private boolean isTextRenderedVisibly(GState gState) throws PDFNetException {
|
||||
|
||||
return gState.getTextRenderMode() != GState.e_invisible_text && //
|
||||
!(gState.getTextRenderMode() == GState.e_fill_text && gState.getFillOpacity() == 0) && //
|
||||
!(gState.getTextRenderMode() == GState.e_stroke_text && gState.getStrokeOpacity() == 0) && //
|
||||
!(gState.getTextRenderMode() == GState.e_fill_stroke_text && gState.getFillOpacity() == 0 && gState.getStrokeOpacity() == 0);
|
||||
}
|
||||
|
||||
|
||||
private GeneralPath convertToGeneralPath(PathData pathData) throws PDFNetException {
|
||||
|
||||
GeneralPath linePath = new GeneralPath();
|
||||
Iterator<Double> points = Doubles.asList(pathData.getPoints()).iterator();
|
||||
Iterable<Byte> operators = Bytes.asList(pathData.getOperators());
|
||||
for (var operator : operators) {
|
||||
switch (operator) {
|
||||
case PathData.e_moveto -> linePath.moveTo(points.next(), points.next());
|
||||
case PathData.e_lineto -> linePath.lineTo(points.next(), points.next());
|
||||
case PathData.e_cubicto -> linePath.curveTo(points.next(), points.next(), points.next(), points.next(), points.next(), points.next());
|
||||
case PathData.e_closepath -> linePath.closePath();
|
||||
case PathData.e_rect -> {
|
||||
double x = points.next();
|
||||
double y = points.next();
|
||||
double w = points.next();
|
||||
double h = points.next();
|
||||
linePath.moveTo(x, y);
|
||||
linePath.lineTo(x + w, y);
|
||||
linePath.lineTo(x + w, y + h);
|
||||
linePath.lineTo(x, y + h);
|
||||
linePath.closePath();
|
||||
}
|
||||
default -> throw new PDFNetException("Invalid Element Type", 0, "", "", "");
|
||||
}
|
||||
}
|
||||
return linePath;
|
||||
}
|
||||
|
||||
|
||||
private boolean almostContains(Shape outer, Rectangle2D inner) {
|
||||
//To address inconsistencies in the calculation of the bounding box we slightly shrink the inner rectangle
|
||||
|
||||
double x_with_tolerance = inner.getX() >= 0 ? inner.getX() + TOLERANCE : inner.getX() - TOLERANCE;
|
||||
double y_with_tolerance = inner.getY() >= 0 ? inner.getY() + TOLERANCE : inner.getY() - TOLERANCE;
|
||||
double height_with_tolerance = inner.getHeight() - (2 * TOLERANCE);
|
||||
double width_with_tolerance = inner.getWidth() - (2 * TOLERANCE);
|
||||
Rectangle2D innerRect = new Rectangle2D.Double(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance);
|
||||
|
||||
return outer.contains(innerRect);
|
||||
}
|
||||
|
||||
|
||||
private boolean isFilledAndNonTransparent(Element element) throws PDFNetException {
|
||||
|
||||
return element.isFilled() && element.getGState().getFillOpacity() == 1;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void drawBBox(ElementWriter writer, Rectangle2D r, String hexcolor) {
|
||||
|
||||
ColorPt colorPt = new ColorPt(Integer.valueOf(hexcolor.substring(1, 3), 16) / 255d,
|
||||
Integer.valueOf(hexcolor.substring(3, 5), 16) / 255d,
|
||||
Integer.valueOf(hexcolor.substring(5, 7), 16) / 255d);
|
||||
ElementBuilder eb = new ElementBuilder();
|
||||
Element rect = eb.createRect(r.getX(), r.getY(), r.getWidth(), r.getHeight());
|
||||
rect.setPathStroke(true);
|
||||
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
||||
rect.getGState().setStrokeColor(colorPt);
|
||||
writer.writePlacedElement(rect);
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,448 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.service;
|
||||
|
||||
import com.iqser.red.service.ocr.v1.server.model.ClippingPathStack;
|
||||
import com.iqser.red.service.ocr.v1.server.model.ElementFeatures;
|
||||
import com.pdftron.common.Matrix2D;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.*;
|
||||
import com.pdftron.sdf.Obj;
|
||||
import com.pdftron.sdf.SDFDoc;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.awt.*;
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.GeneralPath;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
public class InvisibleElementService {
|
||||
|
||||
/*
|
||||
handled cases:
|
||||
Text which is transparent or is set to not render
|
||||
Text or Path or Images outside of clipping path
|
||||
Text or Path or Images that have been painted over by visible and filled Paths
|
||||
unhandled cases:
|
||||
Text covered by widely stroked path
|
||||
Text same color as background
|
||||
Any Text set to clipping with its many interactions with other elements
|
||||
*/
|
||||
@SneakyThrows
|
||||
public byte[] removeInvisibleElements(byte[] pdfFile, boolean delta) {
|
||||
PDFDoc pdfDoc = new PDFDoc(pdfFile);
|
||||
|
||||
ElementWriter writer = new ElementWriter();
|
||||
ElementReader reader = new ElementReader();
|
||||
Set<Integer> visited = new TreeSet<>();
|
||||
|
||||
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
|
||||
Page page = iterator.next();
|
||||
List<ElementFeatures> overlappedElements = removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, reader, writer, visited, delta);
|
||||
visited.clear();
|
||||
removeOverlappedElements(page, reader, writer, visited, overlappedElements, delta);
|
||||
}
|
||||
|
||||
return pdfDoc.save(SDFDoc.SaveMode.REMOVE_UNUSED, null);
|
||||
}
|
||||
|
||||
|
||||
private List<ElementFeatures> removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(Page page, ElementReader reader, ElementWriter writer, Set<Integer> visited, boolean delta)
|
||||
throws PDFNetException {
|
||||
var overlappedElements = new ArrayList<ElementFeatures>();
|
||||
var visibleElements = new ArrayList<ElementFeatures>();
|
||||
ClippingPathStack clippingPathStack = new ClippingPathStack(page.getMediaBox());
|
||||
visited.add((int) page.getSDFObj().getObjNum());
|
||||
reader.begin(page);
|
||||
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
|
||||
processElements(reader, writer, visited, clippingPathStack, delta, overlappedElements, visibleElements);
|
||||
writer.end();
|
||||
reader.end();
|
||||
return overlappedElements;
|
||||
}
|
||||
|
||||
private void processElements(ElementReader reader, ElementWriter writer, Set<Integer> visited, ClippingPathStack clippingPathStack, Boolean delta,
|
||||
List<ElementFeatures> coveredElements, List<ElementFeatures> visibleElements)
|
||||
throws PDFNetException {
|
||||
|
||||
for (Element element = reader.next(); element != null; element = reader.next())
|
||||
switch (element.getType()) {
|
||||
|
||||
case Element.e_image:
|
||||
case Element.e_inline_image:
|
||||
processImages(element, writer, clippingPathStack, delta, visibleElements);
|
||||
break;
|
||||
|
||||
case Element.e_text:
|
||||
processText(element, writer, clippingPathStack, delta, visibleElements);
|
||||
break;
|
||||
|
||||
case Element.e_path:
|
||||
processPath(element, writer, clippingPathStack, delta, coveredElements, visibleElements);
|
||||
break;
|
||||
|
||||
case Element.e_form:
|
||||
processForm(reader, writer, element, visited, clippingPathStack, delta, coveredElements, visibleElements);
|
||||
break;
|
||||
|
||||
case Element.e_group_begin:
|
||||
clippingPathStack.enterNewGState();
|
||||
writer.writeElement(element);
|
||||
break;
|
||||
|
||||
case Element.e_group_end:
|
||||
clippingPathStack.leaveGState();
|
||||
writer.writeElement(element);
|
||||
break;
|
||||
|
||||
default:
|
||||
writer.writeElement(element);
|
||||
}
|
||||
}
|
||||
|
||||
private void processImages(Element imageElement, ElementWriter writer, ClippingPathStack clippingPathStack, Boolean delta, List<ElementFeatures> visibleElements)
|
||||
throws PDFNetException {
|
||||
|
||||
Rect rect = imageElement.getBBox();
|
||||
|
||||
if (rect == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
boolean inClippingPath = clippingPathStack.almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
|
||||
|
||||
if (!delta && inClippingPath) {
|
||||
visibleElements.add(ElementFeatures.extractFeatures(imageElement));
|
||||
writer.writeElement(imageElement);
|
||||
}
|
||||
|
||||
if (delta && !inClippingPath) {
|
||||
writer.writeElement(imageElement);
|
||||
}
|
||||
}
|
||||
|
||||
private void processText(Element textElement, ElementWriter writer, ClippingPathStack clippingPathStack,
|
||||
Boolean delta, List<ElementFeatures> visibleElements)
|
||||
throws PDFNetException {
|
||||
|
||||
Rect rect = textElement.getBBox();
|
||||
|
||||
if (rect == null) {
|
||||
writer.writeElement(textElement);
|
||||
return;
|
||||
}
|
||||
|
||||
GState gState = textElement.getGState();
|
||||
|
||||
boolean inClippingPath = clippingPathStack.almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
|
||||
|
||||
boolean isTextVisible = isTextRenderedVisibly(gState);
|
||||
|
||||
if (inClippingPath && isTextVisible) {
|
||||
visibleElements.add(ElementFeatures.extractFeatures(textElement));
|
||||
}
|
||||
if (!delta) {
|
||||
if (inClippingPath && isTextVisible) {
|
||||
writer.writeElement(textElement);
|
||||
} else if (textElement.hasTextMatrix()) {
|
||||
/*
|
||||
PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
|
||||
hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
|
||||
Therefore, the position of a following Tj is affected by not writing the first Element.
|
||||
This is why, we write only the Tm command:
|
||||
*/
|
||||
writer.writeGStateChanges(textElement);
|
||||
}
|
||||
} else {
|
||||
if (!inClippingPath) {
|
||||
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||
gState.setFillColor(new ColorPt(1, 0, 0));
|
||||
writer.writeElement(textElement);
|
||||
}
|
||||
if (!isTextVisible) {
|
||||
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||
gState.setFillColor(new ColorPt(0, 0, 1));
|
||||
gState.setTextRenderMode(GState.e_fill_text);
|
||||
gState.setFillOpacity(1);
|
||||
writer.writeElement(textElement);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void processForm(ElementReader reader, ElementWriter writer, Element element, Set<Integer> visited, ClippingPathStack clippingPathStack, Boolean delta,
|
||||
List<ElementFeatures> coveredElements, List<ElementFeatures> allElements)
|
||||
throws PDFNetException {
|
||||
|
||||
writer.writeElement(element);
|
||||
Obj formObj = element.getXObject();
|
||||
|
||||
if (!visited.contains((int) formObj.getObjNum())) {
|
||||
visited.add((int) formObj.getObjNum());
|
||||
ElementWriter new_writer = new ElementWriter();
|
||||
reader.formBegin();
|
||||
new_writer.begin(formObj);
|
||||
|
||||
reader.clearChangeList();
|
||||
new_writer.setDefaultGState(reader);
|
||||
|
||||
processElements(reader, new_writer, visited, clippingPathStack, delta, coveredElements, allElements);
|
||||
new_writer.end();
|
||||
reader.end();
|
||||
}
|
||||
}
|
||||
|
||||
private void processPath(Element pathElement, ElementWriter writer, ClippingPathStack clippingPathStack, Boolean delta,
|
||||
List<ElementFeatures> overlappedElements, List<ElementFeatures> visibleElements)
|
||||
throws PDFNetException {
|
||||
|
||||
GeneralPath linePath = convertToGeneralPath(pathElement.getPathData());
|
||||
|
||||
//transform path to initial user space
|
||||
var ctm = pathElement.getCTM();
|
||||
var affineTransform = getAffineTransform(ctm);
|
||||
linePath.transform(affineTransform);
|
||||
|
||||
var rect = linePath.getBounds2D();
|
||||
|
||||
boolean inClippingPath = clippingPathStack.almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight());
|
||||
|
||||
if (pathElement.isClippingPath()) {
|
||||
if (pathElement.isClipWindingFill()) {
|
||||
linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
|
||||
} else {
|
||||
linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
|
||||
}
|
||||
|
||||
clippingPathStack.intersectClippingPath(linePath);
|
||||
pathElement.setPathClip(!delta);
|
||||
writer.writeElement(pathElement);
|
||||
|
||||
} else {
|
||||
if (inClippingPath) {
|
||||
if (isFilledAndNonTransparent(pathElement)) {
|
||||
List<ElementFeatures> currentOverlappedElements = visibleElements.stream()
|
||||
.filter(features -> almostContains(linePath, features.getBoundingBox()))
|
||||
.collect(Collectors.toList());
|
||||
overlappedElements.addAll(currentOverlappedElements);
|
||||
visibleElements.removeAll(currentOverlappedElements);
|
||||
}
|
||||
visibleElements.add(ElementFeatures.extractFeatures(pathElement));
|
||||
if (!delta) {
|
||||
writer.writeElement(pathElement);
|
||||
}
|
||||
}
|
||||
if (delta && !inClippingPath) {
|
||||
pathElement.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||
pathElement.getGState().setFillColor(new ColorPt(1, 0, 0));
|
||||
pathElement.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
||||
pathElement.getGState().setStrokeColor(new ColorPt(1, 0, 0));
|
||||
writer.writeElement(pathElement);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static AffineTransform getAffineTransform(Matrix2D ctm) throws PDFNetException {
|
||||
return new AffineTransform(ctm.getA(), ctm.getB(), ctm.getC(), ctm.getD(), ctm.getH(), ctm.getV());
|
||||
}
|
||||
|
||||
private void removeOverlappedElements(Page page, ElementReader reader, ElementWriter writer, Set<Integer> visited, List<ElementFeatures> overlappedElements, boolean delta)
|
||||
throws PDFNetException {
|
||||
reader.begin(page);
|
||||
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
|
||||
if (delta) {
|
||||
overlappedElements.forEach(feature -> drawBBox(writer, feature.getBoundingBox(), "#00FF00"));
|
||||
overlappedElements.clear();
|
||||
}
|
||||
processOverlappedElements(reader, writer, visited, overlappedElements, delta);
|
||||
writer.end();
|
||||
reader.end();
|
||||
|
||||
if (overlappedElements.size() > 0) {
|
||||
log.warn(overlappedElements.size() + " overlapped elements have not been found and removed");
|
||||
}
|
||||
}
|
||||
|
||||
private void processOverlappedElements(ElementReader reader, ElementWriter writer, Set<Integer> visited, List<ElementFeatures> coveredElements, boolean delta)
|
||||
throws PDFNetException {
|
||||
for (Element element = reader.next(); element != null; element = reader.next()) {
|
||||
switch (element.getType()) {
|
||||
case Element.e_form:
|
||||
processFormOverlappedElements(reader, writer, element, visited, coveredElements, delta);
|
||||
break;
|
||||
case Element.e_path:
|
||||
case Element.e_image:
|
||||
case Element.e_inline_image:
|
||||
case Element.e_text:
|
||||
boolean anyMatch = false;
|
||||
for (ElementFeatures elementToRemove : coveredElements) {
|
||||
if (elementToRemove.almostMatches(element)) {
|
||||
coveredElements.remove(elementToRemove);
|
||||
anyMatch = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!anyMatch) {
|
||||
writer.writeElement(element);
|
||||
} else if (element.getType() == 3 && element.hasTextMatrix()) {
|
||||
/*
|
||||
PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
|
||||
hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
|
||||
Therefore, the position of a following Tj is affected by not writing the first Element.
|
||||
This is why, we write only the Tm command:
|
||||
*/
|
||||
writer.writeGStateChanges(element);
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
writer.writeElement(element);
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void processFormOverlappedElements(ElementReader reader, ElementWriter writer, Element element, Set<Integer> visited, List<ElementFeatures> elementsToRemove, boolean delta)
|
||||
throws PDFNetException {
|
||||
|
||||
writer.writeElement(element);
|
||||
Obj formObj = element.getXObject();
|
||||
|
||||
if (!visited.contains((int) formObj.getObjNum())) {
|
||||
visited.add((int) formObj.getObjNum());
|
||||
ElementWriter new_writer = new ElementWriter();
|
||||
reader.formBegin();
|
||||
new_writer.begin(formObj);
|
||||
|
||||
reader.clearChangeList();
|
||||
new_writer.setDefaultGState(reader);
|
||||
|
||||
processOverlappedElements(reader, new_writer, visited, elementsToRemove, delta);
|
||||
new_writer.end();
|
||||
reader.end();
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isTextRenderedVisibly(GState gState) throws PDFNetException {
|
||||
if (gState.getTextRenderMode() == GState.e_invisible_text) return false;
|
||||
if (gState.getTextRenderMode() == GState.e_fill_text && gState.getFillOpacity() == 0) return false;
|
||||
if (gState.getTextRenderMode() == GState.e_stroke_text && gState.getStrokeOpacity() == 0) return false;
|
||||
if (gState.getTextRenderMode() == GState.e_fill_stroke_text && gState.getFillOpacity() == 0 && gState.getStrokeOpacity() == 0) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
private GeneralPath convertToGeneralPath(PathData pathData) throws PDFNetException {
|
||||
GeneralPath linePath = new GeneralPath();
|
||||
|
||||
double[] dataPoints = pathData.getPoints();
|
||||
byte[] opr = pathData.getOperators();
|
||||
|
||||
double x1;
|
||||
double y1;
|
||||
double x2;
|
||||
double y2;
|
||||
double x3;
|
||||
double y3;
|
||||
|
||||
int data_index = 0;
|
||||
for (int opr_index = 0; opr_index < opr.length; ++opr_index) {
|
||||
switch (opr[opr_index]) {
|
||||
case PathData.e_moveto:
|
||||
x1 = dataPoints[data_index];
|
||||
++data_index;
|
||||
y1 = dataPoints[data_index];
|
||||
++data_index;
|
||||
|
||||
linePath.moveTo(x1, y1);
|
||||
break;
|
||||
case PathData.e_lineto:
|
||||
x1 = dataPoints[data_index];
|
||||
++data_index;
|
||||
y1 = dataPoints[data_index];
|
||||
++data_index;
|
||||
|
||||
linePath.lineTo(x1, y1);
|
||||
break;
|
||||
case PathData.e_cubicto:
|
||||
x1 = dataPoints[data_index];
|
||||
++data_index;
|
||||
y1 = dataPoints[data_index];
|
||||
++data_index;
|
||||
x2 = dataPoints[data_index];
|
||||
++data_index;
|
||||
y2 = dataPoints[data_index];
|
||||
++data_index;
|
||||
x3 = dataPoints[data_index];
|
||||
++data_index;
|
||||
y3 = dataPoints[data_index];
|
||||
++data_index;
|
||||
|
||||
linePath.curveTo(x1, y1, x2, y2, x3, y3);
|
||||
break;
|
||||
case PathData.e_rect:
|
||||
x1 = dataPoints[data_index];
|
||||
++data_index;
|
||||
y1 = dataPoints[data_index];
|
||||
++data_index;
|
||||
double w = dataPoints[data_index];
|
||||
++data_index;
|
||||
double h = dataPoints[data_index];
|
||||
++data_index;
|
||||
x2 = x1 + w;
|
||||
y2 = y1;
|
||||
x3 = x2;
|
||||
y3 = y1 + h;
|
||||
double x4 = x1;
|
||||
double y4 = y3;
|
||||
|
||||
linePath.moveTo(x1, y1);
|
||||
linePath.lineTo(x2, y2);
|
||||
linePath.lineTo(x3, y3);
|
||||
linePath.lineTo(x4, y4);
|
||||
break;
|
||||
case PathData.e_closepath:
|
||||
linePath.closePath();
|
||||
break;
|
||||
default:
|
||||
throw new PDFNetException("Invalid Element Type", 0, "", "", "");
|
||||
}
|
||||
}
|
||||
return linePath;
|
||||
}
|
||||
|
||||
private boolean almostContains(Shape outer, Rectangle2D inner) {
|
||||
double tolerance = 1e-3;
|
||||
|
||||
double x_with_tolerance = inner.getX() >= 0 ? inner.getX() + tolerance : inner.getX() - tolerance;
|
||||
double y_with_tolerance = inner.getY() >= 0 ? inner.getY() + tolerance : inner.getY() - tolerance;
|
||||
double height_with_tolerance = inner.getHeight() - (2 * tolerance);
|
||||
double width_with_tolerance = inner.getWidth() - (2 * tolerance);
|
||||
Rectangle2D innerRect = new Rectangle2D.Double(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance);
|
||||
|
||||
return outer.contains(innerRect);
|
||||
}
|
||||
|
||||
private boolean isFilledAndNonTransparent(Element element) throws PDFNetException {
|
||||
return element.isFilled() && element.getGState().getFillOpacity() == 1;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void drawBBox(ElementWriter writer, Rectangle2D r, String hexcolor) {
|
||||
ColorPt colorPt = new ColorPt(Integer.valueOf(hexcolor.substring(1, 3), 16) / 255d,
|
||||
Integer.valueOf(hexcolor.substring(3, 5), 16) / 255d,
|
||||
Integer.valueOf(hexcolor.substring(5, 7), 16) / 255d);
|
||||
ElementBuilder eb = new ElementBuilder();
|
||||
Element rect = eb.createRect(r.getX(), r.getY(), r.getWidth(), r.getHeight());
|
||||
rect.setPathStroke(true);
|
||||
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
||||
rect.getGState().setStrokeColor(colorPt);
|
||||
writer.writePlacedElement(rect);
|
||||
}
|
||||
}
|
||||
@ -1,5 +1,17 @@
|
||||
package com.iqser.red.service.ocr.v1.server.service;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse;
|
||||
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
|
||||
@ -8,19 +20,17 @@ import com.iqser.red.service.ocr.v1.server.model.image.ImageServiceResponse;
|
||||
import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings;
|
||||
import com.iqser.red.service.redaction.v1.model.Point;
|
||||
import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||
import com.pdftron.pdf.*;
|
||||
import com.pdftron.pdf.OCRModule;
|
||||
import com.pdftron.pdf.OCROptions;
|
||||
import com.pdftron.pdf.Optimizer;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.pdf.RectCollection;
|
||||
import com.pdftron.sdf.SDFDoc;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.InputStream;
|
||||
import java.util.*;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@ -36,20 +46,18 @@ public class OCRService {
|
||||
|
||||
private final ObjectMapper objectMapper;
|
||||
|
||||
private final InvisibleElementService invisibleElementService;
|
||||
private final InvisibleElementRemovalService invisibleElementRemovalService;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public InputStream ocrDocument(String dossierId, String fileId) {
|
||||
|
||||
var fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId);
|
||||
var imageServiceResponse = fileStorageService.getImageServiceResponse(dossierId, fileId);
|
||||
InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId);
|
||||
ImageServiceResponse imageServiceResponse = fileStorageService.getImageServiceResponse(dossierId, fileId);
|
||||
|
||||
var fileBytes = IOUtils.toByteArray(fileStream);
|
||||
byte[] fileWithoutInvisibleTextStream = invisibleElementRemovalService.removeInvisibleElements(fileStream, false);
|
||||
|
||||
byte[] fileWithoutInvisibleTextBytes = invisibleElementService.removeInvisibleElements(fileBytes, false);
|
||||
|
||||
var ocrBytes = ocr(fileWithoutInvisibleTextBytes, fileId, imageServiceResponse);
|
||||
byte[] ocrBytes = ocr(fileWithoutInvisibleTextStream, fileId, imageServiceResponse);
|
||||
|
||||
return new ByteArrayInputStream(ocrBytes);
|
||||
|
||||
|
||||
@ -1,12 +1,12 @@
|
||||
package com.iqser.red.service.ocr.v1.server;
|
||||
|
||||
import com.iqser.red.service.ocr.v1.server.service.InvisibleElementService;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.pdf.PageIterator;
|
||||
import com.pdftron.pdf.TextExtractor;
|
||||
import lombok.SneakyThrows;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
|
||||
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
|
||||
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||
@ -17,62 +17,69 @@ import org.springframework.context.annotation.Import;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
import org.springframework.test.context.junit.jupiter.SpringExtension;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import com.iqser.red.service.ocr.v1.server.service.InvisibleElementRemovalService;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.pdf.PageIterator;
|
||||
import com.pdftron.pdf.TextExtractor;
|
||||
|
||||
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
|
||||
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@ExtendWith(SpringExtension.class)
|
||||
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT //
|
||||
, properties = {"pdftron.ocrmodule.path=/YourOCRModulePath"})
|
||||
@Import(OcrServiceIntegrationTest.TestConfiguration.class)
|
||||
public class InvisibleElementServiceTest {
|
||||
public class InvisibleElementRemovalServiceTest {
|
||||
|
||||
@Autowired
|
||||
private InvisibleElementService invisibleElementService;
|
||||
private InvisibleElementRemovalService invisibleElementRemovalService;
|
||||
|
||||
@MockBean
|
||||
protected RabbitTemplate rabbitTemplate;
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testRemoveInvisibleText() {
|
||||
|
||||
String fileName = "InvisibleText";
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf");
|
||||
|
||||
var initialFileBytes = Files.readAllBytes(pdfFileResource.getFile().toPath());
|
||||
var initialFileStream = Files.newInputStream(pdfFileResource.getFile().toPath());
|
||||
var fileWithoutInvisibleElements = invisibleElementRemovalService.removeInvisibleElements(initialFileStream, false);
|
||||
|
||||
var fileWithoutInvisibleText = invisibleElementService.removeInvisibleElements(initialFileBytes, false);
|
||||
var deltaFile = invisibleElementService.removeInvisibleElements(initialFileBytes, true);
|
||||
initialFileStream = Files.newInputStream(pdfFileResource.getFile().toPath());
|
||||
var deltaFile = invisibleElementRemovalService.removeInvisibleElements(initialFileStream, true);
|
||||
|
||||
String fileWithoutInvisibleTextLocation = getTemporaryDirectory() + "/" + fileName + ".pdf";
|
||||
String deltaFileLocation = getTemporaryDirectory() + "/" + fileName + "_delta.pdf";
|
||||
|
||||
saveToFile(fileWithoutInvisibleTextLocation, fileWithoutInvisibleText);
|
||||
saveToFile(fileWithoutInvisibleTextLocation, fileWithoutInvisibleElements);
|
||||
saveToFile(deltaFileLocation, deltaFile);
|
||||
|
||||
System.out.println("Output File without invisible elements: " + fileWithoutInvisibleTextLocation);
|
||||
System.out.println("Output Delta File: " + deltaFileLocation);
|
||||
TextExtractor extractor = new TextExtractor();
|
||||
PDFDoc pdfDoc = new PDFDoc(fileWithoutInvisibleText);
|
||||
PDFDoc pdfDoc = new PDFDoc(fileWithoutInvisibleElements);
|
||||
PageIterator iterator = pdfDoc.getPageIterator();
|
||||
while (iterator.hasNext()) {
|
||||
Page page = iterator.next();
|
||||
extractor.begin(page);
|
||||
String[] text = extractor.getAsText().split("\n");
|
||||
assertThat(text).containsAnyOf("APPENDIX 16 Pathology Report", "Amendment 1", "Page 255 of 260");
|
||||
assertThat(text).contains("APPENDIX 16 Pathology Report", "Amendment 1", "Page 255 of 260");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void saveToFile(String location, byte[] fileBytes) {
|
||||
try (var f_out = FileUtils.openOutputStream(new File(location))) {
|
||||
|
||||
try (var f_out = new FileOutputStream(location)) {
|
||||
f_out.write(fileBytes);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("File location: " + location + "could not be openend, no file will be saved");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -7,7 +7,7 @@
|
||||
<parent>
|
||||
<groupId>com.iqser.red</groupId>
|
||||
<artifactId>platform-dependency</artifactId>
|
||||
<version>1.14.0</version>
|
||||
<version>RED-6114-1</version>
|
||||
<relativePath/>
|
||||
</parent>
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user