Compare commits

...

3 Commits

Author SHA1 Message Date
Kilian Schuettler
63a06625f6 RED-6019 InvisibleTex wip 2023-01-30 15:25:49 +01:00
deiflaender
f69681133c RED-6019 InvisibleText 1 2023-01-23 09:20:35 +01:00
deiflaender
579e6a5c67 RED-6019 InvisibleText 2023-01-19 12:04:12 +01:00
26 changed files with 1023 additions and 176 deletions

View File

@ -0,0 +1,54 @@
package com.iqser.red.service.ocr.v1.server.model;
import com.pdftron.pdf.Rect;
import lombok.Data;
import lombok.SneakyThrows;
import java.awt.geom.Area;
import java.awt.geom.GeneralPath;
import java.awt.geom.Rectangle2D;
import java.util.ArrayDeque;
import java.util.Deque;
@Data
public class ClippingPathStack {
private Deque<Area> stack = new ArrayDeque<>();
@SneakyThrows
public ClippingPathStack(Rect rectangle) {
stack.push(new Area(new Rectangle2D.Double(rectangle.getX1(), rectangle.getY1(), rectangle.getWidth(), rectangle.getHeight()).getBounds2D()));
}
@SneakyThrows
public void intersectClippingPath(GeneralPath path) {
getCurrentClippingPath().intersect(new Area(path));
}
public boolean almostIntersects(double x, double y, double width, double height) {
// Height or width are zero for straight lines, even though they are being rendered. Therefore, height or width must be at minimum >0.
double tolerance = 1e-3;
double x_with_tolerance = x > 0 ? x - tolerance : x + tolerance;
double y_with_tolerance = y > 0 ? y - tolerance : y + tolerance;
double width_with_tolerance = width + 2 * tolerance;
double height_with_tolerance = height + 2 * tolerance;
return getCurrentClippingPath().intersects(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance);
}
public Area getCurrentClippingPath() {
return stack.peek();
}
public void enterNewGState() {
Area current = stack.peek();
Area cloned = new Area();
cloned.add(current);
stack.push(cloned);
}
public void leaveGState() {
stack.pop();
}
}

View File

@ -0,0 +1,149 @@
package com.iqser.red.service.ocr.v1.server.model;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.Rect;
import lombok.*;
import lombok.experimental.SuperBuilder;
import java.awt.geom.Rectangle2D;
@Data
@SuperBuilder
@NoArgsConstructor
@AllArgsConstructor
public abstract class ElementFeatures {
private int elementType;
private Rectangle2D boundingBox;
public boolean almostMatches(Element element) throws PDFNetException {
if (element.getType() != elementType) return false;
if (element.getBBox() == null) return false;
return rectsAlmostMatch(element.getBBox());
}
protected boolean almostEqual(double a, double b) {
double tolerance = 1e-3;
return Math.abs(a - b) < tolerance;
}
@SneakyThrows
private boolean rectsAlmostMatch(Rect bBox) {
if (!almostEqual(bBox.getX1(), boundingBox.getX())) return false;
if (!almostEqual(bBox.getY1(), boundingBox.getY())) return false;
if (!almostEqual(bBox.getWidth(), boundingBox.getWidth())) return false;
return almostEqual(bBox.getHeight(), boundingBox.getHeight());
}
@EqualsAndHashCode(callSuper = true)
@Data
@SuperBuilder
@NoArgsConstructor
@AllArgsConstructor
public static class Text extends ElementFeatures {
private String text;
private int font;
private double fontsize;
@Override
public boolean almostMatches(Element element) throws PDFNetException {
if (!super.almostMatches(element)) return false;
if (!text.equals(element.getTextString())) return false;
if (font != element.getGState().getFont().getType()) return false;
return almostEqual(fontsize, element.getGState().getFontSize());
}
}
@EqualsAndHashCode(callSuper = true)
@Data
@SuperBuilder
@NoArgsConstructor
@AllArgsConstructor
public static class Path extends ElementFeatures {
private boolean isClippingPath;
private boolean isClipWindingFill;
private boolean isStroked;
private boolean isFilled;
private boolean isWindingFill;
@Override
public boolean almostMatches(Element element) throws PDFNetException {
if (!super.almostMatches(element)) return false;
if (isClippingPath != element.isClippingPath()) return false;
if (isClipWindingFill != element.isClipWindingFill()) return false;
if (isStroked != element.isStroked()) return false;
if (isFilled != element.isFilled()) return false;
if (isWindingFill != element.isWindingFill()) return false;
return true;
}
}
@EqualsAndHashCode(callSuper = true)
@Data
@SuperBuilder
@NoArgsConstructor
public static class Image extends ElementFeatures {
private int dataSize;
private int height;
private int width;
private int renderingIntent;
private int componentNum;
private int bitsPerComponent;
@Override
public boolean almostMatches(Element element) throws PDFNetException {
if (!super.almostMatches(element)) return false;
if (dataSize != element.getImageDataSize()) return false;
if (height != element.getImageHeight()) return false;
if (width != element.getImageWidth()) return false;
if (renderingIntent != element.getImageRenderingIntent()) return false;
if (componentNum != element.getComponentNum()) return false;
if (bitsPerComponent != element.getBitsPerComponent()) return false;
return true;
}
}
public static ElementFeatures extractFeatures(Element element) throws PDFNetException {
switch (element.getType()) {
case Element.e_path:
return ElementFeatures.Path.builder()
.elementType(element.getType())
.boundingBox(toRectangle2D(element.getBBox()))
.isClippingPath(element.isClippingPath())
.isClipWindingFill(element.isClipWindingFill())
.isStroked(element.isStroked())
.isFilled(element.isFilled())
.isWindingFill(element.isWindingFill())
.build();
case Element.e_text:
return ElementFeatures.Text.builder()
.elementType(element.getType())
.boundingBox(toRectangle2D(element.getBBox()))
.text(element.getTextString())
.font(element.getGState().getFont().getType())
.fontsize(element.getGState().getFontSize())
.build();
case Element.e_image:
case Element.e_inline_image:
return Image.builder()
.elementType(element.getType())
.boundingBox(toRectangle2D(element.getBBox()))
.dataSize(element.getImageDataSize())
.height(element.getImageHeight())
.width(element.getImageWidth())
.renderingIntent(element.getImageRenderingIntent())
.componentNum(element.getComponentNum())
.bitsPerComponent(element.getBitsPerComponent())
.build();
default:
throw new UnsupportedOperationException("Feature Extraction is not supported for PDFTron.Element with type: " + element.getType());
}
}
private static Rectangle2D toRectangle2D(Rect rect) throws PDFNetException {
return new Rectangle2D.Double(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
}
}

View File

@ -0,0 +1,463 @@
package com.iqser.red.service.ocr.v1.server.service;
import com.iqser.red.service.ocr.v1.server.model.ClippingPathStack;
import com.iqser.red.service.ocr.v1.server.model.ElementFeatures;
import com.pdftron.common.Matrix2D;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.*;
import com.pdftron.sdf.Obj;
import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.FileUtils;
import org.springframework.stereotype.Service;
import java.awt.*;
import java.awt.geom.AffineTransform;
import java.awt.geom.GeneralPath;
import java.awt.geom.Rectangle2D;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import java.util.stream.Collectors;
@Slf4j
@Service
public class InvisibleElementService {
/*
handled cases:
Text or Path outside of clipping path
Text which is transparent or is set to not render
Text or Path that have been painted over by visible and filled Paths
unhandled cases:
Text covered by widely stroked path
Text same color as background
Any Text set to clipping with its many interactions with other elements
*/
@SneakyThrows
public byte[] removeInvisibleTextOrPathElements(byte[] pdfFile, boolean delta) {
PDFDoc pdfDoc = new PDFDoc(pdfFile);
ElementWriter writer = new ElementWriter();
ElementReader reader = new ElementReader();
Set<Integer> visited = new TreeSet<>();
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
Page page = iterator.next();
List<ElementFeatures> overlappedElements = removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, reader, writer, visited, delta);
visited.clear();
removeOverlappedElements(page, reader, writer, visited, overlappedElements, delta);
}
if (delta) {
debugSave(pdfDoc.save(SDFDoc.SaveMode.LINEARIZED, null));
}
return pdfDoc.save(SDFDoc.SaveMode.LINEARIZED, null);
}
private List<ElementFeatures> removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(Page page, ElementReader reader, ElementWriter writer, Set<Integer> visited, boolean delta)
throws PDFNetException {
var overlappedElements = new ArrayList<ElementFeatures>();
var visibleElements = new ArrayList<ElementFeatures>();
ClippingPathStack clippingPathStack = new ClippingPathStack(page.getMediaBox());
visited.add((int) page.getSDFObj().getObjNum());
reader.begin(page);
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
processElements(reader, writer, visited, clippingPathStack, delta, overlappedElements, visibleElements);
writer.end();
reader.end();
return overlappedElements;
}
private void processElements(ElementReader reader, ElementWriter writer, Set<Integer> visited, ClippingPathStack clippingPathStack, Boolean delta,
List<ElementFeatures> coveredElements, List<ElementFeatures> visibleElements)
throws PDFNetException {
for (Element element = reader.next(); element != null; element = reader.next())
switch (element.getType()) {
case Element.e_image:
case Element.e_inline_image:
processImages(element, writer, clippingPathStack, delta, visibleElements);
break;
case Element.e_text:
processText(element, writer, clippingPathStack, delta, visibleElements);
break;
case Element.e_path:
processPath(element, writer, clippingPathStack, delta, coveredElements, visibleElements);
break;
case Element.e_form:
processForm(reader, writer, element, visited, clippingPathStack, delta, coveredElements, visibleElements);
break;
case Element.e_group_begin:
clippingPathStack.enterNewGState();
writer.writeElement(element);
break;
case Element.e_group_end:
clippingPathStack.leaveGState();
writer.writeElement(element);
break;
default:
writer.writeElement(element);
}
}
private void processImages(Element imageElement, ElementWriter writer, ClippingPathStack clippingPathStack, Boolean delta, List<ElementFeatures> visibleElements)
throws PDFNetException {
Rect rect = imageElement.getBBox();
if (rect == null) {
return;
}
boolean inClippingPath = clippingPathStack.almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
if (!delta && inClippingPath) {
visibleElements.add(ElementFeatures.extractFeatures(imageElement));
writer.writeElement(imageElement);
}
if (delta && !inClippingPath) {
writer.writeElement(imageElement);
}
}
private void processText(Element textElement, ElementWriter writer, ClippingPathStack clippingPathStack,
Boolean delta, List<ElementFeatures> visibleElements)
throws PDFNetException {
Rect rect = textElement.getBBox();
if (rect == null) {
writer.writeElement(textElement);
return;
}
GState gState = textElement.getGState();
boolean inClippingPath = clippingPathStack.almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
boolean isTextVisible = isTextRenderedVisibly(gState);
if (inClippingPath && isTextVisible) {
visibleElements.add(ElementFeatures.extractFeatures(textElement));
}
if (!delta) {
if (inClippingPath && isTextVisible) {
writer.writeElement(textElement);
} else if (textElement.hasTextMatrix()) {
/*
PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
Therefore, the position of a following Tj is affected by not writing the first Element.
This is why, we write only the Tm command:
*/
writer.writeGStateChanges(textElement);
}
} else {
if (!inClippingPath) {
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
gState.setFillColor(new ColorPt(1, 0, 0));
writer.writeElement(textElement);
}
if (!isTextVisible) {
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
gState.setFillColor(new ColorPt(0, 0, 1));
gState.setTextRenderMode(GState.e_fill_text);
gState.setFillOpacity(1);
writer.writeElement(textElement);
}
}
}
private void processForm(ElementReader reader, ElementWriter writer, Element element, Set<Integer> visited, ClippingPathStack clippingPathStack, Boolean delta,
List<ElementFeatures> coveredElements, List<ElementFeatures> allElements)
throws PDFNetException {
writer.writeElement(element);
Obj formObj = element.getXObject();
if (!visited.contains((int) formObj.getObjNum())) {
visited.add((int) formObj.getObjNum());
ElementWriter new_writer = new ElementWriter();
reader.formBegin();
new_writer.begin(formObj);
reader.clearChangeList();
new_writer.setDefaultGState(reader);
processElements(reader, new_writer, visited, clippingPathStack, delta, coveredElements, allElements);
new_writer.end();
reader.end();
}
}
private void processPath(Element pathElement, ElementWriter writer, ClippingPathStack clippingPathStack, Boolean delta,
List<ElementFeatures> overlappedElements, List<ElementFeatures> visibleElements)
throws PDFNetException {
GeneralPath linePath = convertToGeneralPath(pathElement.getPathData());
//transform path to initial user space
var ctm = pathElement.getCTM();
var affineTransform = getAffineTransform(ctm);
linePath.transform(affineTransform);
var rect = linePath.getBounds2D();
boolean inClippingPath = clippingPathStack.almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight());
if (pathElement.isClippingPath()) {
if (pathElement.isClipWindingFill()) {
linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
} else {
linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
}
clippingPathStack.intersectClippingPath(linePath);
pathElement.setPathClip(!delta);
writer.writeElement(pathElement);
} else {
if (inClippingPath) {
if (isFilledAndNonTransparent(pathElement)) {
List<ElementFeatures> currentOverlappedElements = visibleElements.stream()
.filter(features -> almostContains(linePath, features.getBoundingBox()))
.collect(Collectors.toList());
overlappedElements.addAll(currentOverlappedElements);
visibleElements.removeAll(currentOverlappedElements);
}
visibleElements.add(ElementFeatures.extractFeatures(pathElement));
if (!delta) {
writer.writeElement(pathElement);
}
}
if (delta && !inClippingPath) {
pathElement.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
pathElement.getGState().setFillColor(new ColorPt(1, 0, 0));
pathElement.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
pathElement.getGState().setStrokeColor(new ColorPt(1, 0, 0));
writer.writeElement(pathElement);
}
}
}
private static AffineTransform getAffineTransform(Matrix2D ctm) throws PDFNetException {
return new AffineTransform(ctm.getA(), ctm.getB(), ctm.getC(), ctm.getD(), ctm.getH(), ctm.getV());
}
private void removeOverlappedElements(Page page, ElementReader reader, ElementWriter writer, Set<Integer> visited, List<ElementFeatures> overlappedElements, boolean delta)
throws PDFNetException {
reader.begin(page);
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
if (delta) {
overlappedElements.forEach(feature -> drawBBox(writer, feature.getBoundingBox(), "#00FF00"));
overlappedElements.clear();
}
processOverlappedElements(reader, writer, visited, overlappedElements, delta);
writer.end();
reader.end();
if (overlappedElements.size() > 0) {
log.warn(overlappedElements.size() + " overlapped elements have not been found and removed");
}
}
private void processOverlappedElements(ElementReader reader, ElementWriter writer, Set<Integer> visited, List<ElementFeatures> coveredElements, boolean delta)
throws PDFNetException {
for (Element element = reader.next(); element != null; element = reader.next()) {
switch (element.getType()) {
case Element.e_form:
processFormOverlappedElements(reader, writer, element, visited, coveredElements, delta);
break;
case Element.e_path:
case Element.e_image:
case Element.e_inline_image:
case Element.e_text:
boolean anyMatch = false;
for (ElementFeatures elementToRemove : coveredElements) {
if (elementToRemove.almostMatches(element)) {
coveredElements.remove(elementToRemove);
anyMatch = true;
break;
}
}
if (!anyMatch) {
writer.writeElement(element);
} else if (element.getType() == 3 && element.hasTextMatrix()) {
/*
PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
Therefore, the position of a following Tj is affected by not writing the first Element.
This is why, we write only the Tm command:
*/
writer.writeGStateChanges(element);
}
break;
default:
writer.writeElement(element);
}
}
}
private void processFormOverlappedElements(ElementReader reader, ElementWriter writer, Element element, Set<Integer> visited, List<ElementFeatures> elementsToRemove, boolean delta)
throws PDFNetException {
writer.writeElement(element);
Obj formObj = element.getXObject();
if (!visited.contains((int) formObj.getObjNum())) {
visited.add((int) formObj.getObjNum());
ElementWriter new_writer = new ElementWriter();
reader.formBegin();
new_writer.begin(formObj);
reader.clearChangeList();
new_writer.setDefaultGState(reader);
processOverlappedElements(reader, new_writer, visited, elementsToRemove, delta);
new_writer.end();
reader.end();
}
}
private boolean isTextRenderedVisibly(GState gState) throws PDFNetException {
if (gState.getTextRenderMode() == GState.e_invisible_text) return false;
if (gState.getTextRenderMode() == GState.e_fill_text && gState.getFillOpacity() == 0) return false;
if (gState.getTextRenderMode() == GState.e_stroke_text && gState.getStrokeOpacity() == 0) return false;
if (gState.getTextRenderMode() == GState.e_fill_stroke_text && gState.getFillOpacity() == 0 && gState.getStrokeOpacity() == 0) return false;
return true;
}
private GeneralPath convertToGeneralPath(PathData pathData) throws PDFNetException {
GeneralPath linePath = new GeneralPath();
double[] dataPoints = pathData.getPoints();
byte[] opr = pathData.getOperators();
double x1;
double y1;
double x2;
double y2;
double x3;
double y3;
int data_index = 0;
for (int opr_index = 0; opr_index < opr.length; ++opr_index) {
switch (opr[opr_index]) {
case PathData.e_moveto:
x1 = dataPoints[data_index];
++data_index;
y1 = dataPoints[data_index];
++data_index;
linePath.moveTo(x1, y1);
break;
case PathData.e_lineto:
x1 = dataPoints[data_index];
++data_index;
y1 = dataPoints[data_index];
++data_index;
linePath.lineTo(x1, y1);
break;
case PathData.e_cubicto:
x1 = dataPoints[data_index];
++data_index;
y1 = dataPoints[data_index];
++data_index;
x2 = dataPoints[data_index];
++data_index;
y2 = dataPoints[data_index];
++data_index;
x3 = dataPoints[data_index];
++data_index;
y3 = dataPoints[data_index];
++data_index;
linePath.curveTo(x1, y1, x2, y2, x3, y3);
break;
case PathData.e_rect:
x1 = dataPoints[data_index];
++data_index;
y1 = dataPoints[data_index];
++data_index;
double w = dataPoints[data_index];
++data_index;
double h = dataPoints[data_index];
++data_index;
x2 = x1 + w;
y2 = y1;
x3 = x2;
y3 = y1 + h;
double x4 = x1;
double y4 = y3;
linePath.moveTo(x1, y1);
linePath.lineTo(x2, y2);
linePath.lineTo(x3, y3);
linePath.lineTo(x4, y4);
break;
case PathData.e_closepath:
linePath.closePath();
break;
default:
throw new PDFNetException("Invalid Element Type", 0, "", "", "");
}
}
return linePath;
}
private boolean almostContains(Shape outer, Rectangle2D inner) {
double tolerance = 1e-3;
double x_with_tolerance = inner.getX() >= 0 ? inner.getX() + tolerance : inner.getX() - tolerance;
double y_with_tolerance = inner.getY() >= 0 ? inner.getY() + tolerance : inner.getY() - tolerance;
double height_with_tolerance = inner.getHeight() - (2 * tolerance);
double width_with_tolerance = inner.getWidth() - (2 * tolerance);
Rectangle2D innerRect = new Rectangle2D.Double(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance);
return outer.contains(innerRect);
}
private boolean isFilledAndNonTransparent(Element element) throws PDFNetException {
return element.isFilled() && element.getGState().getFillOpacity() == 1;
}
private void debugSave(byte[] pdfFile) {
String fileLocation = "/tmp/delta.pdf";
try (var f_out = FileUtils.openOutputStream(new File(fileLocation))) {
f_out.write(pdfFile);
} catch (IOException e) {
throw new RuntimeException("File location: " + fileLocation + "could not be openend, no file will be saved");
}
}
@SneakyThrows
private void drawBBox(ElementWriter writer, Rectangle2D r, String hexcolor) {
ColorPt colorPt = new ColorPt(Integer.valueOf(hexcolor.substring(1, 3), 16) / 255d,
Integer.valueOf(hexcolor.substring(3, 5), 16) / 255d,
Integer.valueOf(hexcolor.substring(5, 7), 16) / 255d);
ElementBuilder eb = new ElementBuilder();
Element rect = eb.createRect(r.getX(), r.getY(), r.getWidth(), r.getHeight());
rect.setPathStroke(true);
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
rect.getGState().setStrokeColor(colorPt);
writer.writePlacedElement(rect);
}
}

View File

@ -1,21 +1,5 @@
package com.iqser.red.service.ocr.v1.server.service; package com.iqser.red.service.ocr.v1.server.service;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import org.apache.commons.io.IOUtils;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.stereotype.Service;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse; import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse;
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration; import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
@ -25,25 +9,21 @@ import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings;
import com.iqser.red.service.persistence.service.v1.api.utils.SuppressFBWarnings; import com.iqser.red.service.persistence.service.v1.api.utils.SuppressFBWarnings;
import com.iqser.red.service.redaction.v1.model.Point; import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.Rectangle; import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.pdftron.common.PDFNetException; import com.pdftron.pdf.*;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementReader;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.OCRModule;
import com.pdftron.pdf.OCROptions;
import com.pdftron.pdf.Optimizer;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
import com.pdftron.pdf.Rect;
import com.pdftron.pdf.RectCollection;
import com.pdftron.sdf.Obj;
import com.pdftron.sdf.SDFDoc; import com.pdftron.sdf.SDFDoc;
import io.micrometer.core.annotation.Timed; import io.micrometer.core.annotation.Timed;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.IOUtils;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.stereotype.Service;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.util.*;
@Slf4j @Slf4j
@Service @Service
@ -59,6 +39,7 @@ public class OCRService {
private final ObjectMapper objectMapper; private final ObjectMapper objectMapper;
private final InvisibleElementService invisibleElementService;
@Timed("redactmanager_PDFTron-ocrDocument") @Timed("redactmanager_PDFTron-ocrDocument")
@SneakyThrows @SneakyThrows
@ -81,9 +62,11 @@ public class OCRService {
private byte[] ocr(byte[] file, String fileId, ImageServiceResponse imageServiceResponse) { private byte[] ocr(byte[] file, String fileId, ImageServiceResponse imageServiceResponse) {
PDFDoc pdfDoc = null; PDFDoc pdfDoc = null;
var fileWithoutInvisibleText = invisibleElementService.removeInvisibleTextOrPathElements(file, false);
try (ByteArrayOutputStream out = new ByteArrayOutputStream()) { try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
pdfDoc = new PDFDoc(file); pdfDoc = new PDFDoc(fileWithoutInvisibleText);
removeInvisibleText(pdfDoc);
Map<Integer, List<ImagePosition>> pages = new HashMap<>(); Map<Integer, List<ImagePosition>> pages = new HashMap<>();
@ -114,7 +97,6 @@ public class OCRService {
ocrDoc.close(); ocrDoc.close();
} }
Optimizer.optimize(pdfDoc); Optimizer.optimize(pdfDoc);
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null); pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
pdfDoc.close(); pdfDoc.close();
@ -192,134 +174,4 @@ public class OCRService {
} }
} }
/**
* There are 2 possibilities to have invisible Text in pdfs.
* 1. gState is set to invisible, this is ocr text.
* 2. Filled Path elements in front of the text.
*/
@SneakyThrows
private void removeInvisibleText(PDFDoc pdfDoc) {
ElementWriter writer = new ElementWriter();
ElementReader reader = new ElementReader();
Set<Integer> visited = new TreeSet<>();
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
Page page = iterator.next();
removeOverlapText(page, reader, writer, visited);
}
}
@SneakyThrows
private void removeOverlapText(Page page, ElementReader reader, ElementWriter writer, Set<Integer> visited) {
visited.add((int) page.getSDFObj().getObjNum());
reader.begin(page);
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
processElements(reader, writer, visited, false);
writer.end();
reader.end();
}
@SneakyThrows
private void processElements(ElementReader reader, ElementWriter writer, Set<Integer> visited, boolean isInForm) {
Set<Rect> filledRectangles = new HashSet<>();
for (Element element = reader.next(); element != null; element = reader.next())
switch (element.getType()) {
case Element.e_image:
case Element.e_inline_image:
processImage(element, writer, isInForm);
break;
case Element.e_text:
processText(element, writer, filledRectangles);
break;
case Element.e_path:
processPath(element, writer, filledRectangles);
break;
case Element.e_form:
processForm(reader, writer, element, visited);
break;
default:
writer.writeElement(element);
}
}
@SneakyThrows
private void processImage(Element element, ElementWriter writer, boolean isInForm) {
if (!isInForm || !settings.isRemoveWatermark()) {
writer.writeElement(element);
}
}
@SneakyThrows
private void processText(Element element, ElementWriter writer, Set<Rect> filledRectangles) {
if (element.getBBox() == null) {
writer.writeElement(element);
return;
}
double x = element.getBBox().getX1();
double y = element.getBBox().getY1();
boolean filledRectangleIntersection = filledRectangles.stream().anyMatch(r -> {
try {
return r.contains(x, y);
} catch (PDFNetException e) {
throw new RuntimeException("Internal pdftron error during removal of overlap text", e);
}
});
var gState = element.getGState();
//See PDF Reference 5.3 Text rendering modes, 3 = Invisible, however this ocr does not use it.
if (!filledRectangleIntersection && gState.getTextRenderMode() != 3 || filledRectangleIntersection && gState.getTextRenderMode() == 0) {
writer.writeElement(element);
}
}
@SneakyThrows
private void processPath(Element element, ElementWriter writer, Set<Rect> filledRectangles) {
writer.writeElement(element);
if (element.getPathData() != null && element.getPathData().getPoints().length > 4) {
filledRectangles.add(element.getBBox());
}
}
@SneakyThrows
private void processForm(ElementReader reader, ElementWriter writer, Element element, Set<Integer> visited) {
writer.writeElement(element);
Obj formObj = element.getXObject();
if (!visited.contains((int) formObj.getObjNum())) {
visited.add((int) formObj.getObjNum());
ElementWriter new_writer = new ElementWriter();
reader.formBegin();
new_writer.begin(formObj);
reader.clearChangeList();
new_writer.setDefaultGState(reader);
processElements(reader, new_writer, visited, true);
new_writer.end();
reader.end();
}
}
} }

View File

@ -0,0 +1,64 @@
package com.iqser.red.service.ocr.v1.server;
import com.iqser.red.service.ocr.v1.server.service.InvisibleElementService;
import lombok.SneakyThrows;
import org.apache.commons.io.FileUtils;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.context.annotation.Import;
import org.springframework.core.io.ClassPathResource;
import org.springframework.test.context.junit.jupiter.SpringExtension;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
@ExtendWith(SpringExtension.class)
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT //
, properties = {"pdftron.ocrmodule.path=/YourOCRModulePath"})
@Import(OcrServiceIntegrationTest.TestConfiguration.class)
public class InvisibleElementServiceTest {
@Autowired
private InvisibleElementService invisibleElementService;
@MockBean
protected RabbitTemplate rabbitTemplate;
@Test
@SneakyThrows
public void testRemoveInvisibleText() {
String fileName = "InvisibleText";
ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf");
var initialFileBytes = Files.readAllBytes(pdfFileResource.getFile().toPath());
var fileWithoutInvisibleText = invisibleElementService.removeInvisibleTextOrPathElements(initialFileBytes, false);
var deltaFile = invisibleElementService.removeInvisibleTextOrPathElements(initialFileBytes, true);
String fileWithoutInvisibleTextLocation = getTemporaryDirectory() + "/" + fileName + ".pdf";
String deltaFileLocation = getTemporaryDirectory() + "/" + fileName + "_delta.pdf";
saveToFile(fileWithoutInvisibleTextLocation, fileWithoutInvisibleText);
saveToFile(deltaFileLocation, deltaFile);
System.out.println("File:" + fileWithoutInvisibleTextLocation);
System.out.println("File:" + deltaFileLocation);
}
private void saveToFile(String location, byte[] fileBytes) {
try (var f_out = FileUtils.openOutputStream(new File(location))) {
f_out.write(fileBytes);
} catch (IOException e) {
throw new RuntimeException("File location: " + location + "could not be openend, no file will be saved");
}
}
}

View File

@ -1,10 +1,13 @@
package com.iqser.red.service.ocr.v1.server; package com.iqser.red.service.ocr.v1.server;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory; import com.fasterxml.jackson.databind.ObjectMapper;
import static org.assertj.core.api.Assertions.assertThat; import com.iqser.red.service.ocr.v1.server.service.FileStorageService;
import com.iqser.red.service.ocr.v1.server.service.OCRService;
import java.io.File; import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService;
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType;
import com.iqser.red.storage.commons.StorageAutoConfiguration;
import com.iqser.red.storage.commons.service.StorageService;
import lombok.SneakyThrows;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.AfterEach;
@ -24,15 +27,10 @@ import org.springframework.context.annotation.Primary;
import org.springframework.core.io.ClassPathResource; import org.springframework.core.io.ClassPathResource;
import org.springframework.test.context.junit.jupiter.SpringExtension; import org.springframework.test.context.junit.jupiter.SpringExtension;
import com.fasterxml.jackson.databind.ObjectMapper; import java.io.File;
import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService;
import com.iqser.red.service.ocr.v1.server.service.OCRService;
import com.iqser.red.service.ocr.v1.server.service.FileStorageService;
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType;
import com.iqser.red.storage.commons.StorageAutoConfiguration;
import com.iqser.red.storage.commons.service.StorageService;
import lombok.SneakyThrows; import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
import static org.assertj.core.api.Assertions.assertThat;
@ExtendWith(SpringExtension.class) @ExtendWith(SpringExtension.class)
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT // @SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT //
@ -81,6 +79,32 @@ public class OcrServiceIntegrationTest {
} }
@Test
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
@SneakyThrows
public void testRemoveInvisibleText() {
String fileName = "ocr/OCR Docs/MK244 - Fitness of Analytical Method - Physical-Chemical Pro";
// String fileName = "InvisiblePathElements";
ClassPathResource imageInfoResource = new ClassPathResource("files/InvisibleText.IMAGE_INFO.json");
ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf");
var originId = FileStorageService.getStorageId("dossier", "file", FileType.ORIGIN);
storageService.storeObject(originId, pdfFileResource.getInputStream());
var imageId = FileStorageService.getStorageId("dossier", "file", FileType.IMAGE_INFO);
storageService.storeObject(imageId, imageInfoResource.getInputStream());
var response = ocrService.ocrDocument("dossier", "file");
var out = FileUtils.openOutputStream(new File(getTemporaryDirectory() + "/" + fileName + ".pdf"));
IOUtils.copy(response, out);
System.out.println("File:" + getTemporaryDirectory() + "/" + fileName + ".pdf");
}
@SneakyThrows @SneakyThrows
public void dummyTest() { public void dummyTest() {

View File

@ -0,0 +1 @@
{"dossierId": "55547c91-6b0e-4aa6-9009-2e7c4cd90f13", "fileId": "917b9d9c9f548f85fef3679db45ff46c", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": []}

View File

@ -0,0 +1 @@
{"dossierId": "55547c91-6b0e-4aa6-9009-2e7c4cd90f13", "fileId": "917b9d9c9f548f85fef3679db45ff46c", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": []}

View File

@ -0,0 +1 @@
{"dossierId": "55547c91-6b0e-4aa6-9009-2e7c4cd90f13", "fileId": "917b9d9c9f548f85fef3679db45ff46c", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": []}

View File

@ -0,0 +1 @@
{"dossierId": "55547c91-6b0e-4aa6-9009-2e7c4cd90f13", "fileId": "917b9d9c9f548f85fef3679db45ff46c", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": []}

View File

@ -0,0 +1 @@
{"dossierId": "55547c91-6b0e-4aa6-9009-2e7c4cd90f13", "fileId": "917b9d9c9f548f85fef3679db45ff46c", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": []}

View File

@ -0,0 +1 @@
{"dossierId": "55547c91-6b0e-4aa6-9009-2e7c4cd90f13", "fileId": "917b9d9c9f548f85fef3679db45ff46c", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": []}

View File

@ -0,0 +1,235 @@
0 TL
q
q
150 0 m
400 0 l
400 300 l
150 300 l
h
58 50 m
58 200 l
570 200 l
570 50 l
F
W
n
BT
-0.011 Tc
0 Tw
100 Tz
/C2_0 12 Tf
0 Tr
0 Ts
485.52 66.48 Td
(\0003\000D\000J\000H\000\003) Tj
0 Tc
26.4 0 Td
(\000\024) Tj
0.24 Tc
8.88 0 Td
(\000R\000I) Tj
0 Tc
9.84 0 Td
[(\000\003)-10(\000\030)] TJ
0.078 Tc
-440.88 -27.6 Td
(\0006\000W\000X\000G\000\\) Tj
-0.007 Tc
30.96 0 Td
(\0001\000X\000P\000E\000H\000U\000\035\000\003\000\003) Tj
-0.014 Tc
48.72 0 Td
(\0007\0000\0005\000\023\000\024\000\027\000\025) Tj
ET
q
154.32 0 0 76.32 385.68 643.68 cm
/Im0 Do
Q
BT
0.013 Tc
/C2_1 12 Tf
288.96 601.44 Td
(\0006\000X\000E\000V\000W\000D\000Q\000F\000H) Tj
0 Tc
-168.24 -27.6 Td
[(\0006\000<\0001\000\030\000\027\000\031\000\026\000\026\000\023\000\003)-10(\000\261)] TJ
ET
q
192.96 584.88 317.52 -13.92 re
W
n
BT
-0.006 Tc
192.96 573.84 Td
(\0007\000R\000[\000L\000F\000L\000W\000\\\000\003\0006\000W\000X\000G\000\\\000\003\000E\000\\\000\003\000'\000H\000U\000P\000D\000O\000\003\000$\000G\000P\000L\000Q\000L\000V\000W\000U\000D\000W\000L\000R\000Q\000\003\000W\000R\000\003\000+\000D\000Q\000\003\000:\000L\000V\000W\000D\000U\000\003\0005\000D\000W\000V\000\003) Tj
ET
Q
BT
-0.024 Tc
284.4 560.16 Td
(\000I\000R\000U\000\003\000\027\000\003\000:\000H\000H\000N\000V) Tj
-0.011 Tc
-14.4 -27.6 Td
(\0003\000D\000W\000K\000R\000O\000R\000J\000\\\000\003\0005\000H\000S\000R\000U\000W) Tj
ET
117.12 630 0.72 -3.6 re
f*
117.12 630 2.16 -0.72 re
f*
118.56 628.56 0.72 -2.16 re
f*
118.56 628.56 0.72 -0.72 re
f*
119.28 630 390.96 -0.72 re
f*
119.28 628.56 390.96 -0.72 re
f*
511.68 630 0.72 -3.6 re
f*
510.24 630 2.16 -0.72 re
f*
510.24 628.56 0.72 -2.16 re
f*
510.24 628.56 0.72 -0.72 re
f*
118.56 626.4 0.72 -111.84 re
f*
117.12 626.4 0.72 -111.84 re
f*
117.12 514.56 0.72 -2.16 re
f*
117.12 513.12 2.16 -0.72 re
f*
118.56 514.56 0.72 -0.72 re
f*
118.56 514.56 0.72 -0.72 re
f*
119.28 513.12 390.96 -0.72 re
f*
119.28 514.56 390.96 -0.72 re
f*
511.68 626.4 0.72 -111.84 re
f*
510.24 626.4 0.72 -111.84 re
f*
511.68 514.56 0.72 -2.16 re
f*
510.24 513.12 2.16 -0.72 re
f*
510.24 514.56 0.72 -0.72 re
f*
510.24 514.56 0.72 -0.72 re
f*
BT
0.011 Tc
89.76 266.64 Td
(\000$\0008\0007\000+\0002\0005\000\013\0006\000\f\000\035) Tj
-0.012 Tc
/C2_0 12 Tf
184.8 0.24 Td
(\0000\000L\000F\000K\000H\000O\000D\000\003\000*\000U\000H\000J\000R\000U\000L\000\003) Tj
-0.037 Tc
82.56 0 Td
(\000'\0009\0000\000\003\0003\000K\000'\000\003) Tj
0.101 Tc
55.2 0 Td
(\0003\000D\000W\000K\000R\000O) Tj
-0.02 Tc
30.72 0 Td
(\000R\000J\000L\000V\000W) Tj
-0.01 Tc
/C2_1 12 Tf
-353.28 -27.84 Td
(\000&\0002\0000\0003\000/\000\(\0007\000,\0002\0001\000\003\000'\000$\0007\000\(\000\035) Tj
-0.009 Tc
/C2_0 12 Tf
184.8 0.24 Td
(\000\023\000\030\000\003\0000\000D\000U\000F\000K\000\003\000\025\000\023\000\024\000\033) Tj
/C2_1 12 Tf
-184.8 -27.84 Td
(\000/\000$\000%\0002\0005\000$\0007\0002\0005\000<\000\003\0003\0005\0002\000-\000\(\000&\0007\000\003\000,) Tj
-0.026 Tc
152.4 0 Td
(\000'\000\035) Tj
0.009 Tc
/C2_0 12 Tf
32.4 0.24 Td
(\0005\000H\000S\000R\000U\000W\000\003\0001\000X\000P\000E) Tj
0.021 Tc
65.52 0 Td
(\000H\000U\000\035\000\003) Tj
-0.014 Tc
15.84 0 Td
(\0007\0000\0005\000\023\000\024\000\027\000\025) Tj
0.078 Tc
-81.36 -13.68 Td
(\0006\000W\000X\000G\000\\) Tj
-0.011 Tc
27.84 0 Td
(\000\003\0001\000X\000P\000E\000H\000U\000\035) Tj
-0.014 Tc
48.72 0 Td
(\0007\0000\0005\000\023\000\024\000\027\000\025) Tj
-0.008 Tc
-76.56 -13.92 Td
(\0007\000D\000V\000N\000\003\0001\000X\000P\000E\000H\000U\000\035\000\003) Tj
-0.019 Tc
72 0 Td
(\0007\000.\000\023\000\025\000\024\000\023\000\024\000\031\000\033) Tj
0 Tc
/C2_1 12 Tf
-256.8 -27.84 Td
[(\0009\0002\000/\0008\0000\000\(\000\003)-10(\000\024)] TJ
-0.052 Tc
66.24 0 Td
(\0002\000\)\000\003) Tj
0 Tc
19.68 0 Td
(\000\024) Tj
-0.01 Tc
8.88 0 Td
(\0002\000\)\000\003\0006\0007\0008\000'\000<) Tj
ET
146.88 155.04 6 -1.2 re
f*
175.68 155.04 5.76 -1.2 re
f*
BT
-0.053 Tc
89.76 142.56 Td
(\0003\000$\000*\000\(\000\003) Tj
0 Tc
36.24 0 Td
(\000\024) Tj
-0.052 Tc
9.12 0 Td
(\0002\000\)\000\003) Tj
0 Tc
19.68 0 Td
(\000\030) Tj
ET
126 141.36 6 -1.2 re
f*
154.8 141.36 6 -1.2 re
f*
/Artifact <</Subtype/Watermark/Type/Pagination>> BDC
q
/G0 gs
0.940613 0 0 0.940613 26.0628 0 cm
0 0 0 RG
0 w
/Fm0 Do
Q
EMC
/Artifact <</Contents( Page 250 of 256)/Subtype/Header/Type/Pagination>> BDC
q
/G0 gs
1 0 0 1 458.67 48.412 cm
0 0 0 RG
0 w
/Fm1 Do
Q
EMC
Q
Q