RED-6019 InvisibleTex wip

This commit is contained in:
Kilian Schuettler 2023-01-25 20:32:52 +01:00
parent f69681133c
commit 63a06625f6
10 changed files with 724 additions and 393 deletions

View File

@ -1,58 +1,53 @@
package com.iqser.red.service.ocr.v1.server.model;
import java.awt.geom.AffineTransform;
import java.awt.geom.Area;
import java.awt.geom.GeneralPath;
import java.util.ArrayDeque;
import java.util.Deque;
import com.pdftron.common.Matrix2D;
import com.pdftron.pdf.Rect;
import lombok.Data;
import lombok.SneakyThrows;
import java.awt.geom.Area;
import java.awt.geom.GeneralPath;
import java.awt.geom.Rectangle2D;
import java.util.ArrayDeque;
import java.util.Deque;
@Data
public class ClippingPathStack {
private Deque<Area> stack = new ArrayDeque<>();
@SneakyThrows
public ClippingPathStack(Rect rectangle)
{
GeneralPath path = new GeneralPath();
path.moveTo(rectangle.getX1(), rectangle.getY1());
path.lineTo(rectangle.getX2(), rectangle.getY1());
path.lineTo(rectangle.getX2(), rectangle.getY2());
path.lineTo(rectangle.getX1(), rectangle.getY2());
path.closePath();
stack.push(new Area(path));
public ClippingPathStack(Rect rectangle) {
stack.push(new Area(new Rectangle2D.Double(rectangle.getX1(), rectangle.getY1(), rectangle.getWidth(), rectangle.getHeight()).getBounds2D()));
}
@SneakyThrows
public void intersectClippingPath(GeneralPath path, Matrix2D ctm){
var affineTransform = new AffineTransform(ctm.getA(), ctm.getB(), ctm.getC(), ctm.getD(), ctm.getH(), ctm.getV());
path.transform(affineTransform);
// var area = getCurrentClippingPath();
// area.transform(affineTransform);
public void intersectClippingPath(GeneralPath path) {
getCurrentClippingPath().intersect(new Area(path));
}
public boolean almostIntersects(double x, double y, double width, double height) {
// Height or width are zero for straight lines, even though they are being rendered. Therefore, height or width must be at minimum >0.
double tolerance = 1e-3;
double x_with_tolerance = x > 0 ? x - tolerance : x + tolerance;
double y_with_tolerance = y > 0 ? y - tolerance : y + tolerance;
double width_with_tolerance = width + 2 * tolerance;
double height_with_tolerance = height + 2 * tolerance;
return getCurrentClippingPath().intersects(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance);
}
public Area getCurrentClippingPath() {
return stack.peek();
}
public void enterNewGState(){
public void enterNewGState() {
Area current = stack.peek();
Area cloned = new Area();
cloned.add(current);
stack.push(cloned);
}
public void leaveGState(){
public void leaveGState() {
stack.pop();
}

View File

@ -0,0 +1,149 @@
package com.iqser.red.service.ocr.v1.server.model;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.Rect;
import lombok.*;
import lombok.experimental.SuperBuilder;
import java.awt.geom.Rectangle2D;
@Data
@SuperBuilder
@NoArgsConstructor
@AllArgsConstructor
public abstract class ElementFeatures {
private int elementType;
private Rectangle2D boundingBox;
public boolean almostMatches(Element element) throws PDFNetException {
if (element.getType() != elementType) return false;
if (element.getBBox() == null) return false;
return rectsAlmostMatch(element.getBBox());
}
protected boolean almostEqual(double a, double b) {
double tolerance = 1e-3;
return Math.abs(a - b) < tolerance;
}
@SneakyThrows
private boolean rectsAlmostMatch(Rect bBox) {
if (!almostEqual(bBox.getX1(), boundingBox.getX())) return false;
if (!almostEqual(bBox.getY1(), boundingBox.getY())) return false;
if (!almostEqual(bBox.getWidth(), boundingBox.getWidth())) return false;
return almostEqual(bBox.getHeight(), boundingBox.getHeight());
}
@EqualsAndHashCode(callSuper = true)
@Data
@SuperBuilder
@NoArgsConstructor
@AllArgsConstructor
public static class Text extends ElementFeatures {
private String text;
private int font;
private double fontsize;
@Override
public boolean almostMatches(Element element) throws PDFNetException {
if (!super.almostMatches(element)) return false;
if (!text.equals(element.getTextString())) return false;
if (font != element.getGState().getFont().getType()) return false;
return almostEqual(fontsize, element.getGState().getFontSize());
}
}
@EqualsAndHashCode(callSuper = true)
@Data
@SuperBuilder
@NoArgsConstructor
@AllArgsConstructor
public static class Path extends ElementFeatures {
private boolean isClippingPath;
private boolean isClipWindingFill;
private boolean isStroked;
private boolean isFilled;
private boolean isWindingFill;
@Override
public boolean almostMatches(Element element) throws PDFNetException {
if (!super.almostMatches(element)) return false;
if (isClippingPath != element.isClippingPath()) return false;
if (isClipWindingFill != element.isClipWindingFill()) return false;
if (isStroked != element.isStroked()) return false;
if (isFilled != element.isFilled()) return false;
if (isWindingFill != element.isWindingFill()) return false;
return true;
}
}
@EqualsAndHashCode(callSuper = true)
@Data
@SuperBuilder
@NoArgsConstructor
public static class Image extends ElementFeatures {
private int dataSize;
private int height;
private int width;
private int renderingIntent;
private int componentNum;
private int bitsPerComponent;
@Override
public boolean almostMatches(Element element) throws PDFNetException {
if (!super.almostMatches(element)) return false;
if (dataSize != element.getImageDataSize()) return false;
if (height != element.getImageHeight()) return false;
if (width != element.getImageWidth()) return false;
if (renderingIntent != element.getImageRenderingIntent()) return false;
if (componentNum != element.getComponentNum()) return false;
if (bitsPerComponent != element.getBitsPerComponent()) return false;
return true;
}
}
public static ElementFeatures extractFeatures(Element element) throws PDFNetException {
switch (element.getType()) {
case Element.e_path:
return ElementFeatures.Path.builder()
.elementType(element.getType())
.boundingBox(toRectangle2D(element.getBBox()))
.isClippingPath(element.isClippingPath())
.isClipWindingFill(element.isClipWindingFill())
.isStroked(element.isStroked())
.isFilled(element.isFilled())
.isWindingFill(element.isWindingFill())
.build();
case Element.e_text:
return ElementFeatures.Text.builder()
.elementType(element.getType())
.boundingBox(toRectangle2D(element.getBBox()))
.text(element.getTextString())
.font(element.getGState().getFont().getType())
.fontsize(element.getGState().getFontSize())
.build();
case Element.e_image:
case Element.e_inline_image:
return Image.builder()
.elementType(element.getType())
.boundingBox(toRectangle2D(element.getBBox()))
.dataSize(element.getImageDataSize())
.height(element.getImageHeight())
.width(element.getImageWidth())
.renderingIntent(element.getImageRenderingIntent())
.componentNum(element.getComponentNum())
.bitsPerComponent(element.getBitsPerComponent())
.build();
default:
throw new UnsupportedOperationException("Feature Extraction is not supported for PDFTron.Element with type: " + element.getType());
}
}
private static Rectangle2D toRectangle2D(Rect rect) throws PDFNetException {
return new Rectangle2D.Double(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
}
}

View File

@ -0,0 +1,463 @@
package com.iqser.red.service.ocr.v1.server.service;
import com.iqser.red.service.ocr.v1.server.model.ClippingPathStack;
import com.iqser.red.service.ocr.v1.server.model.ElementFeatures;
import com.pdftron.common.Matrix2D;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.*;
import com.pdftron.sdf.Obj;
import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.FileUtils;
import org.springframework.stereotype.Service;
import java.awt.*;
import java.awt.geom.AffineTransform;
import java.awt.geom.GeneralPath;
import java.awt.geom.Rectangle2D;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import java.util.stream.Collectors;
@Slf4j
@Service
public class InvisibleElementService {
/*
handled cases:
Text or Path outside of clipping path
Text which is transparent or is set to not render
Text or Path that have been painted over by visible and filled Paths
unhandled cases:
Text covered by widely stroked path
Text same color as background
Any Text set to clipping with its many interactions with other elements
*/
@SneakyThrows
public byte[] removeInvisibleTextOrPathElements(byte[] pdfFile, boolean delta) {
PDFDoc pdfDoc = new PDFDoc(pdfFile);
ElementWriter writer = new ElementWriter();
ElementReader reader = new ElementReader();
Set<Integer> visited = new TreeSet<>();
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
Page page = iterator.next();
List<ElementFeatures> overlappedElements = removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, reader, writer, visited, delta);
visited.clear();
removeOverlappedElements(page, reader, writer, visited, overlappedElements, delta);
}
if (delta) {
debugSave(pdfDoc.save(SDFDoc.SaveMode.LINEARIZED, null));
}
return pdfDoc.save(SDFDoc.SaveMode.LINEARIZED, null);
}
private List<ElementFeatures> removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(Page page, ElementReader reader, ElementWriter writer, Set<Integer> visited, boolean delta)
throws PDFNetException {
var overlappedElements = new ArrayList<ElementFeatures>();
var visibleElements = new ArrayList<ElementFeatures>();
ClippingPathStack clippingPathStack = new ClippingPathStack(page.getMediaBox());
visited.add((int) page.getSDFObj().getObjNum());
reader.begin(page);
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
processElements(reader, writer, visited, clippingPathStack, delta, overlappedElements, visibleElements);
writer.end();
reader.end();
return overlappedElements;
}
private void processElements(ElementReader reader, ElementWriter writer, Set<Integer> visited, ClippingPathStack clippingPathStack, Boolean delta,
List<ElementFeatures> coveredElements, List<ElementFeatures> visibleElements)
throws PDFNetException {
for (Element element = reader.next(); element != null; element = reader.next())
switch (element.getType()) {
case Element.e_image:
case Element.e_inline_image:
processImages(element, writer, clippingPathStack, delta, visibleElements);
break;
case Element.e_text:
processText(element, writer, clippingPathStack, delta, visibleElements);
break;
case Element.e_path:
processPath(element, writer, clippingPathStack, delta, coveredElements, visibleElements);
break;
case Element.e_form:
processForm(reader, writer, element, visited, clippingPathStack, delta, coveredElements, visibleElements);
break;
case Element.e_group_begin:
clippingPathStack.enterNewGState();
writer.writeElement(element);
break;
case Element.e_group_end:
clippingPathStack.leaveGState();
writer.writeElement(element);
break;
default:
writer.writeElement(element);
}
}
private void processImages(Element imageElement, ElementWriter writer, ClippingPathStack clippingPathStack, Boolean delta, List<ElementFeatures> visibleElements)
throws PDFNetException {
Rect rect = imageElement.getBBox();
if (rect == null) {
return;
}
boolean inClippingPath = clippingPathStack.almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
if (!delta && inClippingPath) {
visibleElements.add(ElementFeatures.extractFeatures(imageElement));
writer.writeElement(imageElement);
}
if (delta && !inClippingPath) {
writer.writeElement(imageElement);
}
}
private void processText(Element textElement, ElementWriter writer, ClippingPathStack clippingPathStack,
Boolean delta, List<ElementFeatures> visibleElements)
throws PDFNetException {
Rect rect = textElement.getBBox();
if (rect == null) {
writer.writeElement(textElement);
return;
}
GState gState = textElement.getGState();
boolean inClippingPath = clippingPathStack.almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
boolean isTextVisible = isTextRenderedVisibly(gState);
if (inClippingPath && isTextVisible) {
visibleElements.add(ElementFeatures.extractFeatures(textElement));
}
if (!delta) {
if (inClippingPath && isTextVisible) {
writer.writeElement(textElement);
} else if (textElement.hasTextMatrix()) {
/*
PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
Therefore, the position of a following Tj is affected by not writing the first Element.
This is why, we write only the Tm command:
*/
writer.writeGStateChanges(textElement);
}
} else {
if (!inClippingPath) {
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
gState.setFillColor(new ColorPt(1, 0, 0));
writer.writeElement(textElement);
}
if (!isTextVisible) {
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
gState.setFillColor(new ColorPt(0, 0, 1));
gState.setTextRenderMode(GState.e_fill_text);
gState.setFillOpacity(1);
writer.writeElement(textElement);
}
}
}
private void processForm(ElementReader reader, ElementWriter writer, Element element, Set<Integer> visited, ClippingPathStack clippingPathStack, Boolean delta,
List<ElementFeatures> coveredElements, List<ElementFeatures> allElements)
throws PDFNetException {
writer.writeElement(element);
Obj formObj = element.getXObject();
if (!visited.contains((int) formObj.getObjNum())) {
visited.add((int) formObj.getObjNum());
ElementWriter new_writer = new ElementWriter();
reader.formBegin();
new_writer.begin(formObj);
reader.clearChangeList();
new_writer.setDefaultGState(reader);
processElements(reader, new_writer, visited, clippingPathStack, delta, coveredElements, allElements);
new_writer.end();
reader.end();
}
}
private void processPath(Element pathElement, ElementWriter writer, ClippingPathStack clippingPathStack, Boolean delta,
List<ElementFeatures> overlappedElements, List<ElementFeatures> visibleElements)
throws PDFNetException {
GeneralPath linePath = convertToGeneralPath(pathElement.getPathData());
//transform path to initial user space
var ctm = pathElement.getCTM();
var affineTransform = getAffineTransform(ctm);
linePath.transform(affineTransform);
var rect = linePath.getBounds2D();
boolean inClippingPath = clippingPathStack.almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight());
if (pathElement.isClippingPath()) {
if (pathElement.isClipWindingFill()) {
linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
} else {
linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
}
clippingPathStack.intersectClippingPath(linePath);
pathElement.setPathClip(!delta);
writer.writeElement(pathElement);
} else {
if (inClippingPath) {
if (isFilledAndNonTransparent(pathElement)) {
List<ElementFeatures> currentOverlappedElements = visibleElements.stream()
.filter(features -> almostContains(linePath, features.getBoundingBox()))
.collect(Collectors.toList());
overlappedElements.addAll(currentOverlappedElements);
visibleElements.removeAll(currentOverlappedElements);
}
visibleElements.add(ElementFeatures.extractFeatures(pathElement));
if (!delta) {
writer.writeElement(pathElement);
}
}
if (delta && !inClippingPath) {
pathElement.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
pathElement.getGState().setFillColor(new ColorPt(1, 0, 0));
pathElement.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
pathElement.getGState().setStrokeColor(new ColorPt(1, 0, 0));
writer.writeElement(pathElement);
}
}
}
private static AffineTransform getAffineTransform(Matrix2D ctm) throws PDFNetException {
return new AffineTransform(ctm.getA(), ctm.getB(), ctm.getC(), ctm.getD(), ctm.getH(), ctm.getV());
}
private void removeOverlappedElements(Page page, ElementReader reader, ElementWriter writer, Set<Integer> visited, List<ElementFeatures> overlappedElements, boolean delta)
throws PDFNetException {
reader.begin(page);
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
if (delta) {
overlappedElements.forEach(feature -> drawBBox(writer, feature.getBoundingBox(), "#00FF00"));
overlappedElements.clear();
}
processOverlappedElements(reader, writer, visited, overlappedElements, delta);
writer.end();
reader.end();
if (overlappedElements.size() > 0) {
log.warn(overlappedElements.size() + " overlapped elements have not been found and removed");
}
}
private void processOverlappedElements(ElementReader reader, ElementWriter writer, Set<Integer> visited, List<ElementFeatures> coveredElements, boolean delta)
throws PDFNetException {
for (Element element = reader.next(); element != null; element = reader.next()) {
switch (element.getType()) {
case Element.e_form:
processFormOverlappedElements(reader, writer, element, visited, coveredElements, delta);
break;
case Element.e_path:
case Element.e_image:
case Element.e_inline_image:
case Element.e_text:
boolean anyMatch = false;
for (ElementFeatures elementToRemove : coveredElements) {
if (elementToRemove.almostMatches(element)) {
coveredElements.remove(elementToRemove);
anyMatch = true;
break;
}
}
if (!anyMatch) {
writer.writeElement(element);
} else if (element.getType() == 3 && element.hasTextMatrix()) {
/*
PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
Therefore, the position of a following Tj is affected by not writing the first Element.
This is why, we write only the Tm command:
*/
writer.writeGStateChanges(element);
}
break;
default:
writer.writeElement(element);
}
}
}
private void processFormOverlappedElements(ElementReader reader, ElementWriter writer, Element element, Set<Integer> visited, List<ElementFeatures> elementsToRemove, boolean delta)
throws PDFNetException {
writer.writeElement(element);
Obj formObj = element.getXObject();
if (!visited.contains((int) formObj.getObjNum())) {
visited.add((int) formObj.getObjNum());
ElementWriter new_writer = new ElementWriter();
reader.formBegin();
new_writer.begin(formObj);
reader.clearChangeList();
new_writer.setDefaultGState(reader);
processOverlappedElements(reader, new_writer, visited, elementsToRemove, delta);
new_writer.end();
reader.end();
}
}
private boolean isTextRenderedVisibly(GState gState) throws PDFNetException {
if (gState.getTextRenderMode() == GState.e_invisible_text) return false;
if (gState.getTextRenderMode() == GState.e_fill_text && gState.getFillOpacity() == 0) return false;
if (gState.getTextRenderMode() == GState.e_stroke_text && gState.getStrokeOpacity() == 0) return false;
if (gState.getTextRenderMode() == GState.e_fill_stroke_text && gState.getFillOpacity() == 0 && gState.getStrokeOpacity() == 0) return false;
return true;
}
private GeneralPath convertToGeneralPath(PathData pathData) throws PDFNetException {
GeneralPath linePath = new GeneralPath();
double[] dataPoints = pathData.getPoints();
byte[] opr = pathData.getOperators();
double x1;
double y1;
double x2;
double y2;
double x3;
double y3;
int data_index = 0;
for (int opr_index = 0; opr_index < opr.length; ++opr_index) {
switch (opr[opr_index]) {
case PathData.e_moveto:
x1 = dataPoints[data_index];
++data_index;
y1 = dataPoints[data_index];
++data_index;
linePath.moveTo(x1, y1);
break;
case PathData.e_lineto:
x1 = dataPoints[data_index];
++data_index;
y1 = dataPoints[data_index];
++data_index;
linePath.lineTo(x1, y1);
break;
case PathData.e_cubicto:
x1 = dataPoints[data_index];
++data_index;
y1 = dataPoints[data_index];
++data_index;
x2 = dataPoints[data_index];
++data_index;
y2 = dataPoints[data_index];
++data_index;
x3 = dataPoints[data_index];
++data_index;
y3 = dataPoints[data_index];
++data_index;
linePath.curveTo(x1, y1, x2, y2, x3, y3);
break;
case PathData.e_rect:
x1 = dataPoints[data_index];
++data_index;
y1 = dataPoints[data_index];
++data_index;
double w = dataPoints[data_index];
++data_index;
double h = dataPoints[data_index];
++data_index;
x2 = x1 + w;
y2 = y1;
x3 = x2;
y3 = y1 + h;
double x4 = x1;
double y4 = y3;
linePath.moveTo(x1, y1);
linePath.lineTo(x2, y2);
linePath.lineTo(x3, y3);
linePath.lineTo(x4, y4);
break;
case PathData.e_closepath:
linePath.closePath();
break;
default:
throw new PDFNetException("Invalid Element Type", 0, "", "", "");
}
}
return linePath;
}
private boolean almostContains(Shape outer, Rectangle2D inner) {
double tolerance = 1e-3;
double x_with_tolerance = inner.getX() >= 0 ? inner.getX() + tolerance : inner.getX() - tolerance;
double y_with_tolerance = inner.getY() >= 0 ? inner.getY() + tolerance : inner.getY() - tolerance;
double height_with_tolerance = inner.getHeight() - (2 * tolerance);
double width_with_tolerance = inner.getWidth() - (2 * tolerance);
Rectangle2D innerRect = new Rectangle2D.Double(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance);
return outer.contains(innerRect);
}
private boolean isFilledAndNonTransparent(Element element) throws PDFNetException {
return element.isFilled() && element.getGState().getFillOpacity() == 1;
}
private void debugSave(byte[] pdfFile) {
String fileLocation = "/tmp/delta.pdf";
try (var f_out = FileUtils.openOutputStream(new File(fileLocation))) {
f_out.write(pdfFile);
} catch (IOException e) {
throw new RuntimeException("File location: " + fileLocation + "could not be openend, no file will be saved");
}
}
@SneakyThrows
private void drawBBox(ElementWriter writer, Rectangle2D r, String hexcolor) {
ColorPt colorPt = new ColorPt(Integer.valueOf(hexcolor.substring(1, 3), 16) / 255d,
Integer.valueOf(hexcolor.substring(3, 5), 16) / 255d,
Integer.valueOf(hexcolor.substring(5, 7), 16) / 255d);
ElementBuilder eb = new ElementBuilder();
Element rect = eb.createRect(r.getX(), r.getY(), r.getWidth(), r.getHeight());
rect.setPathStroke(true);
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
rect.getGState().setStrokeColor(colorPt);
writer.writePlacedElement(rect);
}
}

View File

@ -1,63 +1,29 @@
package com.iqser.red.service.ocr.v1.server.service;
import static com.pdftron.pdf.TextExtractor.e_no_invisible_text;
import static com.pdftron.pdf.TextExtractor.e_remove_hidden_text;
import java.awt.geom.AffineTransform;
import java.awt.geom.GeneralPath;
import java.awt.geom.Point2D;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import org.apache.commons.io.IOUtils;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.stereotype.Service;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse;
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
import com.iqser.red.service.ocr.v1.server.model.ClippingPathStack;
import com.iqser.red.service.ocr.v1.server.model.ImagePosition;
import com.iqser.red.service.ocr.v1.server.model.image.ImageServiceResponse;
import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings;
import com.iqser.red.service.persistence.service.v1.api.utils.SuppressFBWarnings;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.pdftron.common.Matrix2D;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.CharData;
import com.pdftron.pdf.CharIterator;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementReader;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.Font;
import com.pdftron.pdf.GSChangesIterator;
import com.pdftron.pdf.OCRModule;
import com.pdftron.pdf.OCROptions;
import com.pdftron.pdf.Optimizer;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
import com.pdftron.pdf.PathData;
import com.pdftron.pdf.Rect;
import com.pdftron.pdf.RectCollection;
import com.pdftron.pdf.TextExtractor;
import com.pdftron.sdf.Obj;
import com.pdftron.pdf.*;
import com.pdftron.sdf.SDFDoc;
import io.micrometer.core.annotation.Timed;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.IOUtils;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.stereotype.Service;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.util.*;
@Slf4j
@Service
@ -73,6 +39,7 @@ public class OCRService {
private final ObjectMapper objectMapper;
private final InvisibleElementService invisibleElementService;
@Timed("redactmanager_PDFTron-ocrDocument")
@SneakyThrows
@ -95,9 +62,11 @@ public class OCRService {
private byte[] ocr(byte[] file, String fileId, ImageServiceResponse imageServiceResponse) {
PDFDoc pdfDoc = null;
var fileWithoutInvisibleText = invisibleElementService.removeInvisibleTextOrPathElements(file, false);
try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
pdfDoc = new PDFDoc(file);
removeInvisibleText(pdfDoc);
pdfDoc = new PDFDoc(fileWithoutInvisibleText);
Map<Integer, List<ImagePosition>> pages = new HashMap<>();
@ -128,7 +97,6 @@ public class OCRService {
ocrDoc.close();
}
Optimizer.optimize(pdfDoc);
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
pdfDoc.close();
@ -206,308 +174,4 @@ public class OCRService {
}
}
/**
* There are 2 possibilities to have invisible Text in pdfs.
* 1. gState is set to invisible, this is ocr text.
* 2. Filled Path elements in front of the text.
*/
@SneakyThrows
private void removeInvisibleText(PDFDoc pdfDoc) {
ElementWriter writer = new ElementWriter();
ElementReader reader = new ElementReader();
Set<Integer> visited = new TreeSet<>();
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
Page page = iterator.next();
removeOverlapText(page, reader, writer, visited);
}
}
@SneakyThrows
private void removeOverlapText(Page page, ElementReader reader, ElementWriter writer, Set<Integer> visited) {
ClippingPathStack clippingPathStack = new ClippingPathStack(page.getMediaBox());
visited.add((int) page.getSDFObj().getObjNum());
reader.begin(page);
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
processElements(reader, writer, visited, clippingPathStack);
writer.end();
reader.end();
}
@SneakyThrows
private void processElements(ElementReader reader, ElementWriter writer, Set<Integer> visited, ClippingPathStack clippingPathStack) {
GSChangesIterator gs_itr = reader.getChangesIterator();
while (gs_itr.hasNext()) {
System.out.println("---->" + gs_itr);
}
for (Element element = reader.next(); element != null; element = reader.next())
switch (element.getType()) {
case Element.e_text:
processText(element, writer, clippingPathStack);
break;
case Element.e_path:
processPath(element,reader, writer, clippingPathStack);
break;
case Element.e_form:
processForm(reader, writer, element, visited, clippingPathStack);
break;
case Element.e_group_begin:
clippingPathStack.enterNewGState();
writer.writeElement(element);
break;
case Element.e_group_end:
clippingPathStack.leaveGState();
writer.writeElement(element);
break;
default:
writer.writeElement(element);
}
}
@SneakyThrows
private void processText(Element textElement, ElementWriter writer, ClippingPathStack clippingPathStack) {
// var gState = element.getGState();
//
//
// //See PDF Reference 5.3 Text rendering modes, 3 = Invisible, however this ocr does not use it.
// if (!filledRectangleIntersection && gState.getTextRenderMode() != 3) {
Rect rect = textElement.getBBox();
System.out.println(clippingPathStack.getStack().size() + " -> " +textElement.getTextString());
// Matrix textMatrix = text.getTextMatrix();
//// Vector start = textMatrix.transform(new Vector(0, 0));
//// Vector end = new Vector(start.getX() + text.getWidth(), start.getY());
// var textMatrix = textElement.getTextMatrix();
//
// var start = textMatrix.multPoint(0,0);
//// Vector start = textMatrix.transform(new Vector(0, 0));
// var end = new Point2D.Double(start.getX() + textElement.getBBox().getWidth(), start.getY());
// Matrix2D ctm = textElement.getCTM();
//
// // To get the exact character positioning information you need to
// // concatenate current text matrix with CTM and then multiply
// // relative positioning coordinates with the resulting matrix.
// //
// Matrix2D mtx = ctm.multiply(textMatrix);
// java.awt.geom.Point2D.Double p1 = mtx.multPoint(rect.getX1(), rect.getY1());
// java.awt.geom.Point2D.Double p2 = mtx.multPoint(rect.getX1(), rect.getY2());
// java.awt.geom.Point2D.Double p3 = mtx.multPoint(rect.getX2(), rect.getY2());
// java.awt.geom.Point2D.Double p4 = mtx.multPoint(rect.getX2(), rect.getY1());
//
// double x, y;
// long char_code;
//
// for (CharIterator itr = textElement.getCharIterator(); itr.hasNext(); ) {
// CharData data = itr.next();
// char_code = data.getCharCode();
// //System.out.print("Character code: ");
//
// System.out.print(String.valueOf(char_code));
//
// x = data.getGlyphX(); // character positioning information
// y = data.getGlyphY();
//
// // Use element.getCTM() if you are interested in the CTM
// // (current transformation matrix).
// Matrix2D ctm = textElement.getCTM();
//
// var inverse = ctm.inverse();
//
//
//
// // To get the exact character positioning information you need to
// // concatenate current text matrix with CTM and then multiply
// // relative positioning coordinates with the resulting matrix.
// //
// Matrix2D mtx = ctm.multiply(textElement.getTextMatrix());
// java.awt.geom.Point2D.Double t = mtx.multPoint(x, y);
// x = t.x;
// y = t.y;
// System.out.println(" Position: x=" + x + " y=" + y );
// }
// var in = textElement.getCTM().inverse();
//
// var p1 = in.multPoint(rect.getX1(), rect.getY1());
// var p4 = in.multPoint(rect.getX2(), rect.getY2());
// Vector start = textMatrix.transform(new Vector(0, 0));
// Vector end = new Vector(start.getX() + text.getWidth(), start.getY());
if(clippingPathStack.getCurrentClippingPath().contains(rect.getX1(), rect.getY1(),rect.getWidth(), rect.getHeight())) {
writer.writeElement(textElement);
}
// }
}
@SneakyThrows
private void processPath(Element pathElement,ElementReader reader, ElementWriter writer, ClippingPathStack clippingPathStack) {
// System.out.println("New Path");
// System.out.println("ClippingPath: " + pathElement.isClippingPath());
// System.out.println("ClipWindingFill: " + pathElement.isClipWindingFill());
// System.out.println("WindingFill: " + pathElement.isWindingFill());
// System.out.println("Stroke: " + pathElement.isStroked());
// System.out.println("Filled: " + pathElement.isFilled());
GeneralPath linePath = new GeneralPath();
PathData pathData = pathElement.getPathData();
double[] dataPoints = pathData.getPoints();
byte[] opr = pathData.getOperators();
double x1, y1, x2, y2, x3, y3;
int data_index = 0;
for (int opr_index = 0; opr_index < opr.length; ++opr_index) {
switch (opr[opr_index]) {
case PathData.e_moveto:
x1 = dataPoints[data_index];
++data_index;
y1 = dataPoints[data_index];
++data_index;
// System.out.println(" M" + x1 + " " + y1);
linePath.moveTo(x1, y1);
break;
case PathData.e_lineto:
x1 = dataPoints[data_index];
++data_index;
y1 = dataPoints[data_index];
++data_index;
// System.out.println(" L" + x1 + " " + y1);
linePath.lineTo(x1, y1);
break;
case PathData.e_cubicto:
x1 = dataPoints[data_index];
++data_index;
y1 = dataPoints[data_index];
++data_index;
x2 = dataPoints[data_index];
++data_index;
y2 = dataPoints[data_index];
++data_index;
x3 = dataPoints[data_index];
++data_index;
y3 = dataPoints[data_index];
++data_index;
// System.out.println(" CU P1 " + x1 + " " + y1 + " P2 " + x2 + " " + y2 + " P3 " + x3 + " " + y3);
linePath.curveTo(x1, y1, x2, y2, x3, y3);
break;
case PathData.e_rect: {
x1 = dataPoints[data_index];
++data_index;
y1 = dataPoints[data_index];
++data_index;
double w = dataPoints[data_index];
++data_index;
double h = dataPoints[data_index];
++data_index;
x2 = x1 + w;
y2 = y1;
x3 = x2;
y3 = y1 + h;
double x4 = x1;
double y4 = y3;
// System.out.println(" RE P1 " + x1 + " " + y1 + " P2 " + x2 + " " + y2 + " P3 " + x3 + " " + y3 + " P4 " + x4 + " " + y4);
linePath.moveTo(x1, y1);
linePath.lineTo(x2, y2);
linePath.lineTo(x3, y3);
linePath.lineTo(x4, y4);
}
break;
case PathData.e_closepath:
linePath.closePath();
break;
default:
throw new PDFNetException("Invalid Element Type", 0, "", "", "");
}
}
// ClipWindingFill = true W = non-zero
// ClipWindingFill = false W* = even-odd
if(pathElement.isClippingPath()){
if(pathElement.isClipWindingFill()){
linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
} else {
linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
}
clippingPathStack.intersectClippingPath(linePath, pathElement.getCTM());
}
writer.writeElement(pathElement);
}
@SneakyThrows
private void processForm(ElementReader reader, ElementWriter writer, Element element, Set<Integer> visited, ClippingPathStack clippingPathStack) {
writer.writeElement(element);
Obj formObj = element.getXObject();
if (!visited.contains((int) formObj.getObjNum())) {
visited.add((int) formObj.getObjNum());
System.out.println("Form num:" +(int) formObj.getObjNum());
ElementWriter new_writer = new ElementWriter();
reader.formBegin();
new_writer.begin(formObj);
reader.clearChangeList();
new_writer.setDefaultGState(reader);
processElements(reader, new_writer, visited, clippingPathStack);
new_writer.end();
reader.end();
}
}
}

View File

@ -0,0 +1,64 @@
package com.iqser.red.service.ocr.v1.server;
import com.iqser.red.service.ocr.v1.server.service.InvisibleElementService;
import lombok.SneakyThrows;
import org.apache.commons.io.FileUtils;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.context.annotation.Import;
import org.springframework.core.io.ClassPathResource;
import org.springframework.test.context.junit.jupiter.SpringExtension;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
@ExtendWith(SpringExtension.class)
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT //
, properties = {"pdftron.ocrmodule.path=/YourOCRModulePath"})
@Import(OcrServiceIntegrationTest.TestConfiguration.class)
public class InvisibleElementServiceTest {
@Autowired
private InvisibleElementService invisibleElementService;
@MockBean
protected RabbitTemplate rabbitTemplate;
@Test
@SneakyThrows
public void testRemoveInvisibleText() {
String fileName = "InvisibleText";
ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf");
var initialFileBytes = Files.readAllBytes(pdfFileResource.getFile().toPath());
var fileWithoutInvisibleText = invisibleElementService.removeInvisibleTextOrPathElements(initialFileBytes, false);
var deltaFile = invisibleElementService.removeInvisibleTextOrPathElements(initialFileBytes, true);
String fileWithoutInvisibleTextLocation = getTemporaryDirectory() + "/" + fileName + ".pdf";
String deltaFileLocation = getTemporaryDirectory() + "/" + fileName + "_delta.pdf";
saveToFile(fileWithoutInvisibleTextLocation, fileWithoutInvisibleText);
saveToFile(deltaFileLocation, deltaFile);
System.out.println("File:" + fileWithoutInvisibleTextLocation);
System.out.println("File:" + deltaFileLocation);
}
private void saveToFile(String location, byte[] fileBytes) {
try (var f_out = FileUtils.openOutputStream(new File(location))) {
f_out.write(fileBytes);
} catch (IOException e) {
throw new RuntimeException("File location: " + location + "could not be openend, no file will be saved");
}
}
}

View File

@ -1,10 +1,13 @@
package com.iqser.red.service.ocr.v1.server;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
import static org.assertj.core.api.Assertions.assertThat;
import java.io.File;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.ocr.v1.server.service.FileStorageService;
import com.iqser.red.service.ocr.v1.server.service.OCRService;
import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService;
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType;
import com.iqser.red.storage.commons.StorageAutoConfiguration;
import com.iqser.red.storage.commons.service.StorageService;
import lombok.SneakyThrows;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.AfterEach;
@ -24,15 +27,10 @@ import org.springframework.context.annotation.Primary;
import org.springframework.core.io.ClassPathResource;
import org.springframework.test.context.junit.jupiter.SpringExtension;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService;
import com.iqser.red.service.ocr.v1.server.service.OCRService;
import com.iqser.red.service.ocr.v1.server.service.FileStorageService;
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType;
import com.iqser.red.storage.commons.StorageAutoConfiguration;
import com.iqser.red.storage.commons.service.StorageService;
import java.io.File;
import lombok.SneakyThrows;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
import static org.assertj.core.api.Assertions.assertThat;
@ExtendWith(SpringExtension.class)
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT //
@ -86,10 +84,10 @@ public class OcrServiceIntegrationTest {
@SneakyThrows
public void testRemoveInvisibleText() {
String fileName = "InvisiblePathElements";
String fileName = "ocr/OCR Docs/MK244 - Fitness of Analytical Method - Physical-Chemical Pro";
// String fileName = "InvisiblePathElements";
ClassPathResource imageInfoResource = new ClassPathResource("files/" + fileName + ".IMAGE_INFO.json");
ClassPathResource imageInfoResource = new ClassPathResource("files/InvisibleText.IMAGE_INFO.json");
ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf");
var originId = FileStorageService.getStorageId("dossier", "file", FileType.ORIGIN);
@ -107,8 +105,6 @@ public class OcrServiceIntegrationTest {
}
@SneakyThrows
public void dummyTest() {