Compare commits
3 Commits
master
...
invisibleT
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
63a06625f6 | ||
|
|
f69681133c | ||
|
|
579e6a5c67 |
@ -0,0 +1,54 @@
|
|||||||
|
package com.iqser.red.service.ocr.v1.server.model;
|
||||||
|
|
||||||
|
import com.pdftron.pdf.Rect;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
|
||||||
|
import java.awt.geom.Area;
|
||||||
|
import java.awt.geom.GeneralPath;
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.util.ArrayDeque;
|
||||||
|
import java.util.Deque;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
public class ClippingPathStack {
|
||||||
|
|
||||||
|
private Deque<Area> stack = new ArrayDeque<>();
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
public ClippingPathStack(Rect rectangle) {
|
||||||
|
stack.push(new Area(new Rectangle2D.Double(rectangle.getX1(), rectangle.getY1(), rectangle.getWidth(), rectangle.getHeight()).getBounds2D()));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
public void intersectClippingPath(GeneralPath path) {
|
||||||
|
getCurrentClippingPath().intersect(new Area(path));
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean almostIntersects(double x, double y, double width, double height) {
|
||||||
|
// Height or width are zero for straight lines, even though they are being rendered. Therefore, height or width must be at minimum >0.
|
||||||
|
double tolerance = 1e-3;
|
||||||
|
double x_with_tolerance = x > 0 ? x - tolerance : x + tolerance;
|
||||||
|
double y_with_tolerance = y > 0 ? y - tolerance : y + tolerance;
|
||||||
|
double width_with_tolerance = width + 2 * tolerance;
|
||||||
|
double height_with_tolerance = height + 2 * tolerance;
|
||||||
|
return getCurrentClippingPath().intersects(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Area getCurrentClippingPath() {
|
||||||
|
return stack.peek();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void enterNewGState() {
|
||||||
|
Area current = stack.peek();
|
||||||
|
Area cloned = new Area();
|
||||||
|
cloned.add(current);
|
||||||
|
stack.push(cloned);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void leaveGState() {
|
||||||
|
stack.pop();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,149 @@
|
|||||||
|
package com.iqser.red.service.ocr.v1.server.model;
|
||||||
|
|
||||||
|
import com.pdftron.common.PDFNetException;
|
||||||
|
import com.pdftron.pdf.Element;
|
||||||
|
import com.pdftron.pdf.Rect;
|
||||||
|
import lombok.*;
|
||||||
|
import lombok.experimental.SuperBuilder;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@SuperBuilder
|
||||||
|
@NoArgsConstructor
|
||||||
|
@AllArgsConstructor
|
||||||
|
public abstract class ElementFeatures {
|
||||||
|
private int elementType;
|
||||||
|
private Rectangle2D boundingBox;
|
||||||
|
|
||||||
|
public boolean almostMatches(Element element) throws PDFNetException {
|
||||||
|
if (element.getType() != elementType) return false;
|
||||||
|
if (element.getBBox() == null) return false;
|
||||||
|
return rectsAlmostMatch(element.getBBox());
|
||||||
|
}
|
||||||
|
|
||||||
|
protected boolean almostEqual(double a, double b) {
|
||||||
|
double tolerance = 1e-3;
|
||||||
|
return Math.abs(a - b) < tolerance;
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
private boolean rectsAlmostMatch(Rect bBox) {
|
||||||
|
if (!almostEqual(bBox.getX1(), boundingBox.getX())) return false;
|
||||||
|
if (!almostEqual(bBox.getY1(), boundingBox.getY())) return false;
|
||||||
|
if (!almostEqual(bBox.getWidth(), boundingBox.getWidth())) return false;
|
||||||
|
return almostEqual(bBox.getHeight(), boundingBox.getHeight());
|
||||||
|
}
|
||||||
|
|
||||||
|
@EqualsAndHashCode(callSuper = true)
|
||||||
|
@Data
|
||||||
|
@SuperBuilder
|
||||||
|
@NoArgsConstructor
|
||||||
|
@AllArgsConstructor
|
||||||
|
public static class Text extends ElementFeatures {
|
||||||
|
private String text;
|
||||||
|
private int font;
|
||||||
|
private double fontsize;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean almostMatches(Element element) throws PDFNetException {
|
||||||
|
if (!super.almostMatches(element)) return false;
|
||||||
|
if (!text.equals(element.getTextString())) return false;
|
||||||
|
if (font != element.getGState().getFont().getType()) return false;
|
||||||
|
return almostEqual(fontsize, element.getGState().getFontSize());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@EqualsAndHashCode(callSuper = true)
|
||||||
|
@Data
|
||||||
|
@SuperBuilder
|
||||||
|
@NoArgsConstructor
|
||||||
|
@AllArgsConstructor
|
||||||
|
public static class Path extends ElementFeatures {
|
||||||
|
private boolean isClippingPath;
|
||||||
|
private boolean isClipWindingFill;
|
||||||
|
private boolean isStroked;
|
||||||
|
private boolean isFilled;
|
||||||
|
private boolean isWindingFill;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean almostMatches(Element element) throws PDFNetException {
|
||||||
|
if (!super.almostMatches(element)) return false;
|
||||||
|
if (isClippingPath != element.isClippingPath()) return false;
|
||||||
|
if (isClipWindingFill != element.isClipWindingFill()) return false;
|
||||||
|
if (isStroked != element.isStroked()) return false;
|
||||||
|
if (isFilled != element.isFilled()) return false;
|
||||||
|
if (isWindingFill != element.isWindingFill()) return false;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@EqualsAndHashCode(callSuper = true)
|
||||||
|
@Data
|
||||||
|
@SuperBuilder
|
||||||
|
@NoArgsConstructor
|
||||||
|
public static class Image extends ElementFeatures {
|
||||||
|
private int dataSize;
|
||||||
|
private int height;
|
||||||
|
private int width;
|
||||||
|
private int renderingIntent;
|
||||||
|
private int componentNum;
|
||||||
|
private int bitsPerComponent;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean almostMatches(Element element) throws PDFNetException {
|
||||||
|
if (!super.almostMatches(element)) return false;
|
||||||
|
if (dataSize != element.getImageDataSize()) return false;
|
||||||
|
if (height != element.getImageHeight()) return false;
|
||||||
|
if (width != element.getImageWidth()) return false;
|
||||||
|
if (renderingIntent != element.getImageRenderingIntent()) return false;
|
||||||
|
if (componentNum != element.getComponentNum()) return false;
|
||||||
|
if (bitsPerComponent != element.getBitsPerComponent()) return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static ElementFeatures extractFeatures(Element element) throws PDFNetException {
|
||||||
|
switch (element.getType()) {
|
||||||
|
case Element.e_path:
|
||||||
|
return ElementFeatures.Path.builder()
|
||||||
|
.elementType(element.getType())
|
||||||
|
.boundingBox(toRectangle2D(element.getBBox()))
|
||||||
|
.isClippingPath(element.isClippingPath())
|
||||||
|
.isClipWindingFill(element.isClipWindingFill())
|
||||||
|
.isStroked(element.isStroked())
|
||||||
|
.isFilled(element.isFilled())
|
||||||
|
.isWindingFill(element.isWindingFill())
|
||||||
|
.build();
|
||||||
|
case Element.e_text:
|
||||||
|
return ElementFeatures.Text.builder()
|
||||||
|
.elementType(element.getType())
|
||||||
|
.boundingBox(toRectangle2D(element.getBBox()))
|
||||||
|
.text(element.getTextString())
|
||||||
|
.font(element.getGState().getFont().getType())
|
||||||
|
.fontsize(element.getGState().getFontSize())
|
||||||
|
.build();
|
||||||
|
case Element.e_image:
|
||||||
|
case Element.e_inline_image:
|
||||||
|
return Image.builder()
|
||||||
|
.elementType(element.getType())
|
||||||
|
.boundingBox(toRectangle2D(element.getBBox()))
|
||||||
|
.dataSize(element.getImageDataSize())
|
||||||
|
.height(element.getImageHeight())
|
||||||
|
.width(element.getImageWidth())
|
||||||
|
.renderingIntent(element.getImageRenderingIntent())
|
||||||
|
.componentNum(element.getComponentNum())
|
||||||
|
.bitsPerComponent(element.getBitsPerComponent())
|
||||||
|
.build();
|
||||||
|
default:
|
||||||
|
throw new UnsupportedOperationException("Feature Extraction is not supported for PDFTron.Element with type: " + element.getType());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Rectangle2D toRectangle2D(Rect rect) throws PDFNetException {
|
||||||
|
return new Rectangle2D.Double(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@ -0,0 +1,463 @@
|
|||||||
|
package com.iqser.red.service.ocr.v1.server.service;
|
||||||
|
|
||||||
|
import com.iqser.red.service.ocr.v1.server.model.ClippingPathStack;
|
||||||
|
import com.iqser.red.service.ocr.v1.server.model.ElementFeatures;
|
||||||
|
import com.pdftron.common.Matrix2D;
|
||||||
|
import com.pdftron.common.PDFNetException;
|
||||||
|
import com.pdftron.pdf.*;
|
||||||
|
import com.pdftron.sdf.Obj;
|
||||||
|
import com.pdftron.sdf.SDFDoc;
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import org.apache.commons.io.FileUtils;
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import java.awt.*;
|
||||||
|
import java.awt.geom.AffineTransform;
|
||||||
|
import java.awt.geom.GeneralPath;
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.TreeSet;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
@Slf4j
|
||||||
|
@Service
|
||||||
|
public class InvisibleElementService {
|
||||||
|
|
||||||
|
/*
|
||||||
|
handled cases:
|
||||||
|
Text or Path outside of clipping path
|
||||||
|
Text which is transparent or is set to not render
|
||||||
|
Text or Path that have been painted over by visible and filled Paths
|
||||||
|
unhandled cases:
|
||||||
|
Text covered by widely stroked path
|
||||||
|
Text same color as background
|
||||||
|
Any Text set to clipping with its many interactions with other elements
|
||||||
|
*/
|
||||||
|
@SneakyThrows
|
||||||
|
public byte[] removeInvisibleTextOrPathElements(byte[] pdfFile, boolean delta) {
|
||||||
|
PDFDoc pdfDoc = new PDFDoc(pdfFile);
|
||||||
|
|
||||||
|
ElementWriter writer = new ElementWriter();
|
||||||
|
ElementReader reader = new ElementReader();
|
||||||
|
Set<Integer> visited = new TreeSet<>();
|
||||||
|
|
||||||
|
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
|
||||||
|
Page page = iterator.next();
|
||||||
|
List<ElementFeatures> overlappedElements = removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, reader, writer, visited, delta);
|
||||||
|
visited.clear();
|
||||||
|
removeOverlappedElements(page, reader, writer, visited, overlappedElements, delta);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (delta) {
|
||||||
|
debugSave(pdfDoc.save(SDFDoc.SaveMode.LINEARIZED, null));
|
||||||
|
}
|
||||||
|
|
||||||
|
return pdfDoc.save(SDFDoc.SaveMode.LINEARIZED, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<ElementFeatures> removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(Page page, ElementReader reader, ElementWriter writer, Set<Integer> visited, boolean delta)
|
||||||
|
throws PDFNetException {
|
||||||
|
var overlappedElements = new ArrayList<ElementFeatures>();
|
||||||
|
var visibleElements = new ArrayList<ElementFeatures>();
|
||||||
|
ClippingPathStack clippingPathStack = new ClippingPathStack(page.getMediaBox());
|
||||||
|
visited.add((int) page.getSDFObj().getObjNum());
|
||||||
|
reader.begin(page);
|
||||||
|
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
|
||||||
|
processElements(reader, writer, visited, clippingPathStack, delta, overlappedElements, visibleElements);
|
||||||
|
writer.end();
|
||||||
|
reader.end();
|
||||||
|
return overlappedElements;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void processElements(ElementReader reader, ElementWriter writer, Set<Integer> visited, ClippingPathStack clippingPathStack, Boolean delta,
|
||||||
|
List<ElementFeatures> coveredElements, List<ElementFeatures> visibleElements)
|
||||||
|
throws PDFNetException {
|
||||||
|
|
||||||
|
for (Element element = reader.next(); element != null; element = reader.next())
|
||||||
|
switch (element.getType()) {
|
||||||
|
|
||||||
|
case Element.e_image:
|
||||||
|
case Element.e_inline_image:
|
||||||
|
processImages(element, writer, clippingPathStack, delta, visibleElements);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case Element.e_text:
|
||||||
|
processText(element, writer, clippingPathStack, delta, visibleElements);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case Element.e_path:
|
||||||
|
processPath(element, writer, clippingPathStack, delta, coveredElements, visibleElements);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case Element.e_form:
|
||||||
|
processForm(reader, writer, element, visited, clippingPathStack, delta, coveredElements, visibleElements);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case Element.e_group_begin:
|
||||||
|
clippingPathStack.enterNewGState();
|
||||||
|
writer.writeElement(element);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case Element.e_group_end:
|
||||||
|
clippingPathStack.leaveGState();
|
||||||
|
writer.writeElement(element);
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
writer.writeElement(element);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void processImages(Element imageElement, ElementWriter writer, ClippingPathStack clippingPathStack, Boolean delta, List<ElementFeatures> visibleElements)
|
||||||
|
throws PDFNetException {
|
||||||
|
|
||||||
|
Rect rect = imageElement.getBBox();
|
||||||
|
|
||||||
|
if (rect == null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean inClippingPath = clippingPathStack.almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
|
||||||
|
|
||||||
|
if (!delta && inClippingPath) {
|
||||||
|
visibleElements.add(ElementFeatures.extractFeatures(imageElement));
|
||||||
|
writer.writeElement(imageElement);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (delta && !inClippingPath) {
|
||||||
|
writer.writeElement(imageElement);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void processText(Element textElement, ElementWriter writer, ClippingPathStack clippingPathStack,
|
||||||
|
Boolean delta, List<ElementFeatures> visibleElements)
|
||||||
|
throws PDFNetException {
|
||||||
|
|
||||||
|
Rect rect = textElement.getBBox();
|
||||||
|
|
||||||
|
if (rect == null) {
|
||||||
|
writer.writeElement(textElement);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
GState gState = textElement.getGState();
|
||||||
|
|
||||||
|
boolean inClippingPath = clippingPathStack.almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
|
||||||
|
|
||||||
|
boolean isTextVisible = isTextRenderedVisibly(gState);
|
||||||
|
|
||||||
|
if (inClippingPath && isTextVisible) {
|
||||||
|
visibleElements.add(ElementFeatures.extractFeatures(textElement));
|
||||||
|
}
|
||||||
|
if (!delta) {
|
||||||
|
if (inClippingPath && isTextVisible) {
|
||||||
|
writer.writeElement(textElement);
|
||||||
|
} else if (textElement.hasTextMatrix()) {
|
||||||
|
/*
|
||||||
|
PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
|
||||||
|
hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
|
||||||
|
Therefore, the position of a following Tj is affected by not writing the first Element.
|
||||||
|
This is why, we write only the Tm command:
|
||||||
|
*/
|
||||||
|
writer.writeGStateChanges(textElement);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (!inClippingPath) {
|
||||||
|
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||||
|
gState.setFillColor(new ColorPt(1, 0, 0));
|
||||||
|
writer.writeElement(textElement);
|
||||||
|
}
|
||||||
|
if (!isTextVisible) {
|
||||||
|
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||||
|
gState.setFillColor(new ColorPt(0, 0, 1));
|
||||||
|
gState.setTextRenderMode(GState.e_fill_text);
|
||||||
|
gState.setFillOpacity(1);
|
||||||
|
writer.writeElement(textElement);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void processForm(ElementReader reader, ElementWriter writer, Element element, Set<Integer> visited, ClippingPathStack clippingPathStack, Boolean delta,
|
||||||
|
List<ElementFeatures> coveredElements, List<ElementFeatures> allElements)
|
||||||
|
throws PDFNetException {
|
||||||
|
|
||||||
|
writer.writeElement(element);
|
||||||
|
Obj formObj = element.getXObject();
|
||||||
|
|
||||||
|
if (!visited.contains((int) formObj.getObjNum())) {
|
||||||
|
visited.add((int) formObj.getObjNum());
|
||||||
|
ElementWriter new_writer = new ElementWriter();
|
||||||
|
reader.formBegin();
|
||||||
|
new_writer.begin(formObj);
|
||||||
|
|
||||||
|
reader.clearChangeList();
|
||||||
|
new_writer.setDefaultGState(reader);
|
||||||
|
|
||||||
|
processElements(reader, new_writer, visited, clippingPathStack, delta, coveredElements, allElements);
|
||||||
|
new_writer.end();
|
||||||
|
reader.end();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void processPath(Element pathElement, ElementWriter writer, ClippingPathStack clippingPathStack, Boolean delta,
|
||||||
|
List<ElementFeatures> overlappedElements, List<ElementFeatures> visibleElements)
|
||||||
|
throws PDFNetException {
|
||||||
|
|
||||||
|
GeneralPath linePath = convertToGeneralPath(pathElement.getPathData());
|
||||||
|
|
||||||
|
//transform path to initial user space
|
||||||
|
var ctm = pathElement.getCTM();
|
||||||
|
var affineTransform = getAffineTransform(ctm);
|
||||||
|
linePath.transform(affineTransform);
|
||||||
|
|
||||||
|
var rect = linePath.getBounds2D();
|
||||||
|
|
||||||
|
boolean inClippingPath = clippingPathStack.almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight());
|
||||||
|
|
||||||
|
if (pathElement.isClippingPath()) {
|
||||||
|
if (pathElement.isClipWindingFill()) {
|
||||||
|
linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
|
||||||
|
} else {
|
||||||
|
linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
|
||||||
|
}
|
||||||
|
|
||||||
|
clippingPathStack.intersectClippingPath(linePath);
|
||||||
|
pathElement.setPathClip(!delta);
|
||||||
|
writer.writeElement(pathElement);
|
||||||
|
|
||||||
|
} else {
|
||||||
|
if (inClippingPath) {
|
||||||
|
if (isFilledAndNonTransparent(pathElement)) {
|
||||||
|
List<ElementFeatures> currentOverlappedElements = visibleElements.stream()
|
||||||
|
.filter(features -> almostContains(linePath, features.getBoundingBox()))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
overlappedElements.addAll(currentOverlappedElements);
|
||||||
|
visibleElements.removeAll(currentOverlappedElements);
|
||||||
|
}
|
||||||
|
visibleElements.add(ElementFeatures.extractFeatures(pathElement));
|
||||||
|
if (!delta) {
|
||||||
|
writer.writeElement(pathElement);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (delta && !inClippingPath) {
|
||||||
|
pathElement.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||||
|
pathElement.getGState().setFillColor(new ColorPt(1, 0, 0));
|
||||||
|
pathElement.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
||||||
|
pathElement.getGState().setStrokeColor(new ColorPt(1, 0, 0));
|
||||||
|
writer.writeElement(pathElement);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static AffineTransform getAffineTransform(Matrix2D ctm) throws PDFNetException {
|
||||||
|
return new AffineTransform(ctm.getA(), ctm.getB(), ctm.getC(), ctm.getD(), ctm.getH(), ctm.getV());
|
||||||
|
}
|
||||||
|
|
||||||
|
private void removeOverlappedElements(Page page, ElementReader reader, ElementWriter writer, Set<Integer> visited, List<ElementFeatures> overlappedElements, boolean delta)
|
||||||
|
throws PDFNetException {
|
||||||
|
reader.begin(page);
|
||||||
|
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
|
||||||
|
if (delta) {
|
||||||
|
overlappedElements.forEach(feature -> drawBBox(writer, feature.getBoundingBox(), "#00FF00"));
|
||||||
|
overlappedElements.clear();
|
||||||
|
}
|
||||||
|
processOverlappedElements(reader, writer, visited, overlappedElements, delta);
|
||||||
|
writer.end();
|
||||||
|
reader.end();
|
||||||
|
|
||||||
|
if (overlappedElements.size() > 0) {
|
||||||
|
log.warn(overlappedElements.size() + " overlapped elements have not been found and removed");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void processOverlappedElements(ElementReader reader, ElementWriter writer, Set<Integer> visited, List<ElementFeatures> coveredElements, boolean delta)
|
||||||
|
throws PDFNetException {
|
||||||
|
for (Element element = reader.next(); element != null; element = reader.next()) {
|
||||||
|
switch (element.getType()) {
|
||||||
|
case Element.e_form:
|
||||||
|
processFormOverlappedElements(reader, writer, element, visited, coveredElements, delta);
|
||||||
|
break;
|
||||||
|
case Element.e_path:
|
||||||
|
case Element.e_image:
|
||||||
|
case Element.e_inline_image:
|
||||||
|
case Element.e_text:
|
||||||
|
boolean anyMatch = false;
|
||||||
|
for (ElementFeatures elementToRemove : coveredElements) {
|
||||||
|
if (elementToRemove.almostMatches(element)) {
|
||||||
|
coveredElements.remove(elementToRemove);
|
||||||
|
anyMatch = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!anyMatch) {
|
||||||
|
writer.writeElement(element);
|
||||||
|
} else if (element.getType() == 3 && element.hasTextMatrix()) {
|
||||||
|
/*
|
||||||
|
PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
|
||||||
|
hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
|
||||||
|
Therefore, the position of a following Tj is affected by not writing the first Element.
|
||||||
|
This is why, we write only the Tm command:
|
||||||
|
*/
|
||||||
|
writer.writeGStateChanges(element);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
writer.writeElement(element);
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void processFormOverlappedElements(ElementReader reader, ElementWriter writer, Element element, Set<Integer> visited, List<ElementFeatures> elementsToRemove, boolean delta)
|
||||||
|
throws PDFNetException {
|
||||||
|
|
||||||
|
writer.writeElement(element);
|
||||||
|
Obj formObj = element.getXObject();
|
||||||
|
|
||||||
|
if (!visited.contains((int) formObj.getObjNum())) {
|
||||||
|
visited.add((int) formObj.getObjNum());
|
||||||
|
ElementWriter new_writer = new ElementWriter();
|
||||||
|
reader.formBegin();
|
||||||
|
new_writer.begin(formObj);
|
||||||
|
|
||||||
|
reader.clearChangeList();
|
||||||
|
new_writer.setDefaultGState(reader);
|
||||||
|
|
||||||
|
processOverlappedElements(reader, new_writer, visited, elementsToRemove, delta);
|
||||||
|
new_writer.end();
|
||||||
|
reader.end();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isTextRenderedVisibly(GState gState) throws PDFNetException {
|
||||||
|
if (gState.getTextRenderMode() == GState.e_invisible_text) return false;
|
||||||
|
if (gState.getTextRenderMode() == GState.e_fill_text && gState.getFillOpacity() == 0) return false;
|
||||||
|
if (gState.getTextRenderMode() == GState.e_stroke_text && gState.getStrokeOpacity() == 0) return false;
|
||||||
|
if (gState.getTextRenderMode() == GState.e_fill_stroke_text && gState.getFillOpacity() == 0 && gState.getStrokeOpacity() == 0) return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
private GeneralPath convertToGeneralPath(PathData pathData) throws PDFNetException {
|
||||||
|
GeneralPath linePath = new GeneralPath();
|
||||||
|
|
||||||
|
double[] dataPoints = pathData.getPoints();
|
||||||
|
byte[] opr = pathData.getOperators();
|
||||||
|
|
||||||
|
double x1;
|
||||||
|
double y1;
|
||||||
|
double x2;
|
||||||
|
double y2;
|
||||||
|
double x3;
|
||||||
|
double y3;
|
||||||
|
|
||||||
|
int data_index = 0;
|
||||||
|
for (int opr_index = 0; opr_index < opr.length; ++opr_index) {
|
||||||
|
switch (opr[opr_index]) {
|
||||||
|
case PathData.e_moveto:
|
||||||
|
x1 = dataPoints[data_index];
|
||||||
|
++data_index;
|
||||||
|
y1 = dataPoints[data_index];
|
||||||
|
++data_index;
|
||||||
|
|
||||||
|
linePath.moveTo(x1, y1);
|
||||||
|
break;
|
||||||
|
case PathData.e_lineto:
|
||||||
|
x1 = dataPoints[data_index];
|
||||||
|
++data_index;
|
||||||
|
y1 = dataPoints[data_index];
|
||||||
|
++data_index;
|
||||||
|
|
||||||
|
linePath.lineTo(x1, y1);
|
||||||
|
break;
|
||||||
|
case PathData.e_cubicto:
|
||||||
|
x1 = dataPoints[data_index];
|
||||||
|
++data_index;
|
||||||
|
y1 = dataPoints[data_index];
|
||||||
|
++data_index;
|
||||||
|
x2 = dataPoints[data_index];
|
||||||
|
++data_index;
|
||||||
|
y2 = dataPoints[data_index];
|
||||||
|
++data_index;
|
||||||
|
x3 = dataPoints[data_index];
|
||||||
|
++data_index;
|
||||||
|
y3 = dataPoints[data_index];
|
||||||
|
++data_index;
|
||||||
|
|
||||||
|
linePath.curveTo(x1, y1, x2, y2, x3, y3);
|
||||||
|
break;
|
||||||
|
case PathData.e_rect:
|
||||||
|
x1 = dataPoints[data_index];
|
||||||
|
++data_index;
|
||||||
|
y1 = dataPoints[data_index];
|
||||||
|
++data_index;
|
||||||
|
double w = dataPoints[data_index];
|
||||||
|
++data_index;
|
||||||
|
double h = dataPoints[data_index];
|
||||||
|
++data_index;
|
||||||
|
x2 = x1 + w;
|
||||||
|
y2 = y1;
|
||||||
|
x3 = x2;
|
||||||
|
y3 = y1 + h;
|
||||||
|
double x4 = x1;
|
||||||
|
double y4 = y3;
|
||||||
|
|
||||||
|
linePath.moveTo(x1, y1);
|
||||||
|
linePath.lineTo(x2, y2);
|
||||||
|
linePath.lineTo(x3, y3);
|
||||||
|
linePath.lineTo(x4, y4);
|
||||||
|
break;
|
||||||
|
case PathData.e_closepath:
|
||||||
|
linePath.closePath();
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
throw new PDFNetException("Invalid Element Type", 0, "", "", "");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return linePath;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean almostContains(Shape outer, Rectangle2D inner) {
|
||||||
|
double tolerance = 1e-3;
|
||||||
|
|
||||||
|
double x_with_tolerance = inner.getX() >= 0 ? inner.getX() + tolerance : inner.getX() - tolerance;
|
||||||
|
double y_with_tolerance = inner.getY() >= 0 ? inner.getY() + tolerance : inner.getY() - tolerance;
|
||||||
|
double height_with_tolerance = inner.getHeight() - (2 * tolerance);
|
||||||
|
double width_with_tolerance = inner.getWidth() - (2 * tolerance);
|
||||||
|
Rectangle2D innerRect = new Rectangle2D.Double(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance);
|
||||||
|
|
||||||
|
return outer.contains(innerRect);
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isFilledAndNonTransparent(Element element) throws PDFNetException {
|
||||||
|
return element.isFilled() && element.getGState().getFillOpacity() == 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void debugSave(byte[] pdfFile) {
|
||||||
|
String fileLocation = "/tmp/delta.pdf";
|
||||||
|
try (var f_out = FileUtils.openOutputStream(new File(fileLocation))) {
|
||||||
|
f_out.write(pdfFile);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException("File location: " + fileLocation + "could not be openend, no file will be saved");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
private void drawBBox(ElementWriter writer, Rectangle2D r, String hexcolor) {
|
||||||
|
ColorPt colorPt = new ColorPt(Integer.valueOf(hexcolor.substring(1, 3), 16) / 255d,
|
||||||
|
Integer.valueOf(hexcolor.substring(3, 5), 16) / 255d,
|
||||||
|
Integer.valueOf(hexcolor.substring(5, 7), 16) / 255d);
|
||||||
|
ElementBuilder eb = new ElementBuilder();
|
||||||
|
Element rect = eb.createRect(r.getX(), r.getY(), r.getWidth(), r.getHeight());
|
||||||
|
rect.setPathStroke(true);
|
||||||
|
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
||||||
|
rect.getGState().setStrokeColor(colorPt);
|
||||||
|
writer.writePlacedElement(rect);
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -1,21 +1,5 @@
|
|||||||
package com.iqser.red.service.ocr.v1.server.service;
|
package com.iqser.red.service.ocr.v1.server.service;
|
||||||
|
|
||||||
import java.io.ByteArrayInputStream;
|
|
||||||
import java.io.ByteArrayOutputStream;
|
|
||||||
import java.io.InputStream;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Set;
|
|
||||||
import java.util.TreeSet;
|
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
|
||||||
import org.springframework.stereotype.Service;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse;
|
import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse;
|
||||||
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
|
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
|
||||||
@ -25,25 +9,21 @@ import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings;
|
|||||||
import com.iqser.red.service.persistence.service.v1.api.utils.SuppressFBWarnings;
|
import com.iqser.red.service.persistence.service.v1.api.utils.SuppressFBWarnings;
|
||||||
import com.iqser.red.service.redaction.v1.model.Point;
|
import com.iqser.red.service.redaction.v1.model.Point;
|
||||||
import com.iqser.red.service.redaction.v1.model.Rectangle;
|
import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||||
import com.pdftron.common.PDFNetException;
|
import com.pdftron.pdf.*;
|
||||||
import com.pdftron.pdf.Element;
|
|
||||||
import com.pdftron.pdf.ElementReader;
|
|
||||||
import com.pdftron.pdf.ElementWriter;
|
|
||||||
import com.pdftron.pdf.OCRModule;
|
|
||||||
import com.pdftron.pdf.OCROptions;
|
|
||||||
import com.pdftron.pdf.Optimizer;
|
|
||||||
import com.pdftron.pdf.PDFDoc;
|
|
||||||
import com.pdftron.pdf.Page;
|
|
||||||
import com.pdftron.pdf.PageIterator;
|
|
||||||
import com.pdftron.pdf.Rect;
|
|
||||||
import com.pdftron.pdf.RectCollection;
|
|
||||||
import com.pdftron.sdf.Obj;
|
|
||||||
import com.pdftron.sdf.SDFDoc;
|
import com.pdftron.sdf.SDFDoc;
|
||||||
|
|
||||||
import io.micrometer.core.annotation.Timed;
|
import io.micrometer.core.annotation.Timed;
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.ByteArrayOutputStream;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
|
||||||
@Slf4j
|
@Slf4j
|
||||||
@Service
|
@Service
|
||||||
@ -59,6 +39,7 @@ public class OCRService {
|
|||||||
|
|
||||||
private final ObjectMapper objectMapper;
|
private final ObjectMapper objectMapper;
|
||||||
|
|
||||||
|
private final InvisibleElementService invisibleElementService;
|
||||||
|
|
||||||
@Timed("redactmanager_PDFTron-ocrDocument")
|
@Timed("redactmanager_PDFTron-ocrDocument")
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
@ -81,9 +62,11 @@ public class OCRService {
|
|||||||
private byte[] ocr(byte[] file, String fileId, ImageServiceResponse imageServiceResponse) {
|
private byte[] ocr(byte[] file, String fileId, ImageServiceResponse imageServiceResponse) {
|
||||||
|
|
||||||
PDFDoc pdfDoc = null;
|
PDFDoc pdfDoc = null;
|
||||||
|
|
||||||
|
var fileWithoutInvisibleText = invisibleElementService.removeInvisibleTextOrPathElements(file, false);
|
||||||
|
|
||||||
try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
|
try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
|
||||||
pdfDoc = new PDFDoc(file);
|
pdfDoc = new PDFDoc(fileWithoutInvisibleText);
|
||||||
removeInvisibleText(pdfDoc);
|
|
||||||
|
|
||||||
Map<Integer, List<ImagePosition>> pages = new HashMap<>();
|
Map<Integer, List<ImagePosition>> pages = new HashMap<>();
|
||||||
|
|
||||||
@ -114,7 +97,6 @@ public class OCRService {
|
|||||||
|
|
||||||
ocrDoc.close();
|
ocrDoc.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
Optimizer.optimize(pdfDoc);
|
Optimizer.optimize(pdfDoc);
|
||||||
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
|
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
|
||||||
pdfDoc.close();
|
pdfDoc.close();
|
||||||
@ -192,134 +174,4 @@ public class OCRService {
|
|||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* There are 2 possibilities to have invisible Text in pdfs.
|
|
||||||
* 1. gState is set to invisible, this is ocr text.
|
|
||||||
* 2. Filled Path elements in front of the text.
|
|
||||||
*/
|
|
||||||
@SneakyThrows
|
|
||||||
private void removeInvisibleText(PDFDoc pdfDoc) {
|
|
||||||
|
|
||||||
ElementWriter writer = new ElementWriter();
|
|
||||||
ElementReader reader = new ElementReader();
|
|
||||||
Set<Integer> visited = new TreeSet<>();
|
|
||||||
|
|
||||||
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
|
|
||||||
Page page = iterator.next();
|
|
||||||
removeOverlapText(page, reader, writer, visited);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
private void removeOverlapText(Page page, ElementReader reader, ElementWriter writer, Set<Integer> visited) {
|
|
||||||
|
|
||||||
visited.add((int) page.getSDFObj().getObjNum());
|
|
||||||
reader.begin(page);
|
|
||||||
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
|
|
||||||
processElements(reader, writer, visited, false);
|
|
||||||
writer.end();
|
|
||||||
reader.end();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
private void processElements(ElementReader reader, ElementWriter writer, Set<Integer> visited, boolean isInForm) {
|
|
||||||
|
|
||||||
Set<Rect> filledRectangles = new HashSet<>();
|
|
||||||
for (Element element = reader.next(); element != null; element = reader.next())
|
|
||||||
|
|
||||||
switch (element.getType()) {
|
|
||||||
case Element.e_image:
|
|
||||||
case Element.e_inline_image:
|
|
||||||
processImage(element, writer, isInForm);
|
|
||||||
break;
|
|
||||||
|
|
||||||
case Element.e_text:
|
|
||||||
processText(element, writer, filledRectangles);
|
|
||||||
break;
|
|
||||||
|
|
||||||
case Element.e_path:
|
|
||||||
processPath(element, writer, filledRectangles);
|
|
||||||
break;
|
|
||||||
|
|
||||||
case Element.e_form:
|
|
||||||
processForm(reader, writer, element, visited);
|
|
||||||
break;
|
|
||||||
|
|
||||||
default:
|
|
||||||
writer.writeElement(element);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
private void processImage(Element element, ElementWriter writer, boolean isInForm) {
|
|
||||||
|
|
||||||
if (!isInForm || !settings.isRemoveWatermark()) {
|
|
||||||
writer.writeElement(element);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
private void processText(Element element, ElementWriter writer, Set<Rect> filledRectangles) {
|
|
||||||
|
|
||||||
if (element.getBBox() == null) {
|
|
||||||
writer.writeElement(element);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
double x = element.getBBox().getX1();
|
|
||||||
double y = element.getBBox().getY1();
|
|
||||||
boolean filledRectangleIntersection = filledRectangles.stream().anyMatch(r -> {
|
|
||||||
try {
|
|
||||||
return r.contains(x, y);
|
|
||||||
} catch (PDFNetException e) {
|
|
||||||
throw new RuntimeException("Internal pdftron error during removal of overlap text", e);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
var gState = element.getGState();
|
|
||||||
|
|
||||||
//See PDF Reference 5.3 Text rendering modes, 3 = Invisible, however this ocr does not use it.
|
|
||||||
if (!filledRectangleIntersection && gState.getTextRenderMode() != 3 || filledRectangleIntersection && gState.getTextRenderMode() == 0) {
|
|
||||||
writer.writeElement(element);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
private void processPath(Element element, ElementWriter writer, Set<Rect> filledRectangles) {
|
|
||||||
|
|
||||||
writer.writeElement(element);
|
|
||||||
if (element.getPathData() != null && element.getPathData().getPoints().length > 4) {
|
|
||||||
filledRectangles.add(element.getBBox());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
private void processForm(ElementReader reader, ElementWriter writer, Element element, Set<Integer> visited) {
|
|
||||||
|
|
||||||
writer.writeElement(element);
|
|
||||||
Obj formObj = element.getXObject();
|
|
||||||
|
|
||||||
if (!visited.contains((int) formObj.getObjNum())) {
|
|
||||||
visited.add((int) formObj.getObjNum());
|
|
||||||
ElementWriter new_writer = new ElementWriter();
|
|
||||||
reader.formBegin();
|
|
||||||
new_writer.begin(formObj);
|
|
||||||
|
|
||||||
reader.clearChangeList();
|
|
||||||
new_writer.setDefaultGState(reader);
|
|
||||||
|
|
||||||
processElements(reader, new_writer, visited, true);
|
|
||||||
new_writer.end();
|
|
||||||
reader.end();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -0,0 +1,64 @@
|
|||||||
|
package com.iqser.red.service.ocr.v1.server;
|
||||||
|
|
||||||
|
import com.iqser.red.service.ocr.v1.server.service.InvisibleElementService;
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import org.apache.commons.io.FileUtils;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.junit.jupiter.api.extension.ExtendWith;
|
||||||
|
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||||
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
|
import org.springframework.boot.test.context.SpringBootTest;
|
||||||
|
import org.springframework.boot.test.mock.mockito.MockBean;
|
||||||
|
import org.springframework.context.annotation.Import;
|
||||||
|
import org.springframework.core.io.ClassPathResource;
|
||||||
|
import org.springframework.test.context.junit.jupiter.SpringExtension;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
|
||||||
|
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
|
||||||
|
|
||||||
|
@ExtendWith(SpringExtension.class)
|
||||||
|
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT //
|
||||||
|
, properties = {"pdftron.ocrmodule.path=/YourOCRModulePath"})
|
||||||
|
@Import(OcrServiceIntegrationTest.TestConfiguration.class)
|
||||||
|
public class InvisibleElementServiceTest {
|
||||||
|
|
||||||
|
@Autowired
|
||||||
|
private InvisibleElementService invisibleElementService;
|
||||||
|
|
||||||
|
@MockBean
|
||||||
|
protected RabbitTemplate rabbitTemplate;
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@SneakyThrows
|
||||||
|
public void testRemoveInvisibleText() {
|
||||||
|
String fileName = "InvisibleText";
|
||||||
|
|
||||||
|
ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf");
|
||||||
|
|
||||||
|
var initialFileBytes = Files.readAllBytes(pdfFileResource.getFile().toPath());
|
||||||
|
|
||||||
|
var fileWithoutInvisibleText = invisibleElementService.removeInvisibleTextOrPathElements(initialFileBytes, false);
|
||||||
|
var deltaFile = invisibleElementService.removeInvisibleTextOrPathElements(initialFileBytes, true);
|
||||||
|
|
||||||
|
String fileWithoutInvisibleTextLocation = getTemporaryDirectory() + "/" + fileName + ".pdf";
|
||||||
|
String deltaFileLocation = getTemporaryDirectory() + "/" + fileName + "_delta.pdf";
|
||||||
|
|
||||||
|
saveToFile(fileWithoutInvisibleTextLocation, fileWithoutInvisibleText);
|
||||||
|
saveToFile(deltaFileLocation, deltaFile);
|
||||||
|
|
||||||
|
System.out.println("File:" + fileWithoutInvisibleTextLocation);
|
||||||
|
System.out.println("File:" + deltaFileLocation);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void saveToFile(String location, byte[] fileBytes) {
|
||||||
|
try (var f_out = FileUtils.openOutputStream(new File(location))) {
|
||||||
|
f_out.write(fileBytes);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException("File location: " + location + "could not be openend, no file will be saved");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -1,10 +1,13 @@
|
|||||||
package com.iqser.red.service.ocr.v1.server;
|
package com.iqser.red.service.ocr.v1.server;
|
||||||
|
|
||||||
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import static org.assertj.core.api.Assertions.assertThat;
|
import com.iqser.red.service.ocr.v1.server.service.FileStorageService;
|
||||||
|
import com.iqser.red.service.ocr.v1.server.service.OCRService;
|
||||||
import java.io.File;
|
import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService;
|
||||||
|
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType;
|
||||||
|
import com.iqser.red.storage.commons.StorageAutoConfiguration;
|
||||||
|
import com.iqser.red.storage.commons.service.StorageService;
|
||||||
|
import lombok.SneakyThrows;
|
||||||
import org.apache.commons.io.FileUtils;
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
@ -24,15 +27,10 @@ import org.springframework.context.annotation.Primary;
|
|||||||
import org.springframework.core.io.ClassPathResource;
|
import org.springframework.core.io.ClassPathResource;
|
||||||
import org.springframework.test.context.junit.jupiter.SpringExtension;
|
import org.springframework.test.context.junit.jupiter.SpringExtension;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import java.io.File;
|
||||||
import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService;
|
|
||||||
import com.iqser.red.service.ocr.v1.server.service.OCRService;
|
|
||||||
import com.iqser.red.service.ocr.v1.server.service.FileStorageService;
|
|
||||||
import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType;
|
|
||||||
import com.iqser.red.storage.commons.StorageAutoConfiguration;
|
|
||||||
import com.iqser.red.storage.commons.service.StorageService;
|
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
|
||||||
|
import static org.assertj.core.api.Assertions.assertThat;
|
||||||
|
|
||||||
@ExtendWith(SpringExtension.class)
|
@ExtendWith(SpringExtension.class)
|
||||||
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT //
|
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT //
|
||||||
@ -81,6 +79,32 @@ public class OcrServiceIntegrationTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@Disabled // OCRModule is not available on build server. If you want to run the test set the property at the top.
|
||||||
|
@SneakyThrows
|
||||||
|
public void testRemoveInvisibleText() {
|
||||||
|
|
||||||
|
String fileName = "ocr/OCR Docs/MK244 - Fitness of Analytical Method - Physical-Chemical Pro";
|
||||||
|
// String fileName = "InvisiblePathElements";
|
||||||
|
|
||||||
|
ClassPathResource imageInfoResource = new ClassPathResource("files/InvisibleText.IMAGE_INFO.json");
|
||||||
|
ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf");
|
||||||
|
|
||||||
|
var originId = FileStorageService.getStorageId("dossier", "file", FileType.ORIGIN);
|
||||||
|
storageService.storeObject(originId, pdfFileResource.getInputStream());
|
||||||
|
|
||||||
|
var imageId = FileStorageService.getStorageId("dossier", "file", FileType.IMAGE_INFO);
|
||||||
|
storageService.storeObject(imageId, imageInfoResource.getInputStream());
|
||||||
|
|
||||||
|
var response = ocrService.ocrDocument("dossier", "file");
|
||||||
|
|
||||||
|
var out = FileUtils.openOutputStream(new File(getTemporaryDirectory() + "/" + fileName + ".pdf"));
|
||||||
|
IOUtils.copy(response, out);
|
||||||
|
|
||||||
|
System.out.println("File:" + getTemporaryDirectory() + "/" + fileName + ".pdf");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void dummyTest() {
|
public void dummyTest() {
|
||||||
|
|
||||||
|
|||||||
@ -0,0 +1 @@
|
|||||||
|
{"dossierId": "55547c91-6b0e-4aa6-9009-2e7c4cd90f13", "fileId": "917b9d9c9f548f85fef3679db45ff46c", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": []}
|
||||||
Binary file not shown.
Binary file not shown.
@ -0,0 +1 @@
|
|||||||
|
{"dossierId": "55547c91-6b0e-4aa6-9009-2e7c4cd90f13", "fileId": "917b9d9c9f548f85fef3679db45ff46c", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": []}
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1 @@
|
|||||||
|
{"dossierId": "55547c91-6b0e-4aa6-9009-2e7c4cd90f13", "fileId": "917b9d9c9f548f85fef3679db45ff46c", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": []}
|
||||||
Binary file not shown.
@ -0,0 +1 @@
|
|||||||
|
{"dossierId": "55547c91-6b0e-4aa6-9009-2e7c4cd90f13", "fileId": "917b9d9c9f548f85fef3679db45ff46c", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": []}
|
||||||
Binary file not shown.
Binary file not shown.
@ -0,0 +1 @@
|
|||||||
|
{"dossierId": "55547c91-6b0e-4aa6-9009-2e7c4cd90f13", "fileId": "917b9d9c9f548f85fef3679db45ff46c", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": []}
|
||||||
Binary file not shown.
@ -0,0 +1 @@
|
|||||||
|
{"dossierId": "55547c91-6b0e-4aa6-9009-2e7c4cd90f13", "fileId": "917b9d9c9f548f85fef3679db45ff46c", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": []}
|
||||||
Binary file not shown.
Binary file not shown.
@ -0,0 +1,235 @@
|
|||||||
|
|
||||||
|
0 TL
|
||||||
|
q
|
||||||
|
q
|
||||||
|
150 0 m
|
||||||
|
400 0 l
|
||||||
|
400 300 l
|
||||||
|
150 300 l
|
||||||
|
h
|
||||||
|
58 50 m
|
||||||
|
58 200 l
|
||||||
|
570 200 l
|
||||||
|
570 50 l
|
||||||
|
F
|
||||||
|
W
|
||||||
|
n
|
||||||
|
BT
|
||||||
|
-0.011 Tc
|
||||||
|
0 Tw
|
||||||
|
100 Tz
|
||||||
|
/C2_0 12 Tf
|
||||||
|
0 Tr
|
||||||
|
0 Ts
|
||||||
|
485.52 66.48 Td
|
||||||
|
(\0003\000D\000J\000H\000\003) Tj
|
||||||
|
0 Tc
|
||||||
|
26.4 0 Td
|
||||||
|
(\000\024) Tj
|
||||||
|
0.24 Tc
|
||||||
|
8.88 0 Td
|
||||||
|
(\000R\000I) Tj
|
||||||
|
0 Tc
|
||||||
|
9.84 0 Td
|
||||||
|
[(\000\003)-10(\000\030)] TJ
|
||||||
|
0.078 Tc
|
||||||
|
-440.88 -27.6 Td
|
||||||
|
(\0006\000W\000X\000G\000\\) Tj
|
||||||
|
-0.007 Tc
|
||||||
|
30.96 0 Td
|
||||||
|
(\0001\000X\000P\000E\000H\000U\000\035\000\003\000\003) Tj
|
||||||
|
-0.014 Tc
|
||||||
|
48.72 0 Td
|
||||||
|
(\0007\0000\0005\000\023\000\024\000\027\000\025) Tj
|
||||||
|
ET
|
||||||
|
q
|
||||||
|
154.32 0 0 76.32 385.68 643.68 cm
|
||||||
|
/Im0 Do
|
||||||
|
Q
|
||||||
|
BT
|
||||||
|
0.013 Tc
|
||||||
|
/C2_1 12 Tf
|
||||||
|
288.96 601.44 Td
|
||||||
|
(\0006\000X\000E\000V\000W\000D\000Q\000F\000H) Tj
|
||||||
|
0 Tc
|
||||||
|
-168.24 -27.6 Td
|
||||||
|
[(\0006\000<\0001\000\030\000\027\000\031\000\026\000\026\000\023\000\003)-10(\000\261)] TJ
|
||||||
|
ET
|
||||||
|
q
|
||||||
|
192.96 584.88 317.52 -13.92 re
|
||||||
|
W
|
||||||
|
n
|
||||||
|
BT
|
||||||
|
-0.006 Tc
|
||||||
|
192.96 573.84 Td
|
||||||
|
(\0007\000R\000[\000L\000F\000L\000W\000\\\000\003\0006\000W\000X\000G\000\\\000\003\000E\000\\\000\003\000'\000H\000U\000P\000D\000O\000\003\000$\000G\000P\000L\000Q\000L\000V\000W\000U\000D\000W\000L\000R\000Q\000\003\000W\000R\000\003\000+\000D\000Q\000\003\000:\000L\000V\000W\000D\000U\000\003\0005\000D\000W\000V\000\003) Tj
|
||||||
|
ET
|
||||||
|
Q
|
||||||
|
BT
|
||||||
|
-0.024 Tc
|
||||||
|
284.4 560.16 Td
|
||||||
|
(\000I\000R\000U\000\003\000\027\000\003\000:\000H\000H\000N\000V) Tj
|
||||||
|
-0.011 Tc
|
||||||
|
-14.4 -27.6 Td
|
||||||
|
(\0003\000D\000W\000K\000R\000O\000R\000J\000\\\000\003\0005\000H\000S\000R\000U\000W) Tj
|
||||||
|
ET
|
||||||
|
117.12 630 0.72 -3.6 re
|
||||||
|
f*
|
||||||
|
117.12 630 2.16 -0.72 re
|
||||||
|
f*
|
||||||
|
118.56 628.56 0.72 -2.16 re
|
||||||
|
f*
|
||||||
|
118.56 628.56 0.72 -0.72 re
|
||||||
|
f*
|
||||||
|
119.28 630 390.96 -0.72 re
|
||||||
|
f*
|
||||||
|
119.28 628.56 390.96 -0.72 re
|
||||||
|
f*
|
||||||
|
511.68 630 0.72 -3.6 re
|
||||||
|
f*
|
||||||
|
510.24 630 2.16 -0.72 re
|
||||||
|
f*
|
||||||
|
510.24 628.56 0.72 -2.16 re
|
||||||
|
f*
|
||||||
|
510.24 628.56 0.72 -0.72 re
|
||||||
|
f*
|
||||||
|
118.56 626.4 0.72 -111.84 re
|
||||||
|
f*
|
||||||
|
117.12 626.4 0.72 -111.84 re
|
||||||
|
f*
|
||||||
|
117.12 514.56 0.72 -2.16 re
|
||||||
|
f*
|
||||||
|
117.12 513.12 2.16 -0.72 re
|
||||||
|
f*
|
||||||
|
118.56 514.56 0.72 -0.72 re
|
||||||
|
f*
|
||||||
|
118.56 514.56 0.72 -0.72 re
|
||||||
|
f*
|
||||||
|
119.28 513.12 390.96 -0.72 re
|
||||||
|
f*
|
||||||
|
119.28 514.56 390.96 -0.72 re
|
||||||
|
f*
|
||||||
|
511.68 626.4 0.72 -111.84 re
|
||||||
|
f*
|
||||||
|
510.24 626.4 0.72 -111.84 re
|
||||||
|
f*
|
||||||
|
511.68 514.56 0.72 -2.16 re
|
||||||
|
f*
|
||||||
|
510.24 513.12 2.16 -0.72 re
|
||||||
|
f*
|
||||||
|
510.24 514.56 0.72 -0.72 re
|
||||||
|
f*
|
||||||
|
510.24 514.56 0.72 -0.72 re
|
||||||
|
f*
|
||||||
|
BT
|
||||||
|
0.011 Tc
|
||||||
|
89.76 266.64 Td
|
||||||
|
(\000$\0008\0007\000+\0002\0005\000\013\0006\000\f\000\035) Tj
|
||||||
|
-0.012 Tc
|
||||||
|
/C2_0 12 Tf
|
||||||
|
184.8 0.24 Td
|
||||||
|
(\0000\000L\000F\000K\000H\000O\000D\000\003\000*\000U\000H\000J\000R\000U\000L\000\003) Tj
|
||||||
|
-0.037 Tc
|
||||||
|
82.56 0 Td
|
||||||
|
(\000'\0009\0000\000\003\0003\000K\000'\000\003) Tj
|
||||||
|
0.101 Tc
|
||||||
|
55.2 0 Td
|
||||||
|
(\0003\000D\000W\000K\000R\000O) Tj
|
||||||
|
-0.02 Tc
|
||||||
|
30.72 0 Td
|
||||||
|
(\000R\000J\000L\000V\000W) Tj
|
||||||
|
-0.01 Tc
|
||||||
|
/C2_1 12 Tf
|
||||||
|
-353.28 -27.84 Td
|
||||||
|
(\000&\0002\0000\0003\000/\000\(\0007\000,\0002\0001\000\003\000'\000$\0007\000\(\000\035) Tj
|
||||||
|
-0.009 Tc
|
||||||
|
/C2_0 12 Tf
|
||||||
|
184.8 0.24 Td
|
||||||
|
(\000\023\000\030\000\003\0000\000D\000U\000F\000K\000\003\000\025\000\023\000\024\000\033) Tj
|
||||||
|
/C2_1 12 Tf
|
||||||
|
-184.8 -27.84 Td
|
||||||
|
(\000/\000$\000%\0002\0005\000$\0007\0002\0005\000<\000\003\0003\0005\0002\000-\000\(\000&\0007\000\003\000,) Tj
|
||||||
|
-0.026 Tc
|
||||||
|
152.4 0 Td
|
||||||
|
(\000'\000\035) Tj
|
||||||
|
0.009 Tc
|
||||||
|
/C2_0 12 Tf
|
||||||
|
32.4 0.24 Td
|
||||||
|
(\0005\000H\000S\000R\000U\000W\000\003\0001\000X\000P\000E) Tj
|
||||||
|
0.021 Tc
|
||||||
|
65.52 0 Td
|
||||||
|
(\000H\000U\000\035\000\003) Tj
|
||||||
|
-0.014 Tc
|
||||||
|
15.84 0 Td
|
||||||
|
(\0007\0000\0005\000\023\000\024\000\027\000\025) Tj
|
||||||
|
0.078 Tc
|
||||||
|
-81.36 -13.68 Td
|
||||||
|
(\0006\000W\000X\000G\000\\) Tj
|
||||||
|
-0.011 Tc
|
||||||
|
27.84 0 Td
|
||||||
|
(\000\003\0001\000X\000P\000E\000H\000U\000\035) Tj
|
||||||
|
-0.014 Tc
|
||||||
|
48.72 0 Td
|
||||||
|
(\0007\0000\0005\000\023\000\024\000\027\000\025) Tj
|
||||||
|
-0.008 Tc
|
||||||
|
-76.56 -13.92 Td
|
||||||
|
(\0007\000D\000V\000N\000\003\0001\000X\000P\000E\000H\000U\000\035\000\003) Tj
|
||||||
|
-0.019 Tc
|
||||||
|
72 0 Td
|
||||||
|
(\0007\000.\000\023\000\025\000\024\000\023\000\024\000\031\000\033) Tj
|
||||||
|
0 Tc
|
||||||
|
/C2_1 12 Tf
|
||||||
|
-256.8 -27.84 Td
|
||||||
|
[(\0009\0002\000/\0008\0000\000\(\000\003)-10(\000\024)] TJ
|
||||||
|
-0.052 Tc
|
||||||
|
66.24 0 Td
|
||||||
|
(\0002\000\)\000\003) Tj
|
||||||
|
0 Tc
|
||||||
|
19.68 0 Td
|
||||||
|
(\000\024) Tj
|
||||||
|
-0.01 Tc
|
||||||
|
8.88 0 Td
|
||||||
|
(\0002\000\)\000\003\0006\0007\0008\000'\000<) Tj
|
||||||
|
ET
|
||||||
|
146.88 155.04 6 -1.2 re
|
||||||
|
f*
|
||||||
|
175.68 155.04 5.76 -1.2 re
|
||||||
|
f*
|
||||||
|
BT
|
||||||
|
-0.053 Tc
|
||||||
|
89.76 142.56 Td
|
||||||
|
(\0003\000$\000*\000\(\000\003) Tj
|
||||||
|
0 Tc
|
||||||
|
36.24 0 Td
|
||||||
|
(\000\024) Tj
|
||||||
|
-0.052 Tc
|
||||||
|
9.12 0 Td
|
||||||
|
(\0002\000\)\000\003) Tj
|
||||||
|
0 Tc
|
||||||
|
19.68 0 Td
|
||||||
|
(\000\030) Tj
|
||||||
|
ET
|
||||||
|
126 141.36 6 -1.2 re
|
||||||
|
f*
|
||||||
|
154.8 141.36 6 -1.2 re
|
||||||
|
f*
|
||||||
|
/Artifact <</Subtype/Watermark/Type/Pagination>> BDC
|
||||||
|
q
|
||||||
|
/G0 gs
|
||||||
|
0.940613 0 0 0.940613 26.0628 0 cm
|
||||||
|
0 0 0 RG
|
||||||
|
0 w
|
||||||
|
/Fm0 Do
|
||||||
|
Q
|
||||||
|
EMC
|
||||||
|
/Artifact <</Contents( Page 250 of 256)/Subtype/Header/Type/Pagination>> BDC
|
||||||
|
q
|
||||||
|
/G0 gs
|
||||||
|
1 0 0 1 458.67 48.412 cm
|
||||||
|
0 0 0 RG
|
||||||
|
0 w
|
||||||
|
/Fm1 Do
|
||||||
|
Q
|
||||||
|
EMC
|
||||||
|
Q
|
||||||
|
Q
|
||||||
Binary file not shown.
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user