diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ClippingPathStack.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ClippingPathStack.java
index 1c3b1cd..04a30dc 100644
--- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ClippingPathStack.java
+++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ClippingPathStack.java
@@ -1,58 +1,53 @@
package com.iqser.red.service.ocr.v1.server.model;
-import java.awt.geom.AffineTransform;
-import java.awt.geom.Area;
-import java.awt.geom.GeneralPath;
-import java.util.ArrayDeque;
-import java.util.Deque;
-
-import com.pdftron.common.Matrix2D;
import com.pdftron.pdf.Rect;
-
import lombok.Data;
import lombok.SneakyThrows;
+import java.awt.geom.Area;
+import java.awt.geom.GeneralPath;
+import java.awt.geom.Rectangle2D;
+import java.util.ArrayDeque;
+import java.util.Deque;
+
@Data
public class ClippingPathStack {
private Deque stack = new ArrayDeque<>();
@SneakyThrows
- public ClippingPathStack(Rect rectangle)
- {
- GeneralPath path = new GeneralPath();
- path.moveTo(rectangle.getX1(), rectangle.getY1());
- path.lineTo(rectangle.getX2(), rectangle.getY1());
- path.lineTo(rectangle.getX2(), rectangle.getY2());
- path.lineTo(rectangle.getX1(), rectangle.getY2());
- path.closePath();
- stack.push(new Area(path));
+ public ClippingPathStack(Rect rectangle) {
+ stack.push(new Area(new Rectangle2D.Double(rectangle.getX1(), rectangle.getY1(), rectangle.getWidth(), rectangle.getHeight()).getBounds2D()));
}
@SneakyThrows
- public void intersectClippingPath(GeneralPath path, Matrix2D ctm){
- var affineTransform = new AffineTransform(ctm.getA(), ctm.getB(), ctm.getC(), ctm.getD(), ctm.getH(), ctm.getV());
- path.transform(affineTransform);
-// var area = getCurrentClippingPath();
-// area.transform(affineTransform);
+ public void intersectClippingPath(GeneralPath path) {
getCurrentClippingPath().intersect(new Area(path));
}
+ public boolean almostIntersects(double x, double y, double width, double height) {
+ // Height or width are zero for straight lines, even though they are being rendered. Therefore, height or width must be at minimum >0.
+ double tolerance = 1e-3;
+ double x_with_tolerance = x > 0 ? x - tolerance : x + tolerance;
+ double y_with_tolerance = y > 0 ? y - tolerance : y + tolerance;
+ double width_with_tolerance = width + 2 * tolerance;
+ double height_with_tolerance = height + 2 * tolerance;
+ return getCurrentClippingPath().intersects(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance);
+ }
public Area getCurrentClippingPath() {
-
return stack.peek();
}
- public void enterNewGState(){
+ public void enterNewGState() {
Area current = stack.peek();
Area cloned = new Area();
cloned.add(current);
stack.push(cloned);
}
- public void leaveGState(){
+ public void leaveGState() {
stack.pop();
}
diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ElementFeatures.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ElementFeatures.java
new file mode 100644
index 0000000..f150973
--- /dev/null
+++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ElementFeatures.java
@@ -0,0 +1,149 @@
+package com.iqser.red.service.ocr.v1.server.model;
+
+import com.pdftron.common.PDFNetException;
+import com.pdftron.pdf.Element;
+import com.pdftron.pdf.Rect;
+import lombok.*;
+import lombok.experimental.SuperBuilder;
+
+import java.awt.geom.Rectangle2D;
+
+@Data
+@SuperBuilder
+@NoArgsConstructor
+@AllArgsConstructor
+public abstract class ElementFeatures {
+ private int elementType;
+ private Rectangle2D boundingBox;
+
+ public boolean almostMatches(Element element) throws PDFNetException {
+ if (element.getType() != elementType) return false;
+ if (element.getBBox() == null) return false;
+ return rectsAlmostMatch(element.getBBox());
+ }
+
+ protected boolean almostEqual(double a, double b) {
+ double tolerance = 1e-3;
+ return Math.abs(a - b) < tolerance;
+ }
+
+ @SneakyThrows
+ private boolean rectsAlmostMatch(Rect bBox) {
+ if (!almostEqual(bBox.getX1(), boundingBox.getX())) return false;
+ if (!almostEqual(bBox.getY1(), boundingBox.getY())) return false;
+ if (!almostEqual(bBox.getWidth(), boundingBox.getWidth())) return false;
+ return almostEqual(bBox.getHeight(), boundingBox.getHeight());
+ }
+
+ @EqualsAndHashCode(callSuper = true)
+ @Data
+ @SuperBuilder
+ @NoArgsConstructor
+ @AllArgsConstructor
+ public static class Text extends ElementFeatures {
+ private String text;
+ private int font;
+ private double fontsize;
+
+ @Override
+ public boolean almostMatches(Element element) throws PDFNetException {
+ if (!super.almostMatches(element)) return false;
+ if (!text.equals(element.getTextString())) return false;
+ if (font != element.getGState().getFont().getType()) return false;
+ return almostEqual(fontsize, element.getGState().getFontSize());
+ }
+
+ }
+
+ @EqualsAndHashCode(callSuper = true)
+ @Data
+ @SuperBuilder
+ @NoArgsConstructor
+ @AllArgsConstructor
+ public static class Path extends ElementFeatures {
+ private boolean isClippingPath;
+ private boolean isClipWindingFill;
+ private boolean isStroked;
+ private boolean isFilled;
+ private boolean isWindingFill;
+
+ @Override
+ public boolean almostMatches(Element element) throws PDFNetException {
+ if (!super.almostMatches(element)) return false;
+ if (isClippingPath != element.isClippingPath()) return false;
+ if (isClipWindingFill != element.isClipWindingFill()) return false;
+ if (isStroked != element.isStroked()) return false;
+ if (isFilled != element.isFilled()) return false;
+ if (isWindingFill != element.isWindingFill()) return false;
+
+ return true;
+ }
+ }
+
+ @EqualsAndHashCode(callSuper = true)
+ @Data
+ @SuperBuilder
+ @NoArgsConstructor
+ public static class Image extends ElementFeatures {
+ private int dataSize;
+ private int height;
+ private int width;
+ private int renderingIntent;
+ private int componentNum;
+ private int bitsPerComponent;
+
+ @Override
+ public boolean almostMatches(Element element) throws PDFNetException {
+ if (!super.almostMatches(element)) return false;
+ if (dataSize != element.getImageDataSize()) return false;
+ if (height != element.getImageHeight()) return false;
+ if (width != element.getImageWidth()) return false;
+ if (renderingIntent != element.getImageRenderingIntent()) return false;
+ if (componentNum != element.getComponentNum()) return false;
+ if (bitsPerComponent != element.getBitsPerComponent()) return false;
+ return true;
+ }
+ }
+
+ public static ElementFeatures extractFeatures(Element element) throws PDFNetException {
+ switch (element.getType()) {
+ case Element.e_path:
+ return ElementFeatures.Path.builder()
+ .elementType(element.getType())
+ .boundingBox(toRectangle2D(element.getBBox()))
+ .isClippingPath(element.isClippingPath())
+ .isClipWindingFill(element.isClipWindingFill())
+ .isStroked(element.isStroked())
+ .isFilled(element.isFilled())
+ .isWindingFill(element.isWindingFill())
+ .build();
+ case Element.e_text:
+ return ElementFeatures.Text.builder()
+ .elementType(element.getType())
+ .boundingBox(toRectangle2D(element.getBBox()))
+ .text(element.getTextString())
+ .font(element.getGState().getFont().getType())
+ .fontsize(element.getGState().getFontSize())
+ .build();
+ case Element.e_image:
+ case Element.e_inline_image:
+ return Image.builder()
+ .elementType(element.getType())
+ .boundingBox(toRectangle2D(element.getBBox()))
+ .dataSize(element.getImageDataSize())
+ .height(element.getImageHeight())
+ .width(element.getImageWidth())
+ .renderingIntent(element.getImageRenderingIntent())
+ .componentNum(element.getComponentNum())
+ .bitsPerComponent(element.getBitsPerComponent())
+ .build();
+ default:
+ throw new UnsupportedOperationException("Feature Extraction is not supported for PDFTron.Element with type: " + element.getType());
+ }
+ }
+
+ private static Rectangle2D toRectangle2D(Rect rect) throws PDFNetException {
+ return new Rectangle2D.Double(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
+ }
+}
+
diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementService.java
new file mode 100644
index 0000000..7b9ff3d
--- /dev/null
+++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementService.java
@@ -0,0 +1,463 @@
+package com.iqser.red.service.ocr.v1.server.service;
+
+import com.iqser.red.service.ocr.v1.server.model.ClippingPathStack;
+import com.iqser.red.service.ocr.v1.server.model.ElementFeatures;
+import com.pdftron.common.Matrix2D;
+import com.pdftron.common.PDFNetException;
+import com.pdftron.pdf.*;
+import com.pdftron.sdf.Obj;
+import com.pdftron.sdf.SDFDoc;
+import lombok.SneakyThrows;
+import lombok.extern.slf4j.Slf4j;
+import org.apache.commons.io.FileUtils;
+import org.springframework.stereotype.Service;
+
+import java.awt.*;
+import java.awt.geom.AffineTransform;
+import java.awt.geom.GeneralPath;
+import java.awt.geom.Rectangle2D;
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.stream.Collectors;
+
+@Slf4j
+@Service
+public class InvisibleElementService {
+
+ /*
+ handled cases:
+ Text or Path outside of clipping path
+ Text which is transparent or is set to not render
+ Text or Path that have been painted over by visible and filled Paths
+ unhandled cases:
+ Text covered by widely stroked path
+ Text same color as background
+ Any Text set to clipping with its many interactions with other elements
+ */
+ @SneakyThrows
+ public byte[] removeInvisibleTextOrPathElements(byte[] pdfFile, boolean delta) {
+ PDFDoc pdfDoc = new PDFDoc(pdfFile);
+
+ ElementWriter writer = new ElementWriter();
+ ElementReader reader = new ElementReader();
+ Set visited = new TreeSet<>();
+
+ for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
+ Page page = iterator.next();
+ List overlappedElements = removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, reader, writer, visited, delta);
+ visited.clear();
+ removeOverlappedElements(page, reader, writer, visited, overlappedElements, delta);
+ }
+
+ if (delta) {
+ debugSave(pdfDoc.save(SDFDoc.SaveMode.LINEARIZED, null));
+ }
+
+ return pdfDoc.save(SDFDoc.SaveMode.LINEARIZED, null);
+ }
+
+ private List removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(Page page, ElementReader reader, ElementWriter writer, Set visited, boolean delta)
+ throws PDFNetException {
+ var overlappedElements = new ArrayList();
+ var visibleElements = new ArrayList();
+ ClippingPathStack clippingPathStack = new ClippingPathStack(page.getMediaBox());
+ visited.add((int) page.getSDFObj().getObjNum());
+ reader.begin(page);
+ writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
+ processElements(reader, writer, visited, clippingPathStack, delta, overlappedElements, visibleElements);
+ writer.end();
+ reader.end();
+ return overlappedElements;
+ }
+
+ private void processElements(ElementReader reader, ElementWriter writer, Set visited, ClippingPathStack clippingPathStack, Boolean delta,
+ List coveredElements, List visibleElements)
+ throws PDFNetException {
+
+ for (Element element = reader.next(); element != null; element = reader.next())
+ switch (element.getType()) {
+
+ case Element.e_image:
+ case Element.e_inline_image:
+ processImages(element, writer, clippingPathStack, delta, visibleElements);
+ break;
+
+ case Element.e_text:
+ processText(element, writer, clippingPathStack, delta, visibleElements);
+ break;
+
+ case Element.e_path:
+ processPath(element, writer, clippingPathStack, delta, coveredElements, visibleElements);
+ break;
+
+ case Element.e_form:
+ processForm(reader, writer, element, visited, clippingPathStack, delta, coveredElements, visibleElements);
+ break;
+
+ case Element.e_group_begin:
+ clippingPathStack.enterNewGState();
+ writer.writeElement(element);
+ break;
+
+ case Element.e_group_end:
+ clippingPathStack.leaveGState();
+ writer.writeElement(element);
+ break;
+
+ default:
+ writer.writeElement(element);
+ }
+ }
+
+ private void processImages(Element imageElement, ElementWriter writer, ClippingPathStack clippingPathStack, Boolean delta, List visibleElements)
+ throws PDFNetException {
+
+ Rect rect = imageElement.getBBox();
+
+ if (rect == null) {
+ return;
+ }
+
+ boolean inClippingPath = clippingPathStack.almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
+
+ if (!delta && inClippingPath) {
+ visibleElements.add(ElementFeatures.extractFeatures(imageElement));
+ writer.writeElement(imageElement);
+ }
+
+ if (delta && !inClippingPath) {
+ writer.writeElement(imageElement);
+ }
+ }
+
+ private void processText(Element textElement, ElementWriter writer, ClippingPathStack clippingPathStack,
+ Boolean delta, List visibleElements)
+ throws PDFNetException {
+
+ Rect rect = textElement.getBBox();
+
+ if (rect == null) {
+ writer.writeElement(textElement);
+ return;
+ }
+
+ GState gState = textElement.getGState();
+
+ boolean inClippingPath = clippingPathStack.almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
+
+ boolean isTextVisible = isTextRenderedVisibly(gState);
+
+ if (inClippingPath && isTextVisible) {
+ visibleElements.add(ElementFeatures.extractFeatures(textElement));
+ }
+ if (!delta) {
+ if (inClippingPath && isTextVisible) {
+ writer.writeElement(textElement);
+ } else if (textElement.hasTextMatrix()) {
+ /*
+ PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
+ hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
+ Therefore, the position of a following Tj is affected by not writing the first Element.
+ This is why, we write only the Tm command:
+ */
+ writer.writeGStateChanges(textElement);
+ }
+ } else {
+ if (!inClippingPath) {
+ gState.setFillColorSpace(ColorSpace.createDeviceRGB());
+ gState.setFillColor(new ColorPt(1, 0, 0));
+ writer.writeElement(textElement);
+ }
+ if (!isTextVisible) {
+ gState.setFillColorSpace(ColorSpace.createDeviceRGB());
+ gState.setFillColor(new ColorPt(0, 0, 1));
+ gState.setTextRenderMode(GState.e_fill_text);
+ gState.setFillOpacity(1);
+ writer.writeElement(textElement);
+ }
+ }
+ }
+
+ private void processForm(ElementReader reader, ElementWriter writer, Element element, Set visited, ClippingPathStack clippingPathStack, Boolean delta,
+ List coveredElements, List allElements)
+ throws PDFNetException {
+
+ writer.writeElement(element);
+ Obj formObj = element.getXObject();
+
+ if (!visited.contains((int) formObj.getObjNum())) {
+ visited.add((int) formObj.getObjNum());
+ ElementWriter new_writer = new ElementWriter();
+ reader.formBegin();
+ new_writer.begin(formObj);
+
+ reader.clearChangeList();
+ new_writer.setDefaultGState(reader);
+
+ processElements(reader, new_writer, visited, clippingPathStack, delta, coveredElements, allElements);
+ new_writer.end();
+ reader.end();
+ }
+ }
+
+ private void processPath(Element pathElement, ElementWriter writer, ClippingPathStack clippingPathStack, Boolean delta,
+ List overlappedElements, List visibleElements)
+ throws PDFNetException {
+
+ GeneralPath linePath = convertToGeneralPath(pathElement.getPathData());
+
+ //transform path to initial user space
+ var ctm = pathElement.getCTM();
+ var affineTransform = getAffineTransform(ctm);
+ linePath.transform(affineTransform);
+
+ var rect = linePath.getBounds2D();
+
+ boolean inClippingPath = clippingPathStack.almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight());
+
+ if (pathElement.isClippingPath()) {
+ if (pathElement.isClipWindingFill()) {
+ linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
+ } else {
+ linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
+ }
+
+ clippingPathStack.intersectClippingPath(linePath);
+ pathElement.setPathClip(!delta);
+ writer.writeElement(pathElement);
+
+ } else {
+ if (inClippingPath) {
+ if (isFilledAndNonTransparent(pathElement)) {
+ List currentOverlappedElements = visibleElements.stream()
+ .filter(features -> almostContains(linePath, features.getBoundingBox()))
+ .collect(Collectors.toList());
+ overlappedElements.addAll(currentOverlappedElements);
+ visibleElements.removeAll(currentOverlappedElements);
+ }
+ visibleElements.add(ElementFeatures.extractFeatures(pathElement));
+ if (!delta) {
+ writer.writeElement(pathElement);
+ }
+ }
+ if (delta && !inClippingPath) {
+ pathElement.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
+ pathElement.getGState().setFillColor(new ColorPt(1, 0, 0));
+ pathElement.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
+ pathElement.getGState().setStrokeColor(new ColorPt(1, 0, 0));
+ writer.writeElement(pathElement);
+ }
+ }
+ }
+
+ private static AffineTransform getAffineTransform(Matrix2D ctm) throws PDFNetException {
+ return new AffineTransform(ctm.getA(), ctm.getB(), ctm.getC(), ctm.getD(), ctm.getH(), ctm.getV());
+ }
+
+ private void removeOverlappedElements(Page page, ElementReader reader, ElementWriter writer, Set visited, List overlappedElements, boolean delta)
+ throws PDFNetException {
+ reader.begin(page);
+ writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
+ if (delta) {
+ overlappedElements.forEach(feature -> drawBBox(writer, feature.getBoundingBox(), "#00FF00"));
+ overlappedElements.clear();
+ }
+ processOverlappedElements(reader, writer, visited, overlappedElements, delta);
+ writer.end();
+ reader.end();
+
+ if (overlappedElements.size() > 0) {
+ log.warn(overlappedElements.size() + " overlapped elements have not been found and removed");
+ }
+ }
+
+ private void processOverlappedElements(ElementReader reader, ElementWriter writer, Set visited, List coveredElements, boolean delta)
+ throws PDFNetException {
+ for (Element element = reader.next(); element != null; element = reader.next()) {
+ switch (element.getType()) {
+ case Element.e_form:
+ processFormOverlappedElements(reader, writer, element, visited, coveredElements, delta);
+ break;
+ case Element.e_path:
+ case Element.e_image:
+ case Element.e_inline_image:
+ case Element.e_text:
+ boolean anyMatch = false;
+ for (ElementFeatures elementToRemove : coveredElements) {
+ if (elementToRemove.almostMatches(element)) {
+ coveredElements.remove(elementToRemove);
+ anyMatch = true;
+ break;
+ }
+ }
+ if (!anyMatch) {
+ writer.writeElement(element);
+ } else if (element.getType() == 3 && element.hasTextMatrix()) {
+ /*
+ PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
+ hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
+ Therefore, the position of a following Tj is affected by not writing the first Element.
+ This is why, we write only the Tm command:
+ */
+ writer.writeGStateChanges(element);
+ }
+ break;
+
+ default:
+ writer.writeElement(element);
+
+ }
+ }
+ }
+
+ private void processFormOverlappedElements(ElementReader reader, ElementWriter writer, Element element, Set visited, List elementsToRemove, boolean delta)
+ throws PDFNetException {
+
+ writer.writeElement(element);
+ Obj formObj = element.getXObject();
+
+ if (!visited.contains((int) formObj.getObjNum())) {
+ visited.add((int) formObj.getObjNum());
+ ElementWriter new_writer = new ElementWriter();
+ reader.formBegin();
+ new_writer.begin(formObj);
+
+ reader.clearChangeList();
+ new_writer.setDefaultGState(reader);
+
+ processOverlappedElements(reader, new_writer, visited, elementsToRemove, delta);
+ new_writer.end();
+ reader.end();
+ }
+ }
+
+ private boolean isTextRenderedVisibly(GState gState) throws PDFNetException {
+ if (gState.getTextRenderMode() == GState.e_invisible_text) return false;
+ if (gState.getTextRenderMode() == GState.e_fill_text && gState.getFillOpacity() == 0) return false;
+ if (gState.getTextRenderMode() == GState.e_stroke_text && gState.getStrokeOpacity() == 0) return false;
+ if (gState.getTextRenderMode() == GState.e_fill_stroke_text && gState.getFillOpacity() == 0 && gState.getStrokeOpacity() == 0) return false;
+ return true;
+ }
+
+ private GeneralPath convertToGeneralPath(PathData pathData) throws PDFNetException {
+ GeneralPath linePath = new GeneralPath();
+
+ double[] dataPoints = pathData.getPoints();
+ byte[] opr = pathData.getOperators();
+
+ double x1;
+ double y1;
+ double x2;
+ double y2;
+ double x3;
+ double y3;
+
+ int data_index = 0;
+ for (int opr_index = 0; opr_index < opr.length; ++opr_index) {
+ switch (opr[opr_index]) {
+ case PathData.e_moveto:
+ x1 = dataPoints[data_index];
+ ++data_index;
+ y1 = dataPoints[data_index];
+ ++data_index;
+
+ linePath.moveTo(x1, y1);
+ break;
+ case PathData.e_lineto:
+ x1 = dataPoints[data_index];
+ ++data_index;
+ y1 = dataPoints[data_index];
+ ++data_index;
+
+ linePath.lineTo(x1, y1);
+ break;
+ case PathData.e_cubicto:
+ x1 = dataPoints[data_index];
+ ++data_index;
+ y1 = dataPoints[data_index];
+ ++data_index;
+ x2 = dataPoints[data_index];
+ ++data_index;
+ y2 = dataPoints[data_index];
+ ++data_index;
+ x3 = dataPoints[data_index];
+ ++data_index;
+ y3 = dataPoints[data_index];
+ ++data_index;
+
+ linePath.curveTo(x1, y1, x2, y2, x3, y3);
+ break;
+ case PathData.e_rect:
+ x1 = dataPoints[data_index];
+ ++data_index;
+ y1 = dataPoints[data_index];
+ ++data_index;
+ double w = dataPoints[data_index];
+ ++data_index;
+ double h = dataPoints[data_index];
+ ++data_index;
+ x2 = x1 + w;
+ y2 = y1;
+ x3 = x2;
+ y3 = y1 + h;
+ double x4 = x1;
+ double y4 = y3;
+
+ linePath.moveTo(x1, y1);
+ linePath.lineTo(x2, y2);
+ linePath.lineTo(x3, y3);
+ linePath.lineTo(x4, y4);
+ break;
+ case PathData.e_closepath:
+ linePath.closePath();
+ break;
+ default:
+ throw new PDFNetException("Invalid Element Type", 0, "", "", "");
+ }
+ }
+ return linePath;
+ }
+
+ private boolean almostContains(Shape outer, Rectangle2D inner) {
+ double tolerance = 1e-3;
+
+ double x_with_tolerance = inner.getX() >= 0 ? inner.getX() + tolerance : inner.getX() - tolerance;
+ double y_with_tolerance = inner.getY() >= 0 ? inner.getY() + tolerance : inner.getY() - tolerance;
+ double height_with_tolerance = inner.getHeight() - (2 * tolerance);
+ double width_with_tolerance = inner.getWidth() - (2 * tolerance);
+ Rectangle2D innerRect = new Rectangle2D.Double(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance);
+
+ return outer.contains(innerRect);
+ }
+
+ private boolean isFilledAndNonTransparent(Element element) throws PDFNetException {
+ return element.isFilled() && element.getGState().getFillOpacity() == 1;
+ }
+
+
+ private void debugSave(byte[] pdfFile) {
+ String fileLocation = "/tmp/delta.pdf";
+ try (var f_out = FileUtils.openOutputStream(new File(fileLocation))) {
+ f_out.write(pdfFile);
+ } catch (IOException e) {
+ throw new RuntimeException("File location: " + fileLocation + "could not be openend, no file will be saved");
+ }
+ }
+
+ @SneakyThrows
+ private void drawBBox(ElementWriter writer, Rectangle2D r, String hexcolor) {
+ ColorPt colorPt = new ColorPt(Integer.valueOf(hexcolor.substring(1, 3), 16) / 255d,
+ Integer.valueOf(hexcolor.substring(3, 5), 16) / 255d,
+ Integer.valueOf(hexcolor.substring(5, 7), 16) / 255d);
+ ElementBuilder eb = new ElementBuilder();
+ Element rect = eb.createRect(r.getX(), r.getY(), r.getWidth(), r.getHeight());
+ rect.setPathStroke(true);
+ rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
+ rect.getGState().setStrokeColor(colorPt);
+ writer.writePlacedElement(rect);
+ }
+}
diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java
index 4a3f15f..49d1b42 100644
--- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java
+++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java
@@ -1,63 +1,29 @@
package com.iqser.red.service.ocr.v1.server.service;
-import static com.pdftron.pdf.TextExtractor.e_no_invisible_text;
-import static com.pdftron.pdf.TextExtractor.e_remove_hidden_text;
-
-import java.awt.geom.AffineTransform;
-import java.awt.geom.GeneralPath;
-import java.awt.geom.Point2D;
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.TreeSet;
-
-import org.apache.commons.io.IOUtils;
-import org.springframework.amqp.rabbit.core.RabbitTemplate;
-import org.springframework.stereotype.Service;
-
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse;
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
-import com.iqser.red.service.ocr.v1.server.model.ClippingPathStack;
import com.iqser.red.service.ocr.v1.server.model.ImagePosition;
import com.iqser.red.service.ocr.v1.server.model.image.ImageServiceResponse;
import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings;
import com.iqser.red.service.persistence.service.v1.api.utils.SuppressFBWarnings;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.Rectangle;
-import com.pdftron.common.Matrix2D;
-import com.pdftron.common.PDFNetException;
-import com.pdftron.pdf.CharData;
-import com.pdftron.pdf.CharIterator;
-import com.pdftron.pdf.Element;
-import com.pdftron.pdf.ElementReader;
-import com.pdftron.pdf.ElementWriter;
-import com.pdftron.pdf.Font;
-import com.pdftron.pdf.GSChangesIterator;
-import com.pdftron.pdf.OCRModule;
-import com.pdftron.pdf.OCROptions;
-import com.pdftron.pdf.Optimizer;
-import com.pdftron.pdf.PDFDoc;
-import com.pdftron.pdf.Page;
-import com.pdftron.pdf.PageIterator;
-import com.pdftron.pdf.PathData;
-import com.pdftron.pdf.Rect;
-import com.pdftron.pdf.RectCollection;
-import com.pdftron.pdf.TextExtractor;
-import com.pdftron.sdf.Obj;
+import com.pdftron.pdf.*;
import com.pdftron.sdf.SDFDoc;
-
import io.micrometer.core.annotation.Timed;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
+import org.apache.commons.io.IOUtils;
+import org.springframework.amqp.rabbit.core.RabbitTemplate;
+import org.springframework.stereotype.Service;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.InputStream;
+import java.util.*;
+
@Slf4j
@Service
@@ -73,6 +39,7 @@ public class OCRService {
private final ObjectMapper objectMapper;
+ private final InvisibleElementService invisibleElementService;
@Timed("redactmanager_PDFTron-ocrDocument")
@SneakyThrows
@@ -95,9 +62,11 @@ public class OCRService {
private byte[] ocr(byte[] file, String fileId, ImageServiceResponse imageServiceResponse) {
PDFDoc pdfDoc = null;
+
+ var fileWithoutInvisibleText = invisibleElementService.removeInvisibleTextOrPathElements(file, false);
+
try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
- pdfDoc = new PDFDoc(file);
- removeInvisibleText(pdfDoc);
+ pdfDoc = new PDFDoc(fileWithoutInvisibleText);
Map> pages = new HashMap<>();
@@ -128,7 +97,6 @@ public class OCRService {
ocrDoc.close();
}
-
Optimizer.optimize(pdfDoc);
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
pdfDoc.close();
@@ -206,308 +174,4 @@ public class OCRService {
}
}
-
-
- /**
- * There are 2 possibilities to have invisible Text in pdfs.
- * 1. gState is set to invisible, this is ocr text.
- * 2. Filled Path elements in front of the text.
- */
- @SneakyThrows
- private void removeInvisibleText(PDFDoc pdfDoc) {
-
- ElementWriter writer = new ElementWriter();
- ElementReader reader = new ElementReader();
- Set visited = new TreeSet<>();
-
- for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
- Page page = iterator.next();
- removeOverlapText(page, reader, writer, visited);
- }
- }
-
-
- @SneakyThrows
- private void removeOverlapText(Page page, ElementReader reader, ElementWriter writer, Set visited) {
-
- ClippingPathStack clippingPathStack = new ClippingPathStack(page.getMediaBox());
- visited.add((int) page.getSDFObj().getObjNum());
- reader.begin(page);
- writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
- processElements(reader, writer, visited, clippingPathStack);
- writer.end();
- reader.end();
- }
-
-
- @SneakyThrows
- private void processElements(ElementReader reader, ElementWriter writer, Set visited, ClippingPathStack clippingPathStack) {
-
-
- GSChangesIterator gs_itr = reader.getChangesIterator();
- while (gs_itr.hasNext()) {
- System.out.println("---->" + gs_itr);
- }
-
-
- for (Element element = reader.next(); element != null; element = reader.next())
-
-
- switch (element.getType()) {
-
- case Element.e_text:
- processText(element, writer, clippingPathStack);
- break;
-
- case Element.e_path:
- processPath(element,reader, writer, clippingPathStack);
- break;
-
- case Element.e_form:
- processForm(reader, writer, element, visited, clippingPathStack);
- break;
- case Element.e_group_begin:
- clippingPathStack.enterNewGState();
- writer.writeElement(element);
- break;
- case Element.e_group_end:
- clippingPathStack.leaveGState();
- writer.writeElement(element);
- break;
- default:
- writer.writeElement(element);
- }
- }
-
-
-
- @SneakyThrows
- private void processText(Element textElement, ElementWriter writer, ClippingPathStack clippingPathStack) {
-
-
-// var gState = element.getGState();
-//
-//
-// //See PDF Reference 5.3 Text rendering modes, 3 = Invisible, however this ocr does not use it.
-// if (!filledRectangleIntersection && gState.getTextRenderMode() != 3) {
- Rect rect = textElement.getBBox();
-
- System.out.println(clippingPathStack.getStack().size() + " -> " +textElement.getTextString());
-
-
-// Matrix textMatrix = text.getTextMatrix();
-//// Vector start = textMatrix.transform(new Vector(0, 0));
-//// Vector end = new Vector(start.getX() + text.getWidth(), start.getY());
-// var textMatrix = textElement.getTextMatrix();
-//
-// var start = textMatrix.multPoint(0,0);
-//// Vector start = textMatrix.transform(new Vector(0, 0));
-// var end = new Point2D.Double(start.getX() + textElement.getBBox().getWidth(), start.getY());
-
-// Matrix2D ctm = textElement.getCTM();
-//
-// // To get the exact character positioning information you need to
-// // concatenate current text matrix with CTM and then multiply
-// // relative positioning coordinates with the resulting matrix.
-// //
-// Matrix2D mtx = ctm.multiply(textMatrix);
-// java.awt.geom.Point2D.Double p1 = mtx.multPoint(rect.getX1(), rect.getY1());
-// java.awt.geom.Point2D.Double p2 = mtx.multPoint(rect.getX1(), rect.getY2());
-// java.awt.geom.Point2D.Double p3 = mtx.multPoint(rect.getX2(), rect.getY2());
-// java.awt.geom.Point2D.Double p4 = mtx.multPoint(rect.getX2(), rect.getY1());
-
-
-//
-// double x, y;
-// long char_code;
-//
-// for (CharIterator itr = textElement.getCharIterator(); itr.hasNext(); ) {
-// CharData data = itr.next();
-// char_code = data.getCharCode();
-// //System.out.print("Character code: ");
-//
-// System.out.print(String.valueOf(char_code));
-//
-// x = data.getGlyphX(); // character positioning information
-// y = data.getGlyphY();
-//
-// // Use element.getCTM() if you are interested in the CTM
-// // (current transformation matrix).
-// Matrix2D ctm = textElement.getCTM();
-//
-// var inverse = ctm.inverse();
-//
-//
-//
-// // To get the exact character positioning information you need to
-// // concatenate current text matrix with CTM and then multiply
-// // relative positioning coordinates with the resulting matrix.
-// //
-// Matrix2D mtx = ctm.multiply(textElement.getTextMatrix());
-// java.awt.geom.Point2D.Double t = mtx.multPoint(x, y);
-// x = t.x;
-// y = t.y;
-// System.out.println(" Position: x=" + x + " y=" + y );
-// }
-
-// var in = textElement.getCTM().inverse();
-//
-// var p1 = in.multPoint(rect.getX1(), rect.getY1());
-// var p4 = in.multPoint(rect.getX2(), rect.getY2());
-
-
-
-
-
-// Vector start = textMatrix.transform(new Vector(0, 0));
-// Vector end = new Vector(start.getX() + text.getWidth(), start.getY());
-
- if(clippingPathStack.getCurrentClippingPath().contains(rect.getX1(), rect.getY1(),rect.getWidth(), rect.getHeight())) {
- writer.writeElement(textElement);
- }
-
-
-// }
- }
-
-
- @SneakyThrows
- private void processPath(Element pathElement,ElementReader reader, ElementWriter writer, ClippingPathStack clippingPathStack) {
-
-// System.out.println("New Path");
-
-// System.out.println("ClippingPath: " + pathElement.isClippingPath());
-// System.out.println("ClipWindingFill: " + pathElement.isClipWindingFill());
-// System.out.println("WindingFill: " + pathElement.isWindingFill());
-// System.out.println("Stroke: " + pathElement.isStroked());
-// System.out.println("Filled: " + pathElement.isFilled());
-
- GeneralPath linePath = new GeneralPath();
-
- PathData pathData = pathElement.getPathData();
- double[] dataPoints = pathData.getPoints();
- byte[] opr = pathData.getOperators();
-
- double x1, y1, x2, y2, x3, y3;
-
- int data_index = 0;
- for (int opr_index = 0; opr_index < opr.length; ++opr_index) {
- switch (opr[opr_index]) {
- case PathData.e_moveto:
- x1 = dataPoints[data_index];
- ++data_index;
- y1 = dataPoints[data_index];
- ++data_index;
-// System.out.println(" M" + x1 + " " + y1);
-
- linePath.moveTo(x1, y1);
-
-
- break;
- case PathData.e_lineto:
- x1 = dataPoints[data_index];
- ++data_index;
- y1 = dataPoints[data_index];
- ++data_index;
-// System.out.println(" L" + x1 + " " + y1);
-
- linePath.lineTo(x1, y1);
- break;
- case PathData.e_cubicto:
- x1 = dataPoints[data_index];
- ++data_index;
- y1 = dataPoints[data_index];
- ++data_index;
- x2 = dataPoints[data_index];
- ++data_index;
- y2 = dataPoints[data_index];
- ++data_index;
- x3 = dataPoints[data_index];
- ++data_index;
- y3 = dataPoints[data_index];
- ++data_index;
-// System.out.println(" CU P1 " + x1 + " " + y1 + " P2 " + x2 + " " + y2 + " P3 " + x3 + " " + y3);
-
- linePath.curveTo(x1, y1, x2, y2, x3, y3);
-
- break;
- case PathData.e_rect: {
- x1 = dataPoints[data_index];
- ++data_index;
- y1 = dataPoints[data_index];
- ++data_index;
- double w = dataPoints[data_index];
- ++data_index;
- double h = dataPoints[data_index];
- ++data_index;
- x2 = x1 + w;
- y2 = y1;
- x3 = x2;
- y3 = y1 + h;
- double x4 = x1;
- double y4 = y3;
-// System.out.println(" RE P1 " + x1 + " " + y1 + " P2 " + x2 + " " + y2 + " P3 " + x3 + " " + y3 + " P4 " + x4 + " " + y4);
-
-
- linePath.moveTo(x1, y1);
- linePath.lineTo(x2, y2);
- linePath.lineTo(x3, y3);
- linePath.lineTo(x4, y4);
-
- }
-
- break;
- case PathData.e_closepath:
- linePath.closePath();
- break;
- default:
- throw new PDFNetException("Invalid Element Type", 0, "", "", "");
- }
-
-
- }
-
-
-
- // ClipWindingFill = true W = non-zero
- // ClipWindingFill = false W* = even-odd
-
-
- if(pathElement.isClippingPath()){
- if(pathElement.isClipWindingFill()){
- linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
- } else {
- linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
- }
- clippingPathStack.intersectClippingPath(linePath, pathElement.getCTM());
- }
-
-
-
- writer.writeElement(pathElement);
- }
-
-
- @SneakyThrows
- private void processForm(ElementReader reader, ElementWriter writer, Element element, Set visited, ClippingPathStack clippingPathStack) {
-
- writer.writeElement(element);
- Obj formObj = element.getXObject();
-
- if (!visited.contains((int) formObj.getObjNum())) {
- visited.add((int) formObj.getObjNum());
- System.out.println("Form num:" +(int) formObj.getObjNum());
- ElementWriter new_writer = new ElementWriter();
- reader.formBegin();
- new_writer.begin(formObj);
-
- reader.clearChangeList();
- new_writer.setDefaultGState(reader);
-
- processElements(reader, new_writer, visited, clippingPathStack);
- new_writer.end();
- reader.end();
- }
- }
-
}
diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/InvisibleElementServiceTest.java b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/InvisibleElementServiceTest.java
new file mode 100644
index 0000000..709ea4c
--- /dev/null
+++ b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/InvisibleElementServiceTest.java
@@ -0,0 +1,64 @@
+package com.iqser.red.service.ocr.v1.server;
+
+import com.iqser.red.service.ocr.v1.server.service.InvisibleElementService;
+import lombok.SneakyThrows;
+import org.apache.commons.io.FileUtils;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.extension.ExtendWith;
+import org.springframework.amqp.rabbit.core.RabbitTemplate;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.boot.test.context.SpringBootTest;
+import org.springframework.boot.test.mock.mockito.MockBean;
+import org.springframework.context.annotation.Import;
+import org.springframework.core.io.ClassPathResource;
+import org.springframework.test.context.junit.jupiter.SpringExtension;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.Files;
+
+import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
+
+@ExtendWith(SpringExtension.class)
+@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT //
+ , properties = {"pdftron.ocrmodule.path=/YourOCRModulePath"})
+@Import(OcrServiceIntegrationTest.TestConfiguration.class)
+public class InvisibleElementServiceTest {
+
+ @Autowired
+ private InvisibleElementService invisibleElementService;
+
+ @MockBean
+ protected RabbitTemplate rabbitTemplate;
+
+ @Test
+ @SneakyThrows
+ public void testRemoveInvisibleText() {
+ String fileName = "InvisibleText";
+
+ ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf");
+
+ var initialFileBytes = Files.readAllBytes(pdfFileResource.getFile().toPath());
+
+ var fileWithoutInvisibleText = invisibleElementService.removeInvisibleTextOrPathElements(initialFileBytes, false);
+ var deltaFile = invisibleElementService.removeInvisibleTextOrPathElements(initialFileBytes, true);
+
+ String fileWithoutInvisibleTextLocation = getTemporaryDirectory() + "/" + fileName + ".pdf";
+ String deltaFileLocation = getTemporaryDirectory() + "/" + fileName + "_delta.pdf";
+
+ saveToFile(fileWithoutInvisibleTextLocation, fileWithoutInvisibleText);
+ saveToFile(deltaFileLocation, deltaFile);
+
+ System.out.println("File:" + fileWithoutInvisibleTextLocation);
+ System.out.println("File:" + deltaFileLocation);
+ }
+
+ private void saveToFile(String location, byte[] fileBytes) {
+ try (var f_out = FileUtils.openOutputStream(new File(location))) {
+ f_out.write(fileBytes);
+ } catch (IOException e) {
+ throw new RuntimeException("File location: " + location + "could not be openend, no file will be saved");
+ }
+
+ }
+}
diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java
index 9e4e9f5..ceb9db3 100644
--- a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java
+++ b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java
@@ -1,10 +1,13 @@
package com.iqser.red.service.ocr.v1.server;
-import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
-import static org.assertj.core.api.Assertions.assertThat;
-
-import java.io.File;
-
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.iqser.red.service.ocr.v1.server.service.FileStorageService;
+import com.iqser.red.service.ocr.v1.server.service.OCRService;
+import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService;
+import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType;
+import com.iqser.red.storage.commons.StorageAutoConfiguration;
+import com.iqser.red.storage.commons.service.StorageService;
+import lombok.SneakyThrows;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.AfterEach;
@@ -24,15 +27,10 @@ import org.springframework.context.annotation.Primary;
import org.springframework.core.io.ClassPathResource;
import org.springframework.test.context.junit.jupiter.SpringExtension;
-import com.fasterxml.jackson.databind.ObjectMapper;
-import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService;
-import com.iqser.red.service.ocr.v1.server.service.OCRService;
-import com.iqser.red.service.ocr.v1.server.service.FileStorageService;
-import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType;
-import com.iqser.red.storage.commons.StorageAutoConfiguration;
-import com.iqser.red.storage.commons.service.StorageService;
+import java.io.File;
-import lombok.SneakyThrows;
+import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
+import static org.assertj.core.api.Assertions.assertThat;
@ExtendWith(SpringExtension.class)
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT //
@@ -86,10 +84,10 @@ public class OcrServiceIntegrationTest {
@SneakyThrows
public void testRemoveInvisibleText() {
- String fileName = "InvisiblePathElements";
+ String fileName = "ocr/OCR Docs/MK244 - Fitness of Analytical Method - Physical-Chemical Pro";
// String fileName = "InvisiblePathElements";
- ClassPathResource imageInfoResource = new ClassPathResource("files/" + fileName + ".IMAGE_INFO.json");
+ ClassPathResource imageInfoResource = new ClassPathResource("files/InvisibleText.IMAGE_INFO.json");
ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf");
var originId = FileStorageService.getStorageId("dossier", "file", FileType.ORIGIN);
@@ -107,8 +105,6 @@ public class OcrServiceIntegrationTest {
}
-
-
@SneakyThrows
public void dummyTest() {
diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/FooterText.pdf b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/FooterText.pdf
new file mode 100644
index 0000000..93452da
Binary files /dev/null and b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/FooterText.pdf differ
diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/LargeTableCoveringText.pdf b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/LargeTableCoveringText.pdf
new file mode 100644
index 0000000..e6d9a07
Binary files /dev/null and b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/LargeTableCoveringText.pdf differ
diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/testDmitryK.pdf b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/testDmitryK.pdf
new file mode 100644
index 0000000..b15b478
Binary files /dev/null and b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/testDmitryK.pdf differ
diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/testFailed.pdf b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/testFailed.pdf
new file mode 100644
index 0000000..1b653ab
Binary files /dev/null and b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/testFailed.pdf differ