diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ClippingPathStack.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ClippingPathStack.java index 1c3b1cd..04a30dc 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ClippingPathStack.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ClippingPathStack.java @@ -1,58 +1,53 @@ package com.iqser.red.service.ocr.v1.server.model; -import java.awt.geom.AffineTransform; -import java.awt.geom.Area; -import java.awt.geom.GeneralPath; -import java.util.ArrayDeque; -import java.util.Deque; - -import com.pdftron.common.Matrix2D; import com.pdftron.pdf.Rect; - import lombok.Data; import lombok.SneakyThrows; +import java.awt.geom.Area; +import java.awt.geom.GeneralPath; +import java.awt.geom.Rectangle2D; +import java.util.ArrayDeque; +import java.util.Deque; + @Data public class ClippingPathStack { private Deque stack = new ArrayDeque<>(); @SneakyThrows - public ClippingPathStack(Rect rectangle) - { - GeneralPath path = new GeneralPath(); - path.moveTo(rectangle.getX1(), rectangle.getY1()); - path.lineTo(rectangle.getX2(), rectangle.getY1()); - path.lineTo(rectangle.getX2(), rectangle.getY2()); - path.lineTo(rectangle.getX1(), rectangle.getY2()); - path.closePath(); - stack.push(new Area(path)); + public ClippingPathStack(Rect rectangle) { + stack.push(new Area(new Rectangle2D.Double(rectangle.getX1(), rectangle.getY1(), rectangle.getWidth(), rectangle.getHeight()).getBounds2D())); } @SneakyThrows - public void intersectClippingPath(GeneralPath path, Matrix2D ctm){ - var affineTransform = new AffineTransform(ctm.getA(), ctm.getB(), ctm.getC(), ctm.getD(), ctm.getH(), ctm.getV()); - path.transform(affineTransform); -// var area = getCurrentClippingPath(); -// area.transform(affineTransform); + public void intersectClippingPath(GeneralPath path) { getCurrentClippingPath().intersect(new Area(path)); } + public boolean almostIntersects(double x, double y, double width, double height) { + // Height or width are zero for straight lines, even though they are being rendered. Therefore, height or width must be at minimum >0. + double tolerance = 1e-3; + double x_with_tolerance = x > 0 ? x - tolerance : x + tolerance; + double y_with_tolerance = y > 0 ? y - tolerance : y + tolerance; + double width_with_tolerance = width + 2 * tolerance; + double height_with_tolerance = height + 2 * tolerance; + return getCurrentClippingPath().intersects(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance); + } public Area getCurrentClippingPath() { - return stack.peek(); } - public void enterNewGState(){ + public void enterNewGState() { Area current = stack.peek(); Area cloned = new Area(); cloned.add(current); stack.push(cloned); } - public void leaveGState(){ + public void leaveGState() { stack.pop(); } diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ElementFeatures.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ElementFeatures.java new file mode 100644 index 0000000..f150973 --- /dev/null +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ElementFeatures.java @@ -0,0 +1,149 @@ +package com.iqser.red.service.ocr.v1.server.model; + +import com.pdftron.common.PDFNetException; +import com.pdftron.pdf.Element; +import com.pdftron.pdf.Rect; +import lombok.*; +import lombok.experimental.SuperBuilder; + +import java.awt.geom.Rectangle2D; + +@Data +@SuperBuilder +@NoArgsConstructor +@AllArgsConstructor +public abstract class ElementFeatures { + private int elementType; + private Rectangle2D boundingBox; + + public boolean almostMatches(Element element) throws PDFNetException { + if (element.getType() != elementType) return false; + if (element.getBBox() == null) return false; + return rectsAlmostMatch(element.getBBox()); + } + + protected boolean almostEqual(double a, double b) { + double tolerance = 1e-3; + return Math.abs(a - b) < tolerance; + } + + @SneakyThrows + private boolean rectsAlmostMatch(Rect bBox) { + if (!almostEqual(bBox.getX1(), boundingBox.getX())) return false; + if (!almostEqual(bBox.getY1(), boundingBox.getY())) return false; + if (!almostEqual(bBox.getWidth(), boundingBox.getWidth())) return false; + return almostEqual(bBox.getHeight(), boundingBox.getHeight()); + } + + @EqualsAndHashCode(callSuper = true) + @Data + @SuperBuilder + @NoArgsConstructor + @AllArgsConstructor + public static class Text extends ElementFeatures { + private String text; + private int font; + private double fontsize; + + @Override + public boolean almostMatches(Element element) throws PDFNetException { + if (!super.almostMatches(element)) return false; + if (!text.equals(element.getTextString())) return false; + if (font != element.getGState().getFont().getType()) return false; + return almostEqual(fontsize, element.getGState().getFontSize()); + } + + } + + @EqualsAndHashCode(callSuper = true) + @Data + @SuperBuilder + @NoArgsConstructor + @AllArgsConstructor + public static class Path extends ElementFeatures { + private boolean isClippingPath; + private boolean isClipWindingFill; + private boolean isStroked; + private boolean isFilled; + private boolean isWindingFill; + + @Override + public boolean almostMatches(Element element) throws PDFNetException { + if (!super.almostMatches(element)) return false; + if (isClippingPath != element.isClippingPath()) return false; + if (isClipWindingFill != element.isClipWindingFill()) return false; + if (isStroked != element.isStroked()) return false; + if (isFilled != element.isFilled()) return false; + if (isWindingFill != element.isWindingFill()) return false; + + return true; + } + } + + @EqualsAndHashCode(callSuper = true) + @Data + @SuperBuilder + @NoArgsConstructor + public static class Image extends ElementFeatures { + private int dataSize; + private int height; + private int width; + private int renderingIntent; + private int componentNum; + private int bitsPerComponent; + + @Override + public boolean almostMatches(Element element) throws PDFNetException { + if (!super.almostMatches(element)) return false; + if (dataSize != element.getImageDataSize()) return false; + if (height != element.getImageHeight()) return false; + if (width != element.getImageWidth()) return false; + if (renderingIntent != element.getImageRenderingIntent()) return false; + if (componentNum != element.getComponentNum()) return false; + if (bitsPerComponent != element.getBitsPerComponent()) return false; + return true; + } + } + + public static ElementFeatures extractFeatures(Element element) throws PDFNetException { + switch (element.getType()) { + case Element.e_path: + return ElementFeatures.Path.builder() + .elementType(element.getType()) + .boundingBox(toRectangle2D(element.getBBox())) + .isClippingPath(element.isClippingPath()) + .isClipWindingFill(element.isClipWindingFill()) + .isStroked(element.isStroked()) + .isFilled(element.isFilled()) + .isWindingFill(element.isWindingFill()) + .build(); + case Element.e_text: + return ElementFeatures.Text.builder() + .elementType(element.getType()) + .boundingBox(toRectangle2D(element.getBBox())) + .text(element.getTextString()) + .font(element.getGState().getFont().getType()) + .fontsize(element.getGState().getFontSize()) + .build(); + case Element.e_image: + case Element.e_inline_image: + return Image.builder() + .elementType(element.getType()) + .boundingBox(toRectangle2D(element.getBBox())) + .dataSize(element.getImageDataSize()) + .height(element.getImageHeight()) + .width(element.getImageWidth()) + .renderingIntent(element.getImageRenderingIntent()) + .componentNum(element.getComponentNum()) + .bitsPerComponent(element.getBitsPerComponent()) + .build(); + default: + throw new UnsupportedOperationException("Feature Extraction is not supported for PDFTron.Element with type: " + element.getType()); + } + } + + private static Rectangle2D toRectangle2D(Rect rect) throws PDFNetException { + return new Rectangle2D.Double(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight()); + } +} + diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementService.java new file mode 100644 index 0000000..7b9ff3d --- /dev/null +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementService.java @@ -0,0 +1,463 @@ +package com.iqser.red.service.ocr.v1.server.service; + +import com.iqser.red.service.ocr.v1.server.model.ClippingPathStack; +import com.iqser.red.service.ocr.v1.server.model.ElementFeatures; +import com.pdftron.common.Matrix2D; +import com.pdftron.common.PDFNetException; +import com.pdftron.pdf.*; +import com.pdftron.sdf.Obj; +import com.pdftron.sdf.SDFDoc; +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; +import org.apache.commons.io.FileUtils; +import org.springframework.stereotype.Service; + +import java.awt.*; +import java.awt.geom.AffineTransform; +import java.awt.geom.GeneralPath; +import java.awt.geom.Rectangle2D; +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Set; +import java.util.TreeSet; +import java.util.stream.Collectors; + +@Slf4j +@Service +public class InvisibleElementService { + + /* + handled cases: + Text or Path outside of clipping path + Text which is transparent or is set to not render + Text or Path that have been painted over by visible and filled Paths + unhandled cases: + Text covered by widely stroked path + Text same color as background + Any Text set to clipping with its many interactions with other elements + */ + @SneakyThrows + public byte[] removeInvisibleTextOrPathElements(byte[] pdfFile, boolean delta) { + PDFDoc pdfDoc = new PDFDoc(pdfFile); + + ElementWriter writer = new ElementWriter(); + ElementReader reader = new ElementReader(); + Set visited = new TreeSet<>(); + + for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) { + Page page = iterator.next(); + List overlappedElements = removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, reader, writer, visited, delta); + visited.clear(); + removeOverlappedElements(page, reader, writer, visited, overlappedElements, delta); + } + + if (delta) { + debugSave(pdfDoc.save(SDFDoc.SaveMode.LINEARIZED, null)); + } + + return pdfDoc.save(SDFDoc.SaveMode.LINEARIZED, null); + } + + private List removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(Page page, ElementReader reader, ElementWriter writer, Set visited, boolean delta) + throws PDFNetException { + var overlappedElements = new ArrayList(); + var visibleElements = new ArrayList(); + ClippingPathStack clippingPathStack = new ClippingPathStack(page.getMediaBox()); + visited.add((int) page.getSDFObj().getObjNum()); + reader.begin(page); + writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict()); + processElements(reader, writer, visited, clippingPathStack, delta, overlappedElements, visibleElements); + writer.end(); + reader.end(); + return overlappedElements; + } + + private void processElements(ElementReader reader, ElementWriter writer, Set visited, ClippingPathStack clippingPathStack, Boolean delta, + List coveredElements, List visibleElements) + throws PDFNetException { + + for (Element element = reader.next(); element != null; element = reader.next()) + switch (element.getType()) { + + case Element.e_image: + case Element.e_inline_image: + processImages(element, writer, clippingPathStack, delta, visibleElements); + break; + + case Element.e_text: + processText(element, writer, clippingPathStack, delta, visibleElements); + break; + + case Element.e_path: + processPath(element, writer, clippingPathStack, delta, coveredElements, visibleElements); + break; + + case Element.e_form: + processForm(reader, writer, element, visited, clippingPathStack, delta, coveredElements, visibleElements); + break; + + case Element.e_group_begin: + clippingPathStack.enterNewGState(); + writer.writeElement(element); + break; + + case Element.e_group_end: + clippingPathStack.leaveGState(); + writer.writeElement(element); + break; + + default: + writer.writeElement(element); + } + } + + private void processImages(Element imageElement, ElementWriter writer, ClippingPathStack clippingPathStack, Boolean delta, List visibleElements) + throws PDFNetException { + + Rect rect = imageElement.getBBox(); + + if (rect == null) { + return; + } + + boolean inClippingPath = clippingPathStack.almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight()); + + if (!delta && inClippingPath) { + visibleElements.add(ElementFeatures.extractFeatures(imageElement)); + writer.writeElement(imageElement); + } + + if (delta && !inClippingPath) { + writer.writeElement(imageElement); + } + } + + private void processText(Element textElement, ElementWriter writer, ClippingPathStack clippingPathStack, + Boolean delta, List visibleElements) + throws PDFNetException { + + Rect rect = textElement.getBBox(); + + if (rect == null) { + writer.writeElement(textElement); + return; + } + + GState gState = textElement.getGState(); + + boolean inClippingPath = clippingPathStack.almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight()); + + boolean isTextVisible = isTextRenderedVisibly(gState); + + if (inClippingPath && isTextVisible) { + visibleElements.add(ElementFeatures.extractFeatures(textElement)); + } + if (!delta) { + if (inClippingPath && isTextVisible) { + writer.writeElement(textElement); + } else if (textElement.hasTextMatrix()) { + /* + PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element. + hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands. + Therefore, the position of a following Tj is affected by not writing the first Element. + This is why, we write only the Tm command: + */ + writer.writeGStateChanges(textElement); + } + } else { + if (!inClippingPath) { + gState.setFillColorSpace(ColorSpace.createDeviceRGB()); + gState.setFillColor(new ColorPt(1, 0, 0)); + writer.writeElement(textElement); + } + if (!isTextVisible) { + gState.setFillColorSpace(ColorSpace.createDeviceRGB()); + gState.setFillColor(new ColorPt(0, 0, 1)); + gState.setTextRenderMode(GState.e_fill_text); + gState.setFillOpacity(1); + writer.writeElement(textElement); + } + } + } + + private void processForm(ElementReader reader, ElementWriter writer, Element element, Set visited, ClippingPathStack clippingPathStack, Boolean delta, + List coveredElements, List allElements) + throws PDFNetException { + + writer.writeElement(element); + Obj formObj = element.getXObject(); + + if (!visited.contains((int) formObj.getObjNum())) { + visited.add((int) formObj.getObjNum()); + ElementWriter new_writer = new ElementWriter(); + reader.formBegin(); + new_writer.begin(formObj); + + reader.clearChangeList(); + new_writer.setDefaultGState(reader); + + processElements(reader, new_writer, visited, clippingPathStack, delta, coveredElements, allElements); + new_writer.end(); + reader.end(); + } + } + + private void processPath(Element pathElement, ElementWriter writer, ClippingPathStack clippingPathStack, Boolean delta, + List overlappedElements, List visibleElements) + throws PDFNetException { + + GeneralPath linePath = convertToGeneralPath(pathElement.getPathData()); + + //transform path to initial user space + var ctm = pathElement.getCTM(); + var affineTransform = getAffineTransform(ctm); + linePath.transform(affineTransform); + + var rect = linePath.getBounds2D(); + + boolean inClippingPath = clippingPathStack.almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight()); + + if (pathElement.isClippingPath()) { + if (pathElement.isClipWindingFill()) { + linePath.setWindingRule(GeneralPath.WIND_NON_ZERO); + } else { + linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD); + } + + clippingPathStack.intersectClippingPath(linePath); + pathElement.setPathClip(!delta); + writer.writeElement(pathElement); + + } else { + if (inClippingPath) { + if (isFilledAndNonTransparent(pathElement)) { + List currentOverlappedElements = visibleElements.stream() + .filter(features -> almostContains(linePath, features.getBoundingBox())) + .collect(Collectors.toList()); + overlappedElements.addAll(currentOverlappedElements); + visibleElements.removeAll(currentOverlappedElements); + } + visibleElements.add(ElementFeatures.extractFeatures(pathElement)); + if (!delta) { + writer.writeElement(pathElement); + } + } + if (delta && !inClippingPath) { + pathElement.getGState().setFillColorSpace(ColorSpace.createDeviceRGB()); + pathElement.getGState().setFillColor(new ColorPt(1, 0, 0)); + pathElement.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB()); + pathElement.getGState().setStrokeColor(new ColorPt(1, 0, 0)); + writer.writeElement(pathElement); + } + } + } + + private static AffineTransform getAffineTransform(Matrix2D ctm) throws PDFNetException { + return new AffineTransform(ctm.getA(), ctm.getB(), ctm.getC(), ctm.getD(), ctm.getH(), ctm.getV()); + } + + private void removeOverlappedElements(Page page, ElementReader reader, ElementWriter writer, Set visited, List overlappedElements, boolean delta) + throws PDFNetException { + reader.begin(page); + writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict()); + if (delta) { + overlappedElements.forEach(feature -> drawBBox(writer, feature.getBoundingBox(), "#00FF00")); + overlappedElements.clear(); + } + processOverlappedElements(reader, writer, visited, overlappedElements, delta); + writer.end(); + reader.end(); + + if (overlappedElements.size() > 0) { + log.warn(overlappedElements.size() + " overlapped elements have not been found and removed"); + } + } + + private void processOverlappedElements(ElementReader reader, ElementWriter writer, Set visited, List coveredElements, boolean delta) + throws PDFNetException { + for (Element element = reader.next(); element != null; element = reader.next()) { + switch (element.getType()) { + case Element.e_form: + processFormOverlappedElements(reader, writer, element, visited, coveredElements, delta); + break; + case Element.e_path: + case Element.e_image: + case Element.e_inline_image: + case Element.e_text: + boolean anyMatch = false; + for (ElementFeatures elementToRemove : coveredElements) { + if (elementToRemove.almostMatches(element)) { + coveredElements.remove(elementToRemove); + anyMatch = true; + break; + } + } + if (!anyMatch) { + writer.writeElement(element); + } else if (element.getType() == 3 && element.hasTextMatrix()) { + /* + PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element. + hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands. + Therefore, the position of a following Tj is affected by not writing the first Element. + This is why, we write only the Tm command: + */ + writer.writeGStateChanges(element); + } + break; + + default: + writer.writeElement(element); + + } + } + } + + private void processFormOverlappedElements(ElementReader reader, ElementWriter writer, Element element, Set visited, List elementsToRemove, boolean delta) + throws PDFNetException { + + writer.writeElement(element); + Obj formObj = element.getXObject(); + + if (!visited.contains((int) formObj.getObjNum())) { + visited.add((int) formObj.getObjNum()); + ElementWriter new_writer = new ElementWriter(); + reader.formBegin(); + new_writer.begin(formObj); + + reader.clearChangeList(); + new_writer.setDefaultGState(reader); + + processOverlappedElements(reader, new_writer, visited, elementsToRemove, delta); + new_writer.end(); + reader.end(); + } + } + + private boolean isTextRenderedVisibly(GState gState) throws PDFNetException { + if (gState.getTextRenderMode() == GState.e_invisible_text) return false; + if (gState.getTextRenderMode() == GState.e_fill_text && gState.getFillOpacity() == 0) return false; + if (gState.getTextRenderMode() == GState.e_stroke_text && gState.getStrokeOpacity() == 0) return false; + if (gState.getTextRenderMode() == GState.e_fill_stroke_text && gState.getFillOpacity() == 0 && gState.getStrokeOpacity() == 0) return false; + return true; + } + + private GeneralPath convertToGeneralPath(PathData pathData) throws PDFNetException { + GeneralPath linePath = new GeneralPath(); + + double[] dataPoints = pathData.getPoints(); + byte[] opr = pathData.getOperators(); + + double x1; + double y1; + double x2; + double y2; + double x3; + double y3; + + int data_index = 0; + for (int opr_index = 0; opr_index < opr.length; ++opr_index) { + switch (opr[opr_index]) { + case PathData.e_moveto: + x1 = dataPoints[data_index]; + ++data_index; + y1 = dataPoints[data_index]; + ++data_index; + + linePath.moveTo(x1, y1); + break; + case PathData.e_lineto: + x1 = dataPoints[data_index]; + ++data_index; + y1 = dataPoints[data_index]; + ++data_index; + + linePath.lineTo(x1, y1); + break; + case PathData.e_cubicto: + x1 = dataPoints[data_index]; + ++data_index; + y1 = dataPoints[data_index]; + ++data_index; + x2 = dataPoints[data_index]; + ++data_index; + y2 = dataPoints[data_index]; + ++data_index; + x3 = dataPoints[data_index]; + ++data_index; + y3 = dataPoints[data_index]; + ++data_index; + + linePath.curveTo(x1, y1, x2, y2, x3, y3); + break; + case PathData.e_rect: + x1 = dataPoints[data_index]; + ++data_index; + y1 = dataPoints[data_index]; + ++data_index; + double w = dataPoints[data_index]; + ++data_index; + double h = dataPoints[data_index]; + ++data_index; + x2 = x1 + w; + y2 = y1; + x3 = x2; + y3 = y1 + h; + double x4 = x1; + double y4 = y3; + + linePath.moveTo(x1, y1); + linePath.lineTo(x2, y2); + linePath.lineTo(x3, y3); + linePath.lineTo(x4, y4); + break; + case PathData.e_closepath: + linePath.closePath(); + break; + default: + throw new PDFNetException("Invalid Element Type", 0, "", "", ""); + } + } + return linePath; + } + + private boolean almostContains(Shape outer, Rectangle2D inner) { + double tolerance = 1e-3; + + double x_with_tolerance = inner.getX() >= 0 ? inner.getX() + tolerance : inner.getX() - tolerance; + double y_with_tolerance = inner.getY() >= 0 ? inner.getY() + tolerance : inner.getY() - tolerance; + double height_with_tolerance = inner.getHeight() - (2 * tolerance); + double width_with_tolerance = inner.getWidth() - (2 * tolerance); + Rectangle2D innerRect = new Rectangle2D.Double(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance); + + return outer.contains(innerRect); + } + + private boolean isFilledAndNonTransparent(Element element) throws PDFNetException { + return element.isFilled() && element.getGState().getFillOpacity() == 1; + } + + + private void debugSave(byte[] pdfFile) { + String fileLocation = "/tmp/delta.pdf"; + try (var f_out = FileUtils.openOutputStream(new File(fileLocation))) { + f_out.write(pdfFile); + } catch (IOException e) { + throw new RuntimeException("File location: " + fileLocation + "could not be openend, no file will be saved"); + } + } + + @SneakyThrows + private void drawBBox(ElementWriter writer, Rectangle2D r, String hexcolor) { + ColorPt colorPt = new ColorPt(Integer.valueOf(hexcolor.substring(1, 3), 16) / 255d, + Integer.valueOf(hexcolor.substring(3, 5), 16) / 255d, + Integer.valueOf(hexcolor.substring(5, 7), 16) / 255d); + ElementBuilder eb = new ElementBuilder(); + Element rect = eb.createRect(r.getX(), r.getY(), r.getWidth(), r.getHeight()); + rect.setPathStroke(true); + rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB()); + rect.getGState().setStrokeColor(colorPt); + writer.writePlacedElement(rect); + } +} diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java index 4a3f15f..49d1b42 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java @@ -1,63 +1,29 @@ package com.iqser.red.service.ocr.v1.server.service; -import static com.pdftron.pdf.TextExtractor.e_no_invisible_text; -import static com.pdftron.pdf.TextExtractor.e_remove_hidden_text; - -import java.awt.geom.AffineTransform; -import java.awt.geom.GeneralPath; -import java.awt.geom.Point2D; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.InputStream; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.TreeSet; - -import org.apache.commons.io.IOUtils; -import org.springframework.amqp.rabbit.core.RabbitTemplate; -import org.springframework.stereotype.Service; - import com.fasterxml.jackson.databind.ObjectMapper; import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse; import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration; -import com.iqser.red.service.ocr.v1.server.model.ClippingPathStack; import com.iqser.red.service.ocr.v1.server.model.ImagePosition; import com.iqser.red.service.ocr.v1.server.model.image.ImageServiceResponse; import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings; import com.iqser.red.service.persistence.service.v1.api.utils.SuppressFBWarnings; import com.iqser.red.service.redaction.v1.model.Point; import com.iqser.red.service.redaction.v1.model.Rectangle; -import com.pdftron.common.Matrix2D; -import com.pdftron.common.PDFNetException; -import com.pdftron.pdf.CharData; -import com.pdftron.pdf.CharIterator; -import com.pdftron.pdf.Element; -import com.pdftron.pdf.ElementReader; -import com.pdftron.pdf.ElementWriter; -import com.pdftron.pdf.Font; -import com.pdftron.pdf.GSChangesIterator; -import com.pdftron.pdf.OCRModule; -import com.pdftron.pdf.OCROptions; -import com.pdftron.pdf.Optimizer; -import com.pdftron.pdf.PDFDoc; -import com.pdftron.pdf.Page; -import com.pdftron.pdf.PageIterator; -import com.pdftron.pdf.PathData; -import com.pdftron.pdf.Rect; -import com.pdftron.pdf.RectCollection; -import com.pdftron.pdf.TextExtractor; -import com.pdftron.sdf.Obj; +import com.pdftron.pdf.*; import com.pdftron.sdf.SDFDoc; - import io.micrometer.core.annotation.Timed; import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; +import org.apache.commons.io.IOUtils; +import org.springframework.amqp.rabbit.core.RabbitTemplate; +import org.springframework.stereotype.Service; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.InputStream; +import java.util.*; + @Slf4j @Service @@ -73,6 +39,7 @@ public class OCRService { private final ObjectMapper objectMapper; + private final InvisibleElementService invisibleElementService; @Timed("redactmanager_PDFTron-ocrDocument") @SneakyThrows @@ -95,9 +62,11 @@ public class OCRService { private byte[] ocr(byte[] file, String fileId, ImageServiceResponse imageServiceResponse) { PDFDoc pdfDoc = null; + + var fileWithoutInvisibleText = invisibleElementService.removeInvisibleTextOrPathElements(file, false); + try (ByteArrayOutputStream out = new ByteArrayOutputStream()) { - pdfDoc = new PDFDoc(file); - removeInvisibleText(pdfDoc); + pdfDoc = new PDFDoc(fileWithoutInvisibleText); Map> pages = new HashMap<>(); @@ -128,7 +97,6 @@ public class OCRService { ocrDoc.close(); } - Optimizer.optimize(pdfDoc); pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null); pdfDoc.close(); @@ -206,308 +174,4 @@ public class OCRService { } } - - - /** - * There are 2 possibilities to have invisible Text in pdfs. - * 1. gState is set to invisible, this is ocr text. - * 2. Filled Path elements in front of the text. - */ - @SneakyThrows - private void removeInvisibleText(PDFDoc pdfDoc) { - - ElementWriter writer = new ElementWriter(); - ElementReader reader = new ElementReader(); - Set visited = new TreeSet<>(); - - for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) { - Page page = iterator.next(); - removeOverlapText(page, reader, writer, visited); - } - } - - - @SneakyThrows - private void removeOverlapText(Page page, ElementReader reader, ElementWriter writer, Set visited) { - - ClippingPathStack clippingPathStack = new ClippingPathStack(page.getMediaBox()); - visited.add((int) page.getSDFObj().getObjNum()); - reader.begin(page); - writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict()); - processElements(reader, writer, visited, clippingPathStack); - writer.end(); - reader.end(); - } - - - @SneakyThrows - private void processElements(ElementReader reader, ElementWriter writer, Set visited, ClippingPathStack clippingPathStack) { - - - GSChangesIterator gs_itr = reader.getChangesIterator(); - while (gs_itr.hasNext()) { - System.out.println("---->" + gs_itr); - } - - - for (Element element = reader.next(); element != null; element = reader.next()) - - - switch (element.getType()) { - - case Element.e_text: - processText(element, writer, clippingPathStack); - break; - - case Element.e_path: - processPath(element,reader, writer, clippingPathStack); - break; - - case Element.e_form: - processForm(reader, writer, element, visited, clippingPathStack); - break; - case Element.e_group_begin: - clippingPathStack.enterNewGState(); - writer.writeElement(element); - break; - case Element.e_group_end: - clippingPathStack.leaveGState(); - writer.writeElement(element); - break; - default: - writer.writeElement(element); - } - } - - - - @SneakyThrows - private void processText(Element textElement, ElementWriter writer, ClippingPathStack clippingPathStack) { - - -// var gState = element.getGState(); -// -// -// //See PDF Reference 5.3 Text rendering modes, 3 = Invisible, however this ocr does not use it. -// if (!filledRectangleIntersection && gState.getTextRenderMode() != 3) { - Rect rect = textElement.getBBox(); - - System.out.println(clippingPathStack.getStack().size() + " -> " +textElement.getTextString()); - - -// Matrix textMatrix = text.getTextMatrix(); -//// Vector start = textMatrix.transform(new Vector(0, 0)); -//// Vector end = new Vector(start.getX() + text.getWidth(), start.getY()); -// var textMatrix = textElement.getTextMatrix(); -// -// var start = textMatrix.multPoint(0,0); -//// Vector start = textMatrix.transform(new Vector(0, 0)); -// var end = new Point2D.Double(start.getX() + textElement.getBBox().getWidth(), start.getY()); - -// Matrix2D ctm = textElement.getCTM(); -// -// // To get the exact character positioning information you need to -// // concatenate current text matrix with CTM and then multiply -// // relative positioning coordinates with the resulting matrix. -// // -// Matrix2D mtx = ctm.multiply(textMatrix); -// java.awt.geom.Point2D.Double p1 = mtx.multPoint(rect.getX1(), rect.getY1()); -// java.awt.geom.Point2D.Double p2 = mtx.multPoint(rect.getX1(), rect.getY2()); -// java.awt.geom.Point2D.Double p3 = mtx.multPoint(rect.getX2(), rect.getY2()); -// java.awt.geom.Point2D.Double p4 = mtx.multPoint(rect.getX2(), rect.getY1()); - - -// -// double x, y; -// long char_code; -// -// for (CharIterator itr = textElement.getCharIterator(); itr.hasNext(); ) { -// CharData data = itr.next(); -// char_code = data.getCharCode(); -// //System.out.print("Character code: "); -// -// System.out.print(String.valueOf(char_code)); -// -// x = data.getGlyphX(); // character positioning information -// y = data.getGlyphY(); -// -// // Use element.getCTM() if you are interested in the CTM -// // (current transformation matrix). -// Matrix2D ctm = textElement.getCTM(); -// -// var inverse = ctm.inverse(); -// -// -// -// // To get the exact character positioning information you need to -// // concatenate current text matrix with CTM and then multiply -// // relative positioning coordinates with the resulting matrix. -// // -// Matrix2D mtx = ctm.multiply(textElement.getTextMatrix()); -// java.awt.geom.Point2D.Double t = mtx.multPoint(x, y); -// x = t.x; -// y = t.y; -// System.out.println(" Position: x=" + x + " y=" + y ); -// } - -// var in = textElement.getCTM().inverse(); -// -// var p1 = in.multPoint(rect.getX1(), rect.getY1()); -// var p4 = in.multPoint(rect.getX2(), rect.getY2()); - - - - - -// Vector start = textMatrix.transform(new Vector(0, 0)); -// Vector end = new Vector(start.getX() + text.getWidth(), start.getY()); - - if(clippingPathStack.getCurrentClippingPath().contains(rect.getX1(), rect.getY1(),rect.getWidth(), rect.getHeight())) { - writer.writeElement(textElement); - } - - -// } - } - - - @SneakyThrows - private void processPath(Element pathElement,ElementReader reader, ElementWriter writer, ClippingPathStack clippingPathStack) { - -// System.out.println("New Path"); - -// System.out.println("ClippingPath: " + pathElement.isClippingPath()); -// System.out.println("ClipWindingFill: " + pathElement.isClipWindingFill()); -// System.out.println("WindingFill: " + pathElement.isWindingFill()); -// System.out.println("Stroke: " + pathElement.isStroked()); -// System.out.println("Filled: " + pathElement.isFilled()); - - GeneralPath linePath = new GeneralPath(); - - PathData pathData = pathElement.getPathData(); - double[] dataPoints = pathData.getPoints(); - byte[] opr = pathData.getOperators(); - - double x1, y1, x2, y2, x3, y3; - - int data_index = 0; - for (int opr_index = 0; opr_index < opr.length; ++opr_index) { - switch (opr[opr_index]) { - case PathData.e_moveto: - x1 = dataPoints[data_index]; - ++data_index; - y1 = dataPoints[data_index]; - ++data_index; -// System.out.println(" M" + x1 + " " + y1); - - linePath.moveTo(x1, y1); - - - break; - case PathData.e_lineto: - x1 = dataPoints[data_index]; - ++data_index; - y1 = dataPoints[data_index]; - ++data_index; -// System.out.println(" L" + x1 + " " + y1); - - linePath.lineTo(x1, y1); - break; - case PathData.e_cubicto: - x1 = dataPoints[data_index]; - ++data_index; - y1 = dataPoints[data_index]; - ++data_index; - x2 = dataPoints[data_index]; - ++data_index; - y2 = dataPoints[data_index]; - ++data_index; - x3 = dataPoints[data_index]; - ++data_index; - y3 = dataPoints[data_index]; - ++data_index; -// System.out.println(" CU P1 " + x1 + " " + y1 + " P2 " + x2 + " " + y2 + " P3 " + x3 + " " + y3); - - linePath.curveTo(x1, y1, x2, y2, x3, y3); - - break; - case PathData.e_rect: { - x1 = dataPoints[data_index]; - ++data_index; - y1 = dataPoints[data_index]; - ++data_index; - double w = dataPoints[data_index]; - ++data_index; - double h = dataPoints[data_index]; - ++data_index; - x2 = x1 + w; - y2 = y1; - x3 = x2; - y3 = y1 + h; - double x4 = x1; - double y4 = y3; -// System.out.println(" RE P1 " + x1 + " " + y1 + " P2 " + x2 + " " + y2 + " P3 " + x3 + " " + y3 + " P4 " + x4 + " " + y4); - - - linePath.moveTo(x1, y1); - linePath.lineTo(x2, y2); - linePath.lineTo(x3, y3); - linePath.lineTo(x4, y4); - - } - - break; - case PathData.e_closepath: - linePath.closePath(); - break; - default: - throw new PDFNetException("Invalid Element Type", 0, "", "", ""); - } - - - } - - - - // ClipWindingFill = true W = non-zero - // ClipWindingFill = false W* = even-odd - - - if(pathElement.isClippingPath()){ - if(pathElement.isClipWindingFill()){ - linePath.setWindingRule(GeneralPath.WIND_NON_ZERO); - } else { - linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD); - } - clippingPathStack.intersectClippingPath(linePath, pathElement.getCTM()); - } - - - - writer.writeElement(pathElement); - } - - - @SneakyThrows - private void processForm(ElementReader reader, ElementWriter writer, Element element, Set visited, ClippingPathStack clippingPathStack) { - - writer.writeElement(element); - Obj formObj = element.getXObject(); - - if (!visited.contains((int) formObj.getObjNum())) { - visited.add((int) formObj.getObjNum()); - System.out.println("Form num:" +(int) formObj.getObjNum()); - ElementWriter new_writer = new ElementWriter(); - reader.formBegin(); - new_writer.begin(formObj); - - reader.clearChangeList(); - new_writer.setDefaultGState(reader); - - processElements(reader, new_writer, visited, clippingPathStack); - new_writer.end(); - reader.end(); - } - } - } diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/InvisibleElementServiceTest.java b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/InvisibleElementServiceTest.java new file mode 100644 index 0000000..709ea4c --- /dev/null +++ b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/InvisibleElementServiceTest.java @@ -0,0 +1,64 @@ +package com.iqser.red.service.ocr.v1.server; + +import com.iqser.red.service.ocr.v1.server.service.InvisibleElementService; +import lombok.SneakyThrows; +import org.apache.commons.io.FileUtils; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.springframework.amqp.rabbit.core.RabbitTemplate; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.boot.test.mock.mockito.MockBean; +import org.springframework.context.annotation.Import; +import org.springframework.core.io.ClassPathResource; +import org.springframework.test.context.junit.jupiter.SpringExtension; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; + +import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory; + +@ExtendWith(SpringExtension.class) +@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT // + , properties = {"pdftron.ocrmodule.path=/YourOCRModulePath"}) +@Import(OcrServiceIntegrationTest.TestConfiguration.class) +public class InvisibleElementServiceTest { + + @Autowired + private InvisibleElementService invisibleElementService; + + @MockBean + protected RabbitTemplate rabbitTemplate; + + @Test + @SneakyThrows + public void testRemoveInvisibleText() { + String fileName = "InvisibleText"; + + ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf"); + + var initialFileBytes = Files.readAllBytes(pdfFileResource.getFile().toPath()); + + var fileWithoutInvisibleText = invisibleElementService.removeInvisibleTextOrPathElements(initialFileBytes, false); + var deltaFile = invisibleElementService.removeInvisibleTextOrPathElements(initialFileBytes, true); + + String fileWithoutInvisibleTextLocation = getTemporaryDirectory() + "/" + fileName + ".pdf"; + String deltaFileLocation = getTemporaryDirectory() + "/" + fileName + "_delta.pdf"; + + saveToFile(fileWithoutInvisibleTextLocation, fileWithoutInvisibleText); + saveToFile(deltaFileLocation, deltaFile); + + System.out.println("File:" + fileWithoutInvisibleTextLocation); + System.out.println("File:" + deltaFileLocation); + } + + private void saveToFile(String location, byte[] fileBytes) { + try (var f_out = FileUtils.openOutputStream(new File(location))) { + f_out.write(fileBytes); + } catch (IOException e) { + throw new RuntimeException("File location: " + location + "could not be openend, no file will be saved"); + } + + } +} diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java index 9e4e9f5..ceb9db3 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java +++ b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java @@ -1,10 +1,13 @@ package com.iqser.red.service.ocr.v1.server; -import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory; -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; - +import com.fasterxml.jackson.databind.ObjectMapper; +import com.iqser.red.service.ocr.v1.server.service.FileStorageService; +import com.iqser.red.service.ocr.v1.server.service.OCRService; +import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService; +import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType; +import com.iqser.red.storage.commons.StorageAutoConfiguration; +import com.iqser.red.storage.commons.service.StorageService; +import lombok.SneakyThrows; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.junit.jupiter.api.AfterEach; @@ -24,15 +27,10 @@ import org.springframework.context.annotation.Primary; import org.springframework.core.io.ClassPathResource; import org.springframework.test.context.junit.jupiter.SpringExtension; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.iqser.red.service.ocr.v1.server.utils.FileSystemBackedStorageService; -import com.iqser.red.service.ocr.v1.server.service.OCRService; -import com.iqser.red.service.ocr.v1.server.service.FileStorageService; -import com.iqser.red.service.persistence.service.v1.api.model.dossiertemplate.dossier.file.FileType; -import com.iqser.red.storage.commons.StorageAutoConfiguration; -import com.iqser.red.storage.commons.service.StorageService; +import java.io.File; -import lombok.SneakyThrows; +import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory; +import static org.assertj.core.api.Assertions.assertThat; @ExtendWith(SpringExtension.class) @SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT // @@ -86,10 +84,10 @@ public class OcrServiceIntegrationTest { @SneakyThrows public void testRemoveInvisibleText() { - String fileName = "InvisiblePathElements"; + String fileName = "ocr/OCR Docs/MK244 - Fitness of Analytical Method - Physical-Chemical Pro"; // String fileName = "InvisiblePathElements"; - ClassPathResource imageInfoResource = new ClassPathResource("files/" + fileName + ".IMAGE_INFO.json"); + ClassPathResource imageInfoResource = new ClassPathResource("files/InvisibleText.IMAGE_INFO.json"); ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf"); var originId = FileStorageService.getStorageId("dossier", "file", FileType.ORIGIN); @@ -107,8 +105,6 @@ public class OcrServiceIntegrationTest { } - - @SneakyThrows public void dummyTest() { diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/FooterText.pdf b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/FooterText.pdf new file mode 100644 index 0000000..93452da Binary files /dev/null and b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/FooterText.pdf differ diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/LargeTableCoveringText.pdf b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/LargeTableCoveringText.pdf new file mode 100644 index 0000000..e6d9a07 Binary files /dev/null and b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/LargeTableCoveringText.pdf differ diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/testDmitryK.pdf b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/testDmitryK.pdf new file mode 100644 index 0000000..b15b478 Binary files /dev/null and b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/testDmitryK.pdf differ diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/testFailed.pdf b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/testFailed.pdf new file mode 100644 index 0000000..1b653ab Binary files /dev/null and b/ocr-service-v1/ocr-service-server-v1/src/test/resources/files/testFailed.pdf differ