diff --git a/ocr-service-v1/ocr-service-server-v1/pom.xml b/ocr-service-v1/ocr-service-server-v1/pom.xml index 8085f1e..61e0ec0 100644 --- a/ocr-service-v1/ocr-service-server-v1/pom.xml +++ b/ocr-service-v1/ocr-service-server-v1/pom.xml @@ -86,16 +86,6 @@ - - org.apache.maven.plugins - maven-compiler-plugin - - - lombok.launch.AnnotationProcessorHider$AnnotationProcessor - com.dslplatform.json.processor.CompiledJsonAnnotationProcessor - - - pl.project13.maven diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ClippingPathStack.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ClippingPathStack.java index 2b84b66..5e3c36a 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ClippingPathStack.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ClippingPathStack.java @@ -1,53 +1,67 @@ package com.iqser.red.service.ocr.v1.server.model; -import com.pdftron.pdf.Rect; -import lombok.Data; -import lombok.SneakyThrows; +import static com.iqser.red.service.ocr.v1.server.service.InvisibleElementRemovalService.TOLERANCE; import java.awt.geom.Area; import java.awt.geom.GeneralPath; import java.awt.geom.Rectangle2D; -import java.util.ArrayDeque; import java.util.Deque; +import java.util.LinkedList; + +import com.pdftron.pdf.Rect; + +import lombok.Data; +import lombok.SneakyThrows; @Data public class ClippingPathStack { - private Deque stack = new ArrayDeque<>(); + private Deque stack = new LinkedList<>(); + @SneakyThrows public ClippingPathStack(Rect rectangle) { + stack.push(new Area(new Rectangle2D.Double(rectangle.getX1(), rectangle.getY1(), rectangle.getWidth(), rectangle.getHeight()).getBounds2D())); } @SneakyThrows public void intersectClippingPath(GeneralPath path) { + getCurrentClippingPath().intersect(new Area(path)); } + public boolean almostIntersects(double x, double y, double width, double height) { + // To address inconsistencies in the calculation of the bounding box we slightly increase the rectangle // Height or width are zero for straight lines, even though they are being rendered. Therefore, height or width must be at minimum >0. - double tolerance = 1e-3; - double x_with_tolerance = x > 0 ? x - tolerance : x + tolerance; - double y_with_tolerance = y > 0 ? y - tolerance : y + tolerance; - double width_with_tolerance = width + 2 * tolerance; - double height_with_tolerance = height + 2 * tolerance; + + double x_with_tolerance = x > 0 ? x - TOLERANCE : x + TOLERANCE; + double y_with_tolerance = y > 0 ? y - TOLERANCE : y + TOLERANCE; + double width_with_tolerance = width + (2 * TOLERANCE); + double height_with_tolerance = height + (2 * TOLERANCE); return getCurrentClippingPath().intersects(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance); } + public Area getCurrentClippingPath() { + return stack.peek(); } + public void enterNewGState() { + Area current = stack.peek(); Area cloned = new Area(); cloned.add(current); stack.push(cloned); } + public void leaveGState() { + stack.pop(); } diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ElementFeatures.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ElementFeatures.java index 322f226..87c625c 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ElementFeatures.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ElementFeatures.java @@ -1,148 +1,170 @@ package com.iqser.red.service.ocr.v1.server.model; +import static com.iqser.red.service.ocr.v1.server.service.InvisibleElementRemovalService.TOLERANCE; + +import java.awt.geom.Rectangle2D; + import com.pdftron.common.PDFNetException; import com.pdftron.pdf.Element; import com.pdftron.pdf.Rect; -import lombok.*; + +import lombok.AccessLevel; +import lombok.EqualsAndHashCode; +import lombok.Getter; +import lombok.SneakyThrows; +import lombok.experimental.FieldDefaults; import lombok.experimental.SuperBuilder; -import java.awt.geom.Rectangle2D; - -@Data +@Getter @SuperBuilder -@NoArgsConstructor -@AllArgsConstructor -public abstract class ElementFeatures { - private int elementType; - private Rectangle2D boundingBox; +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +public class ElementFeatures { + + int elementType; + Rectangle2D boundingBox; + public boolean almostMatches(Element element) throws PDFNetException { - if (element.getType() != elementType) return false; - if (element.getBBox() == null) return false; - return rectsAlmostMatch(element.getBBox()); + + return element.getType() == elementType && // + element.getBBox() != null && // + rectsAlmostMatch(element.getBBox()); } + protected boolean almostEqual(double a, double b) { - double tolerance = 1e-3; - return Math.abs(a - b) < tolerance; + + return Math.abs(a - b) < TOLERANCE; } + @SneakyThrows private boolean rectsAlmostMatch(Rect bBox) { - if (!almostEqual(bBox.getX1(), boundingBox.getX())) return false; - if (!almostEqual(bBox.getY1(), boundingBox.getY())) return false; - if (!almostEqual(bBox.getWidth(), boundingBox.getWidth())) return false; - return almostEqual(bBox.getHeight(), boundingBox.getHeight()); + // To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance + + return almostEqual(bBox.getX1(), boundingBox.getX()) && // + almostEqual(bBox.getY1(), boundingBox.getY()) && // + almostEqual(bBox.getWidth(), boundingBox.getWidth()) && // + almostEqual(bBox.getHeight(), boundingBox.getHeight()); } + @EqualsAndHashCode(callSuper = true) - @Data + @Getter @SuperBuilder - @NoArgsConstructor - @AllArgsConstructor - public static class Text extends ElementFeatures { - private String text; - private int font; - private double fontsize; + @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) + private static class Text extends ElementFeatures { + + String text; + int font; + double fontsize; + @Override public boolean almostMatches(Element element) throws PDFNetException { - if (!super.almostMatches(element)) return false; - if (!text.equals(element.getTextString())) return false; - if (font != element.getGState().getFont().getType()) return false; - return almostEqual(fontsize, element.getGState().getFontSize()); + + return super.almostMatches(element) && // + text.equals(element.getTextString()) && // + font == element.getGState().getFont().getType() && // + almostEqual(fontsize, element.getGState().getFontSize()); } } @EqualsAndHashCode(callSuper = true) - @Data + @Getter @SuperBuilder - @NoArgsConstructor - @AllArgsConstructor - public static class Path extends ElementFeatures { - private boolean isClippingPath; - private boolean isClipWindingFill; - private boolean isStroked; - private boolean isFilled; - private boolean isWindingFill; + @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) + private static class Path extends ElementFeatures { + + boolean isClippingPath; + boolean isClipWindingFill; + boolean isStroked; + boolean isFilled; + boolean isWindingFill; + @Override public boolean almostMatches(Element element) throws PDFNetException { - if (!super.almostMatches(element)) return false; - if (isClippingPath != element.isClippingPath()) return false; - if (isClipWindingFill != element.isClipWindingFill()) return false; - if (isStroked != element.isStroked()) return false; - if (isFilled != element.isFilled()) return false; - if (isWindingFill != element.isWindingFill()) return false; - return true; + return super.almostMatches(element) && // + isClippingPath == element.isClippingPath() && // + isClipWindingFill == element.isClipWindingFill() && // + isStroked == element.isStroked() && // + isFilled == element.isFilled() && // + isWindingFill == element.isWindingFill(); + } + } @EqualsAndHashCode(callSuper = true) - @Data + @Getter @SuperBuilder - @NoArgsConstructor - public static class Image extends ElementFeatures { - private int dataSize; - private int height; - private int width; - private int renderingIntent; - private int componentNum; - private int bitsPerComponent; + @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) + private static class Image extends ElementFeatures { + + int dataSize; + int height; + int width; + int renderingIntent; + int componentNum; + int bitsPerComponent; + @Override public boolean almostMatches(Element element) throws PDFNetException { - if (!super.almostMatches(element)) return false; - if (dataSize != element.getImageDataSize()) return false; - if (height != element.getImageHeight()) return false; - if (width != element.getImageWidth()) return false; - if (renderingIntent != element.getImageRenderingIntent()) return false; - if (componentNum != element.getComponentNum()) return false; - if (bitsPerComponent != element.getBitsPerComponent()) return false; - return true; + + return super.almostMatches(element) && // + dataSize == element.getImageDataSize() && // + height == element.getImageHeight() && // + width == element.getImageWidth() && // + renderingIntent == element.getImageRenderingIntent() && // + componentNum == element.getComponentNum() && // + bitsPerComponent == element.getBitsPerComponent(); } + } + public static ElementFeatures extractFeatures(Element element) throws PDFNetException { - switch (element.getType()) { - case Element.e_path: - return ElementFeatures.Path.builder() - .elementType(element.getType()) - .boundingBox(toRectangle2D(element.getBBox())) - .isClippingPath(element.isClippingPath()) - .isClipWindingFill(element.isClipWindingFill()) - .isStroked(element.isStroked()) - .isFilled(element.isFilled()) - .isWindingFill(element.isWindingFill()) - .build(); - case Element.e_text: - return ElementFeatures.Text.builder() - .elementType(element.getType()) - .boundingBox(toRectangle2D(element.getBBox())) - .text(element.getTextString()) - .font(element.getGState().getFont().getType()) - .fontsize(element.getGState().getFontSize()) - .build(); - case Element.e_image: - case Element.e_inline_image: - return Image.builder() - .elementType(element.getType()) - .boundingBox(toRectangle2D(element.getBBox())) - .dataSize(element.getImageDataSize()) - .height(element.getImageHeight()) - .width(element.getImageWidth()) - .renderingIntent(element.getImageRenderingIntent()) - .componentNum(element.getComponentNum()) - .bitsPerComponent(element.getBitsPerComponent()) - .build(); - default: - throw new UnsupportedOperationException("Feature Extraction is not supported for PDFTron.Element with type: " + element.getType()); - } + + return switch (element.getType()) { + case Element.e_path -> Path.builder() + .elementType(element.getType()) + .boundingBox(toRectangle2D(element.getBBox())) + .isClippingPath(element.isClippingPath()) + .isClipWindingFill(element.isClipWindingFill()) + .isStroked(element.isStroked()) + .isFilled(element.isFilled()) + .isWindingFill(element.isWindingFill()) + .build(); + case Element.e_text -> Text.builder() + .elementType(element.getType()) + .boundingBox(toRectangle2D(element.getBBox())) + .text(element.getTextString()) + .font(element.getGState().getFont().getType()) + .fontsize(element.getGState().getFontSize()) + .build(); + case Element.e_image, Element.e_inline_image -> Image.builder() + .elementType(element.getType()) + .boundingBox(toRectangle2D(element.getBBox())) + .dataSize(element.getImageDataSize()) + .height(element.getImageHeight()) + .width(element.getImageWidth()) + .renderingIntent(element.getImageRenderingIntent()) + .componentNum(element.getComponentNum()) + .bitsPerComponent(element.getBitsPerComponent()) + .build(); + // This technically should never happen, it's a safetynet + default -> throw new RuntimeException("Feature Extraction is not supported for PDFTron.Element with type: " + element.getType()); + }; } + private static Rectangle2D toRectangle2D(Rect rect) throws PDFNetException { + return new Rectangle2D.Double(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight()); } + } diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/InvisibleElementRemovalDto.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/InvisibleElementRemovalDto.java new file mode 100644 index 0000000..6d7f044 --- /dev/null +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/InvisibleElementRemovalDto.java @@ -0,0 +1,25 @@ +package com.iqser.red.service.ocr.v1.server.model; + +import java.util.List; +import java.util.Set; + +import com.pdftron.pdf.ElementReader; + +import lombok.AccessLevel; +import lombok.Builder; +import lombok.Data; +import lombok.experimental.FieldDefaults; + +@Data +@Builder +@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true) +public class InvisibleElementRemovalDto { + + boolean delta; + ElementReader reader; + ClippingPathStack clippingPathStack; + List overlappedElements; + List visibleElements; + Set visitedXObjIds; + +} diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java new file mode 100644 index 0000000..6252b8c --- /dev/null +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java @@ -0,0 +1,419 @@ +package com.iqser.red.service.ocr.v1.server.service; + +import java.awt.Shape; +import java.awt.geom.AffineTransform; +import java.awt.geom.GeneralPath; +import java.awt.geom.Rectangle2D; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Set; +import java.util.TreeSet; + +import org.springframework.stereotype.Service; + +import com.google.common.primitives.Bytes; +import com.google.common.primitives.Doubles; +import com.iqser.red.service.ocr.v1.server.model.ClippingPathStack; +import com.iqser.red.service.ocr.v1.server.model.ElementFeatures; +import com.iqser.red.service.ocr.v1.server.model.InvisibleElementRemovalDto; +import com.pdftron.common.Matrix2D; +import com.pdftron.common.PDFNetException; +import com.pdftron.pdf.ColorPt; +import com.pdftron.pdf.ColorSpace; +import com.pdftron.pdf.Element; +import com.pdftron.pdf.ElementBuilder; +import com.pdftron.pdf.ElementReader; +import com.pdftron.pdf.ElementWriter; +import com.pdftron.pdf.GState; +import com.pdftron.pdf.PDFDoc; +import com.pdftron.pdf.Page; +import com.pdftron.pdf.PageIterator; +import com.pdftron.pdf.PathData; +import com.pdftron.pdf.Rect; +import com.pdftron.sdf.Obj; +import com.pdftron.sdf.SDFDoc; + +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; + +@Slf4j +@Service +public class InvisibleElementRemovalService { + + static public final double TOLERANCE = 1e-3; + + + /** + * Removes all hidden Text, Path and Image Elements from a PDF Document. + * handled cases: + * -Text which is transparent or is set to not render + * -Elements outside of clipping path + * -Elements that have been painted over by visible and filled Paths + * unhandled cases: + * -Elements covered by widely stroked path + * -Elements with the same color as background + * -Any Text set to clipping with its many interactions with other elements + * + * @param pdfFile The PDF file to process + * @param delta If this flag is set only the removed Elements will be written to the output file. + * The Elements are red if they are removed by clipping path, blue for transparency, and a green bounding box for overlap. + * @return The resulting PDF File as bytes. + **/ + @SneakyThrows + public byte[] removeInvisibleElements(InputStream pdfFile, boolean delta) { + + PDFDoc pdfDoc = new PDFDoc(pdfFile); + + ElementWriter writer = new ElementWriter(); + ElementReader reader = new ElementReader(); + Set visitedXObjIds = new TreeSet<>(); + + for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) { + + Page page = iterator.next(); + + visitedXObjIds.add(page.getSDFObj().getObjNum()); + InvisibleElementRemovalDto dto = InvisibleElementRemovalDto.builder() + .reader(reader) + .clippingPathStack(new ClippingPathStack(page.getMediaBox())) + .delta(delta) + .overlappedElements(new ArrayList<>()) + .visibleElements(new ArrayList<>()) + .visitedXObjIds(visitedXObjIds) + .build(); + + removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, dto); + + dto.getVisitedXObjIds().clear(); + + removeOverlappedElements(page, writer, dto); + } + return pdfDoc.save(SDFDoc.SaveMode.REMOVE_UNUSED, null); + } + + + private void removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(Page page, ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException { + + dto.getReader().begin(page); + writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict()); + processElements(writer, dto); + writer.end(); + dto.getReader().end(); + } + + + private void processElements(ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException { + + for (Element element = dto.getReader().next(); element != null; element = dto.getReader().next()) + switch (element.getType()) { + case Element.e_image, Element.e_inline_image -> processImages(element, writer, dto); + case Element.e_text -> processText(element, writer, dto); + case Element.e_path -> processPath(element, writer, dto); + case Element.e_form -> processForm(element, writer, dto); + case Element.e_group_begin -> { + dto.getClippingPathStack().enterNewGState(); + writer.writeElement(element); + } + case Element.e_group_end -> { + dto.getClippingPathStack().leaveGState(); + writer.writeElement(element); + } + default -> writer.writeElement(element); + } + } + + + private void processImages(Element imageElement, ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException { + + Rect rect = imageElement.getBBox(); + + if (rect == null) { + return; + } + + boolean inClippingPath = dto.getClippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight()); + + if (!dto.isDelta() && inClippingPath) { + dto.getVisibleElements().add(ElementFeatures.extractFeatures(imageElement)); + } + + if (dto.isDelta() ^ inClippingPath) { + writer.writeElement(imageElement); + } + } + + + private void processText(Element textElement, ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException { + + Rect rect = textElement.getBBox(); + + if (rect == null) { + writer.writeElement(textElement); + return; + } + + GState gState = textElement.getGState(); + + boolean inClippingPath = dto.getClippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight()); + + boolean isTextVisible = isTextRenderedVisibly(gState); + + if (inClippingPath && isTextVisible) { + dto.getVisibleElements().add(ElementFeatures.extractFeatures(textElement)); + } + if (!dto.isDelta()) { + if (inClippingPath && isTextVisible) { + writer.writeElement(textElement); + } else if (textElement.hasTextMatrix()) { + /* + PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element. + hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands. + Therefore, the position of a following Tj is affected by not writing the first Element. + This is why, we write only the Tm command: + */ + writer.writeGStateChanges(textElement); + } + } else { + if (!inClippingPath) { + gState.setFillColorSpace(ColorSpace.createDeviceRGB()); + gState.setFillColor(new ColorPt(1, 0, 0)); + writer.writeElement(textElement); + } + if (!isTextVisible) { + gState.setFillColorSpace(ColorSpace.createDeviceRGB()); + gState.setFillColor(new ColorPt(0, 0, 1)); + gState.setTextRenderMode(GState.e_fill_text); + gState.setFillOpacity(1); + writer.writeElement(textElement); + } + } + } + + + private void processForm(Element formElement, ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException { + + writer.writeElement(formElement); + Obj formObj = formElement.getXObject(); + + if (!dto.getVisitedXObjIds().contains(formObj.getObjNum())) { + dto.getVisitedXObjIds().add(formObj.getObjNum()); + // writer needs to be newly initialized when entering a new content stream + // see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest) + ElementWriter formWriter = new ElementWriter(); + dto.getReader().formBegin(); + formWriter.begin(formObj); + + dto.getReader().clearChangeList(); + formWriter.setDefaultGState(dto.getReader()); + + processElements(formWriter, dto); + formWriter.end(); + dto.getReader().end(); + } + } + + + private void processPath(Element pathElement, ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException { + + GeneralPath linePath = convertToGeneralPath(pathElement.getPathData()); + + //transform path to initial user space + var ctm = pathElement.getCTM(); + var affineTransform = getAffineTransform(ctm); + linePath.transform(affineTransform); + + var rect = linePath.getBounds2D(); + + boolean inClippingPath = dto.getClippingPathStack().almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight()); + + if (pathElement.isClippingPath()) { + if (pathElement.isClipWindingFill()) { + linePath.setWindingRule(GeneralPath.WIND_NON_ZERO); + } else { + linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD); + } + + dto.getClippingPathStack().intersectClippingPath(linePath); + pathElement.setPathClip(!dto.isDelta()); + writer.writeElement(pathElement); + + } else { + if (inClippingPath) { + // TODO: WINDING RULE + if (isFilledAndNonTransparent(pathElement)) { + List currentOverlappedElements = dto.getVisibleElements() + .stream() + .filter(features -> almostContains(linePath, features.getBoundingBox())) + .toList(); + dto.getOverlappedElements().addAll(currentOverlappedElements); + dto.getVisibleElements().removeAll(currentOverlappedElements); + } + dto.getVisibleElements().add(ElementFeatures.extractFeatures(pathElement)); + if (!dto.isDelta()) { + writer.writeElement(pathElement); + } + } + if (dto.isDelta() && !inClippingPath) { + pathElement.getGState().setFillColorSpace(ColorSpace.createDeviceRGB()); + pathElement.getGState().setFillColor(new ColorPt(1, 0, 0)); + pathElement.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB()); + pathElement.getGState().setStrokeColor(new ColorPt(1, 0, 0)); + writer.writeElement(pathElement); + } + } + } + + + private static AffineTransform getAffineTransform(Matrix2D ctm) throws PDFNetException { + + return new AffineTransform(ctm.getA(), ctm.getB(), ctm.getC(), ctm.getD(), ctm.getH(), ctm.getV()); + } + + + private void removeOverlappedElements(Page page, ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException { + + dto.getReader().begin(page); + writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict()); + if (dto.isDelta()) { + dto.getOverlappedElements().forEach(feature -> drawBBox(writer, feature.getBoundingBox(), "#00FF00")); + dto.getOverlappedElements().clear(); + } + processOverlappedElements(writer, dto); + writer.end(); + dto.getReader().end(); + + if (dto.getOverlappedElements().size() > 0) { + log.warn(dto.getOverlappedElements().size() + " overlapped elements have not been found or removed"); + } + } + + + private void processOverlappedElements(ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException { + + for (Element element = dto.getReader().next(); element != null; element = dto.getReader().next()) { + switch (element.getType()) { + case Element.e_form -> processFormOverlappedElements(writer, element, dto); + case Element.e_path, Element.e_image, Element.e_inline_image, Element.e_text -> { + boolean anyMatch = false; + for (ElementFeatures elementToRemove : dto.getOverlappedElements()) { + if (elementToRemove.almostMatches(element)) { + dto.getOverlappedElements().remove(elementToRemove); + anyMatch = true; + break; + } + } + if (!anyMatch) { + writer.writeElement(element); + } else if (element.getType() == 3 && element.hasTextMatrix()) { + /* + PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element. + hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands. + Therefore, the position of a following Tj is affected by not writing the first Element. + This is why, we write only the Tm command: + */ + writer.writeGStateChanges(element); + } + } + default -> writer.writeElement(element); + } + } + } + + + private void processFormOverlappedElements(ElementWriter writer, Element formElement, InvisibleElementRemovalDto dto) throws PDFNetException { + + writer.writeElement(formElement); + Obj formObj = formElement.getXObject(); + + if (!dto.getVisitedXObjIds().contains(formObj.getObjNum())) { + dto.getVisitedXObjIds().add(formObj.getObjNum()); + // writer needs to be newly initialized when entering a new content stream + // see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest) + ElementWriter formWriter = new ElementWriter(); + dto.getReader().formBegin(); + formWriter.begin(formObj); + + dto.getReader().clearChangeList(); + formWriter.setDefaultGState(dto.getReader()); + + processOverlappedElements(formWriter, dto); + formWriter.end(); + dto.getReader().end(); + } + } + + + private boolean isTextRenderedVisibly(GState gState) throws PDFNetException { + + return gState.getTextRenderMode() != GState.e_invisible_text && // + !(gState.getTextRenderMode() == GState.e_fill_text && gState.getFillOpacity() == 0) && // + !(gState.getTextRenderMode() == GState.e_stroke_text && gState.getStrokeOpacity() == 0) && // + !(gState.getTextRenderMode() == GState.e_fill_stroke_text && gState.getFillOpacity() == 0 && gState.getStrokeOpacity() == 0); + } + + + private GeneralPath convertToGeneralPath(PathData pathData) throws PDFNetException { + + GeneralPath linePath = new GeneralPath(); + Iterator points = Doubles.asList(pathData.getPoints()).iterator(); + Iterable operators = Bytes.asList(pathData.getOperators()); + for (var operator : operators) { + switch (operator) { + case PathData.e_moveto -> linePath.moveTo(points.next(), points.next()); + case PathData.e_lineto -> linePath.lineTo(points.next(), points.next()); + case PathData.e_cubicto -> linePath.curveTo(points.next(), points.next(), points.next(), points.next(), points.next(), points.next()); + case PathData.e_closepath -> linePath.closePath(); + case PathData.e_rect -> { + double x = points.next(); + double y = points.next(); + double w = points.next(); + double h = points.next(); + linePath.moveTo(x, y); + linePath.lineTo(x + w, y); + linePath.lineTo(x + w, y + h); + linePath.lineTo(x, y + h); + linePath.closePath(); + } + default -> throw new PDFNetException("Invalid Element Type", 0, "", "", ""); + } + } + return linePath; + } + + + private boolean almostContains(Shape outer, Rectangle2D inner) { + //To address inconsistencies in the calculation of the bounding box we slightly shrink the inner rectangle + + double x_with_tolerance = inner.getX() >= 0 ? inner.getX() + TOLERANCE : inner.getX() - TOLERANCE; + double y_with_tolerance = inner.getY() >= 0 ? inner.getY() + TOLERANCE : inner.getY() - TOLERANCE; + double height_with_tolerance = inner.getHeight() - (2 * TOLERANCE); + double width_with_tolerance = inner.getWidth() - (2 * TOLERANCE); + Rectangle2D innerRect = new Rectangle2D.Double(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance); + + return outer.contains(innerRect); + } + + + private boolean isFilledAndNonTransparent(Element element) throws PDFNetException { + + return element.isFilled() && element.getGState().getFillOpacity() == 1; + } + + + @SneakyThrows + private void drawBBox(ElementWriter writer, Rectangle2D r, String hexcolor) { + + ColorPt colorPt = new ColorPt(Integer.valueOf(hexcolor.substring(1, 3), 16) / 255d, + Integer.valueOf(hexcolor.substring(3, 5), 16) / 255d, + Integer.valueOf(hexcolor.substring(5, 7), 16) / 255d); + ElementBuilder eb = new ElementBuilder(); + Element rect = eb.createRect(r.getX(), r.getY(), r.getWidth(), r.getHeight()); + rect.setPathStroke(true); + rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB()); + rect.getGState().setStrokeColor(colorPt); + writer.writePlacedElement(rect); + } + +} \ No newline at end of file diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementService.java deleted file mode 100644 index 14d0be0..0000000 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementService.java +++ /dev/null @@ -1,448 +0,0 @@ -package com.iqser.red.service.ocr.v1.server.service; - -import com.iqser.red.service.ocr.v1.server.model.ClippingPathStack; -import com.iqser.red.service.ocr.v1.server.model.ElementFeatures; -import com.pdftron.common.Matrix2D; -import com.pdftron.common.PDFNetException; -import com.pdftron.pdf.*; -import com.pdftron.sdf.Obj; -import com.pdftron.sdf.SDFDoc; -import lombok.SneakyThrows; -import lombok.extern.slf4j.Slf4j; -import org.springframework.stereotype.Service; - -import java.awt.*; -import java.awt.geom.AffineTransform; -import java.awt.geom.GeneralPath; -import java.awt.geom.Rectangle2D; -import java.util.ArrayList; -import java.util.List; -import java.util.Set; -import java.util.TreeSet; -import java.util.stream.Collectors; - -@Slf4j -@Service -public class InvisibleElementService { - - /* - handled cases: - Text which is transparent or is set to not render - Text or Path or Images outside of clipping path - Text or Path or Images that have been painted over by visible and filled Paths - unhandled cases: - Text covered by widely stroked path - Text same color as background - Any Text set to clipping with its many interactions with other elements - */ - @SneakyThrows - public byte[] removeInvisibleElements(byte[] pdfFile, boolean delta) { - PDFDoc pdfDoc = new PDFDoc(pdfFile); - - ElementWriter writer = new ElementWriter(); - ElementReader reader = new ElementReader(); - Set visited = new TreeSet<>(); - - for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) { - Page page = iterator.next(); - List overlappedElements = removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, reader, writer, visited, delta); - visited.clear(); - removeOverlappedElements(page, reader, writer, visited, overlappedElements, delta); - } - - return pdfDoc.save(SDFDoc.SaveMode.REMOVE_UNUSED, null); - } - - - private List removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(Page page, ElementReader reader, ElementWriter writer, Set visited, boolean delta) - throws PDFNetException { - var overlappedElements = new ArrayList(); - var visibleElements = new ArrayList(); - ClippingPathStack clippingPathStack = new ClippingPathStack(page.getMediaBox()); - visited.add((int) page.getSDFObj().getObjNum()); - reader.begin(page); - writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict()); - processElements(reader, writer, visited, clippingPathStack, delta, overlappedElements, visibleElements); - writer.end(); - reader.end(); - return overlappedElements; - } - - private void processElements(ElementReader reader, ElementWriter writer, Set visited, ClippingPathStack clippingPathStack, Boolean delta, - List coveredElements, List visibleElements) - throws PDFNetException { - - for (Element element = reader.next(); element != null; element = reader.next()) - switch (element.getType()) { - - case Element.e_image: - case Element.e_inline_image: - processImages(element, writer, clippingPathStack, delta, visibleElements); - break; - - case Element.e_text: - processText(element, writer, clippingPathStack, delta, visibleElements); - break; - - case Element.e_path: - processPath(element, writer, clippingPathStack, delta, coveredElements, visibleElements); - break; - - case Element.e_form: - processForm(reader, writer, element, visited, clippingPathStack, delta, coveredElements, visibleElements); - break; - - case Element.e_group_begin: - clippingPathStack.enterNewGState(); - writer.writeElement(element); - break; - - case Element.e_group_end: - clippingPathStack.leaveGState(); - writer.writeElement(element); - break; - - default: - writer.writeElement(element); - } - } - - private void processImages(Element imageElement, ElementWriter writer, ClippingPathStack clippingPathStack, Boolean delta, List visibleElements) - throws PDFNetException { - - Rect rect = imageElement.getBBox(); - - if (rect == null) { - return; - } - - boolean inClippingPath = clippingPathStack.almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight()); - - if (!delta && inClippingPath) { - visibleElements.add(ElementFeatures.extractFeatures(imageElement)); - writer.writeElement(imageElement); - } - - if (delta && !inClippingPath) { - writer.writeElement(imageElement); - } - } - - private void processText(Element textElement, ElementWriter writer, ClippingPathStack clippingPathStack, - Boolean delta, List visibleElements) - throws PDFNetException { - - Rect rect = textElement.getBBox(); - - if (rect == null) { - writer.writeElement(textElement); - return; - } - - GState gState = textElement.getGState(); - - boolean inClippingPath = clippingPathStack.almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight()); - - boolean isTextVisible = isTextRenderedVisibly(gState); - - if (inClippingPath && isTextVisible) { - visibleElements.add(ElementFeatures.extractFeatures(textElement)); - } - if (!delta) { - if (inClippingPath && isTextVisible) { - writer.writeElement(textElement); - } else if (textElement.hasTextMatrix()) { - /* - PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element. - hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands. - Therefore, the position of a following Tj is affected by not writing the first Element. - This is why, we write only the Tm command: - */ - writer.writeGStateChanges(textElement); - } - } else { - if (!inClippingPath) { - gState.setFillColorSpace(ColorSpace.createDeviceRGB()); - gState.setFillColor(new ColorPt(1, 0, 0)); - writer.writeElement(textElement); - } - if (!isTextVisible) { - gState.setFillColorSpace(ColorSpace.createDeviceRGB()); - gState.setFillColor(new ColorPt(0, 0, 1)); - gState.setTextRenderMode(GState.e_fill_text); - gState.setFillOpacity(1); - writer.writeElement(textElement); - } - } - } - - private void processForm(ElementReader reader, ElementWriter writer, Element element, Set visited, ClippingPathStack clippingPathStack, Boolean delta, - List coveredElements, List allElements) - throws PDFNetException { - - writer.writeElement(element); - Obj formObj = element.getXObject(); - - if (!visited.contains((int) formObj.getObjNum())) { - visited.add((int) formObj.getObjNum()); - ElementWriter new_writer = new ElementWriter(); - reader.formBegin(); - new_writer.begin(formObj); - - reader.clearChangeList(); - new_writer.setDefaultGState(reader); - - processElements(reader, new_writer, visited, clippingPathStack, delta, coveredElements, allElements); - new_writer.end(); - reader.end(); - } - } - - private void processPath(Element pathElement, ElementWriter writer, ClippingPathStack clippingPathStack, Boolean delta, - List overlappedElements, List visibleElements) - throws PDFNetException { - - GeneralPath linePath = convertToGeneralPath(pathElement.getPathData()); - - //transform path to initial user space - var ctm = pathElement.getCTM(); - var affineTransform = getAffineTransform(ctm); - linePath.transform(affineTransform); - - var rect = linePath.getBounds2D(); - - boolean inClippingPath = clippingPathStack.almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight()); - - if (pathElement.isClippingPath()) { - if (pathElement.isClipWindingFill()) { - linePath.setWindingRule(GeneralPath.WIND_NON_ZERO); - } else { - linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD); - } - - clippingPathStack.intersectClippingPath(linePath); - pathElement.setPathClip(!delta); - writer.writeElement(pathElement); - - } else { - if (inClippingPath) { - if (isFilledAndNonTransparent(pathElement)) { - List currentOverlappedElements = visibleElements.stream() - .filter(features -> almostContains(linePath, features.getBoundingBox())) - .collect(Collectors.toList()); - overlappedElements.addAll(currentOverlappedElements); - visibleElements.removeAll(currentOverlappedElements); - } - visibleElements.add(ElementFeatures.extractFeatures(pathElement)); - if (!delta) { - writer.writeElement(pathElement); - } - } - if (delta && !inClippingPath) { - pathElement.getGState().setFillColorSpace(ColorSpace.createDeviceRGB()); - pathElement.getGState().setFillColor(new ColorPt(1, 0, 0)); - pathElement.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB()); - pathElement.getGState().setStrokeColor(new ColorPt(1, 0, 0)); - writer.writeElement(pathElement); - } - } - } - - private static AffineTransform getAffineTransform(Matrix2D ctm) throws PDFNetException { - return new AffineTransform(ctm.getA(), ctm.getB(), ctm.getC(), ctm.getD(), ctm.getH(), ctm.getV()); - } - - private void removeOverlappedElements(Page page, ElementReader reader, ElementWriter writer, Set visited, List overlappedElements, boolean delta) - throws PDFNetException { - reader.begin(page); - writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict()); - if (delta) { - overlappedElements.forEach(feature -> drawBBox(writer, feature.getBoundingBox(), "#00FF00")); - overlappedElements.clear(); - } - processOverlappedElements(reader, writer, visited, overlappedElements, delta); - writer.end(); - reader.end(); - - if (overlappedElements.size() > 0) { - log.warn(overlappedElements.size() + " overlapped elements have not been found and removed"); - } - } - - private void processOverlappedElements(ElementReader reader, ElementWriter writer, Set visited, List coveredElements, boolean delta) - throws PDFNetException { - for (Element element = reader.next(); element != null; element = reader.next()) { - switch (element.getType()) { - case Element.e_form: - processFormOverlappedElements(reader, writer, element, visited, coveredElements, delta); - break; - case Element.e_path: - case Element.e_image: - case Element.e_inline_image: - case Element.e_text: - boolean anyMatch = false; - for (ElementFeatures elementToRemove : coveredElements) { - if (elementToRemove.almostMatches(element)) { - coveredElements.remove(elementToRemove); - anyMatch = true; - break; - } - } - if (!anyMatch) { - writer.writeElement(element); - } else if (element.getType() == 3 && element.hasTextMatrix()) { - /* - PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element. - hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands. - Therefore, the position of a following Tj is affected by not writing the first Element. - This is why, we write only the Tm command: - */ - writer.writeGStateChanges(element); - } - break; - - default: - writer.writeElement(element); - - } - } - } - - private void processFormOverlappedElements(ElementReader reader, ElementWriter writer, Element element, Set visited, List elementsToRemove, boolean delta) - throws PDFNetException { - - writer.writeElement(element); - Obj formObj = element.getXObject(); - - if (!visited.contains((int) formObj.getObjNum())) { - visited.add((int) formObj.getObjNum()); - ElementWriter new_writer = new ElementWriter(); - reader.formBegin(); - new_writer.begin(formObj); - - reader.clearChangeList(); - new_writer.setDefaultGState(reader); - - processOverlappedElements(reader, new_writer, visited, elementsToRemove, delta); - new_writer.end(); - reader.end(); - } - } - - private boolean isTextRenderedVisibly(GState gState) throws PDFNetException { - if (gState.getTextRenderMode() == GState.e_invisible_text) return false; - if (gState.getTextRenderMode() == GState.e_fill_text && gState.getFillOpacity() == 0) return false; - if (gState.getTextRenderMode() == GState.e_stroke_text && gState.getStrokeOpacity() == 0) return false; - if (gState.getTextRenderMode() == GState.e_fill_stroke_text && gState.getFillOpacity() == 0 && gState.getStrokeOpacity() == 0) return false; - return true; - } - - private GeneralPath convertToGeneralPath(PathData pathData) throws PDFNetException { - GeneralPath linePath = new GeneralPath(); - - double[] dataPoints = pathData.getPoints(); - byte[] opr = pathData.getOperators(); - - double x1; - double y1; - double x2; - double y2; - double x3; - double y3; - - int data_index = 0; - for (int opr_index = 0; opr_index < opr.length; ++opr_index) { - switch (opr[opr_index]) { - case PathData.e_moveto: - x1 = dataPoints[data_index]; - ++data_index; - y1 = dataPoints[data_index]; - ++data_index; - - linePath.moveTo(x1, y1); - break; - case PathData.e_lineto: - x1 = dataPoints[data_index]; - ++data_index; - y1 = dataPoints[data_index]; - ++data_index; - - linePath.lineTo(x1, y1); - break; - case PathData.e_cubicto: - x1 = dataPoints[data_index]; - ++data_index; - y1 = dataPoints[data_index]; - ++data_index; - x2 = dataPoints[data_index]; - ++data_index; - y2 = dataPoints[data_index]; - ++data_index; - x3 = dataPoints[data_index]; - ++data_index; - y3 = dataPoints[data_index]; - ++data_index; - - linePath.curveTo(x1, y1, x2, y2, x3, y3); - break; - case PathData.e_rect: - x1 = dataPoints[data_index]; - ++data_index; - y1 = dataPoints[data_index]; - ++data_index; - double w = dataPoints[data_index]; - ++data_index; - double h = dataPoints[data_index]; - ++data_index; - x2 = x1 + w; - y2 = y1; - x3 = x2; - y3 = y1 + h; - double x4 = x1; - double y4 = y3; - - linePath.moveTo(x1, y1); - linePath.lineTo(x2, y2); - linePath.lineTo(x3, y3); - linePath.lineTo(x4, y4); - break; - case PathData.e_closepath: - linePath.closePath(); - break; - default: - throw new PDFNetException("Invalid Element Type", 0, "", "", ""); - } - } - return linePath; - } - - private boolean almostContains(Shape outer, Rectangle2D inner) { - double tolerance = 1e-3; - - double x_with_tolerance = inner.getX() >= 0 ? inner.getX() + tolerance : inner.getX() - tolerance; - double y_with_tolerance = inner.getY() >= 0 ? inner.getY() + tolerance : inner.getY() - tolerance; - double height_with_tolerance = inner.getHeight() - (2 * tolerance); - double width_with_tolerance = inner.getWidth() - (2 * tolerance); - Rectangle2D innerRect = new Rectangle2D.Double(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance); - - return outer.contains(innerRect); - } - - private boolean isFilledAndNonTransparent(Element element) throws PDFNetException { - return element.isFilled() && element.getGState().getFillOpacity() == 1; - } - - - @SneakyThrows - private void drawBBox(ElementWriter writer, Rectangle2D r, String hexcolor) { - ColorPt colorPt = new ColorPt(Integer.valueOf(hexcolor.substring(1, 3), 16) / 255d, - Integer.valueOf(hexcolor.substring(3, 5), 16) / 255d, - Integer.valueOf(hexcolor.substring(5, 7), 16) / 255d); - ElementBuilder eb = new ElementBuilder(); - Element rect = eb.createRect(r.getX(), r.getY(), r.getWidth(), r.getHeight()); - rect.setPathStroke(true); - rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB()); - rect.getGState().setStrokeColor(colorPt); - writer.writePlacedElement(rect); - } -} \ No newline at end of file diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java index a14d51a..964a507 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java @@ -1,5 +1,17 @@ package com.iqser.red.service.ocr.v1.server.service; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.springframework.amqp.rabbit.core.RabbitTemplate; +import org.springframework.stereotype.Service; + import com.fasterxml.jackson.databind.ObjectMapper; import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse; import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration; @@ -8,19 +20,17 @@ import com.iqser.red.service.ocr.v1.server.model.image.ImageServiceResponse; import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings; import com.iqser.red.service.redaction.v1.model.Point; import com.iqser.red.service.redaction.v1.model.Rectangle; -import com.pdftron.pdf.*; +import com.pdftron.pdf.OCRModule; +import com.pdftron.pdf.OCROptions; +import com.pdftron.pdf.Optimizer; +import com.pdftron.pdf.PDFDoc; +import com.pdftron.pdf.Page; +import com.pdftron.pdf.RectCollection; import com.pdftron.sdf.SDFDoc; + import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; -import org.apache.commons.io.IOUtils; -import org.springframework.amqp.rabbit.core.RabbitTemplate; -import org.springframework.stereotype.Service; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.InputStream; -import java.util.*; @Slf4j @Service @@ -36,20 +46,18 @@ public class OCRService { private final ObjectMapper objectMapper; - private final InvisibleElementService invisibleElementService; + private final InvisibleElementRemovalService invisibleElementRemovalService; @SneakyThrows public InputStream ocrDocument(String dossierId, String fileId) { - var fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId); - var imageServiceResponse = fileStorageService.getImageServiceResponse(dossierId, fileId); + InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId); + ImageServiceResponse imageServiceResponse = fileStorageService.getImageServiceResponse(dossierId, fileId); - var fileBytes = IOUtils.toByteArray(fileStream); + byte[] fileWithoutInvisibleTextStream = invisibleElementRemovalService.removeInvisibleElements(fileStream, false); - byte[] fileWithoutInvisibleTextBytes = invisibleElementService.removeInvisibleElements(fileBytes, false); - - var ocrBytes = ocr(fileWithoutInvisibleTextBytes, fileId, imageServiceResponse); + byte[] ocrBytes = ocr(fileWithoutInvisibleTextStream, fileId, imageServiceResponse); return new ByteArrayInputStream(ocrBytes); diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/InvisibleElementServiceTest.java b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/InvisibleElementRemovalServiceTest.java similarity index 75% rename from ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/InvisibleElementServiceTest.java rename to ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/InvisibleElementRemovalServiceTest.java index d76f6b0..8517775 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/InvisibleElementServiceTest.java +++ b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/InvisibleElementRemovalServiceTest.java @@ -1,12 +1,12 @@ package com.iqser.red.service.ocr.v1.server; -import com.iqser.red.service.ocr.v1.server.service.InvisibleElementService; -import com.pdftron.pdf.PDFDoc; -import com.pdftron.pdf.Page; -import com.pdftron.pdf.PageIterator; -import com.pdftron.pdf.TextExtractor; -import lombok.SneakyThrows; -import org.apache.commons.io.FileUtils; +import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory; +import static org.assertj.core.api.AssertionsForClassTypes.assertThat; + +import java.io.FileOutputStream; +import java.io.IOException; +import java.nio.file.Files; + import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; import org.springframework.amqp.rabbit.core.RabbitTemplate; @@ -17,62 +17,69 @@ import org.springframework.context.annotation.Import; import org.springframework.core.io.ClassPathResource; import org.springframework.test.context.junit.jupiter.SpringExtension; -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; +import com.iqser.red.service.ocr.v1.server.service.InvisibleElementRemovalService; +import com.pdftron.pdf.PDFDoc; +import com.pdftron.pdf.Page; +import com.pdftron.pdf.PageIterator; +import com.pdftron.pdf.TextExtractor; -import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory; -import static org.assertj.core.api.AssertionsForClassTypes.assertThat; +import lombok.SneakyThrows; @ExtendWith(SpringExtension.class) @SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT // , properties = {"pdftron.ocrmodule.path=/YourOCRModulePath"}) @Import(OcrServiceIntegrationTest.TestConfiguration.class) -public class InvisibleElementServiceTest { +public class InvisibleElementRemovalServiceTest { @Autowired - private InvisibleElementService invisibleElementService; + private InvisibleElementRemovalService invisibleElementRemovalService; @MockBean protected RabbitTemplate rabbitTemplate; + @Test @SneakyThrows public void testRemoveInvisibleText() { + String fileName = "InvisibleText"; ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf"); - var initialFileBytes = Files.readAllBytes(pdfFileResource.getFile().toPath()); + var initialFileStream = Files.newInputStream(pdfFileResource.getFile().toPath()); + var fileWithoutInvisibleElements = invisibleElementRemovalService.removeInvisibleElements(initialFileStream, false); - var fileWithoutInvisibleText = invisibleElementService.removeInvisibleElements(initialFileBytes, false); - var deltaFile = invisibleElementService.removeInvisibleElements(initialFileBytes, true); + initialFileStream = Files.newInputStream(pdfFileResource.getFile().toPath()); + var deltaFile = invisibleElementRemovalService.removeInvisibleElements(initialFileStream, true); String fileWithoutInvisibleTextLocation = getTemporaryDirectory() + "/" + fileName + ".pdf"; String deltaFileLocation = getTemporaryDirectory() + "/" + fileName + "_delta.pdf"; - saveToFile(fileWithoutInvisibleTextLocation, fileWithoutInvisibleText); + saveToFile(fileWithoutInvisibleTextLocation, fileWithoutInvisibleElements); saveToFile(deltaFileLocation, deltaFile); System.out.println("Output File without invisible elements: " + fileWithoutInvisibleTextLocation); System.out.println("Output Delta File: " + deltaFileLocation); TextExtractor extractor = new TextExtractor(); - PDFDoc pdfDoc = new PDFDoc(fileWithoutInvisibleText); + PDFDoc pdfDoc = new PDFDoc(fileWithoutInvisibleElements); PageIterator iterator = pdfDoc.getPageIterator(); while (iterator.hasNext()) { Page page = iterator.next(); extractor.begin(page); String[] text = extractor.getAsText().split("\n"); - assertThat(text).containsAnyOf("APPENDIX 16 Pathology Report", "Amendment 1", "Page 255 of 260"); + assertThat(text).contains("APPENDIX 16 Pathology Report", "Amendment 1", "Page 255 of 260"); } } + private void saveToFile(String location, byte[] fileBytes) { - try (var f_out = FileUtils.openOutputStream(new File(location))) { + + try (var f_out = new FileOutputStream(location)) { f_out.write(fileBytes); } catch (IOException e) { throw new RuntimeException("File location: " + location + "could not be openend, no file will be saved"); } } + } \ No newline at end of file diff --git a/ocr-service-v1/pom.xml b/ocr-service-v1/pom.xml index b9e5d2e..9b40cce 100644 --- a/ocr-service-v1/pom.xml +++ b/ocr-service-v1/pom.xml @@ -7,7 +7,7 @@ com.iqser.red platform-dependency - 1.14.0 + RED-6114-1