diff --git a/ocr-service-v1/ocr-service-server-v1/pom.xml b/ocr-service-v1/ocr-service-server-v1/pom.xml index 38dd26f..052cade 100644 --- a/ocr-service-v1/ocr-service-server-v1/pom.xml +++ b/ocr-service-v1/ocr-service-server-v1/pom.xml @@ -23,6 +23,12 @@ com.iqser.red.commons storage-commons + + com.iqser.red.commons + pdftron-logic-commons + 1.1.0 + + com.iqser.red.commons spring-commons diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/Application.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/Application.java index 3d65e1b..bfa6c70 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/Application.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/Application.java @@ -10,6 +10,7 @@ import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Import; import org.springframework.scheduling.annotation.EnableAsync; +import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService; import com.iqser.red.service.ocr.v1.server.client.FileStatusProcessingUpdateClient; import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration; import com.iqser.red.service.ocr.v1.server.multitenancy.AsyncConfig; @@ -44,4 +45,11 @@ public class Application { return new TimedAspect(registry); } + + @Bean + public InvisibleElementRemovalService invisibleElementRemovalService() { + + return new InvisibleElementRemovalService(); + } + } diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ClippingPathStack.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ClippingPathStack.java deleted file mode 100644 index 5e3c36a..0000000 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ClippingPathStack.java +++ /dev/null @@ -1,68 +0,0 @@ -package com.iqser.red.service.ocr.v1.server.model; - -import static com.iqser.red.service.ocr.v1.server.service.InvisibleElementRemovalService.TOLERANCE; - -import java.awt.geom.Area; -import java.awt.geom.GeneralPath; -import java.awt.geom.Rectangle2D; -import java.util.Deque; -import java.util.LinkedList; - -import com.pdftron.pdf.Rect; - -import lombok.Data; -import lombok.SneakyThrows; - -@Data -public class ClippingPathStack { - - private Deque stack = new LinkedList<>(); - - - @SneakyThrows - public ClippingPathStack(Rect rectangle) { - - stack.push(new Area(new Rectangle2D.Double(rectangle.getX1(), rectangle.getY1(), rectangle.getWidth(), rectangle.getHeight()).getBounds2D())); - } - - - @SneakyThrows - public void intersectClippingPath(GeneralPath path) { - - getCurrentClippingPath().intersect(new Area(path)); - } - - - public boolean almostIntersects(double x, double y, double width, double height) { - // To address inconsistencies in the calculation of the bounding box we slightly increase the rectangle - // Height or width are zero for straight lines, even though they are being rendered. Therefore, height or width must be at minimum >0. - - double x_with_tolerance = x > 0 ? x - TOLERANCE : x + TOLERANCE; - double y_with_tolerance = y > 0 ? y - TOLERANCE : y + TOLERANCE; - double width_with_tolerance = width + (2 * TOLERANCE); - double height_with_tolerance = height + (2 * TOLERANCE); - return getCurrentClippingPath().intersects(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance); - } - - - public Area getCurrentClippingPath() { - - return stack.peek(); - } - - - public void enterNewGState() { - - Area current = stack.peek(); - Area cloned = new Area(); - cloned.add(current); - stack.push(cloned); - } - - - public void leaveGState() { - - stack.pop(); - } - -} \ No newline at end of file diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ElementFeatures.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ElementFeatures.java deleted file mode 100644 index 87c625c..0000000 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ElementFeatures.java +++ /dev/null @@ -1,170 +0,0 @@ -package com.iqser.red.service.ocr.v1.server.model; - -import static com.iqser.red.service.ocr.v1.server.service.InvisibleElementRemovalService.TOLERANCE; - -import java.awt.geom.Rectangle2D; - -import com.pdftron.common.PDFNetException; -import com.pdftron.pdf.Element; -import com.pdftron.pdf.Rect; - -import lombok.AccessLevel; -import lombok.EqualsAndHashCode; -import lombok.Getter; -import lombok.SneakyThrows; -import lombok.experimental.FieldDefaults; -import lombok.experimental.SuperBuilder; - -@Getter -@SuperBuilder -@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) -public class ElementFeatures { - - int elementType; - Rectangle2D boundingBox; - - - public boolean almostMatches(Element element) throws PDFNetException { - - return element.getType() == elementType && // - element.getBBox() != null && // - rectsAlmostMatch(element.getBBox()); - } - - - protected boolean almostEqual(double a, double b) { - - return Math.abs(a - b) < TOLERANCE; - } - - - @SneakyThrows - private boolean rectsAlmostMatch(Rect bBox) { - // To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance - - return almostEqual(bBox.getX1(), boundingBox.getX()) && // - almostEqual(bBox.getY1(), boundingBox.getY()) && // - almostEqual(bBox.getWidth(), boundingBox.getWidth()) && // - almostEqual(bBox.getHeight(), boundingBox.getHeight()); - } - - - @EqualsAndHashCode(callSuper = true) - @Getter - @SuperBuilder - @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) - private static class Text extends ElementFeatures { - - String text; - int font; - double fontsize; - - - @Override - public boolean almostMatches(Element element) throws PDFNetException { - - return super.almostMatches(element) && // - text.equals(element.getTextString()) && // - font == element.getGState().getFont().getType() && // - almostEqual(fontsize, element.getGState().getFontSize()); - } - - } - - @EqualsAndHashCode(callSuper = true) - @Getter - @SuperBuilder - @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) - private static class Path extends ElementFeatures { - - boolean isClippingPath; - boolean isClipWindingFill; - boolean isStroked; - boolean isFilled; - boolean isWindingFill; - - - @Override - public boolean almostMatches(Element element) throws PDFNetException { - - return super.almostMatches(element) && // - isClippingPath == element.isClippingPath() && // - isClipWindingFill == element.isClipWindingFill() && // - isStroked == element.isStroked() && // - isFilled == element.isFilled() && // - isWindingFill == element.isWindingFill(); - - } - - } - - @EqualsAndHashCode(callSuper = true) - @Getter - @SuperBuilder - @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) - private static class Image extends ElementFeatures { - - int dataSize; - int height; - int width; - int renderingIntent; - int componentNum; - int bitsPerComponent; - - - @Override - public boolean almostMatches(Element element) throws PDFNetException { - - return super.almostMatches(element) && // - dataSize == element.getImageDataSize() && // - height == element.getImageHeight() && // - width == element.getImageWidth() && // - renderingIntent == element.getImageRenderingIntent() && // - componentNum == element.getComponentNum() && // - bitsPerComponent == element.getBitsPerComponent(); - } - - } - - - public static ElementFeatures extractFeatures(Element element) throws PDFNetException { - - return switch (element.getType()) { - case Element.e_path -> Path.builder() - .elementType(element.getType()) - .boundingBox(toRectangle2D(element.getBBox())) - .isClippingPath(element.isClippingPath()) - .isClipWindingFill(element.isClipWindingFill()) - .isStroked(element.isStroked()) - .isFilled(element.isFilled()) - .isWindingFill(element.isWindingFill()) - .build(); - case Element.e_text -> Text.builder() - .elementType(element.getType()) - .boundingBox(toRectangle2D(element.getBBox())) - .text(element.getTextString()) - .font(element.getGState().getFont().getType()) - .fontsize(element.getGState().getFontSize()) - .build(); - case Element.e_image, Element.e_inline_image -> Image.builder() - .elementType(element.getType()) - .boundingBox(toRectangle2D(element.getBBox())) - .dataSize(element.getImageDataSize()) - .height(element.getImageHeight()) - .width(element.getImageWidth()) - .renderingIntent(element.getImageRenderingIntent()) - .componentNum(element.getComponentNum()) - .bitsPerComponent(element.getBitsPerComponent()) - .build(); - // This technically should never happen, it's a safetynet - default -> throw new RuntimeException("Feature Extraction is not supported for PDFTron.Element with type: " + element.getType()); - }; - } - - - private static Rectangle2D toRectangle2D(Rect rect) throws PDFNetException { - - return new Rectangle2D.Double(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight()); - } - -} diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java deleted file mode 100644 index e64c1fd..0000000 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java +++ /dev/null @@ -1,466 +0,0 @@ -package com.iqser.red.service.ocr.v1.server.service; - -import java.awt.Shape; -import java.awt.geom.AffineTransform; -import java.awt.geom.GeneralPath; -import java.awt.geom.Rectangle2D; -import java.io.InputStream; -import java.io.OutputStream; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.Set; -import java.util.TreeSet; - -import org.springframework.stereotype.Service; - -import com.google.common.primitives.Bytes; -import com.google.common.primitives.Doubles; -import com.iqser.red.service.ocr.v1.server.model.ClippingPathStack; -import com.iqser.red.service.ocr.v1.server.model.ElementFeatures; -import com.pdftron.common.Matrix2D; -import com.pdftron.common.PDFNetException; -import com.pdftron.pdf.ColorPt; -import com.pdftron.pdf.ColorSpace; -import com.pdftron.pdf.Element; -import com.pdftron.pdf.ElementBuilder; -import com.pdftron.pdf.ElementReader; -import com.pdftron.pdf.ElementWriter; -import com.pdftron.pdf.GState; -import com.pdftron.pdf.PDFDoc; -import com.pdftron.pdf.Page; -import com.pdftron.pdf.PageIterator; -import com.pdftron.pdf.PathData; -import com.pdftron.pdf.Rect; -import com.pdftron.sdf.Obj; -import com.pdftron.sdf.SDFDoc; - -import lombok.Builder; -import lombok.SneakyThrows; -import lombok.extern.slf4j.Slf4j; - -@Slf4j -@Service -public class InvisibleElementRemovalService { - - static public final double TOLERANCE = 1e-3; - - - /** - * Removes all hidden Text, Path and Image Elements from a PDF Document. - * handled cases: - * -Text which is transparent or is set to not render - * -Elements outside of clipping path - * -Elements that have been painted over by visible and filled Paths - * unhandled cases: - * -Elements covered by widely stroked path - * -Elements with the same color as background - * -Any Text set to clipping with its many interactions with other elements - * - * @param pdfFile The PDF file to process - * @param delta If this flag is set only the removed Elements will be written to the output file. - * The Elements are red if they are removed by clipping path, blue for transparency, and a green bounding box for overlap. - * @param out OutputStream to write the resulting file to - **/ - @SneakyThrows - public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta) { - - PDFDoc pdfDoc = new PDFDoc(pdfFile); - - ElementWriter writer = new ElementWriter(); - ElementReader reader = new ElementReader(); - Set visitedXObjIds = new TreeSet<>(); - - for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) { - - Page page = iterator.next(); - - visitedXObjIds.add(page.getSDFObj().getObjNum()); - - - InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder() - .reader(reader) - .clippingPathStack(new ClippingPathStack(page.getMediaBox())) - .delta(delta) - .overlappedElements(new ArrayList<>()) - .visibleElements(new ArrayList<>()) - .visitedXObjIds(visitedXObjIds) - .build(); - - removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context); - - context.visitedXObjIds().clear(); - - removeOverlappedElements(page, writer, context); - } - - try { - pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null); - } catch (Exception e) { - log.error("File could not be saved after invisible element removal"); - throw new RuntimeException(e); - } - - writer.destroy(); - reader.destroy(); - pdfDoc.close(); - } - - - private void removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(Page page, - ElementWriter writer, - InvisibleElementRemovalContext context) throws PDFNetException { - - context.reader().begin(page); - writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict()); - processElements(writer, context); - writer.end(); - context.reader().end(); - } - - - private void processElements(ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { - - for (Element element = context.reader().next(); element != null; element = context.reader().next()) - switch (element.getType()) { - case Element.e_image, Element.e_inline_image -> processImages(element, writer, context); - case Element.e_text -> processText(element, writer, context); - case Element.e_path -> processPath(element, writer, context); - case Element.e_form -> processForm(element, writer, context); - case Element.e_group_begin -> { - context.clippingPathStack().enterNewGState(); - writer.writeElement(element); - } - case Element.e_group_end -> { - context.clippingPathStack().leaveGState(); - writer.writeElement(element); - } - default -> writer.writeElement(element); - } - } - - - private void processImages(Element imageElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { - - Rect rect = imageElement.getBBox(); - - if (rect == null) { - return; - } - - boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight()); - - if (!context.delta() && inClippingPath) { - context.visibleElements().add(ElementFeatures.extractFeatures(imageElement)); - } - - if (context.delta() ^ inClippingPath) { - writer.writeElement(imageElement); - } - } - - - private void processText(Element textElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { - - Rect rect = textElement.getBBox(); - - if (rect == null) { - writer.writeElement(textElement); - return; - } - - GState gState = textElement.getGState(); - - boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight()); - - boolean isTextVisible = isTextRenderedVisibly(gState); - - if (inClippingPath && isTextVisible) { - context.visibleElements().add(ElementFeatures.extractFeatures(textElement)); - } - if (!context.delta()) { - if (inClippingPath && isTextVisible) { - writer.writeElement(textElement); - } else if (textElement.hasTextMatrix()) { - /* - PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element. - hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands. - Therefore, the position of a following Tj is affected by not writing the first Element. - This is why, we write only the Tm command: - */ - writer.writeGStateChanges(textElement); - } - } else { - if (!inClippingPath) { - gState.setFillColorSpace(ColorSpace.createDeviceRGB()); - // red for elements removed by clipping path - gState.setFillColor(new ColorPt(1, 0, 0)); - writer.writeElement(textElement); - } - if (!isTextVisible) { - gState.setFillColorSpace(ColorSpace.createDeviceRGB()); - // blue for elements removed due to transparency or not rendered - gState.setFillColor(new ColorPt(0, 0, 1)); - gState.setTextRenderMode(GState.e_fill_text); - gState.setFillOpacity(1); - writer.writeElement(textElement); - } - } - } - - - private void processForm(Element formElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { - - writer.writeElement(formElement); - Obj formObj = formElement.getXObject(); - - if (!context.visitedXObjIds().contains(formObj.getObjNum())) { - context.visitedXObjIds().add(formObj.getObjNum()); - // writer needs to be newly initialized when entering a new content stream - // see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest) - ElementWriter formWriter = new ElementWriter(); - context.reader().formBegin(); - formWriter.begin(formObj); - - context.reader().clearChangeList(); - formWriter.setDefaultGState(context.reader()); - - processElements(formWriter, context); - formWriter.end(); - formWriter.destroy(); - context.reader().end(); - } - } - - - private void processPath(Element pathElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { - - PathData pathData = pathElement.getPathData(); - - if (pathData.getOperators().length == 0 && pathData.getPoints().length == 0) { - writer.writeGStateChanges(pathElement); - return; - } - - GeneralPath linePath = convertToGeneralPath(pathData); - - //transform path to initial user space - var ctm = pathElement.getCTM(); - var affineTransform = toAffineTransform(ctm); - linePath.transform(affineTransform); - - var rect = linePath.getBounds2D(); - - boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight()); - - if (pathElement.isClippingPath()) { - if (pathElement.isClipWindingFill()) { - linePath.setWindingRule(GeneralPath.WIND_NON_ZERO); - } else { - linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD); - } - - context.clippingPathStack().intersectClippingPath(linePath); - pathElement.setPathClip(!context.delta()); - writer.writeElement(pathElement); - - } else { - if (pathElement.isWindingFill()) { - linePath.setWindingRule(GeneralPath.WIND_NON_ZERO); - } else { - linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD); - } - - if (inClippingPath) { - if (isFilledAndNonTransparent(pathElement)) { - List currentOverlappedElements = context.visibleElements() - .stream() - .filter(features -> almostContains(linePath, features.getBoundingBox())) - .toList(); - context.overlappedElements().addAll(currentOverlappedElements); - context.visibleElements().removeAll(currentOverlappedElements); - } - context.visibleElements().add(ElementFeatures.extractFeatures(pathElement)); - if (!context.delta()) { - writer.writeElement(pathElement); - } - } - if (context.delta() && !inClippingPath) { - pathElement.getGState().setFillColorSpace(ColorSpace.createDeviceRGB()); - pathElement.getGState().setFillColor(new ColorPt(1, 0, 0)); - pathElement.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB()); - pathElement.getGState().setStrokeColor(new ColorPt(1, 0, 0)); - writer.writeElement(pathElement); - } - } - } - - - private void removeOverlappedElements(Page page, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { - - context.reader().begin(page); - writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict()); - if (context.delta()) { - // green for element removed due to overlapping - context.overlappedElements().forEach(feature -> drawBBox(writer, feature.getBoundingBox(), "#00FF00")); - context.overlappedElements().clear(); - } - processOverlappedElements(writer, context); - writer.end(); - context.reader().end(); - - if (context.overlappedElements().size() > 0) { - log.warn(context.overlappedElements().size() + " overlapped elements have not been found or removed"); - } - } - - - private void processOverlappedElements(ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { - - for (Element element = context.reader().next(); element != null; element = context.reader().next()) { - switch (element.getType()) { - case Element.e_form -> processFormOverlappedElements(writer, element, context); - case Element.e_path, Element.e_image, Element.e_inline_image, Element.e_text -> { - boolean anyMatch = false; - for (ElementFeatures elementToRemove : context.overlappedElements()) { - if (elementToRemove.almostMatches(element)) { - context.overlappedElements().remove(elementToRemove); - anyMatch = true; - break; - } - } - if (!anyMatch) { - writer.writeElement(element); - } else if (element.getType() == 3 && element.hasTextMatrix()) { - /* - PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element. - hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands. - Therefore, the position of a following Tj is affected by not writing the first Element. - This is why, we write only the Tm command: - */ - writer.writeGStateChanges(element); - } - } - default -> writer.writeElement(element); - } - } - } - - - private void processFormOverlappedElements(ElementWriter writer, Element formElement, InvisibleElementRemovalContext context) throws PDFNetException { - - writer.writeElement(formElement); - Obj formObj = formElement.getXObject(); - - if (!context.visitedXObjIds().contains(formObj.getObjNum())) { - context.visitedXObjIds().add(formObj.getObjNum()); - // writer needs to be newly initialized when entering a new content stream - // see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest) - ElementWriter formWriter = new ElementWriter(); - context.reader().formBegin(); - formWriter.begin(formObj); - - context.reader().clearChangeList(); - formWriter.setDefaultGState(context.reader()); - - processOverlappedElements(formWriter, context); - formWriter.end(); - formWriter.destroy(); - context.reader().end(); - } - } - - - private boolean isTextRenderedVisibly(GState gState) throws PDFNetException { - - return gState.getTextRenderMode() != GState.e_invisible_text && // - !(gState.getTextRenderMode() == GState.e_fill_text && gState.getFillOpacity() == 0) && // - !(gState.getTextRenderMode() == GState.e_stroke_text && gState.getStrokeOpacity() == 0) && // - !(gState.getTextRenderMode() == GState.e_fill_stroke_text && gState.getFillOpacity() == 0 && gState.getStrokeOpacity() == 0); - } - - - private GeneralPath convertToGeneralPath(PathData pathData) throws PDFNetException { - - GeneralPath linePath = new GeneralPath(); - Iterator points = Doubles.asList(pathData.getPoints()).iterator(); - Iterable operators = Bytes.asList(pathData.getOperators()); - for (var operator : operators) { - switch (operator) { - case PathData.e_moveto -> linePath.moveTo(points.next(), points.next()); - case PathData.e_lineto -> linePath.lineTo(points.next(), points.next()); - case PathData.e_cubicto -> linePath.curveTo(points.next(), points.next(), points.next(), points.next(), points.next(), points.next()); - case PathData.e_closepath -> linePath.closePath(); - case PathData.e_rect -> { - double x = points.next(); - double y = points.next(); - double w = points.next(); - double h = points.next(); - linePath.moveTo(x, y); - linePath.lineTo(x + w, y); - linePath.lineTo(x + w, y + h); - linePath.lineTo(x, y + h); - linePath.closePath(); - } - default -> throw new PDFNetException("Invalid Element Type", 0, "", "", ""); - } - } - return linePath; - } - - - private boolean almostContains(Shape outer, Rectangle2D inner) { - //To address inconsistencies in the calculation of the bounding box we slightly shrink the inner rectangle - - double x_with_tolerance = inner.getX() >= 0 ? inner.getX() + TOLERANCE : inner.getX() - TOLERANCE; - double y_with_tolerance = inner.getY() >= 0 ? inner.getY() + TOLERANCE : inner.getY() - TOLERANCE; - double height_with_tolerance = inner.getHeight() - (2 * TOLERANCE); - double width_with_tolerance = inner.getWidth() - (2 * TOLERANCE); - Rectangle2D innerRect = new Rectangle2D.Double(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance); - - return outer.contains(innerRect); - } - - - private boolean isFilledAndNonTransparent(Element element) throws PDFNetException { - - return element.isFilled() && element.getGState().getFillOpacity() == 1; - } - - - @SneakyThrows - private void drawBBox(ElementWriter writer, Rectangle2D r, String hexcolor) { - - ColorPt colorPt = new ColorPt(Integer.valueOf(hexcolor.substring(1, 3), 16) / 255d, - Integer.valueOf(hexcolor.substring(3, 5), 16) / 255d, - Integer.valueOf(hexcolor.substring(5, 7), 16) / 255d); - ElementBuilder eb = new ElementBuilder(); - Element rect = eb.createRect(r.getX(), r.getY(), r.getWidth(), r.getHeight()); - rect.setPathStroke(true); - rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB()); - rect.getGState().setStrokeColor(colorPt); - writer.writePlacedElement(rect); - - colorPt.destroy(); - eb.destroy(); - } - - - private static AffineTransform toAffineTransform(Matrix2D ctm) throws PDFNetException { - - return new AffineTransform(ctm.getA(), ctm.getB(), ctm.getC(), ctm.getD(), ctm.getH(), ctm.getV()); - } - - - @Builder - private record InvisibleElementRemovalContext( - boolean delta, - ElementReader reader, - ClippingPathStack clippingPathStack, - List overlappedElements, - List visibleElements, - Set visitedXObjIds) { - - } - -} \ No newline at end of file diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java index 2c5fe31..0b91cb0 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java @@ -14,6 +14,7 @@ import org.springframework.amqp.rabbit.core.RabbitTemplate; import org.springframework.stereotype.Service; import com.fasterxml.jackson.databind.ObjectMapper; +import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService; import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse; import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration; import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings; @@ -68,14 +69,7 @@ public class OCRService { try (ByteArrayOutputStream transferOutputStream = new ByteArrayOutputStream()) { try (InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId)) { - long removalStart = System.currentTimeMillis(); - log.debug("Start invisible element removal for file with dossierId {} and fileId {}", dossierId, fileId); invisibleElementRemovalService.removeInvisibleElements(fileStream, transferOutputStream, false); - long removalEnd = System.currentTimeMillis(); - log.debug("Invisible element removal successful for file with dossierId {} and fileId {}, took {}s", - dossierId, - fileId, - format("%.1f", (removalEnd - removalStart) / 1000.0)); } try (InputStream transferInputStream = new ByteArrayInputStream(transferOutputStream.toByteArray())) { long ocrStart = System.currentTimeMillis(); diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java index e719877..6d0f5f8 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java +++ b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java @@ -1,7 +1,7 @@ package com.iqser.red.service.ocr.v1.server; +import static com.iqser.red.pdftronlogic.commons.PdfTextExtraction.extractAllTextFromDocument; import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory; -import static com.iqser.red.service.ocr.v1.server.utils.PdfTextExtraction.extractAllTextFromDocument; import static org.assertj.core.api.Assertions.assertThat; import java.io.FileInputStream; diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalServiceTest.java b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalServiceTest.java index ce6e3a1..32d8875 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalServiceTest.java +++ b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalServiceTest.java @@ -1,7 +1,7 @@ package com.iqser.red.service.ocr.v1.server.service; +import static com.iqser.red.pdftronlogic.commons.PdfTextExtraction.extractAllTextFromDocument; import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory; -import static com.iqser.red.service.ocr.v1.server.utils.PdfTextExtraction.extractAllTextFromDocument; import static org.assertj.core.api.AssertionsForClassTypes.assertThat; import java.io.FileInputStream; @@ -9,16 +9,18 @@ import java.io.FileOutputStream; import org.junit.jupiter.api.Test; import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.annotation.Bean; import org.springframework.core.io.ClassPathResource; +import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService; import com.iqser.red.service.ocr.v1.server.AbstractTest; import lombok.SneakyThrows; public class InvisibleElementRemovalServiceTest extends AbstractTest { - @Autowired - private InvisibleElementRemovalService invisibleElementRemovalService; + @Autowired + private InvisibleElementRemovalService invisibleElementRemovalService; @Test @@ -44,5 +46,6 @@ public class InvisibleElementRemovalServiceTest extends AbstractTest { String[] text = extractAllTextFromDocument(fileStream).split("\n"); assertThat(text).contains("APPENDIX 16 Pathology Report", "Amendment 1", "Page 255 of 260"); } + } } \ No newline at end of file diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/utils/PdfTextExtraction.java b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/utils/PdfTextExtraction.java deleted file mode 100644 index c3f195d..0000000 --- a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/utils/PdfTextExtraction.java +++ /dev/null @@ -1,35 +0,0 @@ -package com.iqser.red.service.ocr.v1.server.utils; - -import java.io.IOException; -import java.io.InputStream; -import java.util.ArrayList; -import java.util.List; - -import com.pdftron.common.PDFNetException; -import com.pdftron.pdf.PDFDoc; -import com.pdftron.pdf.Page; -import com.pdftron.pdf.PageIterator; -import com.pdftron.pdf.TextExtractor; - - -public class PdfTextExtraction { - - public static String extractAllTextFromDocument(InputStream fileStream) throws IOException, PDFNetException { - - PDFDoc pdfDoc = new PDFDoc(fileStream); - TextExtractor extractor = new TextExtractor(); - List texts = new ArrayList<>(); - - PageIterator iterator = pdfDoc.getPageIterator(); - while (iterator.hasNext()) { - Page page = iterator.next(); - extractor.begin(page); - texts.add(extractor.getAsText()); - } - - extractor.destroy(); - pdfDoc.close(); - return String.join("\n", texts); - } - -}