From 143538fa407654b381469347fa9a943211b644ed Mon Sep 17 00:00:00 2001 From: Thomas Beyer Date: Fri, 17 Mar 2023 10:33:48 +0100 Subject: [PATCH 1/8] RED-4875 - call logic of new repo pdftron-logic-commons instead of local one --- ocr-service-v1/ocr-service-server-v1/pom.xml | 6 + .../service/ocr/v1/server/Application.java | 8 + .../v1/server/model/ClippingPathStack.java | 136 +-- .../ocr/v1/server/model/ElementFeatures.java | 340 +++---- .../InvisibleElementRemovalService.java | 932 +++++++++--------- .../ocr/v1/server/service/OCRService.java | 5 +- .../InvisibleElementRemovalServiceTest.java | 6 +- 7 files changed, 725 insertions(+), 708 deletions(-) diff --git a/ocr-service-v1/ocr-service-server-v1/pom.xml b/ocr-service-v1/ocr-service-server-v1/pom.xml index 38dd26f..2005266 100644 --- a/ocr-service-v1/ocr-service-server-v1/pom.xml +++ b/ocr-service-v1/ocr-service-server-v1/pom.xml @@ -23,6 +23,12 @@ com.iqser.red.commons storage-commons + + com.iqser.red.commons + pdftron-logic-commons + dev_red4875_2_4dc4d + + com.iqser.red.commons spring-commons diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/Application.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/Application.java index 3d65e1b..bfa6c70 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/Application.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/Application.java @@ -10,6 +10,7 @@ import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Import; import org.springframework.scheduling.annotation.EnableAsync; +import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService; import com.iqser.red.service.ocr.v1.server.client.FileStatusProcessingUpdateClient; import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration; import com.iqser.red.service.ocr.v1.server.multitenancy.AsyncConfig; @@ -44,4 +45,11 @@ public class Application { return new TimedAspect(registry); } + + @Bean + public InvisibleElementRemovalService invisibleElementRemovalService() { + + return new InvisibleElementRemovalService(); + } + } diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ClippingPathStack.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ClippingPathStack.java index 5e3c36a..fec3727 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ClippingPathStack.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ClippingPathStack.java @@ -1,68 +1,68 @@ -package com.iqser.red.service.ocr.v1.server.model; - -import static com.iqser.red.service.ocr.v1.server.service.InvisibleElementRemovalService.TOLERANCE; - -import java.awt.geom.Area; -import java.awt.geom.GeneralPath; -import java.awt.geom.Rectangle2D; -import java.util.Deque; -import java.util.LinkedList; - -import com.pdftron.pdf.Rect; - -import lombok.Data; -import lombok.SneakyThrows; - -@Data -public class ClippingPathStack { - - private Deque stack = new LinkedList<>(); - - - @SneakyThrows - public ClippingPathStack(Rect rectangle) { - - stack.push(new Area(new Rectangle2D.Double(rectangle.getX1(), rectangle.getY1(), rectangle.getWidth(), rectangle.getHeight()).getBounds2D())); - } - - - @SneakyThrows - public void intersectClippingPath(GeneralPath path) { - - getCurrentClippingPath().intersect(new Area(path)); - } - - - public boolean almostIntersects(double x, double y, double width, double height) { - // To address inconsistencies in the calculation of the bounding box we slightly increase the rectangle - // Height or width are zero for straight lines, even though they are being rendered. Therefore, height or width must be at minimum >0. - - double x_with_tolerance = x > 0 ? x - TOLERANCE : x + TOLERANCE; - double y_with_tolerance = y > 0 ? y - TOLERANCE : y + TOLERANCE; - double width_with_tolerance = width + (2 * TOLERANCE); - double height_with_tolerance = height + (2 * TOLERANCE); - return getCurrentClippingPath().intersects(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance); - } - - - public Area getCurrentClippingPath() { - - return stack.peek(); - } - - - public void enterNewGState() { - - Area current = stack.peek(); - Area cloned = new Area(); - cloned.add(current); - stack.push(cloned); - } - - - public void leaveGState() { - - stack.pop(); - } - -} \ No newline at end of file +//package com.iqser.red.service.ocr.v1.server.model; +// +//import static com.iqser.red.service.ocr.v1.server.service.InvisibleElementRemovalService.TOLERANCE; +// +//import java.awt.geom.Area; +//import java.awt.geom.GeneralPath; +//import java.awt.geom.Rectangle2D; +//import java.util.Deque; +//import java.util.LinkedList; +// +//import com.pdftron.pdf.Rect; +// +//import lombok.Data; +//import lombok.SneakyThrows; +// +//@Data +//public class ClippingPathStack { +// +// private Deque stack = new LinkedList<>(); +// +// +// @SneakyThrows +// public ClippingPathStack(Rect rectangle) { +// +// stack.push(new Area(new Rectangle2D.Double(rectangle.getX1(), rectangle.getY1(), rectangle.getWidth(), rectangle.getHeight()).getBounds2D())); +// } +// +// +// @SneakyThrows +// public void intersectClippingPath(GeneralPath path) { +// +// getCurrentClippingPath().intersect(new Area(path)); +// } +// +// +// public boolean almostIntersects(double x, double y, double width, double height) { +// // To address inconsistencies in the calculation of the bounding box we slightly increase the rectangle +// // Height or width are zero for straight lines, even though they are being rendered. Therefore, height or width must be at minimum >0. +// +// double x_with_tolerance = x > 0 ? x - TOLERANCE : x + TOLERANCE; +// double y_with_tolerance = y > 0 ? y - TOLERANCE : y + TOLERANCE; +// double width_with_tolerance = width + (2 * TOLERANCE); +// double height_with_tolerance = height + (2 * TOLERANCE); +// return getCurrentClippingPath().intersects(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance); +// } +// +// +// public Area getCurrentClippingPath() { +// +// return stack.peek(); +// } +// +// +// public void enterNewGState() { +// +// Area current = stack.peek(); +// Area cloned = new Area(); +// cloned.add(current); +// stack.push(cloned); +// } +// +// +// public void leaveGState() { +// +// stack.pop(); +// } +// +//} \ No newline at end of file diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ElementFeatures.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ElementFeatures.java index 87c625c..d6e24cf 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ElementFeatures.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ElementFeatures.java @@ -1,170 +1,170 @@ -package com.iqser.red.service.ocr.v1.server.model; - -import static com.iqser.red.service.ocr.v1.server.service.InvisibleElementRemovalService.TOLERANCE; - -import java.awt.geom.Rectangle2D; - -import com.pdftron.common.PDFNetException; -import com.pdftron.pdf.Element; -import com.pdftron.pdf.Rect; - -import lombok.AccessLevel; -import lombok.EqualsAndHashCode; -import lombok.Getter; -import lombok.SneakyThrows; -import lombok.experimental.FieldDefaults; -import lombok.experimental.SuperBuilder; - -@Getter -@SuperBuilder -@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) -public class ElementFeatures { - - int elementType; - Rectangle2D boundingBox; - - - public boolean almostMatches(Element element) throws PDFNetException { - - return element.getType() == elementType && // - element.getBBox() != null && // - rectsAlmostMatch(element.getBBox()); - } - - - protected boolean almostEqual(double a, double b) { - - return Math.abs(a - b) < TOLERANCE; - } - - - @SneakyThrows - private boolean rectsAlmostMatch(Rect bBox) { - // To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance - - return almostEqual(bBox.getX1(), boundingBox.getX()) && // - almostEqual(bBox.getY1(), boundingBox.getY()) && // - almostEqual(bBox.getWidth(), boundingBox.getWidth()) && // - almostEqual(bBox.getHeight(), boundingBox.getHeight()); - } - - - @EqualsAndHashCode(callSuper = true) - @Getter - @SuperBuilder - @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) - private static class Text extends ElementFeatures { - - String text; - int font; - double fontsize; - - - @Override - public boolean almostMatches(Element element) throws PDFNetException { - - return super.almostMatches(element) && // - text.equals(element.getTextString()) && // - font == element.getGState().getFont().getType() && // - almostEqual(fontsize, element.getGState().getFontSize()); - } - - } - - @EqualsAndHashCode(callSuper = true) - @Getter - @SuperBuilder - @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) - private static class Path extends ElementFeatures { - - boolean isClippingPath; - boolean isClipWindingFill; - boolean isStroked; - boolean isFilled; - boolean isWindingFill; - - - @Override - public boolean almostMatches(Element element) throws PDFNetException { - - return super.almostMatches(element) && // - isClippingPath == element.isClippingPath() && // - isClipWindingFill == element.isClipWindingFill() && // - isStroked == element.isStroked() && // - isFilled == element.isFilled() && // - isWindingFill == element.isWindingFill(); - - } - - } - - @EqualsAndHashCode(callSuper = true) - @Getter - @SuperBuilder - @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) - private static class Image extends ElementFeatures { - - int dataSize; - int height; - int width; - int renderingIntent; - int componentNum; - int bitsPerComponent; - - - @Override - public boolean almostMatches(Element element) throws PDFNetException { - - return super.almostMatches(element) && // - dataSize == element.getImageDataSize() && // - height == element.getImageHeight() && // - width == element.getImageWidth() && // - renderingIntent == element.getImageRenderingIntent() && // - componentNum == element.getComponentNum() && // - bitsPerComponent == element.getBitsPerComponent(); - } - - } - - - public static ElementFeatures extractFeatures(Element element) throws PDFNetException { - - return switch (element.getType()) { - case Element.e_path -> Path.builder() - .elementType(element.getType()) - .boundingBox(toRectangle2D(element.getBBox())) - .isClippingPath(element.isClippingPath()) - .isClipWindingFill(element.isClipWindingFill()) - .isStroked(element.isStroked()) - .isFilled(element.isFilled()) - .isWindingFill(element.isWindingFill()) - .build(); - case Element.e_text -> Text.builder() - .elementType(element.getType()) - .boundingBox(toRectangle2D(element.getBBox())) - .text(element.getTextString()) - .font(element.getGState().getFont().getType()) - .fontsize(element.getGState().getFontSize()) - .build(); - case Element.e_image, Element.e_inline_image -> Image.builder() - .elementType(element.getType()) - .boundingBox(toRectangle2D(element.getBBox())) - .dataSize(element.getImageDataSize()) - .height(element.getImageHeight()) - .width(element.getImageWidth()) - .renderingIntent(element.getImageRenderingIntent()) - .componentNum(element.getComponentNum()) - .bitsPerComponent(element.getBitsPerComponent()) - .build(); - // This technically should never happen, it's a safetynet - default -> throw new RuntimeException("Feature Extraction is not supported for PDFTron.Element with type: " + element.getType()); - }; - } - - - private static Rectangle2D toRectangle2D(Rect rect) throws PDFNetException { - - return new Rectangle2D.Double(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight()); - } - -} +//package com.iqser.red.service.ocr.v1.server.model; +// +//import static com.iqser.red.service.ocr.v1.server.service.InvisibleElementRemovalService.TOLERANCE; +// +//import java.awt.geom.Rectangle2D; +// +//import com.pdftron.common.PDFNetException; +//import com.pdftron.pdf.Element; +//import com.pdftron.pdf.Rect; +// +//import lombok.AccessLevel; +//import lombok.EqualsAndHashCode; +//import lombok.Getter; +//import lombok.SneakyThrows; +//import lombok.experimental.FieldDefaults; +//import lombok.experimental.SuperBuilder; +// +//@Getter +//@SuperBuilder +//@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +//public class ElementFeatures { +// +// int elementType; +// Rectangle2D boundingBox; +// +// +// public boolean almostMatches(Element element) throws PDFNetException { +// +// return element.getType() == elementType && // +// element.getBBox() != null && // +// rectsAlmostMatch(element.getBBox()); +// } +// +// +// protected boolean almostEqual(double a, double b) { +// +// return Math.abs(a - b) < TOLERANCE; +// } +// +// +// @SneakyThrows +// private boolean rectsAlmostMatch(Rect bBox) { +// // To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance +// +// return almostEqual(bBox.getX1(), boundingBox.getX()) && // +// almostEqual(bBox.getY1(), boundingBox.getY()) && // +// almostEqual(bBox.getWidth(), boundingBox.getWidth()) && // +// almostEqual(bBox.getHeight(), boundingBox.getHeight()); +// } +// +// +// @EqualsAndHashCode(callSuper = true) +// @Getter +// @SuperBuilder +// @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +// private static class Text extends ElementFeatures { +// +// String text; +// int font; +// double fontsize; +// +// +// @Override +// public boolean almostMatches(Element element) throws PDFNetException { +// +// return super.almostMatches(element) && // +// text.equals(element.getTextString()) && // +// font == element.getGState().getFont().getType() && // +// almostEqual(fontsize, element.getGState().getFontSize()); +// } +// +// } +// +// @EqualsAndHashCode(callSuper = true) +// @Getter +// @SuperBuilder +// @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +// private static class Path extends ElementFeatures { +// +// boolean isClippingPath; +// boolean isClipWindingFill; +// boolean isStroked; +// boolean isFilled; +// boolean isWindingFill; +// +// +// @Override +// public boolean almostMatches(Element element) throws PDFNetException { +// +// return super.almostMatches(element) && // +// isClippingPath == element.isClippingPath() && // +// isClipWindingFill == element.isClipWindingFill() && // +// isStroked == element.isStroked() && // +// isFilled == element.isFilled() && // +// isWindingFill == element.isWindingFill(); +// +// } +// +// } +// +// @EqualsAndHashCode(callSuper = true) +// @Getter +// @SuperBuilder +// @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +// private static class Image extends ElementFeatures { +// +// int dataSize; +// int height; +// int width; +// int renderingIntent; +// int componentNum; +// int bitsPerComponent; +// +// +// @Override +// public boolean almostMatches(Element element) throws PDFNetException { +// +// return super.almostMatches(element) && // +// dataSize == element.getImageDataSize() && // +// height == element.getImageHeight() && // +// width == element.getImageWidth() && // +// renderingIntent == element.getImageRenderingIntent() && // +// componentNum == element.getComponentNum() && // +// bitsPerComponent == element.getBitsPerComponent(); +// } +// +// } +// +// +// public static ElementFeatures extractFeatures(Element element) throws PDFNetException { +// +// return switch (element.getType()) { +// case Element.e_path -> Path.builder() +// .elementType(element.getType()) +// .boundingBox(toRectangle2D(element.getBBox())) +// .isClippingPath(element.isClippingPath()) +// .isClipWindingFill(element.isClipWindingFill()) +// .isStroked(element.isStroked()) +// .isFilled(element.isFilled()) +// .isWindingFill(element.isWindingFill()) +// .build(); +// case Element.e_text -> Text.builder() +// .elementType(element.getType()) +// .boundingBox(toRectangle2D(element.getBBox())) +// .text(element.getTextString()) +// .font(element.getGState().getFont().getType()) +// .fontsize(element.getGState().getFontSize()) +// .build(); +// case Element.e_image, Element.e_inline_image -> Image.builder() +// .elementType(element.getType()) +// .boundingBox(toRectangle2D(element.getBBox())) +// .dataSize(element.getImageDataSize()) +// .height(element.getImageHeight()) +// .width(element.getImageWidth()) +// .renderingIntent(element.getImageRenderingIntent()) +// .componentNum(element.getComponentNum()) +// .bitsPerComponent(element.getBitsPerComponent()) +// .build(); +// // This technically should never happen, it's a safetynet +// default -> throw new RuntimeException("Feature Extraction is not supported for PDFTron.Element with type: " + element.getType()); +// }; +// } +// +// +// private static Rectangle2D toRectangle2D(Rect rect) throws PDFNetException { +// +// return new Rectangle2D.Double(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight()); +// } +// +//} diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java index e64c1fd..9b49b6e 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java @@ -1,466 +1,466 @@ -package com.iqser.red.service.ocr.v1.server.service; - -import java.awt.Shape; -import java.awt.geom.AffineTransform; -import java.awt.geom.GeneralPath; -import java.awt.geom.Rectangle2D; -import java.io.InputStream; -import java.io.OutputStream; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.Set; -import java.util.TreeSet; - -import org.springframework.stereotype.Service; - -import com.google.common.primitives.Bytes; -import com.google.common.primitives.Doubles; -import com.iqser.red.service.ocr.v1.server.model.ClippingPathStack; -import com.iqser.red.service.ocr.v1.server.model.ElementFeatures; -import com.pdftron.common.Matrix2D; -import com.pdftron.common.PDFNetException; -import com.pdftron.pdf.ColorPt; -import com.pdftron.pdf.ColorSpace; -import com.pdftron.pdf.Element; -import com.pdftron.pdf.ElementBuilder; -import com.pdftron.pdf.ElementReader; -import com.pdftron.pdf.ElementWriter; -import com.pdftron.pdf.GState; -import com.pdftron.pdf.PDFDoc; -import com.pdftron.pdf.Page; -import com.pdftron.pdf.PageIterator; -import com.pdftron.pdf.PathData; -import com.pdftron.pdf.Rect; -import com.pdftron.sdf.Obj; -import com.pdftron.sdf.SDFDoc; - -import lombok.Builder; -import lombok.SneakyThrows; -import lombok.extern.slf4j.Slf4j; - -@Slf4j -@Service -public class InvisibleElementRemovalService { - - static public final double TOLERANCE = 1e-3; - - - /** - * Removes all hidden Text, Path and Image Elements from a PDF Document. - * handled cases: - * -Text which is transparent or is set to not render - * -Elements outside of clipping path - * -Elements that have been painted over by visible and filled Paths - * unhandled cases: - * -Elements covered by widely stroked path - * -Elements with the same color as background - * -Any Text set to clipping with its many interactions with other elements - * - * @param pdfFile The PDF file to process - * @param delta If this flag is set only the removed Elements will be written to the output file. - * The Elements are red if they are removed by clipping path, blue for transparency, and a green bounding box for overlap. - * @param out OutputStream to write the resulting file to - **/ - @SneakyThrows - public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta) { - - PDFDoc pdfDoc = new PDFDoc(pdfFile); - - ElementWriter writer = new ElementWriter(); - ElementReader reader = new ElementReader(); - Set visitedXObjIds = new TreeSet<>(); - - for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) { - - Page page = iterator.next(); - - visitedXObjIds.add(page.getSDFObj().getObjNum()); - - - InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder() - .reader(reader) - .clippingPathStack(new ClippingPathStack(page.getMediaBox())) - .delta(delta) - .overlappedElements(new ArrayList<>()) - .visibleElements(new ArrayList<>()) - .visitedXObjIds(visitedXObjIds) - .build(); - - removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context); - - context.visitedXObjIds().clear(); - - removeOverlappedElements(page, writer, context); - } - - try { - pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null); - } catch (Exception e) { - log.error("File could not be saved after invisible element removal"); - throw new RuntimeException(e); - } - - writer.destroy(); - reader.destroy(); - pdfDoc.close(); - } - - - private void removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(Page page, - ElementWriter writer, - InvisibleElementRemovalContext context) throws PDFNetException { - - context.reader().begin(page); - writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict()); - processElements(writer, context); - writer.end(); - context.reader().end(); - } - - - private void processElements(ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { - - for (Element element = context.reader().next(); element != null; element = context.reader().next()) - switch (element.getType()) { - case Element.e_image, Element.e_inline_image -> processImages(element, writer, context); - case Element.e_text -> processText(element, writer, context); - case Element.e_path -> processPath(element, writer, context); - case Element.e_form -> processForm(element, writer, context); - case Element.e_group_begin -> { - context.clippingPathStack().enterNewGState(); - writer.writeElement(element); - } - case Element.e_group_end -> { - context.clippingPathStack().leaveGState(); - writer.writeElement(element); - } - default -> writer.writeElement(element); - } - } - - - private void processImages(Element imageElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { - - Rect rect = imageElement.getBBox(); - - if (rect == null) { - return; - } - - boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight()); - - if (!context.delta() && inClippingPath) { - context.visibleElements().add(ElementFeatures.extractFeatures(imageElement)); - } - - if (context.delta() ^ inClippingPath) { - writer.writeElement(imageElement); - } - } - - - private void processText(Element textElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { - - Rect rect = textElement.getBBox(); - - if (rect == null) { - writer.writeElement(textElement); - return; - } - - GState gState = textElement.getGState(); - - boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight()); - - boolean isTextVisible = isTextRenderedVisibly(gState); - - if (inClippingPath && isTextVisible) { - context.visibleElements().add(ElementFeatures.extractFeatures(textElement)); - } - if (!context.delta()) { - if (inClippingPath && isTextVisible) { - writer.writeElement(textElement); - } else if (textElement.hasTextMatrix()) { - /* - PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element. - hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands. - Therefore, the position of a following Tj is affected by not writing the first Element. - This is why, we write only the Tm command: - */ - writer.writeGStateChanges(textElement); - } - } else { - if (!inClippingPath) { - gState.setFillColorSpace(ColorSpace.createDeviceRGB()); - // red for elements removed by clipping path - gState.setFillColor(new ColorPt(1, 0, 0)); - writer.writeElement(textElement); - } - if (!isTextVisible) { - gState.setFillColorSpace(ColorSpace.createDeviceRGB()); - // blue for elements removed due to transparency or not rendered - gState.setFillColor(new ColorPt(0, 0, 1)); - gState.setTextRenderMode(GState.e_fill_text); - gState.setFillOpacity(1); - writer.writeElement(textElement); - } - } - } - - - private void processForm(Element formElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { - - writer.writeElement(formElement); - Obj formObj = formElement.getXObject(); - - if (!context.visitedXObjIds().contains(formObj.getObjNum())) { - context.visitedXObjIds().add(formObj.getObjNum()); - // writer needs to be newly initialized when entering a new content stream - // see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest) - ElementWriter formWriter = new ElementWriter(); - context.reader().formBegin(); - formWriter.begin(formObj); - - context.reader().clearChangeList(); - formWriter.setDefaultGState(context.reader()); - - processElements(formWriter, context); - formWriter.end(); - formWriter.destroy(); - context.reader().end(); - } - } - - - private void processPath(Element pathElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { - - PathData pathData = pathElement.getPathData(); - - if (pathData.getOperators().length == 0 && pathData.getPoints().length == 0) { - writer.writeGStateChanges(pathElement); - return; - } - - GeneralPath linePath = convertToGeneralPath(pathData); - - //transform path to initial user space - var ctm = pathElement.getCTM(); - var affineTransform = toAffineTransform(ctm); - linePath.transform(affineTransform); - - var rect = linePath.getBounds2D(); - - boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight()); - - if (pathElement.isClippingPath()) { - if (pathElement.isClipWindingFill()) { - linePath.setWindingRule(GeneralPath.WIND_NON_ZERO); - } else { - linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD); - } - - context.clippingPathStack().intersectClippingPath(linePath); - pathElement.setPathClip(!context.delta()); - writer.writeElement(pathElement); - - } else { - if (pathElement.isWindingFill()) { - linePath.setWindingRule(GeneralPath.WIND_NON_ZERO); - } else { - linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD); - } - - if (inClippingPath) { - if (isFilledAndNonTransparent(pathElement)) { - List currentOverlappedElements = context.visibleElements() - .stream() - .filter(features -> almostContains(linePath, features.getBoundingBox())) - .toList(); - context.overlappedElements().addAll(currentOverlappedElements); - context.visibleElements().removeAll(currentOverlappedElements); - } - context.visibleElements().add(ElementFeatures.extractFeatures(pathElement)); - if (!context.delta()) { - writer.writeElement(pathElement); - } - } - if (context.delta() && !inClippingPath) { - pathElement.getGState().setFillColorSpace(ColorSpace.createDeviceRGB()); - pathElement.getGState().setFillColor(new ColorPt(1, 0, 0)); - pathElement.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB()); - pathElement.getGState().setStrokeColor(new ColorPt(1, 0, 0)); - writer.writeElement(pathElement); - } - } - } - - - private void removeOverlappedElements(Page page, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { - - context.reader().begin(page); - writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict()); - if (context.delta()) { - // green for element removed due to overlapping - context.overlappedElements().forEach(feature -> drawBBox(writer, feature.getBoundingBox(), "#00FF00")); - context.overlappedElements().clear(); - } - processOverlappedElements(writer, context); - writer.end(); - context.reader().end(); - - if (context.overlappedElements().size() > 0) { - log.warn(context.overlappedElements().size() + " overlapped elements have not been found or removed"); - } - } - - - private void processOverlappedElements(ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { - - for (Element element = context.reader().next(); element != null; element = context.reader().next()) { - switch (element.getType()) { - case Element.e_form -> processFormOverlappedElements(writer, element, context); - case Element.e_path, Element.e_image, Element.e_inline_image, Element.e_text -> { - boolean anyMatch = false; - for (ElementFeatures elementToRemove : context.overlappedElements()) { - if (elementToRemove.almostMatches(element)) { - context.overlappedElements().remove(elementToRemove); - anyMatch = true; - break; - } - } - if (!anyMatch) { - writer.writeElement(element); - } else if (element.getType() == 3 && element.hasTextMatrix()) { - /* - PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element. - hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands. - Therefore, the position of a following Tj is affected by not writing the first Element. - This is why, we write only the Tm command: - */ - writer.writeGStateChanges(element); - } - } - default -> writer.writeElement(element); - } - } - } - - - private void processFormOverlappedElements(ElementWriter writer, Element formElement, InvisibleElementRemovalContext context) throws PDFNetException { - - writer.writeElement(formElement); - Obj formObj = formElement.getXObject(); - - if (!context.visitedXObjIds().contains(formObj.getObjNum())) { - context.visitedXObjIds().add(formObj.getObjNum()); - // writer needs to be newly initialized when entering a new content stream - // see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest) - ElementWriter formWriter = new ElementWriter(); - context.reader().formBegin(); - formWriter.begin(formObj); - - context.reader().clearChangeList(); - formWriter.setDefaultGState(context.reader()); - - processOverlappedElements(formWriter, context); - formWriter.end(); - formWriter.destroy(); - context.reader().end(); - } - } - - - private boolean isTextRenderedVisibly(GState gState) throws PDFNetException { - - return gState.getTextRenderMode() != GState.e_invisible_text && // - !(gState.getTextRenderMode() == GState.e_fill_text && gState.getFillOpacity() == 0) && // - !(gState.getTextRenderMode() == GState.e_stroke_text && gState.getStrokeOpacity() == 0) && // - !(gState.getTextRenderMode() == GState.e_fill_stroke_text && gState.getFillOpacity() == 0 && gState.getStrokeOpacity() == 0); - } - - - private GeneralPath convertToGeneralPath(PathData pathData) throws PDFNetException { - - GeneralPath linePath = new GeneralPath(); - Iterator points = Doubles.asList(pathData.getPoints()).iterator(); - Iterable operators = Bytes.asList(pathData.getOperators()); - for (var operator : operators) { - switch (operator) { - case PathData.e_moveto -> linePath.moveTo(points.next(), points.next()); - case PathData.e_lineto -> linePath.lineTo(points.next(), points.next()); - case PathData.e_cubicto -> linePath.curveTo(points.next(), points.next(), points.next(), points.next(), points.next(), points.next()); - case PathData.e_closepath -> linePath.closePath(); - case PathData.e_rect -> { - double x = points.next(); - double y = points.next(); - double w = points.next(); - double h = points.next(); - linePath.moveTo(x, y); - linePath.lineTo(x + w, y); - linePath.lineTo(x + w, y + h); - linePath.lineTo(x, y + h); - linePath.closePath(); - } - default -> throw new PDFNetException("Invalid Element Type", 0, "", "", ""); - } - } - return linePath; - } - - - private boolean almostContains(Shape outer, Rectangle2D inner) { - //To address inconsistencies in the calculation of the bounding box we slightly shrink the inner rectangle - - double x_with_tolerance = inner.getX() >= 0 ? inner.getX() + TOLERANCE : inner.getX() - TOLERANCE; - double y_with_tolerance = inner.getY() >= 0 ? inner.getY() + TOLERANCE : inner.getY() - TOLERANCE; - double height_with_tolerance = inner.getHeight() - (2 * TOLERANCE); - double width_with_tolerance = inner.getWidth() - (2 * TOLERANCE); - Rectangle2D innerRect = new Rectangle2D.Double(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance); - - return outer.contains(innerRect); - } - - - private boolean isFilledAndNonTransparent(Element element) throws PDFNetException { - - return element.isFilled() && element.getGState().getFillOpacity() == 1; - } - - - @SneakyThrows - private void drawBBox(ElementWriter writer, Rectangle2D r, String hexcolor) { - - ColorPt colorPt = new ColorPt(Integer.valueOf(hexcolor.substring(1, 3), 16) / 255d, - Integer.valueOf(hexcolor.substring(3, 5), 16) / 255d, - Integer.valueOf(hexcolor.substring(5, 7), 16) / 255d); - ElementBuilder eb = new ElementBuilder(); - Element rect = eb.createRect(r.getX(), r.getY(), r.getWidth(), r.getHeight()); - rect.setPathStroke(true); - rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB()); - rect.getGState().setStrokeColor(colorPt); - writer.writePlacedElement(rect); - - colorPt.destroy(); - eb.destroy(); - } - - - private static AffineTransform toAffineTransform(Matrix2D ctm) throws PDFNetException { - - return new AffineTransform(ctm.getA(), ctm.getB(), ctm.getC(), ctm.getD(), ctm.getH(), ctm.getV()); - } - - - @Builder - private record InvisibleElementRemovalContext( - boolean delta, - ElementReader reader, - ClippingPathStack clippingPathStack, - List overlappedElements, - List visibleElements, - Set visitedXObjIds) { - - } - -} \ No newline at end of file +//package com.iqser.red.service.ocr.v1.server.service; +// +//import java.awt.Shape; +//import java.awt.geom.AffineTransform; +//import java.awt.geom.GeneralPath; +//import java.awt.geom.Rectangle2D; +//import java.io.InputStream; +//import java.io.OutputStream; +//import java.util.ArrayList; +//import java.util.Iterator; +//import java.util.List; +//import java.util.Set; +//import java.util.TreeSet; +// +//import org.springframework.stereotype.Service; +// +//import com.google.common.primitives.Bytes; +//import com.google.common.primitives.Doubles; +//import com.iqser.red.service.ocr.v1.server.model.ClippingPathStack; +//import com.iqser.red.service.ocr.v1.server.model.ElementFeatures; +//import com.pdftron.common.Matrix2D; +//import com.pdftron.common.PDFNetException; +//import com.pdftron.pdf.ColorPt; +//import com.pdftron.pdf.ColorSpace; +//import com.pdftron.pdf.Element; +//import com.pdftron.pdf.ElementBuilder; +//import com.pdftron.pdf.ElementReader; +//import com.pdftron.pdf.ElementWriter; +//import com.pdftron.pdf.GState; +//import com.pdftron.pdf.PDFDoc; +//import com.pdftron.pdf.Page; +//import com.pdftron.pdf.PageIterator; +//import com.pdftron.pdf.PathData; +//import com.pdftron.pdf.Rect; +//import com.pdftron.sdf.Obj; +//import com.pdftron.sdf.SDFDoc; +// +//import lombok.Builder; +//import lombok.SneakyThrows; +//import lombok.extern.slf4j.Slf4j; +// +//@Slf4j +//@Service +//public class InvisibleElementRemovalService { +// +// static public final double TOLERANCE = 1e-3; +// +// +// /** +// * Removes all hidden Text, Path and Image Elements from a PDF Document. +// * handled cases: +// * -Text which is transparent or is set to not render +// * -Elements outside of clipping path +// * -Elements that have been painted over by visible and filled Paths +// * unhandled cases: +// * -Elements covered by widely stroked path +// * -Elements with the same color as background +// * -Any Text set to clipping with its many interactions with other elements +// * +// * @param pdfFile The PDF file to process +// * @param delta If this flag is set only the removed Elements will be written to the output file. +// * The Elements are red if they are removed by clipping path, blue for transparency, and a green bounding box for overlap. +// * @param out OutputStream to write the resulting file to +// **/ +// @SneakyThrows +// public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta) { +// +// PDFDoc pdfDoc = new PDFDoc(pdfFile); +// +// ElementWriter writer = new ElementWriter(); +// ElementReader reader = new ElementReader(); +// Set visitedXObjIds = new TreeSet<>(); +// +// for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) { +// +// Page page = iterator.next(); +// +// visitedXObjIds.add(page.getSDFObj().getObjNum()); +// +// +// InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder() +// .reader(reader) +// .clippingPathStack(new ClippingPathStack(page.getMediaBox())) +// .delta(delta) +// .overlappedElements(new ArrayList<>()) +// .visibleElements(new ArrayList<>()) +// .visitedXObjIds(visitedXObjIds) +// .build(); +// +// removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context); +// +// context.visitedXObjIds().clear(); +// +// removeOverlappedElements(page, writer, context); +// } +// +// try { +// pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null); +// } catch (Exception e) { +// log.error("File could not be saved after invisible element removal"); +// throw new RuntimeException(e); +// } +// +// writer.destroy(); +// reader.destroy(); +// pdfDoc.close(); +// } +// +// +// private void removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(Page page, +// ElementWriter writer, +// InvisibleElementRemovalContext context) throws PDFNetException { +// +// context.reader().begin(page); +// writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict()); +// processElements(writer, context); +// writer.end(); +// context.reader().end(); +// } +// +// +// private void processElements(ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { +// +// for (Element element = context.reader().next(); element != null; element = context.reader().next()) +// switch (element.getType()) { +// case Element.e_image, Element.e_inline_image -> processImages(element, writer, context); +// case Element.e_text -> processText(element, writer, context); +// case Element.e_path -> processPath(element, writer, context); +// case Element.e_form -> processForm(element, writer, context); +// case Element.e_group_begin -> { +// context.clippingPathStack().enterNewGState(); +// writer.writeElement(element); +// } +// case Element.e_group_end -> { +// context.clippingPathStack().leaveGState(); +// writer.writeElement(element); +// } +// default -> writer.writeElement(element); +// } +// } +// +// +// private void processImages(Element imageElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { +// +// Rect rect = imageElement.getBBox(); +// +// if (rect == null) { +// return; +// } +// +// boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight()); +// +// if (!context.delta() && inClippingPath) { +// context.visibleElements().add(ElementFeatures.extractFeatures(imageElement)); +// } +// +// if (context.delta() ^ inClippingPath) { +// writer.writeElement(imageElement); +// } +// } +// +// +// private void processText(Element textElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { +// +// Rect rect = textElement.getBBox(); +// +// if (rect == null) { +// writer.writeElement(textElement); +// return; +// } +// +// GState gState = textElement.getGState(); +// +// boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight()); +// +// boolean isTextVisible = isTextRenderedVisibly(gState); +// +// if (inClippingPath && isTextVisible) { +// context.visibleElements().add(ElementFeatures.extractFeatures(textElement)); +// } +// if (!context.delta()) { +// if (inClippingPath && isTextVisible) { +// writer.writeElement(textElement); +// } else if (textElement.hasTextMatrix()) { +// /* +// PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element. +// hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands. +// Therefore, the position of a following Tj is affected by not writing the first Element. +// This is why, we write only the Tm command: +// */ +// writer.writeGStateChanges(textElement); +// } +// } else { +// if (!inClippingPath) { +// gState.setFillColorSpace(ColorSpace.createDeviceRGB()); +// // red for elements removed by clipping path +// gState.setFillColor(new ColorPt(1, 0, 0)); +// writer.writeElement(textElement); +// } +// if (!isTextVisible) { +// gState.setFillColorSpace(ColorSpace.createDeviceRGB()); +// // blue for elements removed due to transparency or not rendered +// gState.setFillColor(new ColorPt(0, 0, 1)); +// gState.setTextRenderMode(GState.e_fill_text); +// gState.setFillOpacity(1); +// writer.writeElement(textElement); +// } +// } +// } +// +// +// private void processForm(Element formElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { +// +// writer.writeElement(formElement); +// Obj formObj = formElement.getXObject(); +// +// if (!context.visitedXObjIds().contains(formObj.getObjNum())) { +// context.visitedXObjIds().add(formObj.getObjNum()); +// // writer needs to be newly initialized when entering a new content stream +// // see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest) +// ElementWriter formWriter = new ElementWriter(); +// context.reader().formBegin(); +// formWriter.begin(formObj); +// +// context.reader().clearChangeList(); +// formWriter.setDefaultGState(context.reader()); +// +// processElements(formWriter, context); +// formWriter.end(); +// formWriter.destroy(); +// context.reader().end(); +// } +// } +// +// +// private void processPath(Element pathElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { +// +// PathData pathData = pathElement.getPathData(); +// +// if (pathData.getOperators().length == 0 && pathData.getPoints().length == 0) { +// writer.writeGStateChanges(pathElement); +// return; +// } +// +// GeneralPath linePath = convertToGeneralPath(pathData); +// +// //transform path to initial user space +// var ctm = pathElement.getCTM(); +// var affineTransform = toAffineTransform(ctm); +// linePath.transform(affineTransform); +// +// var rect = linePath.getBounds2D(); +// +// boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight()); +// +// if (pathElement.isClippingPath()) { +// if (pathElement.isClipWindingFill()) { +// linePath.setWindingRule(GeneralPath.WIND_NON_ZERO); +// } else { +// linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD); +// } +// +// context.clippingPathStack().intersectClippingPath(linePath); +// pathElement.setPathClip(!context.delta()); +// writer.writeElement(pathElement); +// +// } else { +// if (pathElement.isWindingFill()) { +// linePath.setWindingRule(GeneralPath.WIND_NON_ZERO); +// } else { +// linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD); +// } +// +// if (inClippingPath) { +// if (isFilledAndNonTransparent(pathElement)) { +// List currentOverlappedElements = context.visibleElements() +// .stream() +// .filter(features -> almostContains(linePath, features.getBoundingBox())) +// .toList(); +// context.overlappedElements().addAll(currentOverlappedElements); +// context.visibleElements().removeAll(currentOverlappedElements); +// } +// context.visibleElements().add(ElementFeatures.extractFeatures(pathElement)); +// if (!context.delta()) { +// writer.writeElement(pathElement); +// } +// } +// if (context.delta() && !inClippingPath) { +// pathElement.getGState().setFillColorSpace(ColorSpace.createDeviceRGB()); +// pathElement.getGState().setFillColor(new ColorPt(1, 0, 0)); +// pathElement.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB()); +// pathElement.getGState().setStrokeColor(new ColorPt(1, 0, 0)); +// writer.writeElement(pathElement); +// } +// } +// } +// +// +// private void removeOverlappedElements(Page page, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { +// +// context.reader().begin(page); +// writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict()); +// if (context.delta()) { +// // green for element removed due to overlapping +// context.overlappedElements().forEach(feature -> drawBBox(writer, feature.getBoundingBox(), "#00FF00")); +// context.overlappedElements().clear(); +// } +// processOverlappedElements(writer, context); +// writer.end(); +// context.reader().end(); +// +// if (context.overlappedElements().size() > 0) { +// log.warn(context.overlappedElements().size() + " overlapped elements have not been found or removed"); +// } +// } +// +// +// private void processOverlappedElements(ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { +// +// for (Element element = context.reader().next(); element != null; element = context.reader().next()) { +// switch (element.getType()) { +// case Element.e_form -> processFormOverlappedElements(writer, element, context); +// case Element.e_path, Element.e_image, Element.e_inline_image, Element.e_text -> { +// boolean anyMatch = false; +// for (ElementFeatures elementToRemove : context.overlappedElements()) { +// if (elementToRemove.almostMatches(element)) { +// context.overlappedElements().remove(elementToRemove); +// anyMatch = true; +// break; +// } +// } +// if (!anyMatch) { +// writer.writeElement(element); +// } else if (element.getType() == 3 && element.hasTextMatrix()) { +// /* +// PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element. +// hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands. +// Therefore, the position of a following Tj is affected by not writing the first Element. +// This is why, we write only the Tm command: +// */ +// writer.writeGStateChanges(element); +// } +// } +// default -> writer.writeElement(element); +// } +// } +// } +// +// +// private void processFormOverlappedElements(ElementWriter writer, Element formElement, InvisibleElementRemovalContext context) throws PDFNetException { +// +// writer.writeElement(formElement); +// Obj formObj = formElement.getXObject(); +// +// if (!context.visitedXObjIds().contains(formObj.getObjNum())) { +// context.visitedXObjIds().add(formObj.getObjNum()); +// // writer needs to be newly initialized when entering a new content stream +// // see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest) +// ElementWriter formWriter = new ElementWriter(); +// context.reader().formBegin(); +// formWriter.begin(formObj); +// +// context.reader().clearChangeList(); +// formWriter.setDefaultGState(context.reader()); +// +// processOverlappedElements(formWriter, context); +// formWriter.end(); +// formWriter.destroy(); +// context.reader().end(); +// } +// } +// +// +// private boolean isTextRenderedVisibly(GState gState) throws PDFNetException { +// +// return gState.getTextRenderMode() != GState.e_invisible_text && // +// !(gState.getTextRenderMode() == GState.e_fill_text && gState.getFillOpacity() == 0) && // +// !(gState.getTextRenderMode() == GState.e_stroke_text && gState.getStrokeOpacity() == 0) && // +// !(gState.getTextRenderMode() == GState.e_fill_stroke_text && gState.getFillOpacity() == 0 && gState.getStrokeOpacity() == 0); +// } +// +// +// private GeneralPath convertToGeneralPath(PathData pathData) throws PDFNetException { +// +// GeneralPath linePath = new GeneralPath(); +// Iterator points = Doubles.asList(pathData.getPoints()).iterator(); +// Iterable operators = Bytes.asList(pathData.getOperators()); +// for (var operator : operators) { +// switch (operator) { +// case PathData.e_moveto -> linePath.moveTo(points.next(), points.next()); +// case PathData.e_lineto -> linePath.lineTo(points.next(), points.next()); +// case PathData.e_cubicto -> linePath.curveTo(points.next(), points.next(), points.next(), points.next(), points.next(), points.next()); +// case PathData.e_closepath -> linePath.closePath(); +// case PathData.e_rect -> { +// double x = points.next(); +// double y = points.next(); +// double w = points.next(); +// double h = points.next(); +// linePath.moveTo(x, y); +// linePath.lineTo(x + w, y); +// linePath.lineTo(x + w, y + h); +// linePath.lineTo(x, y + h); +// linePath.closePath(); +// } +// default -> throw new PDFNetException("Invalid Element Type", 0, "", "", ""); +// } +// } +// return linePath; +// } +// +// +// private boolean almostContains(Shape outer, Rectangle2D inner) { +// //To address inconsistencies in the calculation of the bounding box we slightly shrink the inner rectangle +// +// double x_with_tolerance = inner.getX() >= 0 ? inner.getX() + TOLERANCE : inner.getX() - TOLERANCE; +// double y_with_tolerance = inner.getY() >= 0 ? inner.getY() + TOLERANCE : inner.getY() - TOLERANCE; +// double height_with_tolerance = inner.getHeight() - (2 * TOLERANCE); +// double width_with_tolerance = inner.getWidth() - (2 * TOLERANCE); +// Rectangle2D innerRect = new Rectangle2D.Double(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance); +// +// return outer.contains(innerRect); +// } +// +// +// private boolean isFilledAndNonTransparent(Element element) throws PDFNetException { +// +// return element.isFilled() && element.getGState().getFillOpacity() == 1; +// } +// +// +// @SneakyThrows +// private void drawBBox(ElementWriter writer, Rectangle2D r, String hexcolor) { +// +// ColorPt colorPt = new ColorPt(Integer.valueOf(hexcolor.substring(1, 3), 16) / 255d, +// Integer.valueOf(hexcolor.substring(3, 5), 16) / 255d, +// Integer.valueOf(hexcolor.substring(5, 7), 16) / 255d); +// ElementBuilder eb = new ElementBuilder(); +// Element rect = eb.createRect(r.getX(), r.getY(), r.getWidth(), r.getHeight()); +// rect.setPathStroke(true); +// rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB()); +// rect.getGState().setStrokeColor(colorPt); +// writer.writePlacedElement(rect); +// +// colorPt.destroy(); +// eb.destroy(); +// } +// +// +// private static AffineTransform toAffineTransform(Matrix2D ctm) throws PDFNetException { +// +// return new AffineTransform(ctm.getA(), ctm.getB(), ctm.getC(), ctm.getD(), ctm.getH(), ctm.getV()); +// } +// +// +// @Builder +// private record InvisibleElementRemovalContext( +// boolean delta, +// ElementReader reader, +// ClippingPathStack clippingPathStack, +// List overlappedElements, +// List visibleElements, +// Set visitedXObjIds) { +// +// } +// +//} \ No newline at end of file diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java index 2c5fe31..69cbca5 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java @@ -14,6 +14,7 @@ import org.springframework.amqp.rabbit.core.RabbitTemplate; import org.springframework.stereotype.Service; import com.fasterxml.jackson.databind.ObjectMapper; +import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService; import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse; import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration; import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings; @@ -69,10 +70,10 @@ public class OCRService { try (ByteArrayOutputStream transferOutputStream = new ByteArrayOutputStream()) { try (InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId)) { long removalStart = System.currentTimeMillis(); - log.debug("Start invisible element removal for file with dossierId {} and fileId {}", dossierId, fileId); + log.info("Start invisible element removal for file with dossierId {} and fileId {}", dossierId, fileId); invisibleElementRemovalService.removeInvisibleElements(fileStream, transferOutputStream, false); long removalEnd = System.currentTimeMillis(); - log.debug("Invisible element removal successful for file with dossierId {} and fileId {}, took {}s", + log.info("Invisible element removal successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, format("%.1f", (removalEnd - removalStart) / 1000.0)); diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalServiceTest.java b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalServiceTest.java index ce6e3a1..39d979e 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalServiceTest.java +++ b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalServiceTest.java @@ -9,16 +9,18 @@ import java.io.FileOutputStream; import org.junit.jupiter.api.Test; import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.annotation.Bean; import org.springframework.core.io.ClassPathResource; +import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService; import com.iqser.red.service.ocr.v1.server.AbstractTest; import lombok.SneakyThrows; public class InvisibleElementRemovalServiceTest extends AbstractTest { - @Autowired - private InvisibleElementRemovalService invisibleElementRemovalService; + @Autowired + private InvisibleElementRemovalService invisibleElementRemovalService; @Test From 142e8cf9573c7de8cf87b0c3ae1d96429bb57aba Mon Sep 17 00:00:00 2001 From: Thomas Beyer Date: Fri, 17 Mar 2023 17:25:52 +0100 Subject: [PATCH 2/8] RED-4875 - update version of pdftron-logic-commons to newest --- ocr-service-v1/ocr-service-server-v1/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocr-service-v1/ocr-service-server-v1/pom.xml b/ocr-service-v1/ocr-service-server-v1/pom.xml index 2005266..1854007 100644 --- a/ocr-service-v1/ocr-service-server-v1/pom.xml +++ b/ocr-service-v1/ocr-service-server-v1/pom.xml @@ -26,7 +26,7 @@ com.iqser.red.commons pdftron-logic-commons - dev_red4875_2_4dc4d + dev_red4875_a3a2a From fd9241989540e9056d4882376e1b583f5649cd23 Mon Sep 17 00:00:00 2001 From: Thomas Beyer Date: Mon, 20 Mar 2023 10:01:33 +0100 Subject: [PATCH 3/8] RED-4875 - set version of common pdftron logics to newest and move PdfTextExtraction to this new repo --- ocr-service-v1/ocr-service-server-v1/pom.xml | 2 +- .../v1/server/OcrServiceIntegrationTest.java | 2 +- .../InvisibleElementRemovalServiceTest.java | 3 +- .../v1/server/utils/PdfTextExtraction.java | 35 ------------------- 4 files changed, 4 insertions(+), 38 deletions(-) delete mode 100644 ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/utils/PdfTextExtraction.java diff --git a/ocr-service-v1/ocr-service-server-v1/pom.xml b/ocr-service-v1/ocr-service-server-v1/pom.xml index 1854007..44641c5 100644 --- a/ocr-service-v1/ocr-service-server-v1/pom.xml +++ b/ocr-service-v1/ocr-service-server-v1/pom.xml @@ -26,7 +26,7 @@ com.iqser.red.commons pdftron-logic-commons - dev_red4875_a3a2a + dev_red4875_e8b89 diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java index e719877..6d0f5f8 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java +++ b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java @@ -1,7 +1,7 @@ package com.iqser.red.service.ocr.v1.server; +import static com.iqser.red.pdftronlogic.commons.PdfTextExtraction.extractAllTextFromDocument; import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory; -import static com.iqser.red.service.ocr.v1.server.utils.PdfTextExtraction.extractAllTextFromDocument; import static org.assertj.core.api.Assertions.assertThat; import java.io.FileInputStream; diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalServiceTest.java b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalServiceTest.java index 39d979e..32d8875 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalServiceTest.java +++ b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalServiceTest.java @@ -1,7 +1,7 @@ package com.iqser.red.service.ocr.v1.server.service; +import static com.iqser.red.pdftronlogic.commons.PdfTextExtraction.extractAllTextFromDocument; import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory; -import static com.iqser.red.service.ocr.v1.server.utils.PdfTextExtraction.extractAllTextFromDocument; import static org.assertj.core.api.AssertionsForClassTypes.assertThat; import java.io.FileInputStream; @@ -46,5 +46,6 @@ public class InvisibleElementRemovalServiceTest extends AbstractTest { String[] text = extractAllTextFromDocument(fileStream).split("\n"); assertThat(text).contains("APPENDIX 16 Pathology Report", "Amendment 1", "Page 255 of 260"); } + } } \ No newline at end of file diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/utils/PdfTextExtraction.java b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/utils/PdfTextExtraction.java deleted file mode 100644 index c3f195d..0000000 --- a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/utils/PdfTextExtraction.java +++ /dev/null @@ -1,35 +0,0 @@ -package com.iqser.red.service.ocr.v1.server.utils; - -import java.io.IOException; -import java.io.InputStream; -import java.util.ArrayList; -import java.util.List; - -import com.pdftron.common.PDFNetException; -import com.pdftron.pdf.PDFDoc; -import com.pdftron.pdf.Page; -import com.pdftron.pdf.PageIterator; -import com.pdftron.pdf.TextExtractor; - - -public class PdfTextExtraction { - - public static String extractAllTextFromDocument(InputStream fileStream) throws IOException, PDFNetException { - - PDFDoc pdfDoc = new PDFDoc(fileStream); - TextExtractor extractor = new TextExtractor(); - List texts = new ArrayList<>(); - - PageIterator iterator = pdfDoc.getPageIterator(); - while (iterator.hasNext()) { - Page page = iterator.next(); - extractor.begin(page); - texts.add(extractor.getAsText()); - } - - extractor.destroy(); - pdfDoc.close(); - return String.join("\n", texts); - } - -} From 5efa0e96a8bdbd8339488070a9ef0f5e3d6cedc5 Mon Sep 17 00:00:00 2001 From: Thomas Beyer Date: Mon, 20 Mar 2023 11:25:38 +0100 Subject: [PATCH 4/8] RED-4875 - update version of pdftron logic commons to newest --- ocr-service-v1/ocr-service-server-v1/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocr-service-v1/ocr-service-server-v1/pom.xml b/ocr-service-v1/ocr-service-server-v1/pom.xml index 44641c5..1be9a09 100644 --- a/ocr-service-v1/ocr-service-server-v1/pom.xml +++ b/ocr-service-v1/ocr-service-server-v1/pom.xml @@ -26,7 +26,7 @@ com.iqser.red.commons pdftron-logic-commons - dev_red4875_e8b89 + dev_red4875_392db From b0c4c25bec2816cd3f61a6c60c829de5901db5f2 Mon Sep 17 00:00:00 2001 From: Thomas Beyer Date: Tue, 21 Mar 2023 12:28:57 +0100 Subject: [PATCH 5/8] RED-4875 - update version of pdftron-commons to newest --- ocr-service-v1/ocr-service-server-v1/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocr-service-v1/ocr-service-server-v1/pom.xml b/ocr-service-v1/ocr-service-server-v1/pom.xml index 1be9a09..328ab9d 100644 --- a/ocr-service-v1/ocr-service-server-v1/pom.xml +++ b/ocr-service-v1/ocr-service-server-v1/pom.xml @@ -26,7 +26,7 @@ com.iqser.red.commons pdftron-logic-commons - dev_red4875_392db + dev_red4875_c6ed6 From dd12611fdc342e3ce17dba19863ea41f0078a68a Mon Sep 17 00:00:00 2001 From: Thomas Beyer Date: Tue, 21 Mar 2023 13:18:41 +0100 Subject: [PATCH 6/8] RED-4875 - removed duration log --- .../red/service/ocr/v1/server/service/OCRService.java | 7 ------- 1 file changed, 7 deletions(-) diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java index 69cbca5..0b91cb0 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java @@ -69,14 +69,7 @@ public class OCRService { try (ByteArrayOutputStream transferOutputStream = new ByteArrayOutputStream()) { try (InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId)) { - long removalStart = System.currentTimeMillis(); - log.info("Start invisible element removal for file with dossierId {} and fileId {}", dossierId, fileId); invisibleElementRemovalService.removeInvisibleElements(fileStream, transferOutputStream, false); - long removalEnd = System.currentTimeMillis(); - log.info("Invisible element removal successful for file with dossierId {} and fileId {}, took {}s", - dossierId, - fileId, - format("%.1f", (removalEnd - removalStart) / 1000.0)); } try (InputStream transferInputStream = new ByteArrayInputStream(transferOutputStream.toByteArray())) { long ocrStart = System.currentTimeMillis(); From 6ac71f6d948176f6ddb14d286e5da0eecd48d0c6 Mon Sep 17 00:00:00 2001 From: Thomas Beyer Date: Tue, 21 Mar 2023 17:00:41 +0100 Subject: [PATCH 7/8] RED-4875 - set version of pdftron-common-logics to newest (release) --- ocr-service-v1/ocr-service-server-v1/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocr-service-v1/ocr-service-server-v1/pom.xml b/ocr-service-v1/ocr-service-server-v1/pom.xml index 328ab9d..052cade 100644 --- a/ocr-service-v1/ocr-service-server-v1/pom.xml +++ b/ocr-service-v1/ocr-service-server-v1/pom.xml @@ -26,7 +26,7 @@ com.iqser.red.commons pdftron-logic-commons - dev_red4875_c6ed6 + 1.1.0 From 036203c24a1f5eb1588e945eb020a666cac5dba2 Mon Sep 17 00:00:00 2001 From: Thomas Beyer Date: Tue, 21 Mar 2023 18:20:26 +0100 Subject: [PATCH 8/8] RED-4875 - delete commented out classes --- .../v1/server/model/ClippingPathStack.java | 68 --- .../ocr/v1/server/model/ElementFeatures.java | 170 ------- .../InvisibleElementRemovalService.java | 466 ------------------ 3 files changed, 704 deletions(-) delete mode 100644 ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ClippingPathStack.java delete mode 100644 ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ElementFeatures.java delete mode 100644 ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ClippingPathStack.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ClippingPathStack.java deleted file mode 100644 index fec3727..0000000 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ClippingPathStack.java +++ /dev/null @@ -1,68 +0,0 @@ -//package com.iqser.red.service.ocr.v1.server.model; -// -//import static com.iqser.red.service.ocr.v1.server.service.InvisibleElementRemovalService.TOLERANCE; -// -//import java.awt.geom.Area; -//import java.awt.geom.GeneralPath; -//import java.awt.geom.Rectangle2D; -//import java.util.Deque; -//import java.util.LinkedList; -// -//import com.pdftron.pdf.Rect; -// -//import lombok.Data; -//import lombok.SneakyThrows; -// -//@Data -//public class ClippingPathStack { -// -// private Deque stack = new LinkedList<>(); -// -// -// @SneakyThrows -// public ClippingPathStack(Rect rectangle) { -// -// stack.push(new Area(new Rectangle2D.Double(rectangle.getX1(), rectangle.getY1(), rectangle.getWidth(), rectangle.getHeight()).getBounds2D())); -// } -// -// -// @SneakyThrows -// public void intersectClippingPath(GeneralPath path) { -// -// getCurrentClippingPath().intersect(new Area(path)); -// } -// -// -// public boolean almostIntersects(double x, double y, double width, double height) { -// // To address inconsistencies in the calculation of the bounding box we slightly increase the rectangle -// // Height or width are zero for straight lines, even though they are being rendered. Therefore, height or width must be at minimum >0. -// -// double x_with_tolerance = x > 0 ? x - TOLERANCE : x + TOLERANCE; -// double y_with_tolerance = y > 0 ? y - TOLERANCE : y + TOLERANCE; -// double width_with_tolerance = width + (2 * TOLERANCE); -// double height_with_tolerance = height + (2 * TOLERANCE); -// return getCurrentClippingPath().intersects(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance); -// } -// -// -// public Area getCurrentClippingPath() { -// -// return stack.peek(); -// } -// -// -// public void enterNewGState() { -// -// Area current = stack.peek(); -// Area cloned = new Area(); -// cloned.add(current); -// stack.push(cloned); -// } -// -// -// public void leaveGState() { -// -// stack.pop(); -// } -// -//} \ No newline at end of file diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ElementFeatures.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ElementFeatures.java deleted file mode 100644 index d6e24cf..0000000 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ElementFeatures.java +++ /dev/null @@ -1,170 +0,0 @@ -//package com.iqser.red.service.ocr.v1.server.model; -// -//import static com.iqser.red.service.ocr.v1.server.service.InvisibleElementRemovalService.TOLERANCE; -// -//import java.awt.geom.Rectangle2D; -// -//import com.pdftron.common.PDFNetException; -//import com.pdftron.pdf.Element; -//import com.pdftron.pdf.Rect; -// -//import lombok.AccessLevel; -//import lombok.EqualsAndHashCode; -//import lombok.Getter; -//import lombok.SneakyThrows; -//import lombok.experimental.FieldDefaults; -//import lombok.experimental.SuperBuilder; -// -//@Getter -//@SuperBuilder -//@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) -//public class ElementFeatures { -// -// int elementType; -// Rectangle2D boundingBox; -// -// -// public boolean almostMatches(Element element) throws PDFNetException { -// -// return element.getType() == elementType && // -// element.getBBox() != null && // -// rectsAlmostMatch(element.getBBox()); -// } -// -// -// protected boolean almostEqual(double a, double b) { -// -// return Math.abs(a - b) < TOLERANCE; -// } -// -// -// @SneakyThrows -// private boolean rectsAlmostMatch(Rect bBox) { -// // To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance -// -// return almostEqual(bBox.getX1(), boundingBox.getX()) && // -// almostEqual(bBox.getY1(), boundingBox.getY()) && // -// almostEqual(bBox.getWidth(), boundingBox.getWidth()) && // -// almostEqual(bBox.getHeight(), boundingBox.getHeight()); -// } -// -// -// @EqualsAndHashCode(callSuper = true) -// @Getter -// @SuperBuilder -// @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) -// private static class Text extends ElementFeatures { -// -// String text; -// int font; -// double fontsize; -// -// -// @Override -// public boolean almostMatches(Element element) throws PDFNetException { -// -// return super.almostMatches(element) && // -// text.equals(element.getTextString()) && // -// font == element.getGState().getFont().getType() && // -// almostEqual(fontsize, element.getGState().getFontSize()); -// } -// -// } -// -// @EqualsAndHashCode(callSuper = true) -// @Getter -// @SuperBuilder -// @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) -// private static class Path extends ElementFeatures { -// -// boolean isClippingPath; -// boolean isClipWindingFill; -// boolean isStroked; -// boolean isFilled; -// boolean isWindingFill; -// -// -// @Override -// public boolean almostMatches(Element element) throws PDFNetException { -// -// return super.almostMatches(element) && // -// isClippingPath == element.isClippingPath() && // -// isClipWindingFill == element.isClipWindingFill() && // -// isStroked == element.isStroked() && // -// isFilled == element.isFilled() && // -// isWindingFill == element.isWindingFill(); -// -// } -// -// } -// -// @EqualsAndHashCode(callSuper = true) -// @Getter -// @SuperBuilder -// @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) -// private static class Image extends ElementFeatures { -// -// int dataSize; -// int height; -// int width; -// int renderingIntent; -// int componentNum; -// int bitsPerComponent; -// -// -// @Override -// public boolean almostMatches(Element element) throws PDFNetException { -// -// return super.almostMatches(element) && // -// dataSize == element.getImageDataSize() && // -// height == element.getImageHeight() && // -// width == element.getImageWidth() && // -// renderingIntent == element.getImageRenderingIntent() && // -// componentNum == element.getComponentNum() && // -// bitsPerComponent == element.getBitsPerComponent(); -// } -// -// } -// -// -// public static ElementFeatures extractFeatures(Element element) throws PDFNetException { -// -// return switch (element.getType()) { -// case Element.e_path -> Path.builder() -// .elementType(element.getType()) -// .boundingBox(toRectangle2D(element.getBBox())) -// .isClippingPath(element.isClippingPath()) -// .isClipWindingFill(element.isClipWindingFill()) -// .isStroked(element.isStroked()) -// .isFilled(element.isFilled()) -// .isWindingFill(element.isWindingFill()) -// .build(); -// case Element.e_text -> Text.builder() -// .elementType(element.getType()) -// .boundingBox(toRectangle2D(element.getBBox())) -// .text(element.getTextString()) -// .font(element.getGState().getFont().getType()) -// .fontsize(element.getGState().getFontSize()) -// .build(); -// case Element.e_image, Element.e_inline_image -> Image.builder() -// .elementType(element.getType()) -// .boundingBox(toRectangle2D(element.getBBox())) -// .dataSize(element.getImageDataSize()) -// .height(element.getImageHeight()) -// .width(element.getImageWidth()) -// .renderingIntent(element.getImageRenderingIntent()) -// .componentNum(element.getComponentNum()) -// .bitsPerComponent(element.getBitsPerComponent()) -// .build(); -// // This technically should never happen, it's a safetynet -// default -> throw new RuntimeException("Feature Extraction is not supported for PDFTron.Element with type: " + element.getType()); -// }; -// } -// -// -// private static Rectangle2D toRectangle2D(Rect rect) throws PDFNetException { -// -// return new Rectangle2D.Double(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight()); -// } -// -//} diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java deleted file mode 100644 index 9b49b6e..0000000 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java +++ /dev/null @@ -1,466 +0,0 @@ -//package com.iqser.red.service.ocr.v1.server.service; -// -//import java.awt.Shape; -//import java.awt.geom.AffineTransform; -//import java.awt.geom.GeneralPath; -//import java.awt.geom.Rectangle2D; -//import java.io.InputStream; -//import java.io.OutputStream; -//import java.util.ArrayList; -//import java.util.Iterator; -//import java.util.List; -//import java.util.Set; -//import java.util.TreeSet; -// -//import org.springframework.stereotype.Service; -// -//import com.google.common.primitives.Bytes; -//import com.google.common.primitives.Doubles; -//import com.iqser.red.service.ocr.v1.server.model.ClippingPathStack; -//import com.iqser.red.service.ocr.v1.server.model.ElementFeatures; -//import com.pdftron.common.Matrix2D; -//import com.pdftron.common.PDFNetException; -//import com.pdftron.pdf.ColorPt; -//import com.pdftron.pdf.ColorSpace; -//import com.pdftron.pdf.Element; -//import com.pdftron.pdf.ElementBuilder; -//import com.pdftron.pdf.ElementReader; -//import com.pdftron.pdf.ElementWriter; -//import com.pdftron.pdf.GState; -//import com.pdftron.pdf.PDFDoc; -//import com.pdftron.pdf.Page; -//import com.pdftron.pdf.PageIterator; -//import com.pdftron.pdf.PathData; -//import com.pdftron.pdf.Rect; -//import com.pdftron.sdf.Obj; -//import com.pdftron.sdf.SDFDoc; -// -//import lombok.Builder; -//import lombok.SneakyThrows; -//import lombok.extern.slf4j.Slf4j; -// -//@Slf4j -//@Service -//public class InvisibleElementRemovalService { -// -// static public final double TOLERANCE = 1e-3; -// -// -// /** -// * Removes all hidden Text, Path and Image Elements from a PDF Document. -// * handled cases: -// * -Text which is transparent or is set to not render -// * -Elements outside of clipping path -// * -Elements that have been painted over by visible and filled Paths -// * unhandled cases: -// * -Elements covered by widely stroked path -// * -Elements with the same color as background -// * -Any Text set to clipping with its many interactions with other elements -// * -// * @param pdfFile The PDF file to process -// * @param delta If this flag is set only the removed Elements will be written to the output file. -// * The Elements are red if they are removed by clipping path, blue for transparency, and a green bounding box for overlap. -// * @param out OutputStream to write the resulting file to -// **/ -// @SneakyThrows -// public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta) { -// -// PDFDoc pdfDoc = new PDFDoc(pdfFile); -// -// ElementWriter writer = new ElementWriter(); -// ElementReader reader = new ElementReader(); -// Set visitedXObjIds = new TreeSet<>(); -// -// for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) { -// -// Page page = iterator.next(); -// -// visitedXObjIds.add(page.getSDFObj().getObjNum()); -// -// -// InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder() -// .reader(reader) -// .clippingPathStack(new ClippingPathStack(page.getMediaBox())) -// .delta(delta) -// .overlappedElements(new ArrayList<>()) -// .visibleElements(new ArrayList<>()) -// .visitedXObjIds(visitedXObjIds) -// .build(); -// -// removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context); -// -// context.visitedXObjIds().clear(); -// -// removeOverlappedElements(page, writer, context); -// } -// -// try { -// pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null); -// } catch (Exception e) { -// log.error("File could not be saved after invisible element removal"); -// throw new RuntimeException(e); -// } -// -// writer.destroy(); -// reader.destroy(); -// pdfDoc.close(); -// } -// -// -// private void removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(Page page, -// ElementWriter writer, -// InvisibleElementRemovalContext context) throws PDFNetException { -// -// context.reader().begin(page); -// writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict()); -// processElements(writer, context); -// writer.end(); -// context.reader().end(); -// } -// -// -// private void processElements(ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { -// -// for (Element element = context.reader().next(); element != null; element = context.reader().next()) -// switch (element.getType()) { -// case Element.e_image, Element.e_inline_image -> processImages(element, writer, context); -// case Element.e_text -> processText(element, writer, context); -// case Element.e_path -> processPath(element, writer, context); -// case Element.e_form -> processForm(element, writer, context); -// case Element.e_group_begin -> { -// context.clippingPathStack().enterNewGState(); -// writer.writeElement(element); -// } -// case Element.e_group_end -> { -// context.clippingPathStack().leaveGState(); -// writer.writeElement(element); -// } -// default -> writer.writeElement(element); -// } -// } -// -// -// private void processImages(Element imageElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { -// -// Rect rect = imageElement.getBBox(); -// -// if (rect == null) { -// return; -// } -// -// boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight()); -// -// if (!context.delta() && inClippingPath) { -// context.visibleElements().add(ElementFeatures.extractFeatures(imageElement)); -// } -// -// if (context.delta() ^ inClippingPath) { -// writer.writeElement(imageElement); -// } -// } -// -// -// private void processText(Element textElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { -// -// Rect rect = textElement.getBBox(); -// -// if (rect == null) { -// writer.writeElement(textElement); -// return; -// } -// -// GState gState = textElement.getGState(); -// -// boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight()); -// -// boolean isTextVisible = isTextRenderedVisibly(gState); -// -// if (inClippingPath && isTextVisible) { -// context.visibleElements().add(ElementFeatures.extractFeatures(textElement)); -// } -// if (!context.delta()) { -// if (inClippingPath && isTextVisible) { -// writer.writeElement(textElement); -// } else if (textElement.hasTextMatrix()) { -// /* -// PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element. -// hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands. -// Therefore, the position of a following Tj is affected by not writing the first Element. -// This is why, we write only the Tm command: -// */ -// writer.writeGStateChanges(textElement); -// } -// } else { -// if (!inClippingPath) { -// gState.setFillColorSpace(ColorSpace.createDeviceRGB()); -// // red for elements removed by clipping path -// gState.setFillColor(new ColorPt(1, 0, 0)); -// writer.writeElement(textElement); -// } -// if (!isTextVisible) { -// gState.setFillColorSpace(ColorSpace.createDeviceRGB()); -// // blue for elements removed due to transparency or not rendered -// gState.setFillColor(new ColorPt(0, 0, 1)); -// gState.setTextRenderMode(GState.e_fill_text); -// gState.setFillOpacity(1); -// writer.writeElement(textElement); -// } -// } -// } -// -// -// private void processForm(Element formElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { -// -// writer.writeElement(formElement); -// Obj formObj = formElement.getXObject(); -// -// if (!context.visitedXObjIds().contains(formObj.getObjNum())) { -// context.visitedXObjIds().add(formObj.getObjNum()); -// // writer needs to be newly initialized when entering a new content stream -// // see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest) -// ElementWriter formWriter = new ElementWriter(); -// context.reader().formBegin(); -// formWriter.begin(formObj); -// -// context.reader().clearChangeList(); -// formWriter.setDefaultGState(context.reader()); -// -// processElements(formWriter, context); -// formWriter.end(); -// formWriter.destroy(); -// context.reader().end(); -// } -// } -// -// -// private void processPath(Element pathElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { -// -// PathData pathData = pathElement.getPathData(); -// -// if (pathData.getOperators().length == 0 && pathData.getPoints().length == 0) { -// writer.writeGStateChanges(pathElement); -// return; -// } -// -// GeneralPath linePath = convertToGeneralPath(pathData); -// -// //transform path to initial user space -// var ctm = pathElement.getCTM(); -// var affineTransform = toAffineTransform(ctm); -// linePath.transform(affineTransform); -// -// var rect = linePath.getBounds2D(); -// -// boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight()); -// -// if (pathElement.isClippingPath()) { -// if (pathElement.isClipWindingFill()) { -// linePath.setWindingRule(GeneralPath.WIND_NON_ZERO); -// } else { -// linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD); -// } -// -// context.clippingPathStack().intersectClippingPath(linePath); -// pathElement.setPathClip(!context.delta()); -// writer.writeElement(pathElement); -// -// } else { -// if (pathElement.isWindingFill()) { -// linePath.setWindingRule(GeneralPath.WIND_NON_ZERO); -// } else { -// linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD); -// } -// -// if (inClippingPath) { -// if (isFilledAndNonTransparent(pathElement)) { -// List currentOverlappedElements = context.visibleElements() -// .stream() -// .filter(features -> almostContains(linePath, features.getBoundingBox())) -// .toList(); -// context.overlappedElements().addAll(currentOverlappedElements); -// context.visibleElements().removeAll(currentOverlappedElements); -// } -// context.visibleElements().add(ElementFeatures.extractFeatures(pathElement)); -// if (!context.delta()) { -// writer.writeElement(pathElement); -// } -// } -// if (context.delta() && !inClippingPath) { -// pathElement.getGState().setFillColorSpace(ColorSpace.createDeviceRGB()); -// pathElement.getGState().setFillColor(new ColorPt(1, 0, 0)); -// pathElement.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB()); -// pathElement.getGState().setStrokeColor(new ColorPt(1, 0, 0)); -// writer.writeElement(pathElement); -// } -// } -// } -// -// -// private void removeOverlappedElements(Page page, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { -// -// context.reader().begin(page); -// writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict()); -// if (context.delta()) { -// // green for element removed due to overlapping -// context.overlappedElements().forEach(feature -> drawBBox(writer, feature.getBoundingBox(), "#00FF00")); -// context.overlappedElements().clear(); -// } -// processOverlappedElements(writer, context); -// writer.end(); -// context.reader().end(); -// -// if (context.overlappedElements().size() > 0) { -// log.warn(context.overlappedElements().size() + " overlapped elements have not been found or removed"); -// } -// } -// -// -// private void processOverlappedElements(ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { -// -// for (Element element = context.reader().next(); element != null; element = context.reader().next()) { -// switch (element.getType()) { -// case Element.e_form -> processFormOverlappedElements(writer, element, context); -// case Element.e_path, Element.e_image, Element.e_inline_image, Element.e_text -> { -// boolean anyMatch = false; -// for (ElementFeatures elementToRemove : context.overlappedElements()) { -// if (elementToRemove.almostMatches(element)) { -// context.overlappedElements().remove(elementToRemove); -// anyMatch = true; -// break; -// } -// } -// if (!anyMatch) { -// writer.writeElement(element); -// } else if (element.getType() == 3 && element.hasTextMatrix()) { -// /* -// PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element. -// hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands. -// Therefore, the position of a following Tj is affected by not writing the first Element. -// This is why, we write only the Tm command: -// */ -// writer.writeGStateChanges(element); -// } -// } -// default -> writer.writeElement(element); -// } -// } -// } -// -// -// private void processFormOverlappedElements(ElementWriter writer, Element formElement, InvisibleElementRemovalContext context) throws PDFNetException { -// -// writer.writeElement(formElement); -// Obj formObj = formElement.getXObject(); -// -// if (!context.visitedXObjIds().contains(formObj.getObjNum())) { -// context.visitedXObjIds().add(formObj.getObjNum()); -// // writer needs to be newly initialized when entering a new content stream -// // see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest) -// ElementWriter formWriter = new ElementWriter(); -// context.reader().formBegin(); -// formWriter.begin(formObj); -// -// context.reader().clearChangeList(); -// formWriter.setDefaultGState(context.reader()); -// -// processOverlappedElements(formWriter, context); -// formWriter.end(); -// formWriter.destroy(); -// context.reader().end(); -// } -// } -// -// -// private boolean isTextRenderedVisibly(GState gState) throws PDFNetException { -// -// return gState.getTextRenderMode() != GState.e_invisible_text && // -// !(gState.getTextRenderMode() == GState.e_fill_text && gState.getFillOpacity() == 0) && // -// !(gState.getTextRenderMode() == GState.e_stroke_text && gState.getStrokeOpacity() == 0) && // -// !(gState.getTextRenderMode() == GState.e_fill_stroke_text && gState.getFillOpacity() == 0 && gState.getStrokeOpacity() == 0); -// } -// -// -// private GeneralPath convertToGeneralPath(PathData pathData) throws PDFNetException { -// -// GeneralPath linePath = new GeneralPath(); -// Iterator points = Doubles.asList(pathData.getPoints()).iterator(); -// Iterable operators = Bytes.asList(pathData.getOperators()); -// for (var operator : operators) { -// switch (operator) { -// case PathData.e_moveto -> linePath.moveTo(points.next(), points.next()); -// case PathData.e_lineto -> linePath.lineTo(points.next(), points.next()); -// case PathData.e_cubicto -> linePath.curveTo(points.next(), points.next(), points.next(), points.next(), points.next(), points.next()); -// case PathData.e_closepath -> linePath.closePath(); -// case PathData.e_rect -> { -// double x = points.next(); -// double y = points.next(); -// double w = points.next(); -// double h = points.next(); -// linePath.moveTo(x, y); -// linePath.lineTo(x + w, y); -// linePath.lineTo(x + w, y + h); -// linePath.lineTo(x, y + h); -// linePath.closePath(); -// } -// default -> throw new PDFNetException("Invalid Element Type", 0, "", "", ""); -// } -// } -// return linePath; -// } -// -// -// private boolean almostContains(Shape outer, Rectangle2D inner) { -// //To address inconsistencies in the calculation of the bounding box we slightly shrink the inner rectangle -// -// double x_with_tolerance = inner.getX() >= 0 ? inner.getX() + TOLERANCE : inner.getX() - TOLERANCE; -// double y_with_tolerance = inner.getY() >= 0 ? inner.getY() + TOLERANCE : inner.getY() - TOLERANCE; -// double height_with_tolerance = inner.getHeight() - (2 * TOLERANCE); -// double width_with_tolerance = inner.getWidth() - (2 * TOLERANCE); -// Rectangle2D innerRect = new Rectangle2D.Double(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance); -// -// return outer.contains(innerRect); -// } -// -// -// private boolean isFilledAndNonTransparent(Element element) throws PDFNetException { -// -// return element.isFilled() && element.getGState().getFillOpacity() == 1; -// } -// -// -// @SneakyThrows -// private void drawBBox(ElementWriter writer, Rectangle2D r, String hexcolor) { -// -// ColorPt colorPt = new ColorPt(Integer.valueOf(hexcolor.substring(1, 3), 16) / 255d, -// Integer.valueOf(hexcolor.substring(3, 5), 16) / 255d, -// Integer.valueOf(hexcolor.substring(5, 7), 16) / 255d); -// ElementBuilder eb = new ElementBuilder(); -// Element rect = eb.createRect(r.getX(), r.getY(), r.getWidth(), r.getHeight()); -// rect.setPathStroke(true); -// rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB()); -// rect.getGState().setStrokeColor(colorPt); -// writer.writePlacedElement(rect); -// -// colorPt.destroy(); -// eb.destroy(); -// } -// -// -// private static AffineTransform toAffineTransform(Matrix2D ctm) throws PDFNetException { -// -// return new AffineTransform(ctm.getA(), ctm.getB(), ctm.getC(), ctm.getD(), ctm.getH(), ctm.getV()); -// } -// -// -// @Builder -// private record InvisibleElementRemovalContext( -// boolean delta, -// ElementReader reader, -// ClippingPathStack clippingPathStack, -// List overlappedElements, -// List visibleElements, -// Set visitedXObjIds) { -// -// } -// -//} \ No newline at end of file