diff --git a/ocr-service-v1/ocr-service-server-v1/pom.xml b/ocr-service-v1/ocr-service-server-v1/pom.xml
index 38dd26f..052cade 100644
--- a/ocr-service-v1/ocr-service-server-v1/pom.xml
+++ b/ocr-service-v1/ocr-service-server-v1/pom.xml
@@ -23,6 +23,12 @@
com.iqser.red.commons
storage-commons
+
+ com.iqser.red.commons
+ pdftron-logic-commons
+ 1.1.0
+
+
com.iqser.red.commons
spring-commons
diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/Application.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/Application.java
index 3d65e1b..bfa6c70 100644
--- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/Application.java
+++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/Application.java
@@ -10,6 +10,7 @@ import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Import;
import org.springframework.scheduling.annotation.EnableAsync;
+import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService;
import com.iqser.red.service.ocr.v1.server.client.FileStatusProcessingUpdateClient;
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
import com.iqser.red.service.ocr.v1.server.multitenancy.AsyncConfig;
@@ -44,4 +45,11 @@ public class Application {
return new TimedAspect(registry);
}
+
+ @Bean
+ public InvisibleElementRemovalService invisibleElementRemovalService() {
+
+ return new InvisibleElementRemovalService();
+ }
+
}
diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ClippingPathStack.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ClippingPathStack.java
deleted file mode 100644
index 5e3c36a..0000000
--- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ClippingPathStack.java
+++ /dev/null
@@ -1,68 +0,0 @@
-package com.iqser.red.service.ocr.v1.server.model;
-
-import static com.iqser.red.service.ocr.v1.server.service.InvisibleElementRemovalService.TOLERANCE;
-
-import java.awt.geom.Area;
-import java.awt.geom.GeneralPath;
-import java.awt.geom.Rectangle2D;
-import java.util.Deque;
-import java.util.LinkedList;
-
-import com.pdftron.pdf.Rect;
-
-import lombok.Data;
-import lombok.SneakyThrows;
-
-@Data
-public class ClippingPathStack {
-
- private Deque stack = new LinkedList<>();
-
-
- @SneakyThrows
- public ClippingPathStack(Rect rectangle) {
-
- stack.push(new Area(new Rectangle2D.Double(rectangle.getX1(), rectangle.getY1(), rectangle.getWidth(), rectangle.getHeight()).getBounds2D()));
- }
-
-
- @SneakyThrows
- public void intersectClippingPath(GeneralPath path) {
-
- getCurrentClippingPath().intersect(new Area(path));
- }
-
-
- public boolean almostIntersects(double x, double y, double width, double height) {
- // To address inconsistencies in the calculation of the bounding box we slightly increase the rectangle
- // Height or width are zero for straight lines, even though they are being rendered. Therefore, height or width must be at minimum >0.
-
- double x_with_tolerance = x > 0 ? x - TOLERANCE : x + TOLERANCE;
- double y_with_tolerance = y > 0 ? y - TOLERANCE : y + TOLERANCE;
- double width_with_tolerance = width + (2 * TOLERANCE);
- double height_with_tolerance = height + (2 * TOLERANCE);
- return getCurrentClippingPath().intersects(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance);
- }
-
-
- public Area getCurrentClippingPath() {
-
- return stack.peek();
- }
-
-
- public void enterNewGState() {
-
- Area current = stack.peek();
- Area cloned = new Area();
- cloned.add(current);
- stack.push(cloned);
- }
-
-
- public void leaveGState() {
-
- stack.pop();
- }
-
-}
\ No newline at end of file
diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ElementFeatures.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ElementFeatures.java
deleted file mode 100644
index 87c625c..0000000
--- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/ElementFeatures.java
+++ /dev/null
@@ -1,170 +0,0 @@
-package com.iqser.red.service.ocr.v1.server.model;
-
-import static com.iqser.red.service.ocr.v1.server.service.InvisibleElementRemovalService.TOLERANCE;
-
-import java.awt.geom.Rectangle2D;
-
-import com.pdftron.common.PDFNetException;
-import com.pdftron.pdf.Element;
-import com.pdftron.pdf.Rect;
-
-import lombok.AccessLevel;
-import lombok.EqualsAndHashCode;
-import lombok.Getter;
-import lombok.SneakyThrows;
-import lombok.experimental.FieldDefaults;
-import lombok.experimental.SuperBuilder;
-
-@Getter
-@SuperBuilder
-@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
-public class ElementFeatures {
-
- int elementType;
- Rectangle2D boundingBox;
-
-
- public boolean almostMatches(Element element) throws PDFNetException {
-
- return element.getType() == elementType && //
- element.getBBox() != null && //
- rectsAlmostMatch(element.getBBox());
- }
-
-
- protected boolean almostEqual(double a, double b) {
-
- return Math.abs(a - b) < TOLERANCE;
- }
-
-
- @SneakyThrows
- private boolean rectsAlmostMatch(Rect bBox) {
- // To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance
-
- return almostEqual(bBox.getX1(), boundingBox.getX()) && //
- almostEqual(bBox.getY1(), boundingBox.getY()) && //
- almostEqual(bBox.getWidth(), boundingBox.getWidth()) && //
- almostEqual(bBox.getHeight(), boundingBox.getHeight());
- }
-
-
- @EqualsAndHashCode(callSuper = true)
- @Getter
- @SuperBuilder
- @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
- private static class Text extends ElementFeatures {
-
- String text;
- int font;
- double fontsize;
-
-
- @Override
- public boolean almostMatches(Element element) throws PDFNetException {
-
- return super.almostMatches(element) && //
- text.equals(element.getTextString()) && //
- font == element.getGState().getFont().getType() && //
- almostEqual(fontsize, element.getGState().getFontSize());
- }
-
- }
-
- @EqualsAndHashCode(callSuper = true)
- @Getter
- @SuperBuilder
- @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
- private static class Path extends ElementFeatures {
-
- boolean isClippingPath;
- boolean isClipWindingFill;
- boolean isStroked;
- boolean isFilled;
- boolean isWindingFill;
-
-
- @Override
- public boolean almostMatches(Element element) throws PDFNetException {
-
- return super.almostMatches(element) && //
- isClippingPath == element.isClippingPath() && //
- isClipWindingFill == element.isClipWindingFill() && //
- isStroked == element.isStroked() && //
- isFilled == element.isFilled() && //
- isWindingFill == element.isWindingFill();
-
- }
-
- }
-
- @EqualsAndHashCode(callSuper = true)
- @Getter
- @SuperBuilder
- @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
- private static class Image extends ElementFeatures {
-
- int dataSize;
- int height;
- int width;
- int renderingIntent;
- int componentNum;
- int bitsPerComponent;
-
-
- @Override
- public boolean almostMatches(Element element) throws PDFNetException {
-
- return super.almostMatches(element) && //
- dataSize == element.getImageDataSize() && //
- height == element.getImageHeight() && //
- width == element.getImageWidth() && //
- renderingIntent == element.getImageRenderingIntent() && //
- componentNum == element.getComponentNum() && //
- bitsPerComponent == element.getBitsPerComponent();
- }
-
- }
-
-
- public static ElementFeatures extractFeatures(Element element) throws PDFNetException {
-
- return switch (element.getType()) {
- case Element.e_path -> Path.builder()
- .elementType(element.getType())
- .boundingBox(toRectangle2D(element.getBBox()))
- .isClippingPath(element.isClippingPath())
- .isClipWindingFill(element.isClipWindingFill())
- .isStroked(element.isStroked())
- .isFilled(element.isFilled())
- .isWindingFill(element.isWindingFill())
- .build();
- case Element.e_text -> Text.builder()
- .elementType(element.getType())
- .boundingBox(toRectangle2D(element.getBBox()))
- .text(element.getTextString())
- .font(element.getGState().getFont().getType())
- .fontsize(element.getGState().getFontSize())
- .build();
- case Element.e_image, Element.e_inline_image -> Image.builder()
- .elementType(element.getType())
- .boundingBox(toRectangle2D(element.getBBox()))
- .dataSize(element.getImageDataSize())
- .height(element.getImageHeight())
- .width(element.getImageWidth())
- .renderingIntent(element.getImageRenderingIntent())
- .componentNum(element.getComponentNum())
- .bitsPerComponent(element.getBitsPerComponent())
- .build();
- // This technically should never happen, it's a safetynet
- default -> throw new RuntimeException("Feature Extraction is not supported for PDFTron.Element with type: " + element.getType());
- };
- }
-
-
- private static Rectangle2D toRectangle2D(Rect rect) throws PDFNetException {
-
- return new Rectangle2D.Double(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
- }
-
-}
diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java
deleted file mode 100644
index e64c1fd..0000000
--- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java
+++ /dev/null
@@ -1,466 +0,0 @@
-package com.iqser.red.service.ocr.v1.server.service;
-
-import java.awt.Shape;
-import java.awt.geom.AffineTransform;
-import java.awt.geom.GeneralPath;
-import java.awt.geom.Rectangle2D;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Set;
-import java.util.TreeSet;
-
-import org.springframework.stereotype.Service;
-
-import com.google.common.primitives.Bytes;
-import com.google.common.primitives.Doubles;
-import com.iqser.red.service.ocr.v1.server.model.ClippingPathStack;
-import com.iqser.red.service.ocr.v1.server.model.ElementFeatures;
-import com.pdftron.common.Matrix2D;
-import com.pdftron.common.PDFNetException;
-import com.pdftron.pdf.ColorPt;
-import com.pdftron.pdf.ColorSpace;
-import com.pdftron.pdf.Element;
-import com.pdftron.pdf.ElementBuilder;
-import com.pdftron.pdf.ElementReader;
-import com.pdftron.pdf.ElementWriter;
-import com.pdftron.pdf.GState;
-import com.pdftron.pdf.PDFDoc;
-import com.pdftron.pdf.Page;
-import com.pdftron.pdf.PageIterator;
-import com.pdftron.pdf.PathData;
-import com.pdftron.pdf.Rect;
-import com.pdftron.sdf.Obj;
-import com.pdftron.sdf.SDFDoc;
-
-import lombok.Builder;
-import lombok.SneakyThrows;
-import lombok.extern.slf4j.Slf4j;
-
-@Slf4j
-@Service
-public class InvisibleElementRemovalService {
-
- static public final double TOLERANCE = 1e-3;
-
-
- /**
- * Removes all hidden Text, Path and Image Elements from a PDF Document.
- * handled cases:
- * -Text which is transparent or is set to not render
- * -Elements outside of clipping path
- * -Elements that have been painted over by visible and filled Paths
- * unhandled cases:
- * -Elements covered by widely stroked path
- * -Elements with the same color as background
- * -Any Text set to clipping with its many interactions with other elements
- *
- * @param pdfFile The PDF file to process
- * @param delta If this flag is set only the removed Elements will be written to the output file.
- * The Elements are red if they are removed by clipping path, blue for transparency, and a green bounding box for overlap.
- * @param out OutputStream to write the resulting file to
- **/
- @SneakyThrows
- public void removeInvisibleElements(InputStream pdfFile, OutputStream out, boolean delta) {
-
- PDFDoc pdfDoc = new PDFDoc(pdfFile);
-
- ElementWriter writer = new ElementWriter();
- ElementReader reader = new ElementReader();
- Set visitedXObjIds = new TreeSet<>();
-
- for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
-
- Page page = iterator.next();
-
- visitedXObjIds.add(page.getSDFObj().getObjNum());
-
-
- InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder()
- .reader(reader)
- .clippingPathStack(new ClippingPathStack(page.getMediaBox()))
- .delta(delta)
- .overlappedElements(new ArrayList<>())
- .visibleElements(new ArrayList<>())
- .visitedXObjIds(visitedXObjIds)
- .build();
-
- removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context);
-
- context.visitedXObjIds().clear();
-
- removeOverlappedElements(page, writer, context);
- }
-
- try {
- pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
- } catch (Exception e) {
- log.error("File could not be saved after invisible element removal");
- throw new RuntimeException(e);
- }
-
- writer.destroy();
- reader.destroy();
- pdfDoc.close();
- }
-
-
- private void removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(Page page,
- ElementWriter writer,
- InvisibleElementRemovalContext context) throws PDFNetException {
-
- context.reader().begin(page);
- writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
- processElements(writer, context);
- writer.end();
- context.reader().end();
- }
-
-
- private void processElements(ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
-
- for (Element element = context.reader().next(); element != null; element = context.reader().next())
- switch (element.getType()) {
- case Element.e_image, Element.e_inline_image -> processImages(element, writer, context);
- case Element.e_text -> processText(element, writer, context);
- case Element.e_path -> processPath(element, writer, context);
- case Element.e_form -> processForm(element, writer, context);
- case Element.e_group_begin -> {
- context.clippingPathStack().enterNewGState();
- writer.writeElement(element);
- }
- case Element.e_group_end -> {
- context.clippingPathStack().leaveGState();
- writer.writeElement(element);
- }
- default -> writer.writeElement(element);
- }
- }
-
-
- private void processImages(Element imageElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
-
- Rect rect = imageElement.getBBox();
-
- if (rect == null) {
- return;
- }
-
- boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
-
- if (!context.delta() && inClippingPath) {
- context.visibleElements().add(ElementFeatures.extractFeatures(imageElement));
- }
-
- if (context.delta() ^ inClippingPath) {
- writer.writeElement(imageElement);
- }
- }
-
-
- private void processText(Element textElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
-
- Rect rect = textElement.getBBox();
-
- if (rect == null) {
- writer.writeElement(textElement);
- return;
- }
-
- GState gState = textElement.getGState();
-
- boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
-
- boolean isTextVisible = isTextRenderedVisibly(gState);
-
- if (inClippingPath && isTextVisible) {
- context.visibleElements().add(ElementFeatures.extractFeatures(textElement));
- }
- if (!context.delta()) {
- if (inClippingPath && isTextVisible) {
- writer.writeElement(textElement);
- } else if (textElement.hasTextMatrix()) {
- /*
- PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
- hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
- Therefore, the position of a following Tj is affected by not writing the first Element.
- This is why, we write only the Tm command:
- */
- writer.writeGStateChanges(textElement);
- }
- } else {
- if (!inClippingPath) {
- gState.setFillColorSpace(ColorSpace.createDeviceRGB());
- // red for elements removed by clipping path
- gState.setFillColor(new ColorPt(1, 0, 0));
- writer.writeElement(textElement);
- }
- if (!isTextVisible) {
- gState.setFillColorSpace(ColorSpace.createDeviceRGB());
- // blue for elements removed due to transparency or not rendered
- gState.setFillColor(new ColorPt(0, 0, 1));
- gState.setTextRenderMode(GState.e_fill_text);
- gState.setFillOpacity(1);
- writer.writeElement(textElement);
- }
- }
- }
-
-
- private void processForm(Element formElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
-
- writer.writeElement(formElement);
- Obj formObj = formElement.getXObject();
-
- if (!context.visitedXObjIds().contains(formObj.getObjNum())) {
- context.visitedXObjIds().add(formObj.getObjNum());
- // writer needs to be newly initialized when entering a new content stream
- // see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
- ElementWriter formWriter = new ElementWriter();
- context.reader().formBegin();
- formWriter.begin(formObj);
-
- context.reader().clearChangeList();
- formWriter.setDefaultGState(context.reader());
-
- processElements(formWriter, context);
- formWriter.end();
- formWriter.destroy();
- context.reader().end();
- }
- }
-
-
- private void processPath(Element pathElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
-
- PathData pathData = pathElement.getPathData();
-
- if (pathData.getOperators().length == 0 && pathData.getPoints().length == 0) {
- writer.writeGStateChanges(pathElement);
- return;
- }
-
- GeneralPath linePath = convertToGeneralPath(pathData);
-
- //transform path to initial user space
- var ctm = pathElement.getCTM();
- var affineTransform = toAffineTransform(ctm);
- linePath.transform(affineTransform);
-
- var rect = linePath.getBounds2D();
-
- boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight());
-
- if (pathElement.isClippingPath()) {
- if (pathElement.isClipWindingFill()) {
- linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
- } else {
- linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
- }
-
- context.clippingPathStack().intersectClippingPath(linePath);
- pathElement.setPathClip(!context.delta());
- writer.writeElement(pathElement);
-
- } else {
- if (pathElement.isWindingFill()) {
- linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
- } else {
- linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
- }
-
- if (inClippingPath) {
- if (isFilledAndNonTransparent(pathElement)) {
- List currentOverlappedElements = context.visibleElements()
- .stream()
- .filter(features -> almostContains(linePath, features.getBoundingBox()))
- .toList();
- context.overlappedElements().addAll(currentOverlappedElements);
- context.visibleElements().removeAll(currentOverlappedElements);
- }
- context.visibleElements().add(ElementFeatures.extractFeatures(pathElement));
- if (!context.delta()) {
- writer.writeElement(pathElement);
- }
- }
- if (context.delta() && !inClippingPath) {
- pathElement.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
- pathElement.getGState().setFillColor(new ColorPt(1, 0, 0));
- pathElement.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
- pathElement.getGState().setStrokeColor(new ColorPt(1, 0, 0));
- writer.writeElement(pathElement);
- }
- }
- }
-
-
- private void removeOverlappedElements(Page page, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
-
- context.reader().begin(page);
- writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
- if (context.delta()) {
- // green for element removed due to overlapping
- context.overlappedElements().forEach(feature -> drawBBox(writer, feature.getBoundingBox(), "#00FF00"));
- context.overlappedElements().clear();
- }
- processOverlappedElements(writer, context);
- writer.end();
- context.reader().end();
-
- if (context.overlappedElements().size() > 0) {
- log.warn(context.overlappedElements().size() + " overlapped elements have not been found or removed");
- }
- }
-
-
- private void processOverlappedElements(ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
-
- for (Element element = context.reader().next(); element != null; element = context.reader().next()) {
- switch (element.getType()) {
- case Element.e_form -> processFormOverlappedElements(writer, element, context);
- case Element.e_path, Element.e_image, Element.e_inline_image, Element.e_text -> {
- boolean anyMatch = false;
- for (ElementFeatures elementToRemove : context.overlappedElements()) {
- if (elementToRemove.almostMatches(element)) {
- context.overlappedElements().remove(elementToRemove);
- anyMatch = true;
- break;
- }
- }
- if (!anyMatch) {
- writer.writeElement(element);
- } else if (element.getType() == 3 && element.hasTextMatrix()) {
- /*
- PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
- hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
- Therefore, the position of a following Tj is affected by not writing the first Element.
- This is why, we write only the Tm command:
- */
- writer.writeGStateChanges(element);
- }
- }
- default -> writer.writeElement(element);
- }
- }
- }
-
-
- private void processFormOverlappedElements(ElementWriter writer, Element formElement, InvisibleElementRemovalContext context) throws PDFNetException {
-
- writer.writeElement(formElement);
- Obj formObj = formElement.getXObject();
-
- if (!context.visitedXObjIds().contains(formObj.getObjNum())) {
- context.visitedXObjIds().add(formObj.getObjNum());
- // writer needs to be newly initialized when entering a new content stream
- // see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
- ElementWriter formWriter = new ElementWriter();
- context.reader().formBegin();
- formWriter.begin(formObj);
-
- context.reader().clearChangeList();
- formWriter.setDefaultGState(context.reader());
-
- processOverlappedElements(formWriter, context);
- formWriter.end();
- formWriter.destroy();
- context.reader().end();
- }
- }
-
-
- private boolean isTextRenderedVisibly(GState gState) throws PDFNetException {
-
- return gState.getTextRenderMode() != GState.e_invisible_text && //
- !(gState.getTextRenderMode() == GState.e_fill_text && gState.getFillOpacity() == 0) && //
- !(gState.getTextRenderMode() == GState.e_stroke_text && gState.getStrokeOpacity() == 0) && //
- !(gState.getTextRenderMode() == GState.e_fill_stroke_text && gState.getFillOpacity() == 0 && gState.getStrokeOpacity() == 0);
- }
-
-
- private GeneralPath convertToGeneralPath(PathData pathData) throws PDFNetException {
-
- GeneralPath linePath = new GeneralPath();
- Iterator points = Doubles.asList(pathData.getPoints()).iterator();
- Iterable operators = Bytes.asList(pathData.getOperators());
- for (var operator : operators) {
- switch (operator) {
- case PathData.e_moveto -> linePath.moveTo(points.next(), points.next());
- case PathData.e_lineto -> linePath.lineTo(points.next(), points.next());
- case PathData.e_cubicto -> linePath.curveTo(points.next(), points.next(), points.next(), points.next(), points.next(), points.next());
- case PathData.e_closepath -> linePath.closePath();
- case PathData.e_rect -> {
- double x = points.next();
- double y = points.next();
- double w = points.next();
- double h = points.next();
- linePath.moveTo(x, y);
- linePath.lineTo(x + w, y);
- linePath.lineTo(x + w, y + h);
- linePath.lineTo(x, y + h);
- linePath.closePath();
- }
- default -> throw new PDFNetException("Invalid Element Type", 0, "", "", "");
- }
- }
- return linePath;
- }
-
-
- private boolean almostContains(Shape outer, Rectangle2D inner) {
- //To address inconsistencies in the calculation of the bounding box we slightly shrink the inner rectangle
-
- double x_with_tolerance = inner.getX() >= 0 ? inner.getX() + TOLERANCE : inner.getX() - TOLERANCE;
- double y_with_tolerance = inner.getY() >= 0 ? inner.getY() + TOLERANCE : inner.getY() - TOLERANCE;
- double height_with_tolerance = inner.getHeight() - (2 * TOLERANCE);
- double width_with_tolerance = inner.getWidth() - (2 * TOLERANCE);
- Rectangle2D innerRect = new Rectangle2D.Double(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance);
-
- return outer.contains(innerRect);
- }
-
-
- private boolean isFilledAndNonTransparent(Element element) throws PDFNetException {
-
- return element.isFilled() && element.getGState().getFillOpacity() == 1;
- }
-
-
- @SneakyThrows
- private void drawBBox(ElementWriter writer, Rectangle2D r, String hexcolor) {
-
- ColorPt colorPt = new ColorPt(Integer.valueOf(hexcolor.substring(1, 3), 16) / 255d,
- Integer.valueOf(hexcolor.substring(3, 5), 16) / 255d,
- Integer.valueOf(hexcolor.substring(5, 7), 16) / 255d);
- ElementBuilder eb = new ElementBuilder();
- Element rect = eb.createRect(r.getX(), r.getY(), r.getWidth(), r.getHeight());
- rect.setPathStroke(true);
- rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
- rect.getGState().setStrokeColor(colorPt);
- writer.writePlacedElement(rect);
-
- colorPt.destroy();
- eb.destroy();
- }
-
-
- private static AffineTransform toAffineTransform(Matrix2D ctm) throws PDFNetException {
-
- return new AffineTransform(ctm.getA(), ctm.getB(), ctm.getC(), ctm.getD(), ctm.getH(), ctm.getV());
- }
-
-
- @Builder
- private record InvisibleElementRemovalContext(
- boolean delta,
- ElementReader reader,
- ClippingPathStack clippingPathStack,
- List overlappedElements,
- List visibleElements,
- Set visitedXObjIds) {
-
- }
-
-}
\ No newline at end of file
diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java
index 2c5fe31..0b91cb0 100644
--- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java
+++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/OCRService.java
@@ -14,6 +14,7 @@ import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.stereotype.Service;
import com.fasterxml.jackson.databind.ObjectMapper;
+import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService;
import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse;
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings;
@@ -68,14 +69,7 @@ public class OCRService {
try (ByteArrayOutputStream transferOutputStream = new ByteArrayOutputStream()) {
try (InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId)) {
- long removalStart = System.currentTimeMillis();
- log.debug("Start invisible element removal for file with dossierId {} and fileId {}", dossierId, fileId);
invisibleElementRemovalService.removeInvisibleElements(fileStream, transferOutputStream, false);
- long removalEnd = System.currentTimeMillis();
- log.debug("Invisible element removal successful for file with dossierId {} and fileId {}, took {}s",
- dossierId,
- fileId,
- format("%.1f", (removalEnd - removalStart) / 1000.0));
}
try (InputStream transferInputStream = new ByteArrayInputStream(transferOutputStream.toByteArray())) {
long ocrStart = System.currentTimeMillis();
diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java
index e719877..6d0f5f8 100644
--- a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java
+++ b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/OcrServiceIntegrationTest.java
@@ -1,7 +1,7 @@
package com.iqser.red.service.ocr.v1.server;
+import static com.iqser.red.pdftronlogic.commons.PdfTextExtraction.extractAllTextFromDocument;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
-import static com.iqser.red.service.ocr.v1.server.utils.PdfTextExtraction.extractAllTextFromDocument;
import static org.assertj.core.api.Assertions.assertThat;
import java.io.FileInputStream;
diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalServiceTest.java b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalServiceTest.java
index ce6e3a1..32d8875 100644
--- a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalServiceTest.java
+++ b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalServiceTest.java
@@ -1,7 +1,7 @@
package com.iqser.red.service.ocr.v1.server.service;
+import static com.iqser.red.pdftronlogic.commons.PdfTextExtraction.extractAllTextFromDocument;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
-import static com.iqser.red.service.ocr.v1.server.utils.PdfTextExtraction.extractAllTextFromDocument;
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
import java.io.FileInputStream;
@@ -9,16 +9,18 @@ import java.io.FileOutputStream;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.context.annotation.Bean;
import org.springframework.core.io.ClassPathResource;
+import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService;
import com.iqser.red.service.ocr.v1.server.AbstractTest;
import lombok.SneakyThrows;
public class InvisibleElementRemovalServiceTest extends AbstractTest {
- @Autowired
- private InvisibleElementRemovalService invisibleElementRemovalService;
+ @Autowired
+ private InvisibleElementRemovalService invisibleElementRemovalService;
@Test
@@ -44,5 +46,6 @@ public class InvisibleElementRemovalServiceTest extends AbstractTest {
String[] text = extractAllTextFromDocument(fileStream).split("\n");
assertThat(text).contains("APPENDIX 16 Pathology Report", "Amendment 1", "Page 255 of 260");
}
+
}
}
\ No newline at end of file
diff --git a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/utils/PdfTextExtraction.java b/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/utils/PdfTextExtraction.java
deleted file mode 100644
index c3f195d..0000000
--- a/ocr-service-v1/ocr-service-server-v1/src/test/java/com/iqser/red/service/ocr/v1/server/utils/PdfTextExtraction.java
+++ /dev/null
@@ -1,35 +0,0 @@
-package com.iqser.red.service.ocr.v1.server.utils;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.List;
-
-import com.pdftron.common.PDFNetException;
-import com.pdftron.pdf.PDFDoc;
-import com.pdftron.pdf.Page;
-import com.pdftron.pdf.PageIterator;
-import com.pdftron.pdf.TextExtractor;
-
-
-public class PdfTextExtraction {
-
- public static String extractAllTextFromDocument(InputStream fileStream) throws IOException, PDFNetException {
-
- PDFDoc pdfDoc = new PDFDoc(fileStream);
- TextExtractor extractor = new TextExtractor();
- List texts = new ArrayList<>();
-
- PageIterator iterator = pdfDoc.getPageIterator();
- while (iterator.hasNext()) {
- Page page = iterator.next();
- extractor.begin(page);
- texts.add(extractor.getAsText());
- }
-
- extractor.destroy();
- pdfDoc.close();
- return String.join("\n", texts);
- }
-
-}