From a96260f77fd5b546a5d27d84f34861742f13ddff Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Thu, 2 Feb 2023 13:01:58 +0100 Subject: [PATCH] RED-6019: Remove hidden text when processing OCR *moved InvisibleElementRemovalDto to private inner record of InvisibleElementRemovalService *added comments for color choices --- .../model/InvisibleElementRemovalDto.java | 25 --- .../InvisibleElementRemovalService.java | 148 ++++++++++-------- 2 files changed, 82 insertions(+), 91 deletions(-) delete mode 100644 ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/InvisibleElementRemovalDto.java diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/InvisibleElementRemovalDto.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/InvisibleElementRemovalDto.java deleted file mode 100644 index 6d7f044..0000000 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/model/InvisibleElementRemovalDto.java +++ /dev/null @@ -1,25 +0,0 @@ -package com.iqser.red.service.ocr.v1.server.model; - -import java.util.List; -import java.util.Set; - -import com.pdftron.pdf.ElementReader; - -import lombok.AccessLevel; -import lombok.Builder; -import lombok.Data; -import lombok.experimental.FieldDefaults; - -@Data -@Builder -@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true) -public class InvisibleElementRemovalDto { - - boolean delta; - ElementReader reader; - ClippingPathStack clippingPathStack; - List overlappedElements; - List visibleElements; - Set visitedXObjIds; - -} diff --git a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java index 6252b8c..a473a10 100644 --- a/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java +++ b/ocr-service-v1/ocr-service-server-v1/src/main/java/com/iqser/red/service/ocr/v1/server/service/InvisibleElementRemovalService.java @@ -17,7 +17,6 @@ import com.google.common.primitives.Bytes; import com.google.common.primitives.Doubles; import com.iqser.red.service.ocr.v1.server.model.ClippingPathStack; import com.iqser.red.service.ocr.v1.server.model.ElementFeatures; -import com.iqser.red.service.ocr.v1.server.model.InvisibleElementRemovalDto; import com.pdftron.common.Matrix2D; import com.pdftron.common.PDFNetException; import com.pdftron.pdf.ColorPt; @@ -35,6 +34,7 @@ import com.pdftron.pdf.Rect; import com.pdftron.sdf.Obj; import com.pdftron.sdf.SDFDoc; +import lombok.Builder; import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; @@ -75,7 +75,7 @@ public class InvisibleElementRemovalService { Page page = iterator.next(); visitedXObjIds.add(page.getSDFObj().getObjNum()); - InvisibleElementRemovalDto dto = InvisibleElementRemovalDto.builder() + InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder() .reader(reader) .clippingPathStack(new ClippingPathStack(page.getMediaBox())) .delta(delta) @@ -84,40 +84,42 @@ public class InvisibleElementRemovalService { .visitedXObjIds(visitedXObjIds) .build(); - removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, dto); + removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context); - dto.getVisitedXObjIds().clear(); + context.visitedXObjIds().clear(); - removeOverlappedElements(page, writer, dto); + removeOverlappedElements(page, writer, context); } return pdfDoc.save(SDFDoc.SaveMode.REMOVE_UNUSED, null); } - private void removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(Page page, ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException { + private void removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(Page page, + ElementWriter writer, + InvisibleElementRemovalContext context) throws PDFNetException { - dto.getReader().begin(page); + context.reader().begin(page); writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict()); - processElements(writer, dto); + processElements(writer, context); writer.end(); - dto.getReader().end(); + context.reader().end(); } - private void processElements(ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException { + private void processElements(ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { - for (Element element = dto.getReader().next(); element != null; element = dto.getReader().next()) + for (Element element = context.reader().next(); element != null; element = context.reader().next()) switch (element.getType()) { - case Element.e_image, Element.e_inline_image -> processImages(element, writer, dto); - case Element.e_text -> processText(element, writer, dto); - case Element.e_path -> processPath(element, writer, dto); - case Element.e_form -> processForm(element, writer, dto); + case Element.e_image, Element.e_inline_image -> processImages(element, writer, context); + case Element.e_text -> processText(element, writer, context); + case Element.e_path -> processPath(element, writer, context); + case Element.e_form -> processForm(element, writer, context); case Element.e_group_begin -> { - dto.getClippingPathStack().enterNewGState(); + context.clippingPathStack().enterNewGState(); writer.writeElement(element); } case Element.e_group_end -> { - dto.getClippingPathStack().leaveGState(); + context.clippingPathStack().leaveGState(); writer.writeElement(element); } default -> writer.writeElement(element); @@ -125,7 +127,7 @@ public class InvisibleElementRemovalService { } - private void processImages(Element imageElement, ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException { + private void processImages(Element imageElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { Rect rect = imageElement.getBBox(); @@ -133,19 +135,19 @@ public class InvisibleElementRemovalService { return; } - boolean inClippingPath = dto.getClippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight()); + boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight()); - if (!dto.isDelta() && inClippingPath) { - dto.getVisibleElements().add(ElementFeatures.extractFeatures(imageElement)); + if (!context.delta() && inClippingPath) { + context.visibleElements().add(ElementFeatures.extractFeatures(imageElement)); } - if (dto.isDelta() ^ inClippingPath) { + if (context.delta() ^ inClippingPath) { writer.writeElement(imageElement); } } - private void processText(Element textElement, ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException { + private void processText(Element textElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { Rect rect = textElement.getBBox(); @@ -156,14 +158,14 @@ public class InvisibleElementRemovalService { GState gState = textElement.getGState(); - boolean inClippingPath = dto.getClippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight()); + boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight()); boolean isTextVisible = isTextRenderedVisibly(gState); if (inClippingPath && isTextVisible) { - dto.getVisibleElements().add(ElementFeatures.extractFeatures(textElement)); + context.visibleElements().add(ElementFeatures.extractFeatures(textElement)); } - if (!dto.isDelta()) { + if (!context.delta()) { if (inClippingPath && isTextVisible) { writer.writeElement(textElement); } else if (textElement.hasTextMatrix()) { @@ -178,11 +180,13 @@ public class InvisibleElementRemovalService { } else { if (!inClippingPath) { gState.setFillColorSpace(ColorSpace.createDeviceRGB()); + // red for elements removed by clipping path gState.setFillColor(new ColorPt(1, 0, 0)); writer.writeElement(textElement); } if (!isTextVisible) { gState.setFillColorSpace(ColorSpace.createDeviceRGB()); + // blue for elements removed due to transparency or not rendered gState.setFillColor(new ColorPt(0, 0, 1)); gState.setTextRenderMode(GState.e_fill_text); gState.setFillOpacity(1); @@ -192,30 +196,30 @@ public class InvisibleElementRemovalService { } - private void processForm(Element formElement, ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException { + private void processForm(Element formElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { writer.writeElement(formElement); Obj formObj = formElement.getXObject(); - if (!dto.getVisitedXObjIds().contains(formObj.getObjNum())) { - dto.getVisitedXObjIds().add(formObj.getObjNum()); + if (!context.visitedXObjIds().contains(formObj.getObjNum())) { + context.visitedXObjIds().add(formObj.getObjNum()); // writer needs to be newly initialized when entering a new content stream // see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest) ElementWriter formWriter = new ElementWriter(); - dto.getReader().formBegin(); + context.reader().formBegin(); formWriter.begin(formObj); - dto.getReader().clearChangeList(); - formWriter.setDefaultGState(dto.getReader()); + context.reader().clearChangeList(); + formWriter.setDefaultGState(context.reader()); - processElements(formWriter, dto); + processElements(formWriter, context); formWriter.end(); - dto.getReader().end(); + context.reader().end(); } } - private void processPath(Element pathElement, ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException { + private void processPath(Element pathElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { GeneralPath linePath = convertToGeneralPath(pathElement.getPathData()); @@ -226,7 +230,7 @@ public class InvisibleElementRemovalService { var rect = linePath.getBounds2D(); - boolean inClippingPath = dto.getClippingPathStack().almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight()); + boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight()); if (pathElement.isClippingPath()) { if (pathElement.isClipWindingFill()) { @@ -235,27 +239,27 @@ public class InvisibleElementRemovalService { linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD); } - dto.getClippingPathStack().intersectClippingPath(linePath); - pathElement.setPathClip(!dto.isDelta()); + context.clippingPathStack().intersectClippingPath(linePath); + pathElement.setPathClip(!context.delta()); writer.writeElement(pathElement); } else { if (inClippingPath) { // TODO: WINDING RULE if (isFilledAndNonTransparent(pathElement)) { - List currentOverlappedElements = dto.getVisibleElements() + List currentOverlappedElements = context.visibleElements() .stream() .filter(features -> almostContains(linePath, features.getBoundingBox())) .toList(); - dto.getOverlappedElements().addAll(currentOverlappedElements); - dto.getVisibleElements().removeAll(currentOverlappedElements); + context.overlappedElements().addAll(currentOverlappedElements); + context.visibleElements().removeAll(currentOverlappedElements); } - dto.getVisibleElements().add(ElementFeatures.extractFeatures(pathElement)); - if (!dto.isDelta()) { + context.visibleElements().add(ElementFeatures.extractFeatures(pathElement)); + if (!context.delta()) { writer.writeElement(pathElement); } } - if (dto.isDelta() && !inClippingPath) { + if (context.delta() && !inClippingPath) { pathElement.getGState().setFillColorSpace(ColorSpace.createDeviceRGB()); pathElement.getGState().setFillColor(new ColorPt(1, 0, 0)); pathElement.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB()); @@ -272,34 +276,35 @@ public class InvisibleElementRemovalService { } - private void removeOverlappedElements(Page page, ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException { + private void removeOverlappedElements(Page page, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { - dto.getReader().begin(page); + context.reader().begin(page); writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict()); - if (dto.isDelta()) { - dto.getOverlappedElements().forEach(feature -> drawBBox(writer, feature.getBoundingBox(), "#00FF00")); - dto.getOverlappedElements().clear(); + if (context.delta()) { + // green for element removed due to overlapping + context.overlappedElements().forEach(feature -> drawBBox(writer, feature.getBoundingBox(), "#00FF00")); + context.overlappedElements().clear(); } - processOverlappedElements(writer, dto); + processOverlappedElements(writer, context); writer.end(); - dto.getReader().end(); + context.reader().end(); - if (dto.getOverlappedElements().size() > 0) { - log.warn(dto.getOverlappedElements().size() + " overlapped elements have not been found or removed"); + if (context.overlappedElements().size() > 0) { + log.warn(context.overlappedElements().size() + " overlapped elements have not been found or removed"); } } - private void processOverlappedElements(ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException { + private void processOverlappedElements(ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException { - for (Element element = dto.getReader().next(); element != null; element = dto.getReader().next()) { + for (Element element = context.reader().next(); element != null; element = context.reader().next()) { switch (element.getType()) { - case Element.e_form -> processFormOverlappedElements(writer, element, dto); + case Element.e_form -> processFormOverlappedElements(writer, element, context); case Element.e_path, Element.e_image, Element.e_inline_image, Element.e_text -> { boolean anyMatch = false; - for (ElementFeatures elementToRemove : dto.getOverlappedElements()) { + for (ElementFeatures elementToRemove : context.overlappedElements()) { if (elementToRemove.almostMatches(element)) { - dto.getOverlappedElements().remove(elementToRemove); + context.overlappedElements().remove(elementToRemove); anyMatch = true; break; } @@ -322,25 +327,25 @@ public class InvisibleElementRemovalService { } - private void processFormOverlappedElements(ElementWriter writer, Element formElement, InvisibleElementRemovalDto dto) throws PDFNetException { + private void processFormOverlappedElements(ElementWriter writer, Element formElement, InvisibleElementRemovalContext context) throws PDFNetException { writer.writeElement(formElement); Obj formObj = formElement.getXObject(); - if (!dto.getVisitedXObjIds().contains(formObj.getObjNum())) { - dto.getVisitedXObjIds().add(formObj.getObjNum()); + if (!context.visitedXObjIds().contains(formObj.getObjNum())) { + context.visitedXObjIds().add(formObj.getObjNum()); // writer needs to be newly initialized when entering a new content stream // see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest) ElementWriter formWriter = new ElementWriter(); - dto.getReader().formBegin(); + context.reader().formBegin(); formWriter.begin(formObj); - dto.getReader().clearChangeList(); - formWriter.setDefaultGState(dto.getReader()); + context.reader().clearChangeList(); + formWriter.setDefaultGState(context.reader()); - processOverlappedElements(formWriter, dto); + processOverlappedElements(formWriter, context); formWriter.end(); - dto.getReader().end(); + context.reader().end(); } } @@ -416,4 +421,15 @@ public class InvisibleElementRemovalService { writer.writePlacedElement(rect); } + + @Builder + private record InvisibleElementRemovalContext(boolean delta, // + ElementReader reader, // + ClippingPathStack clippingPathStack, // + List overlappedElements, // + List visibleElements, // + Set visitedXObjIds) { + + } + } \ No newline at end of file