RED-6019: Remove hidden text when processing OCR

*moved InvisibleElementRemovalDto to private inner record of InvisibleElementRemovalService
*added comments for color choices
This commit is contained in:
Kilian Schuettler 2023-02-02 13:01:58 +01:00
parent 12fbdbee50
commit a96260f77f
2 changed files with 82 additions and 91 deletions

View File

@ -1,25 +0,0 @@
package com.iqser.red.service.ocr.v1.server.model;
import java.util.List;
import java.util.Set;
import com.pdftron.pdf.ElementReader;
import lombok.AccessLevel;
import lombok.Builder;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true)
public class InvisibleElementRemovalDto {
boolean delta;
ElementReader reader;
ClippingPathStack clippingPathStack;
List<ElementFeatures> overlappedElements;
List<ElementFeatures> visibleElements;
Set<Long> visitedXObjIds;
}

View File

@ -17,7 +17,6 @@ import com.google.common.primitives.Bytes;
import com.google.common.primitives.Doubles;
import com.iqser.red.service.ocr.v1.server.model.ClippingPathStack;
import com.iqser.red.service.ocr.v1.server.model.ElementFeatures;
import com.iqser.red.service.ocr.v1.server.model.InvisibleElementRemovalDto;
import com.pdftron.common.Matrix2D;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.ColorPt;
@ -35,6 +34,7 @@ import com.pdftron.pdf.Rect;
import com.pdftron.sdf.Obj;
import com.pdftron.sdf.SDFDoc;
import lombok.Builder;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@ -75,7 +75,7 @@ public class InvisibleElementRemovalService {
Page page = iterator.next();
visitedXObjIds.add(page.getSDFObj().getObjNum());
InvisibleElementRemovalDto dto = InvisibleElementRemovalDto.builder()
InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder()
.reader(reader)
.clippingPathStack(new ClippingPathStack(page.getMediaBox()))
.delta(delta)
@ -84,40 +84,42 @@ public class InvisibleElementRemovalService {
.visitedXObjIds(visitedXObjIds)
.build();
removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, dto);
removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context);
dto.getVisitedXObjIds().clear();
context.visitedXObjIds().clear();
removeOverlappedElements(page, writer, dto);
removeOverlappedElements(page, writer, context);
}
return pdfDoc.save(SDFDoc.SaveMode.REMOVE_UNUSED, null);
}
private void removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(Page page, ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException {
private void removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(Page page,
ElementWriter writer,
InvisibleElementRemovalContext context) throws PDFNetException {
dto.getReader().begin(page);
context.reader().begin(page);
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
processElements(writer, dto);
processElements(writer, context);
writer.end();
dto.getReader().end();
context.reader().end();
}
private void processElements(ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException {
private void processElements(ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
for (Element element = dto.getReader().next(); element != null; element = dto.getReader().next())
for (Element element = context.reader().next(); element != null; element = context.reader().next())
switch (element.getType()) {
case Element.e_image, Element.e_inline_image -> processImages(element, writer, dto);
case Element.e_text -> processText(element, writer, dto);
case Element.e_path -> processPath(element, writer, dto);
case Element.e_form -> processForm(element, writer, dto);
case Element.e_image, Element.e_inline_image -> processImages(element, writer, context);
case Element.e_text -> processText(element, writer, context);
case Element.e_path -> processPath(element, writer, context);
case Element.e_form -> processForm(element, writer, context);
case Element.e_group_begin -> {
dto.getClippingPathStack().enterNewGState();
context.clippingPathStack().enterNewGState();
writer.writeElement(element);
}
case Element.e_group_end -> {
dto.getClippingPathStack().leaveGState();
context.clippingPathStack().leaveGState();
writer.writeElement(element);
}
default -> writer.writeElement(element);
@ -125,7 +127,7 @@ public class InvisibleElementRemovalService {
}
private void processImages(Element imageElement, ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException {
private void processImages(Element imageElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
Rect rect = imageElement.getBBox();
@ -133,19 +135,19 @@ public class InvisibleElementRemovalService {
return;
}
boolean inClippingPath = dto.getClippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
if (!dto.isDelta() && inClippingPath) {
dto.getVisibleElements().add(ElementFeatures.extractFeatures(imageElement));
if (!context.delta() && inClippingPath) {
context.visibleElements().add(ElementFeatures.extractFeatures(imageElement));
}
if (dto.isDelta() ^ inClippingPath) {
if (context.delta() ^ inClippingPath) {
writer.writeElement(imageElement);
}
}
private void processText(Element textElement, ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException {
private void processText(Element textElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
Rect rect = textElement.getBBox();
@ -156,14 +158,14 @@ public class InvisibleElementRemovalService {
GState gState = textElement.getGState();
boolean inClippingPath = dto.getClippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
boolean isTextVisible = isTextRenderedVisibly(gState);
if (inClippingPath && isTextVisible) {
dto.getVisibleElements().add(ElementFeatures.extractFeatures(textElement));
context.visibleElements().add(ElementFeatures.extractFeatures(textElement));
}
if (!dto.isDelta()) {
if (!context.delta()) {
if (inClippingPath && isTextVisible) {
writer.writeElement(textElement);
} else if (textElement.hasTextMatrix()) {
@ -178,11 +180,13 @@ public class InvisibleElementRemovalService {
} else {
if (!inClippingPath) {
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
// red for elements removed by clipping path
gState.setFillColor(new ColorPt(1, 0, 0));
writer.writeElement(textElement);
}
if (!isTextVisible) {
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
// blue for elements removed due to transparency or not rendered
gState.setFillColor(new ColorPt(0, 0, 1));
gState.setTextRenderMode(GState.e_fill_text);
gState.setFillOpacity(1);
@ -192,30 +196,30 @@ public class InvisibleElementRemovalService {
}
private void processForm(Element formElement, ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException {
private void processForm(Element formElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
writer.writeElement(formElement);
Obj formObj = formElement.getXObject();
if (!dto.getVisitedXObjIds().contains(formObj.getObjNum())) {
dto.getVisitedXObjIds().add(formObj.getObjNum());
if (!context.visitedXObjIds().contains(formObj.getObjNum())) {
context.visitedXObjIds().add(formObj.getObjNum());
// writer needs to be newly initialized when entering a new content stream
// see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
ElementWriter formWriter = new ElementWriter();
dto.getReader().formBegin();
context.reader().formBegin();
formWriter.begin(formObj);
dto.getReader().clearChangeList();
formWriter.setDefaultGState(dto.getReader());
context.reader().clearChangeList();
formWriter.setDefaultGState(context.reader());
processElements(formWriter, dto);
processElements(formWriter, context);
formWriter.end();
dto.getReader().end();
context.reader().end();
}
}
private void processPath(Element pathElement, ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException {
private void processPath(Element pathElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
GeneralPath linePath = convertToGeneralPath(pathElement.getPathData());
@ -226,7 +230,7 @@ public class InvisibleElementRemovalService {
var rect = linePath.getBounds2D();
boolean inClippingPath = dto.getClippingPathStack().almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight());
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight());
if (pathElement.isClippingPath()) {
if (pathElement.isClipWindingFill()) {
@ -235,27 +239,27 @@ public class InvisibleElementRemovalService {
linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
}
dto.getClippingPathStack().intersectClippingPath(linePath);
pathElement.setPathClip(!dto.isDelta());
context.clippingPathStack().intersectClippingPath(linePath);
pathElement.setPathClip(!context.delta());
writer.writeElement(pathElement);
} else {
if (inClippingPath) {
// TODO: WINDING RULE
if (isFilledAndNonTransparent(pathElement)) {
List<ElementFeatures> currentOverlappedElements = dto.getVisibleElements()
List<ElementFeatures> currentOverlappedElements = context.visibleElements()
.stream()
.filter(features -> almostContains(linePath, features.getBoundingBox()))
.toList();
dto.getOverlappedElements().addAll(currentOverlappedElements);
dto.getVisibleElements().removeAll(currentOverlappedElements);
context.overlappedElements().addAll(currentOverlappedElements);
context.visibleElements().removeAll(currentOverlappedElements);
}
dto.getVisibleElements().add(ElementFeatures.extractFeatures(pathElement));
if (!dto.isDelta()) {
context.visibleElements().add(ElementFeatures.extractFeatures(pathElement));
if (!context.delta()) {
writer.writeElement(pathElement);
}
}
if (dto.isDelta() && !inClippingPath) {
if (context.delta() && !inClippingPath) {
pathElement.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
pathElement.getGState().setFillColor(new ColorPt(1, 0, 0));
pathElement.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
@ -272,34 +276,35 @@ public class InvisibleElementRemovalService {
}
private void removeOverlappedElements(Page page, ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException {
private void removeOverlappedElements(Page page, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
dto.getReader().begin(page);
context.reader().begin(page);
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
if (dto.isDelta()) {
dto.getOverlappedElements().forEach(feature -> drawBBox(writer, feature.getBoundingBox(), "#00FF00"));
dto.getOverlappedElements().clear();
if (context.delta()) {
// green for element removed due to overlapping
context.overlappedElements().forEach(feature -> drawBBox(writer, feature.getBoundingBox(), "#00FF00"));
context.overlappedElements().clear();
}
processOverlappedElements(writer, dto);
processOverlappedElements(writer, context);
writer.end();
dto.getReader().end();
context.reader().end();
if (dto.getOverlappedElements().size() > 0) {
log.warn(dto.getOverlappedElements().size() + " overlapped elements have not been found or removed");
if (context.overlappedElements().size() > 0) {
log.warn(context.overlappedElements().size() + " overlapped elements have not been found or removed");
}
}
private void processOverlappedElements(ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException {
private void processOverlappedElements(ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
for (Element element = dto.getReader().next(); element != null; element = dto.getReader().next()) {
for (Element element = context.reader().next(); element != null; element = context.reader().next()) {
switch (element.getType()) {
case Element.e_form -> processFormOverlappedElements(writer, element, dto);
case Element.e_form -> processFormOverlappedElements(writer, element, context);
case Element.e_path, Element.e_image, Element.e_inline_image, Element.e_text -> {
boolean anyMatch = false;
for (ElementFeatures elementToRemove : dto.getOverlappedElements()) {
for (ElementFeatures elementToRemove : context.overlappedElements()) {
if (elementToRemove.almostMatches(element)) {
dto.getOverlappedElements().remove(elementToRemove);
context.overlappedElements().remove(elementToRemove);
anyMatch = true;
break;
}
@ -322,25 +327,25 @@ public class InvisibleElementRemovalService {
}
private void processFormOverlappedElements(ElementWriter writer, Element formElement, InvisibleElementRemovalDto dto) throws PDFNetException {
private void processFormOverlappedElements(ElementWriter writer, Element formElement, InvisibleElementRemovalContext context) throws PDFNetException {
writer.writeElement(formElement);
Obj formObj = formElement.getXObject();
if (!dto.getVisitedXObjIds().contains(formObj.getObjNum())) {
dto.getVisitedXObjIds().add(formObj.getObjNum());
if (!context.visitedXObjIds().contains(formObj.getObjNum())) {
context.visitedXObjIds().add(formObj.getObjNum());
// writer needs to be newly initialized when entering a new content stream
// see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
ElementWriter formWriter = new ElementWriter();
dto.getReader().formBegin();
context.reader().formBegin();
formWriter.begin(formObj);
dto.getReader().clearChangeList();
formWriter.setDefaultGState(dto.getReader());
context.reader().clearChangeList();
formWriter.setDefaultGState(context.reader());
processOverlappedElements(formWriter, dto);
processOverlappedElements(formWriter, context);
formWriter.end();
dto.getReader().end();
context.reader().end();
}
}
@ -416,4 +421,15 @@ public class InvisibleElementRemovalService {
writer.writePlacedElement(rect);
}
@Builder
private record InvisibleElementRemovalContext(boolean delta, //
ElementReader reader, //
ClippingPathStack clippingPathStack, //
List<ElementFeatures> overlappedElements, //
List<ElementFeatures> visibleElements, //
Set<Long> visitedXObjIds) {
}
}