RED-6019: Remove hidden text when processing OCR
*moved InvisibleElementRemovalDto to private inner record of InvisibleElementRemovalService *added comments for color choices
This commit is contained in:
parent
12fbdbee50
commit
a96260f77f
@ -1,25 +0,0 @@
|
||||
package com.iqser.red.service.ocr.v1.server.model;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.pdftron.pdf.ElementReader;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true)
|
||||
public class InvisibleElementRemovalDto {
|
||||
|
||||
boolean delta;
|
||||
ElementReader reader;
|
||||
ClippingPathStack clippingPathStack;
|
||||
List<ElementFeatures> overlappedElements;
|
||||
List<ElementFeatures> visibleElements;
|
||||
Set<Long> visitedXObjIds;
|
||||
|
||||
}
|
||||
@ -17,7 +17,6 @@ import com.google.common.primitives.Bytes;
|
||||
import com.google.common.primitives.Doubles;
|
||||
import com.iqser.red.service.ocr.v1.server.model.ClippingPathStack;
|
||||
import com.iqser.red.service.ocr.v1.server.model.ElementFeatures;
|
||||
import com.iqser.red.service.ocr.v1.server.model.InvisibleElementRemovalDto;
|
||||
import com.pdftron.common.Matrix2D;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.ColorPt;
|
||||
@ -35,6 +34,7 @@ import com.pdftron.pdf.Rect;
|
||||
import com.pdftron.sdf.Obj;
|
||||
import com.pdftron.sdf.SDFDoc;
|
||||
|
||||
import lombok.Builder;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@ -75,7 +75,7 @@ public class InvisibleElementRemovalService {
|
||||
Page page = iterator.next();
|
||||
|
||||
visitedXObjIds.add(page.getSDFObj().getObjNum());
|
||||
InvisibleElementRemovalDto dto = InvisibleElementRemovalDto.builder()
|
||||
InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder()
|
||||
.reader(reader)
|
||||
.clippingPathStack(new ClippingPathStack(page.getMediaBox()))
|
||||
.delta(delta)
|
||||
@ -84,40 +84,42 @@ public class InvisibleElementRemovalService {
|
||||
.visitedXObjIds(visitedXObjIds)
|
||||
.build();
|
||||
|
||||
removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, dto);
|
||||
removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context);
|
||||
|
||||
dto.getVisitedXObjIds().clear();
|
||||
context.visitedXObjIds().clear();
|
||||
|
||||
removeOverlappedElements(page, writer, dto);
|
||||
removeOverlappedElements(page, writer, context);
|
||||
}
|
||||
return pdfDoc.save(SDFDoc.SaveMode.REMOVE_UNUSED, null);
|
||||
}
|
||||
|
||||
|
||||
private void removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(Page page, ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException {
|
||||
private void removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(Page page,
|
||||
ElementWriter writer,
|
||||
InvisibleElementRemovalContext context) throws PDFNetException {
|
||||
|
||||
dto.getReader().begin(page);
|
||||
context.reader().begin(page);
|
||||
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
|
||||
processElements(writer, dto);
|
||||
processElements(writer, context);
|
||||
writer.end();
|
||||
dto.getReader().end();
|
||||
context.reader().end();
|
||||
}
|
||||
|
||||
|
||||
private void processElements(ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException {
|
||||
private void processElements(ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
|
||||
|
||||
for (Element element = dto.getReader().next(); element != null; element = dto.getReader().next())
|
||||
for (Element element = context.reader().next(); element != null; element = context.reader().next())
|
||||
switch (element.getType()) {
|
||||
case Element.e_image, Element.e_inline_image -> processImages(element, writer, dto);
|
||||
case Element.e_text -> processText(element, writer, dto);
|
||||
case Element.e_path -> processPath(element, writer, dto);
|
||||
case Element.e_form -> processForm(element, writer, dto);
|
||||
case Element.e_image, Element.e_inline_image -> processImages(element, writer, context);
|
||||
case Element.e_text -> processText(element, writer, context);
|
||||
case Element.e_path -> processPath(element, writer, context);
|
||||
case Element.e_form -> processForm(element, writer, context);
|
||||
case Element.e_group_begin -> {
|
||||
dto.getClippingPathStack().enterNewGState();
|
||||
context.clippingPathStack().enterNewGState();
|
||||
writer.writeElement(element);
|
||||
}
|
||||
case Element.e_group_end -> {
|
||||
dto.getClippingPathStack().leaveGState();
|
||||
context.clippingPathStack().leaveGState();
|
||||
writer.writeElement(element);
|
||||
}
|
||||
default -> writer.writeElement(element);
|
||||
@ -125,7 +127,7 @@ public class InvisibleElementRemovalService {
|
||||
}
|
||||
|
||||
|
||||
private void processImages(Element imageElement, ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException {
|
||||
private void processImages(Element imageElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
|
||||
|
||||
Rect rect = imageElement.getBBox();
|
||||
|
||||
@ -133,19 +135,19 @@ public class InvisibleElementRemovalService {
|
||||
return;
|
||||
}
|
||||
|
||||
boolean inClippingPath = dto.getClippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
|
||||
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
|
||||
|
||||
if (!dto.isDelta() && inClippingPath) {
|
||||
dto.getVisibleElements().add(ElementFeatures.extractFeatures(imageElement));
|
||||
if (!context.delta() && inClippingPath) {
|
||||
context.visibleElements().add(ElementFeatures.extractFeatures(imageElement));
|
||||
}
|
||||
|
||||
if (dto.isDelta() ^ inClippingPath) {
|
||||
if (context.delta() ^ inClippingPath) {
|
||||
writer.writeElement(imageElement);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void processText(Element textElement, ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException {
|
||||
private void processText(Element textElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
|
||||
|
||||
Rect rect = textElement.getBBox();
|
||||
|
||||
@ -156,14 +158,14 @@ public class InvisibleElementRemovalService {
|
||||
|
||||
GState gState = textElement.getGState();
|
||||
|
||||
boolean inClippingPath = dto.getClippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
|
||||
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
|
||||
|
||||
boolean isTextVisible = isTextRenderedVisibly(gState);
|
||||
|
||||
if (inClippingPath && isTextVisible) {
|
||||
dto.getVisibleElements().add(ElementFeatures.extractFeatures(textElement));
|
||||
context.visibleElements().add(ElementFeatures.extractFeatures(textElement));
|
||||
}
|
||||
if (!dto.isDelta()) {
|
||||
if (!context.delta()) {
|
||||
if (inClippingPath && isTextVisible) {
|
||||
writer.writeElement(textElement);
|
||||
} else if (textElement.hasTextMatrix()) {
|
||||
@ -178,11 +180,13 @@ public class InvisibleElementRemovalService {
|
||||
} else {
|
||||
if (!inClippingPath) {
|
||||
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||
// red for elements removed by clipping path
|
||||
gState.setFillColor(new ColorPt(1, 0, 0));
|
||||
writer.writeElement(textElement);
|
||||
}
|
||||
if (!isTextVisible) {
|
||||
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||
// blue for elements removed due to transparency or not rendered
|
||||
gState.setFillColor(new ColorPt(0, 0, 1));
|
||||
gState.setTextRenderMode(GState.e_fill_text);
|
||||
gState.setFillOpacity(1);
|
||||
@ -192,30 +196,30 @@ public class InvisibleElementRemovalService {
|
||||
}
|
||||
|
||||
|
||||
private void processForm(Element formElement, ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException {
|
||||
private void processForm(Element formElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
|
||||
|
||||
writer.writeElement(formElement);
|
||||
Obj formObj = formElement.getXObject();
|
||||
|
||||
if (!dto.getVisitedXObjIds().contains(formObj.getObjNum())) {
|
||||
dto.getVisitedXObjIds().add(formObj.getObjNum());
|
||||
if (!context.visitedXObjIds().contains(formObj.getObjNum())) {
|
||||
context.visitedXObjIds().add(formObj.getObjNum());
|
||||
// writer needs to be newly initialized when entering a new content stream
|
||||
// see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
|
||||
ElementWriter formWriter = new ElementWriter();
|
||||
dto.getReader().formBegin();
|
||||
context.reader().formBegin();
|
||||
formWriter.begin(formObj);
|
||||
|
||||
dto.getReader().clearChangeList();
|
||||
formWriter.setDefaultGState(dto.getReader());
|
||||
context.reader().clearChangeList();
|
||||
formWriter.setDefaultGState(context.reader());
|
||||
|
||||
processElements(formWriter, dto);
|
||||
processElements(formWriter, context);
|
||||
formWriter.end();
|
||||
dto.getReader().end();
|
||||
context.reader().end();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void processPath(Element pathElement, ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException {
|
||||
private void processPath(Element pathElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
|
||||
|
||||
GeneralPath linePath = convertToGeneralPath(pathElement.getPathData());
|
||||
|
||||
@ -226,7 +230,7 @@ public class InvisibleElementRemovalService {
|
||||
|
||||
var rect = linePath.getBounds2D();
|
||||
|
||||
boolean inClippingPath = dto.getClippingPathStack().almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight());
|
||||
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight());
|
||||
|
||||
if (pathElement.isClippingPath()) {
|
||||
if (pathElement.isClipWindingFill()) {
|
||||
@ -235,27 +239,27 @@ public class InvisibleElementRemovalService {
|
||||
linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
|
||||
}
|
||||
|
||||
dto.getClippingPathStack().intersectClippingPath(linePath);
|
||||
pathElement.setPathClip(!dto.isDelta());
|
||||
context.clippingPathStack().intersectClippingPath(linePath);
|
||||
pathElement.setPathClip(!context.delta());
|
||||
writer.writeElement(pathElement);
|
||||
|
||||
} else {
|
||||
if (inClippingPath) {
|
||||
// TODO: WINDING RULE
|
||||
if (isFilledAndNonTransparent(pathElement)) {
|
||||
List<ElementFeatures> currentOverlappedElements = dto.getVisibleElements()
|
||||
List<ElementFeatures> currentOverlappedElements = context.visibleElements()
|
||||
.stream()
|
||||
.filter(features -> almostContains(linePath, features.getBoundingBox()))
|
||||
.toList();
|
||||
dto.getOverlappedElements().addAll(currentOverlappedElements);
|
||||
dto.getVisibleElements().removeAll(currentOverlappedElements);
|
||||
context.overlappedElements().addAll(currentOverlappedElements);
|
||||
context.visibleElements().removeAll(currentOverlappedElements);
|
||||
}
|
||||
dto.getVisibleElements().add(ElementFeatures.extractFeatures(pathElement));
|
||||
if (!dto.isDelta()) {
|
||||
context.visibleElements().add(ElementFeatures.extractFeatures(pathElement));
|
||||
if (!context.delta()) {
|
||||
writer.writeElement(pathElement);
|
||||
}
|
||||
}
|
||||
if (dto.isDelta() && !inClippingPath) {
|
||||
if (context.delta() && !inClippingPath) {
|
||||
pathElement.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||
pathElement.getGState().setFillColor(new ColorPt(1, 0, 0));
|
||||
pathElement.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
||||
@ -272,34 +276,35 @@ public class InvisibleElementRemovalService {
|
||||
}
|
||||
|
||||
|
||||
private void removeOverlappedElements(Page page, ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException {
|
||||
private void removeOverlappedElements(Page page, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
|
||||
|
||||
dto.getReader().begin(page);
|
||||
context.reader().begin(page);
|
||||
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
|
||||
if (dto.isDelta()) {
|
||||
dto.getOverlappedElements().forEach(feature -> drawBBox(writer, feature.getBoundingBox(), "#00FF00"));
|
||||
dto.getOverlappedElements().clear();
|
||||
if (context.delta()) {
|
||||
// green for element removed due to overlapping
|
||||
context.overlappedElements().forEach(feature -> drawBBox(writer, feature.getBoundingBox(), "#00FF00"));
|
||||
context.overlappedElements().clear();
|
||||
}
|
||||
processOverlappedElements(writer, dto);
|
||||
processOverlappedElements(writer, context);
|
||||
writer.end();
|
||||
dto.getReader().end();
|
||||
context.reader().end();
|
||||
|
||||
if (dto.getOverlappedElements().size() > 0) {
|
||||
log.warn(dto.getOverlappedElements().size() + " overlapped elements have not been found or removed");
|
||||
if (context.overlappedElements().size() > 0) {
|
||||
log.warn(context.overlappedElements().size() + " overlapped elements have not been found or removed");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void processOverlappedElements(ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException {
|
||||
private void processOverlappedElements(ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
|
||||
|
||||
for (Element element = dto.getReader().next(); element != null; element = dto.getReader().next()) {
|
||||
for (Element element = context.reader().next(); element != null; element = context.reader().next()) {
|
||||
switch (element.getType()) {
|
||||
case Element.e_form -> processFormOverlappedElements(writer, element, dto);
|
||||
case Element.e_form -> processFormOverlappedElements(writer, element, context);
|
||||
case Element.e_path, Element.e_image, Element.e_inline_image, Element.e_text -> {
|
||||
boolean anyMatch = false;
|
||||
for (ElementFeatures elementToRemove : dto.getOverlappedElements()) {
|
||||
for (ElementFeatures elementToRemove : context.overlappedElements()) {
|
||||
if (elementToRemove.almostMatches(element)) {
|
||||
dto.getOverlappedElements().remove(elementToRemove);
|
||||
context.overlappedElements().remove(elementToRemove);
|
||||
anyMatch = true;
|
||||
break;
|
||||
}
|
||||
@ -322,25 +327,25 @@ public class InvisibleElementRemovalService {
|
||||
}
|
||||
|
||||
|
||||
private void processFormOverlappedElements(ElementWriter writer, Element formElement, InvisibleElementRemovalDto dto) throws PDFNetException {
|
||||
private void processFormOverlappedElements(ElementWriter writer, Element formElement, InvisibleElementRemovalContext context) throws PDFNetException {
|
||||
|
||||
writer.writeElement(formElement);
|
||||
Obj formObj = formElement.getXObject();
|
||||
|
||||
if (!dto.getVisitedXObjIds().contains(formObj.getObjNum())) {
|
||||
dto.getVisitedXObjIds().add(formObj.getObjNum());
|
||||
if (!context.visitedXObjIds().contains(formObj.getObjNum())) {
|
||||
context.visitedXObjIds().add(formObj.getObjNum());
|
||||
// writer needs to be newly initialized when entering a new content stream
|
||||
// see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
|
||||
ElementWriter formWriter = new ElementWriter();
|
||||
dto.getReader().formBegin();
|
||||
context.reader().formBegin();
|
||||
formWriter.begin(formObj);
|
||||
|
||||
dto.getReader().clearChangeList();
|
||||
formWriter.setDefaultGState(dto.getReader());
|
||||
context.reader().clearChangeList();
|
||||
formWriter.setDefaultGState(context.reader());
|
||||
|
||||
processOverlappedElements(formWriter, dto);
|
||||
processOverlappedElements(formWriter, context);
|
||||
formWriter.end();
|
||||
dto.getReader().end();
|
||||
context.reader().end();
|
||||
}
|
||||
}
|
||||
|
||||
@ -416,4 +421,15 @@ public class InvisibleElementRemovalService {
|
||||
writer.writePlacedElement(rect);
|
||||
}
|
||||
|
||||
|
||||
@Builder
|
||||
private record InvisibleElementRemovalContext(boolean delta, //
|
||||
ElementReader reader, //
|
||||
ClippingPathStack clippingPathStack, //
|
||||
List<ElementFeatures> overlappedElements, //
|
||||
List<ElementFeatures> visibleElements, //
|
||||
Set<Long> visitedXObjIds) {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user