RED-6019: Remove hidden text when processing OCR

handled cases:
     Text which is transparent or is set to not render
     Elements outside of clipping path
     Elements that have been painted over by visible and filled Paths
unhandled cases:
     Elements covered by widely stroked path
     Elements same color as background
     Any Text set to clipping with its many interactions with other elements
This commit is contained in:
Kilian Schuettler 2023-01-30 16:13:51 +01:00
parent 265fac8099
commit fd7ec6e7aa
7 changed files with 747 additions and 100 deletions

View File

@ -0,0 +1,54 @@
package com.iqser.red.service.ocr.v1.server.model;
import com.pdftron.pdf.Rect;
import lombok.Data;
import lombok.SneakyThrows;
import java.awt.geom.Area;
import java.awt.geom.GeneralPath;
import java.awt.geom.Rectangle2D;
import java.util.ArrayDeque;
import java.util.Deque;
@Data
public class ClippingPathStack {
private Deque<Area> stack = new ArrayDeque<>();
@SneakyThrows
public ClippingPathStack(Rect rectangle) {
stack.push(new Area(new Rectangle2D.Double(rectangle.getX1(), rectangle.getY1(), rectangle.getWidth(), rectangle.getHeight()).getBounds2D()));
}
@SneakyThrows
public void intersectClippingPath(GeneralPath path) {
getCurrentClippingPath().intersect(new Area(path));
}
public boolean almostIntersects(double x, double y, double width, double height) {
// Height or width are zero for straight lines, even though they are being rendered. Therefore, height or width must be at minimum >0.
double tolerance = 1e-3;
double x_with_tolerance = x > 0 ? x - tolerance : x + tolerance;
double y_with_tolerance = y > 0 ? y - tolerance : y + tolerance;
double width_with_tolerance = width + 2 * tolerance;
double height_with_tolerance = height + 2 * tolerance;
return getCurrentClippingPath().intersects(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance);
}
public Area getCurrentClippingPath() {
return stack.peek();
}
public void enterNewGState() {
Area current = stack.peek();
Area cloned = new Area();
cloned.add(current);
stack.push(cloned);
}
public void leaveGState() {
stack.pop();
}
}

View File

@ -0,0 +1,148 @@
package com.iqser.red.service.ocr.v1.server.model;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.Rect;
import lombok.*;
import lombok.experimental.SuperBuilder;
import java.awt.geom.Rectangle2D;
@Data
@SuperBuilder
@NoArgsConstructor
@AllArgsConstructor
public abstract class ElementFeatures {
private int elementType;
private Rectangle2D boundingBox;
public boolean almostMatches(Element element) throws PDFNetException {
if (element.getType() != elementType) return false;
if (element.getBBox() == null) return false;
return rectsAlmostMatch(element.getBBox());
}
protected boolean almostEqual(double a, double b) {
double tolerance = 1e-3;
return Math.abs(a - b) < tolerance;
}
@SneakyThrows
private boolean rectsAlmostMatch(Rect bBox) {
if (!almostEqual(bBox.getX1(), boundingBox.getX())) return false;
if (!almostEqual(bBox.getY1(), boundingBox.getY())) return false;
if (!almostEqual(bBox.getWidth(), boundingBox.getWidth())) return false;
return almostEqual(bBox.getHeight(), boundingBox.getHeight());
}
@EqualsAndHashCode(callSuper = true)
@Data
@SuperBuilder
@NoArgsConstructor
@AllArgsConstructor
public static class Text extends ElementFeatures {
private String text;
private int font;
private double fontsize;
@Override
public boolean almostMatches(Element element) throws PDFNetException {
if (!super.almostMatches(element)) return false;
if (!text.equals(element.getTextString())) return false;
if (font != element.getGState().getFont().getType()) return false;
return almostEqual(fontsize, element.getGState().getFontSize());
}
}
@EqualsAndHashCode(callSuper = true)
@Data
@SuperBuilder
@NoArgsConstructor
@AllArgsConstructor
public static class Path extends ElementFeatures {
private boolean isClippingPath;
private boolean isClipWindingFill;
private boolean isStroked;
private boolean isFilled;
private boolean isWindingFill;
@Override
public boolean almostMatches(Element element) throws PDFNetException {
if (!super.almostMatches(element)) return false;
if (isClippingPath != element.isClippingPath()) return false;
if (isClipWindingFill != element.isClipWindingFill()) return false;
if (isStroked != element.isStroked()) return false;
if (isFilled != element.isFilled()) return false;
if (isWindingFill != element.isWindingFill()) return false;
return true;
}
}
@EqualsAndHashCode(callSuper = true)
@Data
@SuperBuilder
@NoArgsConstructor
public static class Image extends ElementFeatures {
private int dataSize;
private int height;
private int width;
private int renderingIntent;
private int componentNum;
private int bitsPerComponent;
@Override
public boolean almostMatches(Element element) throws PDFNetException {
if (!super.almostMatches(element)) return false;
if (dataSize != element.getImageDataSize()) return false;
if (height != element.getImageHeight()) return false;
if (width != element.getImageWidth()) return false;
if (renderingIntent != element.getImageRenderingIntent()) return false;
if (componentNum != element.getComponentNum()) return false;
if (bitsPerComponent != element.getBitsPerComponent()) return false;
return true;
}
}
public static ElementFeatures extractFeatures(Element element) throws PDFNetException {
switch (element.getType()) {
case Element.e_path:
return ElementFeatures.Path.builder()
.elementType(element.getType())
.boundingBox(toRectangle2D(element.getBBox()))
.isClippingPath(element.isClippingPath())
.isClipWindingFill(element.isClipWindingFill())
.isStroked(element.isStroked())
.isFilled(element.isFilled())
.isWindingFill(element.isWindingFill())
.build();
case Element.e_text:
return ElementFeatures.Text.builder()
.elementType(element.getType())
.boundingBox(toRectangle2D(element.getBBox()))
.text(element.getTextString())
.font(element.getGState().getFont().getType())
.fontsize(element.getGState().getFontSize())
.build();
case Element.e_image:
case Element.e_inline_image:
return Image.builder()
.elementType(element.getType())
.boundingBox(toRectangle2D(element.getBBox()))
.dataSize(element.getImageDataSize())
.height(element.getImageHeight())
.width(element.getImageWidth())
.renderingIntent(element.getImageRenderingIntent())
.componentNum(element.getComponentNum())
.bitsPerComponent(element.getBitsPerComponent())
.build();
default:
throw new UnsupportedOperationException("Feature Extraction is not supported for PDFTron.Element with type: " + element.getType());
}
}
private static Rectangle2D toRectangle2D(Rect rect) throws PDFNetException {
return new Rectangle2D.Double(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
}
}

View File

@ -0,0 +1,448 @@
package com.iqser.red.service.ocr.v1.server.service;
import com.iqser.red.service.ocr.v1.server.model.ClippingPathStack;
import com.iqser.red.service.ocr.v1.server.model.ElementFeatures;
import com.pdftron.common.Matrix2D;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.*;
import com.pdftron.sdf.Obj;
import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
import java.awt.*;
import java.awt.geom.AffineTransform;
import java.awt.geom.GeneralPath;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import java.util.stream.Collectors;
@Slf4j
@Service
public class InvisibleElementService {
/*
handled cases:
Text which is transparent or is set to not render
Text or Path or Images outside of clipping path
Text or Path or Images that have been painted over by visible and filled Paths
unhandled cases:
Text covered by widely stroked path
Text same color as background
Any Text set to clipping with its many interactions with other elements
*/
@SneakyThrows
public byte[] removeInvisibleElements(byte[] pdfFile, boolean delta) {
PDFDoc pdfDoc = new PDFDoc(pdfFile);
ElementWriter writer = new ElementWriter();
ElementReader reader = new ElementReader();
Set<Integer> visited = new TreeSet<>();
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
Page page = iterator.next();
List<ElementFeatures> overlappedElements = removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, reader, writer, visited, delta);
visited.clear();
removeOverlappedElements(page, reader, writer, visited, overlappedElements, delta);
}
return pdfDoc.save(SDFDoc.SaveMode.REMOVE_UNUSED, null);
}
private List<ElementFeatures> removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(Page page, ElementReader reader, ElementWriter writer, Set<Integer> visited, boolean delta)
throws PDFNetException {
var overlappedElements = new ArrayList<ElementFeatures>();
var visibleElements = new ArrayList<ElementFeatures>();
ClippingPathStack clippingPathStack = new ClippingPathStack(page.getMediaBox());
visited.add((int) page.getSDFObj().getObjNum());
reader.begin(page);
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
processElements(reader, writer, visited, clippingPathStack, delta, overlappedElements, visibleElements);
writer.end();
reader.end();
return overlappedElements;
}
private void processElements(ElementReader reader, ElementWriter writer, Set<Integer> visited, ClippingPathStack clippingPathStack, Boolean delta,
List<ElementFeatures> coveredElements, List<ElementFeatures> visibleElements)
throws PDFNetException {
for (Element element = reader.next(); element != null; element = reader.next())
switch (element.getType()) {
case Element.e_image:
case Element.e_inline_image:
processImages(element, writer, clippingPathStack, delta, visibleElements);
break;
case Element.e_text:
processText(element, writer, clippingPathStack, delta, visibleElements);
break;
case Element.e_path:
processPath(element, writer, clippingPathStack, delta, coveredElements, visibleElements);
break;
case Element.e_form:
processForm(reader, writer, element, visited, clippingPathStack, delta, coveredElements, visibleElements);
break;
case Element.e_group_begin:
clippingPathStack.enterNewGState();
writer.writeElement(element);
break;
case Element.e_group_end:
clippingPathStack.leaveGState();
writer.writeElement(element);
break;
default:
writer.writeElement(element);
}
}
private void processImages(Element imageElement, ElementWriter writer, ClippingPathStack clippingPathStack, Boolean delta, List<ElementFeatures> visibleElements)
throws PDFNetException {
Rect rect = imageElement.getBBox();
if (rect == null) {
return;
}
boolean inClippingPath = clippingPathStack.almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
if (!delta && inClippingPath) {
visibleElements.add(ElementFeatures.extractFeatures(imageElement));
writer.writeElement(imageElement);
}
if (delta && !inClippingPath) {
writer.writeElement(imageElement);
}
}
private void processText(Element textElement, ElementWriter writer, ClippingPathStack clippingPathStack,
Boolean delta, List<ElementFeatures> visibleElements)
throws PDFNetException {
Rect rect = textElement.getBBox();
if (rect == null) {
writer.writeElement(textElement);
return;
}
GState gState = textElement.getGState();
boolean inClippingPath = clippingPathStack.almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
boolean isTextVisible = isTextRenderedVisibly(gState);
if (inClippingPath && isTextVisible) {
visibleElements.add(ElementFeatures.extractFeatures(textElement));
}
if (!delta) {
if (inClippingPath && isTextVisible) {
writer.writeElement(textElement);
} else if (textElement.hasTextMatrix()) {
/*
PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
Therefore, the position of a following Tj is affected by not writing the first Element.
This is why, we write only the Tm command:
*/
writer.writeGStateChanges(textElement);
}
} else {
if (!inClippingPath) {
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
gState.setFillColor(new ColorPt(1, 0, 0));
writer.writeElement(textElement);
}
if (!isTextVisible) {
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
gState.setFillColor(new ColorPt(0, 0, 1));
gState.setTextRenderMode(GState.e_fill_text);
gState.setFillOpacity(1);
writer.writeElement(textElement);
}
}
}
private void processForm(ElementReader reader, ElementWriter writer, Element element, Set<Integer> visited, ClippingPathStack clippingPathStack, Boolean delta,
List<ElementFeatures> coveredElements, List<ElementFeatures> allElements)
throws PDFNetException {
writer.writeElement(element);
Obj formObj = element.getXObject();
if (!visited.contains((int) formObj.getObjNum())) {
visited.add((int) formObj.getObjNum());
ElementWriter new_writer = new ElementWriter();
reader.formBegin();
new_writer.begin(formObj);
reader.clearChangeList();
new_writer.setDefaultGState(reader);
processElements(reader, new_writer, visited, clippingPathStack, delta, coveredElements, allElements);
new_writer.end();
reader.end();
}
}
private void processPath(Element pathElement, ElementWriter writer, ClippingPathStack clippingPathStack, Boolean delta,
List<ElementFeatures> overlappedElements, List<ElementFeatures> visibleElements)
throws PDFNetException {
GeneralPath linePath = convertToGeneralPath(pathElement.getPathData());
//transform path to initial user space
var ctm = pathElement.getCTM();
var affineTransform = getAffineTransform(ctm);
linePath.transform(affineTransform);
var rect = linePath.getBounds2D();
boolean inClippingPath = clippingPathStack.almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight());
if (pathElement.isClippingPath()) {
if (pathElement.isClipWindingFill()) {
linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
} else {
linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
}
clippingPathStack.intersectClippingPath(linePath);
pathElement.setPathClip(!delta);
writer.writeElement(pathElement);
} else {
if (inClippingPath) {
if (isFilledAndNonTransparent(pathElement)) {
List<ElementFeatures> currentOverlappedElements = visibleElements.stream()
.filter(features -> almostContains(linePath, features.getBoundingBox()))
.collect(Collectors.toList());
overlappedElements.addAll(currentOverlappedElements);
visibleElements.removeAll(currentOverlappedElements);
}
visibleElements.add(ElementFeatures.extractFeatures(pathElement));
if (!delta) {
writer.writeElement(pathElement);
}
}
if (delta && !inClippingPath) {
pathElement.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
pathElement.getGState().setFillColor(new ColorPt(1, 0, 0));
pathElement.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
pathElement.getGState().setStrokeColor(new ColorPt(1, 0, 0));
writer.writeElement(pathElement);
}
}
}
private static AffineTransform getAffineTransform(Matrix2D ctm) throws PDFNetException {
return new AffineTransform(ctm.getA(), ctm.getB(), ctm.getC(), ctm.getD(), ctm.getH(), ctm.getV());
}
private void removeOverlappedElements(Page page, ElementReader reader, ElementWriter writer, Set<Integer> visited, List<ElementFeatures> overlappedElements, boolean delta)
throws PDFNetException {
reader.begin(page);
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
if (delta) {
overlappedElements.forEach(feature -> drawBBox(writer, feature.getBoundingBox(), "#00FF00"));
overlappedElements.clear();
}
processOverlappedElements(reader, writer, visited, overlappedElements, delta);
writer.end();
reader.end();
if (overlappedElements.size() > 0) {
log.warn(overlappedElements.size() + " overlapped elements have not been found and removed");
}
}
private void processOverlappedElements(ElementReader reader, ElementWriter writer, Set<Integer> visited, List<ElementFeatures> coveredElements, boolean delta)
throws PDFNetException {
for (Element element = reader.next(); element != null; element = reader.next()) {
switch (element.getType()) {
case Element.e_form:
processFormOverlappedElements(reader, writer, element, visited, coveredElements, delta);
break;
case Element.e_path:
case Element.e_image:
case Element.e_inline_image:
case Element.e_text:
boolean anyMatch = false;
for (ElementFeatures elementToRemove : coveredElements) {
if (elementToRemove.almostMatches(element)) {
coveredElements.remove(elementToRemove);
anyMatch = true;
break;
}
}
if (!anyMatch) {
writer.writeElement(element);
} else if (element.getType() == 3 && element.hasTextMatrix()) {
/*
PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
Therefore, the position of a following Tj is affected by not writing the first Element.
This is why, we write only the Tm command:
*/
writer.writeGStateChanges(element);
}
break;
default:
writer.writeElement(element);
}
}
}
private void processFormOverlappedElements(ElementReader reader, ElementWriter writer, Element element, Set<Integer> visited, List<ElementFeatures> elementsToRemove, boolean delta)
throws PDFNetException {
writer.writeElement(element);
Obj formObj = element.getXObject();
if (!visited.contains((int) formObj.getObjNum())) {
visited.add((int) formObj.getObjNum());
ElementWriter new_writer = new ElementWriter();
reader.formBegin();
new_writer.begin(formObj);
reader.clearChangeList();
new_writer.setDefaultGState(reader);
processOverlappedElements(reader, new_writer, visited, elementsToRemove, delta);
new_writer.end();
reader.end();
}
}
private boolean isTextRenderedVisibly(GState gState) throws PDFNetException {
if (gState.getTextRenderMode() == GState.e_invisible_text) return false;
if (gState.getTextRenderMode() == GState.e_fill_text && gState.getFillOpacity() == 0) return false;
if (gState.getTextRenderMode() == GState.e_stroke_text && gState.getStrokeOpacity() == 0) return false;
if (gState.getTextRenderMode() == GState.e_fill_stroke_text && gState.getFillOpacity() == 0 && gState.getStrokeOpacity() == 0) return false;
return true;
}
private GeneralPath convertToGeneralPath(PathData pathData) throws PDFNetException {
GeneralPath linePath = new GeneralPath();
double[] dataPoints = pathData.getPoints();
byte[] opr = pathData.getOperators();
double x1;
double y1;
double x2;
double y2;
double x3;
double y3;
int data_index = 0;
for (int opr_index = 0; opr_index < opr.length; ++opr_index) {
switch (opr[opr_index]) {
case PathData.e_moveto:
x1 = dataPoints[data_index];
++data_index;
y1 = dataPoints[data_index];
++data_index;
linePath.moveTo(x1, y1);
break;
case PathData.e_lineto:
x1 = dataPoints[data_index];
++data_index;
y1 = dataPoints[data_index];
++data_index;
linePath.lineTo(x1, y1);
break;
case PathData.e_cubicto:
x1 = dataPoints[data_index];
++data_index;
y1 = dataPoints[data_index];
++data_index;
x2 = dataPoints[data_index];
++data_index;
y2 = dataPoints[data_index];
++data_index;
x3 = dataPoints[data_index];
++data_index;
y3 = dataPoints[data_index];
++data_index;
linePath.curveTo(x1, y1, x2, y2, x3, y3);
break;
case PathData.e_rect:
x1 = dataPoints[data_index];
++data_index;
y1 = dataPoints[data_index];
++data_index;
double w = dataPoints[data_index];
++data_index;
double h = dataPoints[data_index];
++data_index;
x2 = x1 + w;
y2 = y1;
x3 = x2;
y3 = y1 + h;
double x4 = x1;
double y4 = y3;
linePath.moveTo(x1, y1);
linePath.lineTo(x2, y2);
linePath.lineTo(x3, y3);
linePath.lineTo(x4, y4);
break;
case PathData.e_closepath:
linePath.closePath();
break;
default:
throw new PDFNetException("Invalid Element Type", 0, "", "", "");
}
}
return linePath;
}
private boolean almostContains(Shape outer, Rectangle2D inner) {
double tolerance = 1e-3;
double x_with_tolerance = inner.getX() >= 0 ? inner.getX() + tolerance : inner.getX() - tolerance;
double y_with_tolerance = inner.getY() >= 0 ? inner.getY() + tolerance : inner.getY() - tolerance;
double height_with_tolerance = inner.getHeight() - (2 * tolerance);
double width_with_tolerance = inner.getWidth() - (2 * tolerance);
Rectangle2D innerRect = new Rectangle2D.Double(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance);
return outer.contains(innerRect);
}
private boolean isFilledAndNonTransparent(Element element) throws PDFNetException {
return element.isFilled() && element.getGState().getFillOpacity() == 1;
}
@SneakyThrows
private void drawBBox(ElementWriter writer, Rectangle2D r, String hexcolor) {
ColorPt colorPt = new ColorPt(Integer.valueOf(hexcolor.substring(1, 3), 16) / 255d,
Integer.valueOf(hexcolor.substring(3, 5), 16) / 255d,
Integer.valueOf(hexcolor.substring(5, 7), 16) / 255d);
ElementBuilder eb = new ElementBuilder();
Element rect = eb.createRect(r.getX(), r.getY(), r.getWidth(), r.getHeight());
rect.setPathStroke(true);
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
rect.getGState().setStrokeColor(colorPt);
writer.writePlacedElement(rect);
}
}

View File

@ -1,42 +1,26 @@
package com.iqser.red.service.ocr.v1.server.service;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.io.IOUtils;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.stereotype.Service;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse;
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
import com.iqser.red.service.ocr.v1.server.model.ImagePosition;
import com.iqser.red.service.ocr.v1.server.model.image.ImageServiceResponse;
import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings;
import com.iqser.red.service.persistence.service.v1.api.utils.SuppressFBWarnings;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.ContentReplacer;
import com.pdftron.pdf.OCRModule;
import com.pdftron.pdf.OCROptions;
import com.pdftron.pdf.Optimizer;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.Rect;
import com.pdftron.pdf.RectCollection;
import com.pdftron.pdf.TextExtractor;
import com.pdftron.pdf.*;
import com.pdftron.sdf.SDFDoc;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.IOUtils;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.stereotype.Service;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.util.*;
@Slf4j
@Service
@ -44,7 +28,6 @@ import lombok.extern.slf4j.Slf4j;
public class OCRService {
public static final String ENGLISH = "eng";
public static final String REPLACEMENT_TEXT = "";
private final FileStorageService fileStorageService;
private final OcrServiceSettings settings;
@ -53,6 +36,8 @@ public class OCRService {
private final ObjectMapper objectMapper;
private final InvisibleElementService invisibleElementService;
@SneakyThrows
public InputStream ocrDocument(String dossierId, String fileId) {
@ -62,16 +47,16 @@ public class OCRService {
var fileBytes = IOUtils.toByteArray(fileStream);
var ocrBytes = ocr(fileBytes, fileId, imageServiceResponse);
byte[] fileWithoutInvisibleTextBytes = invisibleElementService.removeInvisibleElements(fileBytes, false);
var ocrBytes = ocr(fileWithoutInvisibleTextBytes, fileId, imageServiceResponse);
return new ByteArrayInputStream(ocrBytes);
}
@SuppressFBWarnings("REC_CATCH_EXCEPTION")
private byte[] ocr(byte[] file, String fileId, ImageServiceResponse imageServiceResponse) {
PDFDoc pdfDoc = null;
try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
pdfDoc = new PDFDoc(file);
@ -86,30 +71,11 @@ public class OCRService {
imageMetadata.getPosition().getPageNumber()), imageMetadata.isAlpha())));
Map<Integer, PDFDoc> pdfDocMap = Collections.synchronizedMap(new HashMap<>());
Map<Integer, Integer> wordCountPerPage = Collections.synchronizedMap(new HashMap<>());
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder().fileId(fileId).numberOfPagesToOCR(pages.keySet().size()).build()));
ocrPages(pdfDoc, fileId, pages, pdfDocMap, wordCountPerPage);
for (var entry : pdfDocMap.entrySet()) {
var ocrDoc = entry.getValue();
var page = entry.getKey();
Page ocrPage = ocrDoc.getPageIterator(1).next();
TextExtractor txt = new TextExtractor();
txt.begin(ocrPage);
int wordCount = txt.getWordCount();
if (wordCount >= wordCountPerPage.get(page)) {
pdfDoc.pageInsert(pdfDoc.getPageIterator(page), ocrPage);
pdfDoc.pageRemove(pdfDoc.getPageIterator(page + 1));
}
ocrDoc.close();
}
ocrPages(pdfDoc, fileId, pages, pdfDocMap);
Optimizer.optimize(pdfDoc);
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
@ -139,7 +105,7 @@ public class OCRService {
@SneakyThrows
private void ocrPages(PDFDoc pdfDoc, String fileId, Map<Integer, List<ImagePosition>> pages, Map<Integer, PDFDoc> pdfDocMap, Map<Integer, Integer> wordCountPerPage) {
private void ocrPages(PDFDoc pdfDoc, String fileId, Map<Integer, List<ImagePosition>> pages, Map<Integer, PDFDoc> pdfDocMap) {
int numberOfOCRedPages = 0;
for (var pageEntry : pages.entrySet()) {
@ -149,35 +115,20 @@ public class OCRService {
var page = pageEntry.getKey();
var areasToRemoveInOcrDoc = new ArrayList<Rect>();
Page pdfPage = pdfDoc.getPageIterator(page).next();
pdfPage.setMediaBox(pdfPage.getCropBox());
TextExtractor txt = new TextExtractor();
txt.begin(pdfPage);
int wordCount = txt.getWordCount();
wordCountPerPage.put(page, wordCount);
for (ImagePosition imagePosition : pageEntry.getValue()) {
Rectangle rectangle = imagePosition.getRectangle();
Rect rect = convert(rectangle, pdfPage.getCropBox(), pdfPage.getMediaBox());
// Warning coordinate system is different in this call macOs/Linux
double y = -rectangle.getTopLeft().getY() + pdfPage.getCropBox().getY2() - rectangle.getHeight();
rectCollection.addRect(rectangle.getTopLeft().getX(), y, rectangle.getTopLeft().getX() + rectangle.getWidth(), y + rectangle.getHeight());
if (!imagePosition.isHasTransparency()) {
areasToRemoveInOcrDoc.add(rect);
}
}
rectCollection.clear();
PDFDoc ocrDoc = new PDFDoc();
ocrDoc.pagePushBack(pdfPage);
removeTextFromOCRPage(areasToRemoveInOcrDoc, ocrDoc);
pdfDocMap.put(pageEntry.getKey(), ocrDoc);
OCROptions options = new OCROptions();
@ -186,6 +137,8 @@ public class OCRService {
options.addDPI(settings.getOcrDPI());
OCRModule.processPDF(ocrDoc, options);
rectCollection.clear();
} catch (Exception e) {
log.warn("Failed to process PDF page {}", pageEntry.getKey());
}
@ -201,39 +154,4 @@ public class OCRService {
}
}
@SneakyThrows
private void removeTextFromOCRPage(List<Rect> areasToRemoveInOcrDoc, PDFDoc ocrDoc) {
Page ocrPage = ocrDoc.getPage(1);
for (var rect : areasToRemoveInOcrDoc) {
try {
ContentReplacer replacer = new ContentReplacer(); // Reinitialize is needed in loop.
replacer.addText(rect, REPLACEMENT_TEXT);
replacer.process(ocrPage);
} catch (Exception e) {
log.warn("Skipping removing text behind image because of: {}", e.getMessage());
break;
}
}
}
public Rect convert(Rectangle rectangle, Rect cropBox, Rect mediaBox) {
try {
var offset = 0.01;
var x1 = rectangle.getTopLeft().getX() + cropBox.getX1() - mediaBox.getX1() + (cropBox.equals(mediaBox) ? cropBox.getX1() : 0f) - offset;
var y1 = rectangle.getTopLeft().getY() + rectangle.getHeight() - (mediaBox.getY1() - cropBox.getY1()) + (cropBox.equals(mediaBox) ? cropBox.getY1() : 0f) + offset;
var x2 = rectangle.getTopLeft().getX() + rectangle.getWidth() + cropBox.getX1() - mediaBox.getX1() + (cropBox.equals(mediaBox) ? cropBox.getX1() : 0f) + offset;
var y2 = rectangle.getTopLeft().getY() - (mediaBox.getY1() - cropBox.getY1()) + (cropBox.equals(mediaBox) ? cropBox.getY1() : 0f) - offset;
// Rect is specified by lower-left and upperright corner.
return new Rect(x1, y1, x2, y2);
} catch (PDFNetException e) {
throw new RuntimeException(e);
}
}
}

View File

@ -0,0 +1,78 @@
package com.iqser.red.service.ocr.v1.server;
import com.iqser.red.service.ocr.v1.server.service.InvisibleElementService;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
import com.pdftron.pdf.TextExtractor;
import lombok.SneakyThrows;
import org.apache.commons.io.FileUtils;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.context.annotation.Import;
import org.springframework.core.io.ClassPathResource;
import org.springframework.test.context.junit.jupiter.SpringExtension;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
@ExtendWith(SpringExtension.class)
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT //
, properties = {"pdftron.ocrmodule.path=/YourOCRModulePath"})
@Import(OcrServiceIntegrationTest.TestConfiguration.class)
public class InvisibleElementServiceTest {
@Autowired
private InvisibleElementService invisibleElementService;
@MockBean
protected RabbitTemplate rabbitTemplate;
@Test
@SneakyThrows
public void testRemoveInvisibleText() {
String fileName = "InvisibleText";
ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf");
var initialFileBytes = Files.readAllBytes(pdfFileResource.getFile().toPath());
var fileWithoutInvisibleText = invisibleElementService.removeInvisibleElements(initialFileBytes, false);
var deltaFile = invisibleElementService.removeInvisibleElements(initialFileBytes, true);
String fileWithoutInvisibleTextLocation = getTemporaryDirectory() + "/" + fileName + ".pdf";
String deltaFileLocation = getTemporaryDirectory() + "/" + fileName + "_delta.pdf";
saveToFile(fileWithoutInvisibleTextLocation, fileWithoutInvisibleText);
saveToFile(deltaFileLocation, deltaFile);
System.out.println("Output File without invisible elements: " + fileWithoutInvisibleTextLocation);
System.out.println("Output Delta File: " + deltaFileLocation);
TextExtractor extractor = new TextExtractor();
PDFDoc pdfDoc = new PDFDoc(fileWithoutInvisibleText);
PageIterator iterator = pdfDoc.getPageIterator();
while (iterator.hasNext()) {
Page page = iterator.next();
extractor.begin(page);
String[] text = extractor.getAsText().split("\n");
assertThat(text).containsAnyOf("APPENDIX 16 Pathology Report", "Amendment 1", "Page 255 of 260");
}
}
private void saveToFile(String location, byte[] fileBytes) {
try (var f_out = FileUtils.openOutputStream(new File(location))) {
f_out.write(fileBytes);
} catch (IOException e) {
throw new RuntimeException("File location: " + location + "could not be openend, no file will be saved");
}
}
}

View File

@ -0,0 +1 @@
{"dossierId": "c8553cbd-409f-4e1a-baf4-34b11d49deac", "fileId": "bd6f93ed896dd0e2f641b0568f13ddf1", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": [{"classification": {"label": "other", "probabilities": {"other": 0.9999, "logo": 0.0001, "formula": 0.0, "signature": 0.0}}, "representation": "FFFFFEFBF7EFCFFFFFFFFFFFF", "position": {"x1": -3, "x2": 795, "y1": 0, "y2": 612, "pageNumber": 1}, "geometry": {"width": 798, "height": 612}, "alpha": false, "filters": {"geometry": {"imageSize": {"quotient": 1.0038, "tooLarge": true, "tooSmall": false}, "imageFormat": {"quotient": 1.3039, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "logo", "probabilities": {"logo": 0.9966, "other": 0.0025, "signature": 0.0005, "formula": 0.0003}}, "representation": "FFC33D3C323CCF3390C1F8C72", "position": {"x1": 120, "x2": 131, "y1": 264, "y2": 380, "pageNumber": 1}, "geometry": {"width": 11, "height": 116}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.0513, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.0948, "tooTall": true, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "other", "probabilities": {"other": 1.0, "formula": 0.0, "logo": 0.0, "signature": 0.0}}, "representation": "EF8FF6381060800318F0E187", "position": {"x1": 152, "x2": 205, "y1": 115, "y2": 533, "pageNumber": 1}, "geometry": {"width": 53, "height": 418}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.2138, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.1268, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": true}}, {"classification": {"label": "other", "probabilities": {"other": 0.5967, "logo": 0.1756, "signature": 0.1218, "formula": 0.106}}, "representation": "EC9377C9170E1070C3070C30F", "position": {"x1": 196, "x2": 221, "y1": 121, "y2": 245, "pageNumber": 1}, "geometry": {"width": 25, "height": 124}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.08, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.2016, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": true}}, {"classification": {"label": "logo", "probabilities": {"logo": 1.0, "formula": 0.0, "other": 0.0, "signature": 0.0}}, "representation": "CF1F1F70F140036860F1441B5", "position": {"x1": 707, "x2": 718, "y1": 481, "y2": 531, "pageNumber": 1}, "geometry": {"width": 11, "height": 50}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.0337, "tooLarge": false, "tooSmall": true}, "imageFormat": {"quotient": 0.22, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "logo", "probabilities": {"logo": 0.9704, "other": 0.0223, "formula": 0.0044, "signature": 0.0029}}, "representation": "CF0F1C70F1C7090081F7CF073", "position": {"x1": 732, "x2": 744, "y1": 118, "y2": 241, "pageNumber": 1}, "geometry": {"width": 12, "height": 123}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.0552, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.0976, "tooTall": true, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}], "dataCV": []}