Pull request #4: RED-6019: Remove hidden text when processing OCR

Merge in RED/ocr-service from RED-6019 to master

* commit 'a96260f77fd5b546a5d27d84f34861742f13ddff':
  RED-6019: Remove hidden text when processing OCR *moved InvisibleElementRemovalDto to private inner record of InvisibleElementRemovalService *added comments for color choices
  RED-6019: Remove hidden text when processing OCR *moved to release version of platform-dependencies *restored annotationProcessors
  RED-6019: Remove hidden text when processing OCR *code refactor *upgrade to java 17
  RED-6019: Remove hidden text when processing OCR handled cases:      Text which is transparent or is set to not render      Elements outside of clipping path      Elements that have been painted over by visible and filled Paths unhandled cases:      Elements covered by widely stroked path      Elements same color as background      Any Text set to clipping with its many interactions with other elements
This commit is contained in:
Kilian Schuettler 2023-02-02 13:05:03 +01:00
commit b37ec5afc9
9 changed files with 780 additions and 95 deletions

View File

@ -86,16 +86,6 @@
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<annotationProcessors>
<annotationProcessor>lombok.launch.AnnotationProcessorHider$AnnotationProcessor</annotationProcessor>
<annotationProcessor>com.dslplatform.json.processor.CompiledJsonAnnotationProcessor</annotationProcessor>
</annotationProcessors>
</configuration>
</plugin>
<plugin>
<!-- generate git.properties for exposure in /info -->
<groupId>pl.project13.maven</groupId>
@ -114,6 +104,16 @@
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<annotationProcessors>
<annotationProcessor>lombok.launch.AnnotationProcessorHider$AnnotationProcessor</annotationProcessor>
<annotationProcessor>com.dslplatform.json.processor.CompiledJsonAnnotationProcessor</annotationProcessor>
</annotationProcessors>
</configuration>
</plugin>
<plugin>
<!-- repackages the generated jar into a runnable fat-jar and makes it
executable -->

View File

@ -0,0 +1,68 @@
package com.iqser.red.service.ocr.v1.server.model;
import static com.iqser.red.service.ocr.v1.server.service.InvisibleElementRemovalService.TOLERANCE;
import java.awt.geom.Area;
import java.awt.geom.GeneralPath;
import java.awt.geom.Rectangle2D;
import java.util.Deque;
import java.util.LinkedList;
import com.pdftron.pdf.Rect;
import lombok.Data;
import lombok.SneakyThrows;
@Data
public class ClippingPathStack {
private Deque<Area> stack = new LinkedList<>();
@SneakyThrows
public ClippingPathStack(Rect rectangle) {
stack.push(new Area(new Rectangle2D.Double(rectangle.getX1(), rectangle.getY1(), rectangle.getWidth(), rectangle.getHeight()).getBounds2D()));
}
@SneakyThrows
public void intersectClippingPath(GeneralPath path) {
getCurrentClippingPath().intersect(new Area(path));
}
public boolean almostIntersects(double x, double y, double width, double height) {
// To address inconsistencies in the calculation of the bounding box we slightly increase the rectangle
// Height or width are zero for straight lines, even though they are being rendered. Therefore, height or width must be at minimum >0.
double x_with_tolerance = x > 0 ? x - TOLERANCE : x + TOLERANCE;
double y_with_tolerance = y > 0 ? y - TOLERANCE : y + TOLERANCE;
double width_with_tolerance = width + (2 * TOLERANCE);
double height_with_tolerance = height + (2 * TOLERANCE);
return getCurrentClippingPath().intersects(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance);
}
public Area getCurrentClippingPath() {
return stack.peek();
}
public void enterNewGState() {
Area current = stack.peek();
Area cloned = new Area();
cloned.add(current);
stack.push(cloned);
}
public void leaveGState() {
stack.pop();
}
}

View File

@ -0,0 +1,170 @@
package com.iqser.red.service.ocr.v1.server.model;
import static com.iqser.red.service.ocr.v1.server.service.InvisibleElementRemovalService.TOLERANCE;
import java.awt.geom.Rectangle2D;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.Rect;
import lombok.AccessLevel;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
@Getter
@SuperBuilder
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ElementFeatures {
int elementType;
Rectangle2D boundingBox;
public boolean almostMatches(Element element) throws PDFNetException {
return element.getType() == elementType && //
element.getBBox() != null && //
rectsAlmostMatch(element.getBBox());
}
protected boolean almostEqual(double a, double b) {
return Math.abs(a - b) < TOLERANCE;
}
@SneakyThrows
private boolean rectsAlmostMatch(Rect bBox) {
// To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance
return almostEqual(bBox.getX1(), boundingBox.getX()) && //
almostEqual(bBox.getY1(), boundingBox.getY()) && //
almostEqual(bBox.getWidth(), boundingBox.getWidth()) && //
almostEqual(bBox.getHeight(), boundingBox.getHeight());
}
@EqualsAndHashCode(callSuper = true)
@Getter
@SuperBuilder
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
private static class Text extends ElementFeatures {
String text;
int font;
double fontsize;
@Override
public boolean almostMatches(Element element) throws PDFNetException {
return super.almostMatches(element) && //
text.equals(element.getTextString()) && //
font == element.getGState().getFont().getType() && //
almostEqual(fontsize, element.getGState().getFontSize());
}
}
@EqualsAndHashCode(callSuper = true)
@Getter
@SuperBuilder
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
private static class Path extends ElementFeatures {
boolean isClippingPath;
boolean isClipWindingFill;
boolean isStroked;
boolean isFilled;
boolean isWindingFill;
@Override
public boolean almostMatches(Element element) throws PDFNetException {
return super.almostMatches(element) && //
isClippingPath == element.isClippingPath() && //
isClipWindingFill == element.isClipWindingFill() && //
isStroked == element.isStroked() && //
isFilled == element.isFilled() && //
isWindingFill == element.isWindingFill();
}
}
@EqualsAndHashCode(callSuper = true)
@Getter
@SuperBuilder
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
private static class Image extends ElementFeatures {
int dataSize;
int height;
int width;
int renderingIntent;
int componentNum;
int bitsPerComponent;
@Override
public boolean almostMatches(Element element) throws PDFNetException {
return super.almostMatches(element) && //
dataSize == element.getImageDataSize() && //
height == element.getImageHeight() && //
width == element.getImageWidth() && //
renderingIntent == element.getImageRenderingIntent() && //
componentNum == element.getComponentNum() && //
bitsPerComponent == element.getBitsPerComponent();
}
}
public static ElementFeatures extractFeatures(Element element) throws PDFNetException {
return switch (element.getType()) {
case Element.e_path -> Path.builder()
.elementType(element.getType())
.boundingBox(toRectangle2D(element.getBBox()))
.isClippingPath(element.isClippingPath())
.isClipWindingFill(element.isClipWindingFill())
.isStroked(element.isStroked())
.isFilled(element.isFilled())
.isWindingFill(element.isWindingFill())
.build();
case Element.e_text -> Text.builder()
.elementType(element.getType())
.boundingBox(toRectangle2D(element.getBBox()))
.text(element.getTextString())
.font(element.getGState().getFont().getType())
.fontsize(element.getGState().getFontSize())
.build();
case Element.e_image, Element.e_inline_image -> Image.builder()
.elementType(element.getType())
.boundingBox(toRectangle2D(element.getBBox()))
.dataSize(element.getImageDataSize())
.height(element.getImageHeight())
.width(element.getImageWidth())
.renderingIntent(element.getImageRenderingIntent())
.componentNum(element.getComponentNum())
.bitsPerComponent(element.getBitsPerComponent())
.build();
// This technically should never happen, it's a safetynet
default -> throw new RuntimeException("Feature Extraction is not supported for PDFTron.Element with type: " + element.getType());
};
}
private static Rectangle2D toRectangle2D(Rect rect) throws PDFNetException {
return new Rectangle2D.Double(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
}
}

View File

@ -0,0 +1,435 @@
package com.iqser.red.service.ocr.v1.server.service;
import java.awt.Shape;
import java.awt.geom.AffineTransform;
import java.awt.geom.GeneralPath;
import java.awt.geom.Rectangle2D;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import org.springframework.stereotype.Service;
import com.google.common.primitives.Bytes;
import com.google.common.primitives.Doubles;
import com.iqser.red.service.ocr.v1.server.model.ClippingPathStack;
import com.iqser.red.service.ocr.v1.server.model.ElementFeatures;
import com.pdftron.common.Matrix2D;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.ColorPt;
import com.pdftron.pdf.ColorSpace;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementBuilder;
import com.pdftron.pdf.ElementReader;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.GState;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
import com.pdftron.pdf.PathData;
import com.pdftron.pdf.Rect;
import com.pdftron.sdf.Obj;
import com.pdftron.sdf.SDFDoc;
import lombok.Builder;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
public class InvisibleElementRemovalService {
static public final double TOLERANCE = 1e-3;
/**
* Removes all hidden Text, Path and Image Elements from a PDF Document.
* handled cases:
* -Text which is transparent or is set to not render
* -Elements outside of clipping path
* -Elements that have been painted over by visible and filled Paths
* unhandled cases:
* -Elements covered by widely stroked path
* -Elements with the same color as background
* -Any Text set to clipping with its many interactions with other elements
*
* @param pdfFile The PDF file to process
* @param delta If this flag is set only the removed Elements will be written to the output file.
* The Elements are red if they are removed by clipping path, blue for transparency, and a green bounding box for overlap.
* @return The resulting PDF File as bytes.
**/
@SneakyThrows
public byte[] removeInvisibleElements(InputStream pdfFile, boolean delta) {
PDFDoc pdfDoc = new PDFDoc(pdfFile);
ElementWriter writer = new ElementWriter();
ElementReader reader = new ElementReader();
Set<Long> visitedXObjIds = new TreeSet<>();
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
Page page = iterator.next();
visitedXObjIds.add(page.getSDFObj().getObjNum());
InvisibleElementRemovalContext context = InvisibleElementRemovalContext.builder()
.reader(reader)
.clippingPathStack(new ClippingPathStack(page.getMediaBox()))
.delta(delta)
.overlappedElements(new ArrayList<>())
.visibleElements(new ArrayList<>())
.visitedXObjIds(visitedXObjIds)
.build();
removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context);
context.visitedXObjIds().clear();
removeOverlappedElements(page, writer, context);
}
return pdfDoc.save(SDFDoc.SaveMode.REMOVE_UNUSED, null);
}
private void removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(Page page,
ElementWriter writer,
InvisibleElementRemovalContext context) throws PDFNetException {
context.reader().begin(page);
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
processElements(writer, context);
writer.end();
context.reader().end();
}
private void processElements(ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
for (Element element = context.reader().next(); element != null; element = context.reader().next())
switch (element.getType()) {
case Element.e_image, Element.e_inline_image -> processImages(element, writer, context);
case Element.e_text -> processText(element, writer, context);
case Element.e_path -> processPath(element, writer, context);
case Element.e_form -> processForm(element, writer, context);
case Element.e_group_begin -> {
context.clippingPathStack().enterNewGState();
writer.writeElement(element);
}
case Element.e_group_end -> {
context.clippingPathStack().leaveGState();
writer.writeElement(element);
}
default -> writer.writeElement(element);
}
}
private void processImages(Element imageElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
Rect rect = imageElement.getBBox();
if (rect == null) {
return;
}
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
if (!context.delta() && inClippingPath) {
context.visibleElements().add(ElementFeatures.extractFeatures(imageElement));
}
if (context.delta() ^ inClippingPath) {
writer.writeElement(imageElement);
}
}
private void processText(Element textElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
Rect rect = textElement.getBBox();
if (rect == null) {
writer.writeElement(textElement);
return;
}
GState gState = textElement.getGState();
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
boolean isTextVisible = isTextRenderedVisibly(gState);
if (inClippingPath && isTextVisible) {
context.visibleElements().add(ElementFeatures.extractFeatures(textElement));
}
if (!context.delta()) {
if (inClippingPath && isTextVisible) {
writer.writeElement(textElement);
} else if (textElement.hasTextMatrix()) {
/*
PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
Therefore, the position of a following Tj is affected by not writing the first Element.
This is why, we write only the Tm command:
*/
writer.writeGStateChanges(textElement);
}
} else {
if (!inClippingPath) {
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
// red for elements removed by clipping path
gState.setFillColor(new ColorPt(1, 0, 0));
writer.writeElement(textElement);
}
if (!isTextVisible) {
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
// blue for elements removed due to transparency or not rendered
gState.setFillColor(new ColorPt(0, 0, 1));
gState.setTextRenderMode(GState.e_fill_text);
gState.setFillOpacity(1);
writer.writeElement(textElement);
}
}
}
private void processForm(Element formElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
writer.writeElement(formElement);
Obj formObj = formElement.getXObject();
if (!context.visitedXObjIds().contains(formObj.getObjNum())) {
context.visitedXObjIds().add(formObj.getObjNum());
// writer needs to be newly initialized when entering a new content stream
// see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
ElementWriter formWriter = new ElementWriter();
context.reader().formBegin();
formWriter.begin(formObj);
context.reader().clearChangeList();
formWriter.setDefaultGState(context.reader());
processElements(formWriter, context);
formWriter.end();
context.reader().end();
}
}
private void processPath(Element pathElement, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
GeneralPath linePath = convertToGeneralPath(pathElement.getPathData());
//transform path to initial user space
var ctm = pathElement.getCTM();
var affineTransform = getAffineTransform(ctm);
linePath.transform(affineTransform);
var rect = linePath.getBounds2D();
boolean inClippingPath = context.clippingPathStack().almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight());
if (pathElement.isClippingPath()) {
if (pathElement.isClipWindingFill()) {
linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
} else {
linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
}
context.clippingPathStack().intersectClippingPath(linePath);
pathElement.setPathClip(!context.delta());
writer.writeElement(pathElement);
} else {
if (inClippingPath) {
// TODO: WINDING RULE
if (isFilledAndNonTransparent(pathElement)) {
List<ElementFeatures> currentOverlappedElements = context.visibleElements()
.stream()
.filter(features -> almostContains(linePath, features.getBoundingBox()))
.toList();
context.overlappedElements().addAll(currentOverlappedElements);
context.visibleElements().removeAll(currentOverlappedElements);
}
context.visibleElements().add(ElementFeatures.extractFeatures(pathElement));
if (!context.delta()) {
writer.writeElement(pathElement);
}
}
if (context.delta() && !inClippingPath) {
pathElement.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
pathElement.getGState().setFillColor(new ColorPt(1, 0, 0));
pathElement.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
pathElement.getGState().setStrokeColor(new ColorPt(1, 0, 0));
writer.writeElement(pathElement);
}
}
}
private static AffineTransform getAffineTransform(Matrix2D ctm) throws PDFNetException {
return new AffineTransform(ctm.getA(), ctm.getB(), ctm.getC(), ctm.getD(), ctm.getH(), ctm.getV());
}
private void removeOverlappedElements(Page page, ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
context.reader().begin(page);
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
if (context.delta()) {
// green for element removed due to overlapping
context.overlappedElements().forEach(feature -> drawBBox(writer, feature.getBoundingBox(), "#00FF00"));
context.overlappedElements().clear();
}
processOverlappedElements(writer, context);
writer.end();
context.reader().end();
if (context.overlappedElements().size() > 0) {
log.warn(context.overlappedElements().size() + " overlapped elements have not been found or removed");
}
}
private void processOverlappedElements(ElementWriter writer, InvisibleElementRemovalContext context) throws PDFNetException {
for (Element element = context.reader().next(); element != null; element = context.reader().next()) {
switch (element.getType()) {
case Element.e_form -> processFormOverlappedElements(writer, element, context);
case Element.e_path, Element.e_image, Element.e_inline_image, Element.e_text -> {
boolean anyMatch = false;
for (ElementFeatures elementToRemove : context.overlappedElements()) {
if (elementToRemove.almostMatches(element)) {
context.overlappedElements().remove(elementToRemove);
anyMatch = true;
break;
}
}
if (!anyMatch) {
writer.writeElement(element);
} else if (element.getType() == 3 && element.hasTextMatrix()) {
/*
PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
Therefore, the position of a following Tj is affected by not writing the first Element.
This is why, we write only the Tm command:
*/
writer.writeGStateChanges(element);
}
}
default -> writer.writeElement(element);
}
}
}
private void processFormOverlappedElements(ElementWriter writer, Element formElement, InvisibleElementRemovalContext context) throws PDFNetException {
writer.writeElement(formElement);
Obj formObj = formElement.getXObject();
if (!context.visitedXObjIds().contains(formObj.getObjNum())) {
context.visitedXObjIds().add(formObj.getObjNum());
// writer needs to be newly initialized when entering a new content stream
// see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
ElementWriter formWriter = new ElementWriter();
context.reader().formBegin();
formWriter.begin(formObj);
context.reader().clearChangeList();
formWriter.setDefaultGState(context.reader());
processOverlappedElements(formWriter, context);
formWriter.end();
context.reader().end();
}
}
private boolean isTextRenderedVisibly(GState gState) throws PDFNetException {
return gState.getTextRenderMode() != GState.e_invisible_text && //
!(gState.getTextRenderMode() == GState.e_fill_text && gState.getFillOpacity() == 0) && //
!(gState.getTextRenderMode() == GState.e_stroke_text && gState.getStrokeOpacity() == 0) && //
!(gState.getTextRenderMode() == GState.e_fill_stroke_text && gState.getFillOpacity() == 0 && gState.getStrokeOpacity() == 0);
}
private GeneralPath convertToGeneralPath(PathData pathData) throws PDFNetException {
GeneralPath linePath = new GeneralPath();
Iterator<Double> points = Doubles.asList(pathData.getPoints()).iterator();
Iterable<Byte> operators = Bytes.asList(pathData.getOperators());
for (var operator : operators) {
switch (operator) {
case PathData.e_moveto -> linePath.moveTo(points.next(), points.next());
case PathData.e_lineto -> linePath.lineTo(points.next(), points.next());
case PathData.e_cubicto -> linePath.curveTo(points.next(), points.next(), points.next(), points.next(), points.next(), points.next());
case PathData.e_closepath -> linePath.closePath();
case PathData.e_rect -> {
double x = points.next();
double y = points.next();
double w = points.next();
double h = points.next();
linePath.moveTo(x, y);
linePath.lineTo(x + w, y);
linePath.lineTo(x + w, y + h);
linePath.lineTo(x, y + h);
linePath.closePath();
}
default -> throw new PDFNetException("Invalid Element Type", 0, "", "", "");
}
}
return linePath;
}
private boolean almostContains(Shape outer, Rectangle2D inner) {
//To address inconsistencies in the calculation of the bounding box we slightly shrink the inner rectangle
double x_with_tolerance = inner.getX() >= 0 ? inner.getX() + TOLERANCE : inner.getX() - TOLERANCE;
double y_with_tolerance = inner.getY() >= 0 ? inner.getY() + TOLERANCE : inner.getY() - TOLERANCE;
double height_with_tolerance = inner.getHeight() - (2 * TOLERANCE);
double width_with_tolerance = inner.getWidth() - (2 * TOLERANCE);
Rectangle2D innerRect = new Rectangle2D.Double(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance);
return outer.contains(innerRect);
}
private boolean isFilledAndNonTransparent(Element element) throws PDFNetException {
return element.isFilled() && element.getGState().getFillOpacity() == 1;
}
@SneakyThrows
private void drawBBox(ElementWriter writer, Rectangle2D r, String hexcolor) {
ColorPt colorPt = new ColorPt(Integer.valueOf(hexcolor.substring(1, 3), 16) / 255d,
Integer.valueOf(hexcolor.substring(3, 5), 16) / 255d,
Integer.valueOf(hexcolor.substring(5, 7), 16) / 255d);
ElementBuilder eb = new ElementBuilder();
Element rect = eb.createRect(r.getX(), r.getY(), r.getWidth(), r.getHeight());
rect.setPathStroke(true);
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
rect.getGState().setStrokeColor(colorPt);
writer.writePlacedElement(rect);
}
@Builder
private record InvisibleElementRemovalContext(boolean delta, //
ElementReader reader, //
ClippingPathStack clippingPathStack, //
List<ElementFeatures> overlappedElements, //
List<ElementFeatures> visibleElements, //
Set<Long> visitedXObjIds) {
}
}

View File

@ -9,7 +9,6 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.io.IOUtils;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.stereotype.Service;
@ -19,19 +18,14 @@ import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
import com.iqser.red.service.ocr.v1.server.model.ImagePosition;
import com.iqser.red.service.ocr.v1.server.model.image.ImageServiceResponse;
import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings;
import com.iqser.red.service.persistence.service.v1.api.utils.SuppressFBWarnings;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.ContentReplacer;
import com.pdftron.pdf.OCRModule;
import com.pdftron.pdf.OCROptions;
import com.pdftron.pdf.Optimizer;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.Rect;
import com.pdftron.pdf.RectCollection;
import com.pdftron.pdf.TextExtractor;
import com.pdftron.sdf.SDFDoc;
import lombok.RequiredArgsConstructor;
@ -44,7 +38,6 @@ import lombok.extern.slf4j.Slf4j;
public class OCRService {
public static final String ENGLISH = "eng";
public static final String REPLACEMENT_TEXT = "";
private final FileStorageService fileStorageService;
private final OcrServiceSettings settings;
@ -53,25 +46,25 @@ public class OCRService {
private final ObjectMapper objectMapper;
private final InvisibleElementRemovalService invisibleElementRemovalService;
@SneakyThrows
public InputStream ocrDocument(String dossierId, String fileId) {
var fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId);
var imageServiceResponse = fileStorageService.getImageServiceResponse(dossierId, fileId);
InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId);
ImageServiceResponse imageServiceResponse = fileStorageService.getImageServiceResponse(dossierId, fileId);
var fileBytes = IOUtils.toByteArray(fileStream);
byte[] fileWithoutInvisibleTextStream = invisibleElementRemovalService.removeInvisibleElements(fileStream, false);
var ocrBytes = ocr(fileBytes, fileId, imageServiceResponse);
byte[] ocrBytes = ocr(fileWithoutInvisibleTextStream, fileId, imageServiceResponse);
return new ByteArrayInputStream(ocrBytes);
}
@SuppressFBWarnings("REC_CATCH_EXCEPTION")
private byte[] ocr(byte[] file, String fileId, ImageServiceResponse imageServiceResponse) {
PDFDoc pdfDoc = null;
try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
pdfDoc = new PDFDoc(file);
@ -86,30 +79,11 @@ public class OCRService {
imageMetadata.getPosition().getPageNumber()), imageMetadata.isAlpha())));
Map<Integer, PDFDoc> pdfDocMap = Collections.synchronizedMap(new HashMap<>());
Map<Integer, Integer> wordCountPerPage = Collections.synchronizedMap(new HashMap<>());
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder().fileId(fileId).numberOfPagesToOCR(pages.keySet().size()).build()));
ocrPages(pdfDoc, fileId, pages, pdfDocMap, wordCountPerPage);
for (var entry : pdfDocMap.entrySet()) {
var ocrDoc = entry.getValue();
var page = entry.getKey();
Page ocrPage = ocrDoc.getPageIterator(1).next();
TextExtractor txt = new TextExtractor();
txt.begin(ocrPage);
int wordCount = txt.getWordCount();
if (wordCount >= wordCountPerPage.get(page)) {
pdfDoc.pageInsert(pdfDoc.getPageIterator(page), ocrPage);
pdfDoc.pageRemove(pdfDoc.getPageIterator(page + 1));
}
ocrDoc.close();
}
ocrPages(pdfDoc, fileId, pages, pdfDocMap);
Optimizer.optimize(pdfDoc);
pdfDoc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
@ -139,7 +113,7 @@ public class OCRService {
@SneakyThrows
private void ocrPages(PDFDoc pdfDoc, String fileId, Map<Integer, List<ImagePosition>> pages, Map<Integer, PDFDoc> pdfDocMap, Map<Integer, Integer> wordCountPerPage) {
private void ocrPages(PDFDoc pdfDoc, String fileId, Map<Integer, List<ImagePosition>> pages, Map<Integer, PDFDoc> pdfDocMap) {
int numberOfOCRedPages = 0;
for (var pageEntry : pages.entrySet()) {
@ -149,35 +123,20 @@ public class OCRService {
var page = pageEntry.getKey();
var areasToRemoveInOcrDoc = new ArrayList<Rect>();
Page pdfPage = pdfDoc.getPageIterator(page).next();
pdfPage.setMediaBox(pdfPage.getCropBox());
TextExtractor txt = new TextExtractor();
txt.begin(pdfPage);
int wordCount = txt.getWordCount();
wordCountPerPage.put(page, wordCount);
for (ImagePosition imagePosition : pageEntry.getValue()) {
Rectangle rectangle = imagePosition.getRectangle();
Rect rect = convert(rectangle, pdfPage.getCropBox(), pdfPage.getMediaBox());
// Warning coordinate system is different in this call macOs/Linux
double y = -rectangle.getTopLeft().getY() + pdfPage.getCropBox().getY2() - rectangle.getHeight();
rectCollection.addRect(rectangle.getTopLeft().getX(), y, rectangle.getTopLeft().getX() + rectangle.getWidth(), y + rectangle.getHeight());
if (!imagePosition.isHasTransparency()) {
areasToRemoveInOcrDoc.add(rect);
}
}
rectCollection.clear();
PDFDoc ocrDoc = new PDFDoc();
ocrDoc.pagePushBack(pdfPage);
removeTextFromOCRPage(areasToRemoveInOcrDoc, ocrDoc);
pdfDocMap.put(pageEntry.getKey(), ocrDoc);
OCROptions options = new OCROptions();
@ -186,6 +145,8 @@ public class OCRService {
options.addDPI(settings.getOcrDPI());
OCRModule.processPDF(ocrDoc, options);
rectCollection.clear();
} catch (Exception e) {
log.warn("Failed to process PDF page {}", pageEntry.getKey());
}
@ -201,39 +162,4 @@ public class OCRService {
}
}
@SneakyThrows
private void removeTextFromOCRPage(List<Rect> areasToRemoveInOcrDoc, PDFDoc ocrDoc) {
Page ocrPage = ocrDoc.getPage(1);
for (var rect : areasToRemoveInOcrDoc) {
try {
ContentReplacer replacer = new ContentReplacer(); // Reinitialize is needed in loop.
replacer.addText(rect, REPLACEMENT_TEXT);
replacer.process(ocrPage);
} catch (Exception e) {
log.warn("Skipping removing text behind image because of: {}", e.getMessage());
break;
}
}
}
public Rect convert(Rectangle rectangle, Rect cropBox, Rect mediaBox) {
try {
var offset = 0.01;
var x1 = rectangle.getTopLeft().getX() + cropBox.getX1() - mediaBox.getX1() + (cropBox.equals(mediaBox) ? cropBox.getX1() : 0f) - offset;
var y1 = rectangle.getTopLeft().getY() + rectangle.getHeight() - (mediaBox.getY1() - cropBox.getY1()) + (cropBox.equals(mediaBox) ? cropBox.getY1() : 0f) + offset;
var x2 = rectangle.getTopLeft().getX() + rectangle.getWidth() + cropBox.getX1() - mediaBox.getX1() + (cropBox.equals(mediaBox) ? cropBox.getX1() : 0f) + offset;
var y2 = rectangle.getTopLeft().getY() - (mediaBox.getY1() - cropBox.getY1()) + (cropBox.equals(mediaBox) ? cropBox.getY1() : 0f) - offset;
// Rect is specified by lower-left and upperright corner.
return new Rect(x1, y1, x2, y2);
} catch (PDFNetException e) {
throw new RuntimeException(e);
}
}
}

View File

@ -0,0 +1,85 @@
package com.iqser.red.service.ocr.v1.server;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.file.Files;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.boot.test.mock.mockito.MockBean;
import org.springframework.context.annotation.Import;
import org.springframework.core.io.ClassPathResource;
import org.springframework.test.context.junit.jupiter.SpringExtension;
import com.iqser.red.service.ocr.v1.server.service.InvisibleElementRemovalService;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
import com.pdftron.pdf.TextExtractor;
import lombok.SneakyThrows;
@ExtendWith(SpringExtension.class)
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT //
, properties = {"pdftron.ocrmodule.path=/YourOCRModulePath"})
@Import(OcrServiceIntegrationTest.TestConfiguration.class)
public class InvisibleElementRemovalServiceTest {
@Autowired
private InvisibleElementRemovalService invisibleElementRemovalService;
@MockBean
protected RabbitTemplate rabbitTemplate;
@Test
@SneakyThrows
public void testRemoveInvisibleText() {
String fileName = "InvisibleText";
ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf");
var initialFileStream = Files.newInputStream(pdfFileResource.getFile().toPath());
var fileWithoutInvisibleElements = invisibleElementRemovalService.removeInvisibleElements(initialFileStream, false);
initialFileStream = Files.newInputStream(pdfFileResource.getFile().toPath());
var deltaFile = invisibleElementRemovalService.removeInvisibleElements(initialFileStream, true);
String fileWithoutInvisibleTextLocation = getTemporaryDirectory() + "/" + fileName + ".pdf";
String deltaFileLocation = getTemporaryDirectory() + "/" + fileName + "_delta.pdf";
saveToFile(fileWithoutInvisibleTextLocation, fileWithoutInvisibleElements);
saveToFile(deltaFileLocation, deltaFile);
System.out.println("Output File without invisible elements: " + fileWithoutInvisibleTextLocation);
System.out.println("Output Delta File: " + deltaFileLocation);
TextExtractor extractor = new TextExtractor();
PDFDoc pdfDoc = new PDFDoc(fileWithoutInvisibleElements);
PageIterator iterator = pdfDoc.getPageIterator();
while (iterator.hasNext()) {
Page page = iterator.next();
extractor.begin(page);
String[] text = extractor.getAsText().split("\n");
assertThat(text).contains("APPENDIX 16 Pathology Report", "Amendment 1", "Page 255 of 260");
}
}
private void saveToFile(String location, byte[] fileBytes) {
try (var f_out = new FileOutputStream(location)) {
f_out.write(fileBytes);
} catch (IOException e) {
throw new RuntimeException("File location: " + location + "could not be openend, no file will be saved");
}
}
}

View File

@ -0,0 +1 @@
{"dossierId": "c8553cbd-409f-4e1a-baf4-34b11d49deac", "fileId": "bd6f93ed896dd0e2f641b0568f13ddf1", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": [{"classification": {"label": "other", "probabilities": {"other": 0.9999, "logo": 0.0001, "formula": 0.0, "signature": 0.0}}, "representation": "FFFFFEFBF7EFCFFFFFFFFFFFF", "position": {"x1": -3, "x2": 795, "y1": 0, "y2": 612, "pageNumber": 1}, "geometry": {"width": 798, "height": 612}, "alpha": false, "filters": {"geometry": {"imageSize": {"quotient": 1.0038, "tooLarge": true, "tooSmall": false}, "imageFormat": {"quotient": 1.3039, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "logo", "probabilities": {"logo": 0.9966, "other": 0.0025, "signature": 0.0005, "formula": 0.0003}}, "representation": "FFC33D3C323CCF3390C1F8C72", "position": {"x1": 120, "x2": 131, "y1": 264, "y2": 380, "pageNumber": 1}, "geometry": {"width": 11, "height": 116}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.0513, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.0948, "tooTall": true, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "other", "probabilities": {"other": 1.0, "formula": 0.0, "logo": 0.0, "signature": 0.0}}, "representation": "EF8FF6381060800318F0E187", "position": {"x1": 152, "x2": 205, "y1": 115, "y2": 533, "pageNumber": 1}, "geometry": {"width": 53, "height": 418}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.2138, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.1268, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": true}}, {"classification": {"label": "other", "probabilities": {"other": 0.5967, "logo": 0.1756, "signature": 0.1218, "formula": 0.106}}, "representation": "EC9377C9170E1070C3070C30F", "position": {"x1": 196, "x2": 221, "y1": 121, "y2": 245, "pageNumber": 1}, "geometry": {"width": 25, "height": 124}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.08, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.2016, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": true}}, {"classification": {"label": "logo", "probabilities": {"logo": 1.0, "formula": 0.0, "other": 0.0, "signature": 0.0}}, "representation": "CF1F1F70F140036860F1441B5", "position": {"x1": 707, "x2": 718, "y1": 481, "y2": 531, "pageNumber": 1}, "geometry": {"width": 11, "height": 50}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.0337, "tooLarge": false, "tooSmall": true}, "imageFormat": {"quotient": 0.22, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "logo", "probabilities": {"logo": 0.9704, "other": 0.0223, "formula": 0.0044, "signature": 0.0029}}, "representation": "CF0F1C70F1C7090081F7CF073", "position": {"x1": 732, "x2": 744, "y1": 118, "y2": 241, "pageNumber": 1}, "geometry": {"width": 12, "height": 123}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.0552, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.0976, "tooTall": true, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}], "dataCV": []}

View File

@ -7,7 +7,7 @@
<parent>
<groupId>com.iqser.red</groupId>
<artifactId>platform-dependency</artifactId>
<version>1.14.0</version>
<version>1.16.0</version>
<relativePath/>
</parent>