RED-6019: Remove hidden text when processing OCR

*code refactor
*upgrade to java 17
This commit is contained in:
Kilian Schuettler 2023-02-02 10:27:01 +01:00
parent fd7ec6e7aa
commit 99a0cb51d0
9 changed files with 639 additions and 602 deletions

View File

@ -86,16 +86,6 @@
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<annotationProcessors>
<annotationProcessor>lombok.launch.AnnotationProcessorHider$AnnotationProcessor</annotationProcessor>
<annotationProcessor>com.dslplatform.json.processor.CompiledJsonAnnotationProcessor</annotationProcessor>
</annotationProcessors>
</configuration>
</plugin>
<plugin>
<!-- generate git.properties for exposure in /info -->
<groupId>pl.project13.maven</groupId>

View File

@ -1,53 +1,67 @@
package com.iqser.red.service.ocr.v1.server.model;
import com.pdftron.pdf.Rect;
import lombok.Data;
import lombok.SneakyThrows;
import static com.iqser.red.service.ocr.v1.server.service.InvisibleElementRemovalService.TOLERANCE;
import java.awt.geom.Area;
import java.awt.geom.GeneralPath;
import java.awt.geom.Rectangle2D;
import java.util.ArrayDeque;
import java.util.Deque;
import java.util.LinkedList;
import com.pdftron.pdf.Rect;
import lombok.Data;
import lombok.SneakyThrows;
@Data
public class ClippingPathStack {
private Deque<Area> stack = new ArrayDeque<>();
private Deque<Area> stack = new LinkedList<>();
@SneakyThrows
public ClippingPathStack(Rect rectangle) {
stack.push(new Area(new Rectangle2D.Double(rectangle.getX1(), rectangle.getY1(), rectangle.getWidth(), rectangle.getHeight()).getBounds2D()));
}
@SneakyThrows
public void intersectClippingPath(GeneralPath path) {
getCurrentClippingPath().intersect(new Area(path));
}
public boolean almostIntersects(double x, double y, double width, double height) {
// To address inconsistencies in the calculation of the bounding box we slightly increase the rectangle
// Height or width are zero for straight lines, even though they are being rendered. Therefore, height or width must be at minimum >0.
double tolerance = 1e-3;
double x_with_tolerance = x > 0 ? x - tolerance : x + tolerance;
double y_with_tolerance = y > 0 ? y - tolerance : y + tolerance;
double width_with_tolerance = width + 2 * tolerance;
double height_with_tolerance = height + 2 * tolerance;
double x_with_tolerance = x > 0 ? x - TOLERANCE : x + TOLERANCE;
double y_with_tolerance = y > 0 ? y - TOLERANCE : y + TOLERANCE;
double width_with_tolerance = width + (2 * TOLERANCE);
double height_with_tolerance = height + (2 * TOLERANCE);
return getCurrentClippingPath().intersects(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance);
}
public Area getCurrentClippingPath() {
return stack.peek();
}
public void enterNewGState() {
Area current = stack.peek();
Area cloned = new Area();
cloned.add(current);
stack.push(cloned);
}
public void leaveGState() {
stack.pop();
}

View File

@ -1,148 +1,170 @@
package com.iqser.red.service.ocr.v1.server.model;
import static com.iqser.red.service.ocr.v1.server.service.InvisibleElementRemovalService.TOLERANCE;
import java.awt.geom.Rectangle2D;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.Rect;
import lombok.*;
import lombok.AccessLevel;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.experimental.SuperBuilder;
import java.awt.geom.Rectangle2D;
@Data
@Getter
@SuperBuilder
@NoArgsConstructor
@AllArgsConstructor
public abstract class ElementFeatures {
private int elementType;
private Rectangle2D boundingBox;
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class ElementFeatures {
int elementType;
Rectangle2D boundingBox;
public boolean almostMatches(Element element) throws PDFNetException {
if (element.getType() != elementType) return false;
if (element.getBBox() == null) return false;
return rectsAlmostMatch(element.getBBox());
return element.getType() == elementType && //
element.getBBox() != null && //
rectsAlmostMatch(element.getBBox());
}
protected boolean almostEqual(double a, double b) {
double tolerance = 1e-3;
return Math.abs(a - b) < tolerance;
return Math.abs(a - b) < TOLERANCE;
}
@SneakyThrows
private boolean rectsAlmostMatch(Rect bBox) {
if (!almostEqual(bBox.getX1(), boundingBox.getX())) return false;
if (!almostEqual(bBox.getY1(), boundingBox.getY())) return false;
if (!almostEqual(bBox.getWidth(), boundingBox.getWidth())) return false;
return almostEqual(bBox.getHeight(), boundingBox.getHeight());
// To address the inconsistencies in the calculation of the bounding box we check equality with a tolerance
return almostEqual(bBox.getX1(), boundingBox.getX()) && //
almostEqual(bBox.getY1(), boundingBox.getY()) && //
almostEqual(bBox.getWidth(), boundingBox.getWidth()) && //
almostEqual(bBox.getHeight(), boundingBox.getHeight());
}
@EqualsAndHashCode(callSuper = true)
@Data
@Getter
@SuperBuilder
@NoArgsConstructor
@AllArgsConstructor
public static class Text extends ElementFeatures {
private String text;
private int font;
private double fontsize;
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
private static class Text extends ElementFeatures {
String text;
int font;
double fontsize;
@Override
public boolean almostMatches(Element element) throws PDFNetException {
if (!super.almostMatches(element)) return false;
if (!text.equals(element.getTextString())) return false;
if (font != element.getGState().getFont().getType()) return false;
return almostEqual(fontsize, element.getGState().getFontSize());
return super.almostMatches(element) && //
text.equals(element.getTextString()) && //
font == element.getGState().getFont().getType() && //
almostEqual(fontsize, element.getGState().getFontSize());
}
}
@EqualsAndHashCode(callSuper = true)
@Data
@Getter
@SuperBuilder
@NoArgsConstructor
@AllArgsConstructor
public static class Path extends ElementFeatures {
private boolean isClippingPath;
private boolean isClipWindingFill;
private boolean isStroked;
private boolean isFilled;
private boolean isWindingFill;
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
private static class Path extends ElementFeatures {
boolean isClippingPath;
boolean isClipWindingFill;
boolean isStroked;
boolean isFilled;
boolean isWindingFill;
@Override
public boolean almostMatches(Element element) throws PDFNetException {
if (!super.almostMatches(element)) return false;
if (isClippingPath != element.isClippingPath()) return false;
if (isClipWindingFill != element.isClipWindingFill()) return false;
if (isStroked != element.isStroked()) return false;
if (isFilled != element.isFilled()) return false;
if (isWindingFill != element.isWindingFill()) return false;
return true;
return super.almostMatches(element) && //
isClippingPath == element.isClippingPath() && //
isClipWindingFill == element.isClipWindingFill() && //
isStroked == element.isStroked() && //
isFilled == element.isFilled() && //
isWindingFill == element.isWindingFill();
}
}
@EqualsAndHashCode(callSuper = true)
@Data
@Getter
@SuperBuilder
@NoArgsConstructor
public static class Image extends ElementFeatures {
private int dataSize;
private int height;
private int width;
private int renderingIntent;
private int componentNum;
private int bitsPerComponent;
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
private static class Image extends ElementFeatures {
int dataSize;
int height;
int width;
int renderingIntent;
int componentNum;
int bitsPerComponent;
@Override
public boolean almostMatches(Element element) throws PDFNetException {
if (!super.almostMatches(element)) return false;
if (dataSize != element.getImageDataSize()) return false;
if (height != element.getImageHeight()) return false;
if (width != element.getImageWidth()) return false;
if (renderingIntent != element.getImageRenderingIntent()) return false;
if (componentNum != element.getComponentNum()) return false;
if (bitsPerComponent != element.getBitsPerComponent()) return false;
return true;
return super.almostMatches(element) && //
dataSize == element.getImageDataSize() && //
height == element.getImageHeight() && //
width == element.getImageWidth() && //
renderingIntent == element.getImageRenderingIntent() && //
componentNum == element.getComponentNum() && //
bitsPerComponent == element.getBitsPerComponent();
}
}
public static ElementFeatures extractFeatures(Element element) throws PDFNetException {
switch (element.getType()) {
case Element.e_path:
return ElementFeatures.Path.builder()
.elementType(element.getType())
.boundingBox(toRectangle2D(element.getBBox()))
.isClippingPath(element.isClippingPath())
.isClipWindingFill(element.isClipWindingFill())
.isStroked(element.isStroked())
.isFilled(element.isFilled())
.isWindingFill(element.isWindingFill())
.build();
case Element.e_text:
return ElementFeatures.Text.builder()
.elementType(element.getType())
.boundingBox(toRectangle2D(element.getBBox()))
.text(element.getTextString())
.font(element.getGState().getFont().getType())
.fontsize(element.getGState().getFontSize())
.build();
case Element.e_image:
case Element.e_inline_image:
return Image.builder()
.elementType(element.getType())
.boundingBox(toRectangle2D(element.getBBox()))
.dataSize(element.getImageDataSize())
.height(element.getImageHeight())
.width(element.getImageWidth())
.renderingIntent(element.getImageRenderingIntent())
.componentNum(element.getComponentNum())
.bitsPerComponent(element.getBitsPerComponent())
.build();
default:
throw new UnsupportedOperationException("Feature Extraction is not supported for PDFTron.Element with type: " + element.getType());
}
return switch (element.getType()) {
case Element.e_path -> Path.builder()
.elementType(element.getType())
.boundingBox(toRectangle2D(element.getBBox()))
.isClippingPath(element.isClippingPath())
.isClipWindingFill(element.isClipWindingFill())
.isStroked(element.isStroked())
.isFilled(element.isFilled())
.isWindingFill(element.isWindingFill())
.build();
case Element.e_text -> Text.builder()
.elementType(element.getType())
.boundingBox(toRectangle2D(element.getBBox()))
.text(element.getTextString())
.font(element.getGState().getFont().getType())
.fontsize(element.getGState().getFontSize())
.build();
case Element.e_image, Element.e_inline_image -> Image.builder()
.elementType(element.getType())
.boundingBox(toRectangle2D(element.getBBox()))
.dataSize(element.getImageDataSize())
.height(element.getImageHeight())
.width(element.getImageWidth())
.renderingIntent(element.getImageRenderingIntent())
.componentNum(element.getComponentNum())
.bitsPerComponent(element.getBitsPerComponent())
.build();
// This technically should never happen, it's a safetynet
default -> throw new RuntimeException("Feature Extraction is not supported for PDFTron.Element with type: " + element.getType());
};
}
private static Rectangle2D toRectangle2D(Rect rect) throws PDFNetException {
return new Rectangle2D.Double(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
}
}

View File

@ -0,0 +1,25 @@
package com.iqser.red.service.ocr.v1.server.model;
import java.util.List;
import java.util.Set;
import com.pdftron.pdf.ElementReader;
import lombok.AccessLevel;
import lombok.Builder;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@Builder
@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true)
public class InvisibleElementRemovalDto {
boolean delta;
ElementReader reader;
ClippingPathStack clippingPathStack;
List<ElementFeatures> overlappedElements;
List<ElementFeatures> visibleElements;
Set<Long> visitedXObjIds;
}

View File

@ -0,0 +1,419 @@
package com.iqser.red.service.ocr.v1.server.service;
import java.awt.Shape;
import java.awt.geom.AffineTransform;
import java.awt.geom.GeneralPath;
import java.awt.geom.Rectangle2D;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import org.springframework.stereotype.Service;
import com.google.common.primitives.Bytes;
import com.google.common.primitives.Doubles;
import com.iqser.red.service.ocr.v1.server.model.ClippingPathStack;
import com.iqser.red.service.ocr.v1.server.model.ElementFeatures;
import com.iqser.red.service.ocr.v1.server.model.InvisibleElementRemovalDto;
import com.pdftron.common.Matrix2D;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.ColorPt;
import com.pdftron.pdf.ColorSpace;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementBuilder;
import com.pdftron.pdf.ElementReader;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.GState;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
import com.pdftron.pdf.PathData;
import com.pdftron.pdf.Rect;
import com.pdftron.sdf.Obj;
import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
public class InvisibleElementRemovalService {
static public final double TOLERANCE = 1e-3;
/**
* Removes all hidden Text, Path and Image Elements from a PDF Document.
* handled cases:
* -Text which is transparent or is set to not render
* -Elements outside of clipping path
* -Elements that have been painted over by visible and filled Paths
* unhandled cases:
* -Elements covered by widely stroked path
* -Elements with the same color as background
* -Any Text set to clipping with its many interactions with other elements
*
* @param pdfFile The PDF file to process
* @param delta If this flag is set only the removed Elements will be written to the output file.
* The Elements are red if they are removed by clipping path, blue for transparency, and a green bounding box for overlap.
* @return The resulting PDF File as bytes.
**/
@SneakyThrows
public byte[] removeInvisibleElements(InputStream pdfFile, boolean delta) {
PDFDoc pdfDoc = new PDFDoc(pdfFile);
ElementWriter writer = new ElementWriter();
ElementReader reader = new ElementReader();
Set<Long> visitedXObjIds = new TreeSet<>();
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
Page page = iterator.next();
visitedXObjIds.add(page.getSDFObj().getObjNum());
InvisibleElementRemovalDto dto = InvisibleElementRemovalDto.builder()
.reader(reader)
.clippingPathStack(new ClippingPathStack(page.getMediaBox()))
.delta(delta)
.overlappedElements(new ArrayList<>())
.visibleElements(new ArrayList<>())
.visitedXObjIds(visitedXObjIds)
.build();
removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, dto);
dto.getVisitedXObjIds().clear();
removeOverlappedElements(page, writer, dto);
}
return pdfDoc.save(SDFDoc.SaveMode.REMOVE_UNUSED, null);
}
private void removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(Page page, ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException {
dto.getReader().begin(page);
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
processElements(writer, dto);
writer.end();
dto.getReader().end();
}
private void processElements(ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException {
for (Element element = dto.getReader().next(); element != null; element = dto.getReader().next())
switch (element.getType()) {
case Element.e_image, Element.e_inline_image -> processImages(element, writer, dto);
case Element.e_text -> processText(element, writer, dto);
case Element.e_path -> processPath(element, writer, dto);
case Element.e_form -> processForm(element, writer, dto);
case Element.e_group_begin -> {
dto.getClippingPathStack().enterNewGState();
writer.writeElement(element);
}
case Element.e_group_end -> {
dto.getClippingPathStack().leaveGState();
writer.writeElement(element);
}
default -> writer.writeElement(element);
}
}
private void processImages(Element imageElement, ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException {
Rect rect = imageElement.getBBox();
if (rect == null) {
return;
}
boolean inClippingPath = dto.getClippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
if (!dto.isDelta() && inClippingPath) {
dto.getVisibleElements().add(ElementFeatures.extractFeatures(imageElement));
}
if (dto.isDelta() ^ inClippingPath) {
writer.writeElement(imageElement);
}
}
private void processText(Element textElement, ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException {
Rect rect = textElement.getBBox();
if (rect == null) {
writer.writeElement(textElement);
return;
}
GState gState = textElement.getGState();
boolean inClippingPath = dto.getClippingPathStack().almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
boolean isTextVisible = isTextRenderedVisibly(gState);
if (inClippingPath && isTextVisible) {
dto.getVisibleElements().add(ElementFeatures.extractFeatures(textElement));
}
if (!dto.isDelta()) {
if (inClippingPath && isTextVisible) {
writer.writeElement(textElement);
} else if (textElement.hasTextMatrix()) {
/*
PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
Therefore, the position of a following Tj is affected by not writing the first Element.
This is why, we write only the Tm command:
*/
writer.writeGStateChanges(textElement);
}
} else {
if (!inClippingPath) {
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
gState.setFillColor(new ColorPt(1, 0, 0));
writer.writeElement(textElement);
}
if (!isTextVisible) {
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
gState.setFillColor(new ColorPt(0, 0, 1));
gState.setTextRenderMode(GState.e_fill_text);
gState.setFillOpacity(1);
writer.writeElement(textElement);
}
}
}
private void processForm(Element formElement, ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException {
writer.writeElement(formElement);
Obj formObj = formElement.getXObject();
if (!dto.getVisitedXObjIds().contains(formObj.getObjNum())) {
dto.getVisitedXObjIds().add(formObj.getObjNum());
// writer needs to be newly initialized when entering a new content stream
// see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
ElementWriter formWriter = new ElementWriter();
dto.getReader().formBegin();
formWriter.begin(formObj);
dto.getReader().clearChangeList();
formWriter.setDefaultGState(dto.getReader());
processElements(formWriter, dto);
formWriter.end();
dto.getReader().end();
}
}
private void processPath(Element pathElement, ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException {
GeneralPath linePath = convertToGeneralPath(pathElement.getPathData());
//transform path to initial user space
var ctm = pathElement.getCTM();
var affineTransform = getAffineTransform(ctm);
linePath.transform(affineTransform);
var rect = linePath.getBounds2D();
boolean inClippingPath = dto.getClippingPathStack().almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight());
if (pathElement.isClippingPath()) {
if (pathElement.isClipWindingFill()) {
linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
} else {
linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
}
dto.getClippingPathStack().intersectClippingPath(linePath);
pathElement.setPathClip(!dto.isDelta());
writer.writeElement(pathElement);
} else {
if (inClippingPath) {
// TODO: WINDING RULE
if (isFilledAndNonTransparent(pathElement)) {
List<ElementFeatures> currentOverlappedElements = dto.getVisibleElements()
.stream()
.filter(features -> almostContains(linePath, features.getBoundingBox()))
.toList();
dto.getOverlappedElements().addAll(currentOverlappedElements);
dto.getVisibleElements().removeAll(currentOverlappedElements);
}
dto.getVisibleElements().add(ElementFeatures.extractFeatures(pathElement));
if (!dto.isDelta()) {
writer.writeElement(pathElement);
}
}
if (dto.isDelta() && !inClippingPath) {
pathElement.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
pathElement.getGState().setFillColor(new ColorPt(1, 0, 0));
pathElement.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
pathElement.getGState().setStrokeColor(new ColorPt(1, 0, 0));
writer.writeElement(pathElement);
}
}
}
private static AffineTransform getAffineTransform(Matrix2D ctm) throws PDFNetException {
return new AffineTransform(ctm.getA(), ctm.getB(), ctm.getC(), ctm.getD(), ctm.getH(), ctm.getV());
}
private void removeOverlappedElements(Page page, ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException {
dto.getReader().begin(page);
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
if (dto.isDelta()) {
dto.getOverlappedElements().forEach(feature -> drawBBox(writer, feature.getBoundingBox(), "#00FF00"));
dto.getOverlappedElements().clear();
}
processOverlappedElements(writer, dto);
writer.end();
dto.getReader().end();
if (dto.getOverlappedElements().size() > 0) {
log.warn(dto.getOverlappedElements().size() + " overlapped elements have not been found or removed");
}
}
private void processOverlappedElements(ElementWriter writer, InvisibleElementRemovalDto dto) throws PDFNetException {
for (Element element = dto.getReader().next(); element != null; element = dto.getReader().next()) {
switch (element.getType()) {
case Element.e_form -> processFormOverlappedElements(writer, element, dto);
case Element.e_path, Element.e_image, Element.e_inline_image, Element.e_text -> {
boolean anyMatch = false;
for (ElementFeatures elementToRemove : dto.getOverlappedElements()) {
if (elementToRemove.almostMatches(element)) {
dto.getOverlappedElements().remove(elementToRemove);
anyMatch = true;
break;
}
}
if (!anyMatch) {
writer.writeElement(element);
} else if (element.getType() == 3 && element.hasTextMatrix()) {
/*
PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
Therefore, the position of a following Tj is affected by not writing the first Element.
This is why, we write only the Tm command:
*/
writer.writeGStateChanges(element);
}
}
default -> writer.writeElement(element);
}
}
}
private void processFormOverlappedElements(ElementWriter writer, Element formElement, InvisibleElementRemovalDto dto) throws PDFNetException {
writer.writeElement(formElement);
Obj formObj = formElement.getXObject();
if (!dto.getVisitedXObjIds().contains(formObj.getObjNum())) {
dto.getVisitedXObjIds().add(formObj.getObjNum());
// writer needs to be newly initialized when entering a new content stream
// see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
ElementWriter formWriter = new ElementWriter();
dto.getReader().formBegin();
formWriter.begin(formObj);
dto.getReader().clearChangeList();
formWriter.setDefaultGState(dto.getReader());
processOverlappedElements(formWriter, dto);
formWriter.end();
dto.getReader().end();
}
}
private boolean isTextRenderedVisibly(GState gState) throws PDFNetException {
return gState.getTextRenderMode() != GState.e_invisible_text && //
!(gState.getTextRenderMode() == GState.e_fill_text && gState.getFillOpacity() == 0) && //
!(gState.getTextRenderMode() == GState.e_stroke_text && gState.getStrokeOpacity() == 0) && //
!(gState.getTextRenderMode() == GState.e_fill_stroke_text && gState.getFillOpacity() == 0 && gState.getStrokeOpacity() == 0);
}
private GeneralPath convertToGeneralPath(PathData pathData) throws PDFNetException {
GeneralPath linePath = new GeneralPath();
Iterator<Double> points = Doubles.asList(pathData.getPoints()).iterator();
Iterable<Byte> operators = Bytes.asList(pathData.getOperators());
for (var operator : operators) {
switch (operator) {
case PathData.e_moveto -> linePath.moveTo(points.next(), points.next());
case PathData.e_lineto -> linePath.lineTo(points.next(), points.next());
case PathData.e_cubicto -> linePath.curveTo(points.next(), points.next(), points.next(), points.next(), points.next(), points.next());
case PathData.e_closepath -> linePath.closePath();
case PathData.e_rect -> {
double x = points.next();
double y = points.next();
double w = points.next();
double h = points.next();
linePath.moveTo(x, y);
linePath.lineTo(x + w, y);
linePath.lineTo(x + w, y + h);
linePath.lineTo(x, y + h);
linePath.closePath();
}
default -> throw new PDFNetException("Invalid Element Type", 0, "", "", "");
}
}
return linePath;
}
private boolean almostContains(Shape outer, Rectangle2D inner) {
//To address inconsistencies in the calculation of the bounding box we slightly shrink the inner rectangle
double x_with_tolerance = inner.getX() >= 0 ? inner.getX() + TOLERANCE : inner.getX() - TOLERANCE;
double y_with_tolerance = inner.getY() >= 0 ? inner.getY() + TOLERANCE : inner.getY() - TOLERANCE;
double height_with_tolerance = inner.getHeight() - (2 * TOLERANCE);
double width_with_tolerance = inner.getWidth() - (2 * TOLERANCE);
Rectangle2D innerRect = new Rectangle2D.Double(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance);
return outer.contains(innerRect);
}
private boolean isFilledAndNonTransparent(Element element) throws PDFNetException {
return element.isFilled() && element.getGState().getFillOpacity() == 1;
}
@SneakyThrows
private void drawBBox(ElementWriter writer, Rectangle2D r, String hexcolor) {
ColorPt colorPt = new ColorPt(Integer.valueOf(hexcolor.substring(1, 3), 16) / 255d,
Integer.valueOf(hexcolor.substring(3, 5), 16) / 255d,
Integer.valueOf(hexcolor.substring(5, 7), 16) / 255d);
ElementBuilder eb = new ElementBuilder();
Element rect = eb.createRect(r.getX(), r.getY(), r.getWidth(), r.getHeight());
rect.setPathStroke(true);
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
rect.getGState().setStrokeColor(colorPt);
writer.writePlacedElement(rect);
}
}

View File

@ -1,448 +0,0 @@
package com.iqser.red.service.ocr.v1.server.service;
import com.iqser.red.service.ocr.v1.server.model.ClippingPathStack;
import com.iqser.red.service.ocr.v1.server.model.ElementFeatures;
import com.pdftron.common.Matrix2D;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.*;
import com.pdftron.sdf.Obj;
import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
import java.awt.*;
import java.awt.geom.AffineTransform;
import java.awt.geom.GeneralPath;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import java.util.stream.Collectors;
@Slf4j
@Service
public class InvisibleElementService {
/*
handled cases:
Text which is transparent or is set to not render
Text or Path or Images outside of clipping path
Text or Path or Images that have been painted over by visible and filled Paths
unhandled cases:
Text covered by widely stroked path
Text same color as background
Any Text set to clipping with its many interactions with other elements
*/
@SneakyThrows
public byte[] removeInvisibleElements(byte[] pdfFile, boolean delta) {
PDFDoc pdfDoc = new PDFDoc(pdfFile);
ElementWriter writer = new ElementWriter();
ElementReader reader = new ElementReader();
Set<Integer> visited = new TreeSet<>();
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
Page page = iterator.next();
List<ElementFeatures> overlappedElements = removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, reader, writer, visited, delta);
visited.clear();
removeOverlappedElements(page, reader, writer, visited, overlappedElements, delta);
}
return pdfDoc.save(SDFDoc.SaveMode.REMOVE_UNUSED, null);
}
private List<ElementFeatures> removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(Page page, ElementReader reader, ElementWriter writer, Set<Integer> visited, boolean delta)
throws PDFNetException {
var overlappedElements = new ArrayList<ElementFeatures>();
var visibleElements = new ArrayList<ElementFeatures>();
ClippingPathStack clippingPathStack = new ClippingPathStack(page.getMediaBox());
visited.add((int) page.getSDFObj().getObjNum());
reader.begin(page);
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
processElements(reader, writer, visited, clippingPathStack, delta, overlappedElements, visibleElements);
writer.end();
reader.end();
return overlappedElements;
}
private void processElements(ElementReader reader, ElementWriter writer, Set<Integer> visited, ClippingPathStack clippingPathStack, Boolean delta,
List<ElementFeatures> coveredElements, List<ElementFeatures> visibleElements)
throws PDFNetException {
for (Element element = reader.next(); element != null; element = reader.next())
switch (element.getType()) {
case Element.e_image:
case Element.e_inline_image:
processImages(element, writer, clippingPathStack, delta, visibleElements);
break;
case Element.e_text:
processText(element, writer, clippingPathStack, delta, visibleElements);
break;
case Element.e_path:
processPath(element, writer, clippingPathStack, delta, coveredElements, visibleElements);
break;
case Element.e_form:
processForm(reader, writer, element, visited, clippingPathStack, delta, coveredElements, visibleElements);
break;
case Element.e_group_begin:
clippingPathStack.enterNewGState();
writer.writeElement(element);
break;
case Element.e_group_end:
clippingPathStack.leaveGState();
writer.writeElement(element);
break;
default:
writer.writeElement(element);
}
}
private void processImages(Element imageElement, ElementWriter writer, ClippingPathStack clippingPathStack, Boolean delta, List<ElementFeatures> visibleElements)
throws PDFNetException {
Rect rect = imageElement.getBBox();
if (rect == null) {
return;
}
boolean inClippingPath = clippingPathStack.almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
if (!delta && inClippingPath) {
visibleElements.add(ElementFeatures.extractFeatures(imageElement));
writer.writeElement(imageElement);
}
if (delta && !inClippingPath) {
writer.writeElement(imageElement);
}
}
private void processText(Element textElement, ElementWriter writer, ClippingPathStack clippingPathStack,
Boolean delta, List<ElementFeatures> visibleElements)
throws PDFNetException {
Rect rect = textElement.getBBox();
if (rect == null) {
writer.writeElement(textElement);
return;
}
GState gState = textElement.getGState();
boolean inClippingPath = clippingPathStack.almostIntersects(rect.getX1(), rect.getY1(), rect.getWidth(), rect.getHeight());
boolean isTextVisible = isTextRenderedVisibly(gState);
if (inClippingPath && isTextVisible) {
visibleElements.add(ElementFeatures.extractFeatures(textElement));
}
if (!delta) {
if (inClippingPath && isTextVisible) {
writer.writeElement(textElement);
} else if (textElement.hasTextMatrix()) {
/*
PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
Therefore, the position of a following Tj is affected by not writing the first Element.
This is why, we write only the Tm command:
*/
writer.writeGStateChanges(textElement);
}
} else {
if (!inClippingPath) {
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
gState.setFillColor(new ColorPt(1, 0, 0));
writer.writeElement(textElement);
}
if (!isTextVisible) {
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
gState.setFillColor(new ColorPt(0, 0, 1));
gState.setTextRenderMode(GState.e_fill_text);
gState.setFillOpacity(1);
writer.writeElement(textElement);
}
}
}
private void processForm(ElementReader reader, ElementWriter writer, Element element, Set<Integer> visited, ClippingPathStack clippingPathStack, Boolean delta,
List<ElementFeatures> coveredElements, List<ElementFeatures> allElements)
throws PDFNetException {
writer.writeElement(element);
Obj formObj = element.getXObject();
if (!visited.contains((int) formObj.getObjNum())) {
visited.add((int) formObj.getObjNum());
ElementWriter new_writer = new ElementWriter();
reader.formBegin();
new_writer.begin(formObj);
reader.clearChangeList();
new_writer.setDefaultGState(reader);
processElements(reader, new_writer, visited, clippingPathStack, delta, coveredElements, allElements);
new_writer.end();
reader.end();
}
}
private void processPath(Element pathElement, ElementWriter writer, ClippingPathStack clippingPathStack, Boolean delta,
List<ElementFeatures> overlappedElements, List<ElementFeatures> visibleElements)
throws PDFNetException {
GeneralPath linePath = convertToGeneralPath(pathElement.getPathData());
//transform path to initial user space
var ctm = pathElement.getCTM();
var affineTransform = getAffineTransform(ctm);
linePath.transform(affineTransform);
var rect = linePath.getBounds2D();
boolean inClippingPath = clippingPathStack.almostIntersects(rect.getX(), rect.getY(), rect.getWidth(), rect.getHeight());
if (pathElement.isClippingPath()) {
if (pathElement.isClipWindingFill()) {
linePath.setWindingRule(GeneralPath.WIND_NON_ZERO);
} else {
linePath.setWindingRule(GeneralPath.WIND_EVEN_ODD);
}
clippingPathStack.intersectClippingPath(linePath);
pathElement.setPathClip(!delta);
writer.writeElement(pathElement);
} else {
if (inClippingPath) {
if (isFilledAndNonTransparent(pathElement)) {
List<ElementFeatures> currentOverlappedElements = visibleElements.stream()
.filter(features -> almostContains(linePath, features.getBoundingBox()))
.collect(Collectors.toList());
overlappedElements.addAll(currentOverlappedElements);
visibleElements.removeAll(currentOverlappedElements);
}
visibleElements.add(ElementFeatures.extractFeatures(pathElement));
if (!delta) {
writer.writeElement(pathElement);
}
}
if (delta && !inClippingPath) {
pathElement.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
pathElement.getGState().setFillColor(new ColorPt(1, 0, 0));
pathElement.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
pathElement.getGState().setStrokeColor(new ColorPt(1, 0, 0));
writer.writeElement(pathElement);
}
}
}
private static AffineTransform getAffineTransform(Matrix2D ctm) throws PDFNetException {
return new AffineTransform(ctm.getA(), ctm.getB(), ctm.getC(), ctm.getD(), ctm.getH(), ctm.getV());
}
private void removeOverlappedElements(Page page, ElementReader reader, ElementWriter writer, Set<Integer> visited, List<ElementFeatures> overlappedElements, boolean delta)
throws PDFNetException {
reader.begin(page);
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
if (delta) {
overlappedElements.forEach(feature -> drawBBox(writer, feature.getBoundingBox(), "#00FF00"));
overlappedElements.clear();
}
processOverlappedElements(reader, writer, visited, overlappedElements, delta);
writer.end();
reader.end();
if (overlappedElements.size() > 0) {
log.warn(overlappedElements.size() + " overlapped elements have not been found and removed");
}
}
private void processOverlappedElements(ElementReader reader, ElementWriter writer, Set<Integer> visited, List<ElementFeatures> coveredElements, boolean delta)
throws PDFNetException {
for (Element element = reader.next(); element != null; element = reader.next()) {
switch (element.getType()) {
case Element.e_form:
processFormOverlappedElements(reader, writer, element, visited, coveredElements, delta);
break;
case Element.e_path:
case Element.e_image:
case Element.e_inline_image:
case Element.e_text:
boolean anyMatch = false;
for (ElementFeatures elementToRemove : coveredElements) {
if (elementToRemove.almostMatches(element)) {
coveredElements.remove(elementToRemove);
anyMatch = true;
break;
}
}
if (!anyMatch) {
writer.writeElement(element);
} else if (element.getType() == 3 && element.hasTextMatrix()) {
/*
PDFTron Element with type "text" refers to a Tj command. If a Tm command is just above it in the pdf file, PDFTron will join the two commands and treat them as one Element.
hasTextMatrix() checks for this case specifically. Also, Tm changes the position for a whole BT/ET segment, possibly containing multiple Tj commands.
Therefore, the position of a following Tj is affected by not writing the first Element.
This is why, we write only the Tm command:
*/
writer.writeGStateChanges(element);
}
break;
default:
writer.writeElement(element);
}
}
}
private void processFormOverlappedElements(ElementReader reader, ElementWriter writer, Element element, Set<Integer> visited, List<ElementFeatures> elementsToRemove, boolean delta)
throws PDFNetException {
writer.writeElement(element);
Obj formObj = element.getXObject();
if (!visited.contains((int) formObj.getObjNum())) {
visited.add((int) formObj.getObjNum());
ElementWriter new_writer = new ElementWriter();
reader.formBegin();
new_writer.begin(formObj);
reader.clearChangeList();
new_writer.setDefaultGState(reader);
processOverlappedElements(reader, new_writer, visited, elementsToRemove, delta);
new_writer.end();
reader.end();
}
}
private boolean isTextRenderedVisibly(GState gState) throws PDFNetException {
if (gState.getTextRenderMode() == GState.e_invisible_text) return false;
if (gState.getTextRenderMode() == GState.e_fill_text && gState.getFillOpacity() == 0) return false;
if (gState.getTextRenderMode() == GState.e_stroke_text && gState.getStrokeOpacity() == 0) return false;
if (gState.getTextRenderMode() == GState.e_fill_stroke_text && gState.getFillOpacity() == 0 && gState.getStrokeOpacity() == 0) return false;
return true;
}
private GeneralPath convertToGeneralPath(PathData pathData) throws PDFNetException {
GeneralPath linePath = new GeneralPath();
double[] dataPoints = pathData.getPoints();
byte[] opr = pathData.getOperators();
double x1;
double y1;
double x2;
double y2;
double x3;
double y3;
int data_index = 0;
for (int opr_index = 0; opr_index < opr.length; ++opr_index) {
switch (opr[opr_index]) {
case PathData.e_moveto:
x1 = dataPoints[data_index];
++data_index;
y1 = dataPoints[data_index];
++data_index;
linePath.moveTo(x1, y1);
break;
case PathData.e_lineto:
x1 = dataPoints[data_index];
++data_index;
y1 = dataPoints[data_index];
++data_index;
linePath.lineTo(x1, y1);
break;
case PathData.e_cubicto:
x1 = dataPoints[data_index];
++data_index;
y1 = dataPoints[data_index];
++data_index;
x2 = dataPoints[data_index];
++data_index;
y2 = dataPoints[data_index];
++data_index;
x3 = dataPoints[data_index];
++data_index;
y3 = dataPoints[data_index];
++data_index;
linePath.curveTo(x1, y1, x2, y2, x3, y3);
break;
case PathData.e_rect:
x1 = dataPoints[data_index];
++data_index;
y1 = dataPoints[data_index];
++data_index;
double w = dataPoints[data_index];
++data_index;
double h = dataPoints[data_index];
++data_index;
x2 = x1 + w;
y2 = y1;
x3 = x2;
y3 = y1 + h;
double x4 = x1;
double y4 = y3;
linePath.moveTo(x1, y1);
linePath.lineTo(x2, y2);
linePath.lineTo(x3, y3);
linePath.lineTo(x4, y4);
break;
case PathData.e_closepath:
linePath.closePath();
break;
default:
throw new PDFNetException("Invalid Element Type", 0, "", "", "");
}
}
return linePath;
}
private boolean almostContains(Shape outer, Rectangle2D inner) {
double tolerance = 1e-3;
double x_with_tolerance = inner.getX() >= 0 ? inner.getX() + tolerance : inner.getX() - tolerance;
double y_with_tolerance = inner.getY() >= 0 ? inner.getY() + tolerance : inner.getY() - tolerance;
double height_with_tolerance = inner.getHeight() - (2 * tolerance);
double width_with_tolerance = inner.getWidth() - (2 * tolerance);
Rectangle2D innerRect = new Rectangle2D.Double(x_with_tolerance, y_with_tolerance, width_with_tolerance, height_with_tolerance);
return outer.contains(innerRect);
}
private boolean isFilledAndNonTransparent(Element element) throws PDFNetException {
return element.isFilled() && element.getGState().getFillOpacity() == 1;
}
@SneakyThrows
private void drawBBox(ElementWriter writer, Rectangle2D r, String hexcolor) {
ColorPt colorPt = new ColorPt(Integer.valueOf(hexcolor.substring(1, 3), 16) / 255d,
Integer.valueOf(hexcolor.substring(3, 5), 16) / 255d,
Integer.valueOf(hexcolor.substring(5, 7), 16) / 255d);
ElementBuilder eb = new ElementBuilder();
Element rect = eb.createRect(r.getX(), r.getY(), r.getWidth(), r.getHeight());
rect.setPathStroke(true);
rect.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
rect.getGState().setStrokeColor(colorPt);
writer.writePlacedElement(rect);
}
}

View File

@ -1,5 +1,17 @@
package com.iqser.red.service.ocr.v1.server.service;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.stereotype.Service;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.ocr.v1.api.model.OCRStatusUpdateResponse;
import com.iqser.red.service.ocr.v1.server.configuration.MessagingConfiguration;
@ -8,19 +20,17 @@ import com.iqser.red.service.ocr.v1.server.model.image.ImageServiceResponse;
import com.iqser.red.service.ocr.v1.server.settings.OcrServiceSettings;
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.pdftron.pdf.*;
import com.pdftron.pdf.OCRModule;
import com.pdftron.pdf.OCROptions;
import com.pdftron.pdf.Optimizer;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.RectCollection;
import com.pdftron.sdf.SDFDoc;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.IOUtils;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.stereotype.Service;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.util.*;
@Slf4j
@Service
@ -36,20 +46,18 @@ public class OCRService {
private final ObjectMapper objectMapper;
private final InvisibleElementService invisibleElementService;
private final InvisibleElementRemovalService invisibleElementRemovalService;
@SneakyThrows
public InputStream ocrDocument(String dossierId, String fileId) {
var fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId);
var imageServiceResponse = fileStorageService.getImageServiceResponse(dossierId, fileId);
InputStream fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId);
ImageServiceResponse imageServiceResponse = fileStorageService.getImageServiceResponse(dossierId, fileId);
var fileBytes = IOUtils.toByteArray(fileStream);
byte[] fileWithoutInvisibleTextStream = invisibleElementRemovalService.removeInvisibleElements(fileStream, false);
byte[] fileWithoutInvisibleTextBytes = invisibleElementService.removeInvisibleElements(fileBytes, false);
var ocrBytes = ocr(fileWithoutInvisibleTextBytes, fileId, imageServiceResponse);
byte[] ocrBytes = ocr(fileWithoutInvisibleTextStream, fileId, imageServiceResponse);
return new ByteArrayInputStream(ocrBytes);

View File

@ -1,12 +1,12 @@
package com.iqser.red.service.ocr.v1.server;
import com.iqser.red.service.ocr.v1.server.service.InvisibleElementService;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
import com.pdftron.pdf.TextExtractor;
import lombok.SneakyThrows;
import org.apache.commons.io.FileUtils;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.file.Files;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
@ -17,62 +17,69 @@ import org.springframework.context.annotation.Import;
import org.springframework.core.io.ClassPathResource;
import org.springframework.test.context.junit.jupiter.SpringExtension;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import com.iqser.red.service.ocr.v1.server.service.InvisibleElementRemovalService;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
import com.pdftron.pdf.TextExtractor;
import static com.iqser.red.service.ocr.v1.server.utils.OsUtils.getTemporaryDirectory;
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
import lombok.SneakyThrows;
@ExtendWith(SpringExtension.class)
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT //
, properties = {"pdftron.ocrmodule.path=/YourOCRModulePath"})
@Import(OcrServiceIntegrationTest.TestConfiguration.class)
public class InvisibleElementServiceTest {
public class InvisibleElementRemovalServiceTest {
@Autowired
private InvisibleElementService invisibleElementService;
private InvisibleElementRemovalService invisibleElementRemovalService;
@MockBean
protected RabbitTemplate rabbitTemplate;
@Test
@SneakyThrows
public void testRemoveInvisibleText() {
String fileName = "InvisibleText";
ClassPathResource pdfFileResource = new ClassPathResource("files/" + fileName + ".pdf");
var initialFileBytes = Files.readAllBytes(pdfFileResource.getFile().toPath());
var initialFileStream = Files.newInputStream(pdfFileResource.getFile().toPath());
var fileWithoutInvisibleElements = invisibleElementRemovalService.removeInvisibleElements(initialFileStream, false);
var fileWithoutInvisibleText = invisibleElementService.removeInvisibleElements(initialFileBytes, false);
var deltaFile = invisibleElementService.removeInvisibleElements(initialFileBytes, true);
initialFileStream = Files.newInputStream(pdfFileResource.getFile().toPath());
var deltaFile = invisibleElementRemovalService.removeInvisibleElements(initialFileStream, true);
String fileWithoutInvisibleTextLocation = getTemporaryDirectory() + "/" + fileName + ".pdf";
String deltaFileLocation = getTemporaryDirectory() + "/" + fileName + "_delta.pdf";
saveToFile(fileWithoutInvisibleTextLocation, fileWithoutInvisibleText);
saveToFile(fileWithoutInvisibleTextLocation, fileWithoutInvisibleElements);
saveToFile(deltaFileLocation, deltaFile);
System.out.println("Output File without invisible elements: " + fileWithoutInvisibleTextLocation);
System.out.println("Output Delta File: " + deltaFileLocation);
TextExtractor extractor = new TextExtractor();
PDFDoc pdfDoc = new PDFDoc(fileWithoutInvisibleText);
PDFDoc pdfDoc = new PDFDoc(fileWithoutInvisibleElements);
PageIterator iterator = pdfDoc.getPageIterator();
while (iterator.hasNext()) {
Page page = iterator.next();
extractor.begin(page);
String[] text = extractor.getAsText().split("\n");
assertThat(text).containsAnyOf("APPENDIX 16 Pathology Report", "Amendment 1", "Page 255 of 260");
assertThat(text).contains("APPENDIX 16 Pathology Report", "Amendment 1", "Page 255 of 260");
}
}
private void saveToFile(String location, byte[] fileBytes) {
try (var f_out = FileUtils.openOutputStream(new File(location))) {
try (var f_out = new FileOutputStream(location)) {
f_out.write(fileBytes);
} catch (IOException e) {
throw new RuntimeException("File location: " + location + "could not be openend, no file will be saved");
}
}
}

View File

@ -7,7 +7,7 @@
<parent>
<groupId>com.iqser.red</groupId>
<artifactId>platform-dependency</artifactId>
<version>1.14.0</version>
<version>RED-6114-1</version>
<relativePath/>
</parent>