RED-5911: Reverted to old ocr logic that uses ContentReplacer/TextExtractor to remove text behind images

This commit is contained in:
deiflaender 2023-01-17 12:15:34 +01:00
parent e535861da8
commit 7a4c5c2f89
3 changed files with 51 additions and 136 deletions

View File

@ -6,11 +6,8 @@ import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import org.apache.commons.io.IOUtils;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
@ -26,21 +23,17 @@ import com.iqser.red.service.persistence.service.v1.api.utils.SuppressFBWarnings
import com.iqser.red.service.redaction.v1.model.Point;
import com.iqser.red.service.redaction.v1.model.Rectangle;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementReader;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.ContentReplacer;
import com.pdftron.pdf.OCRModule;
import com.pdftron.pdf.OCROptions;
import com.pdftron.pdf.Optimizer;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
import com.pdftron.pdf.Rect;
import com.pdftron.pdf.RectCollection;
import com.pdftron.sdf.Obj;
import com.pdftron.pdf.TextExtractor;
import com.pdftron.sdf.SDFDoc;
import io.micrometer.core.annotation.Timed;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@ -51,6 +44,7 @@ import lombok.extern.slf4j.Slf4j;
public class OCRService {
public static final String ENGLISH = "eng";
public static final String REPLACEMENT_TEXT = "";
private final FileStorageService fileStorageService;
private final OcrServiceSettings settings;
@ -60,12 +54,10 @@ public class OCRService {
private final ObjectMapper objectMapper;
@Timed("redactmanager_PDFTron-ocrDocument")
@SneakyThrows
public InputStream ocrDocument(String dossierId, String fileId) {
var fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId);
var imageServiceResponse = fileStorageService.getImageServiceResponse(dossierId, fileId);
var fileBytes = IOUtils.toByteArray(fileStream);
@ -83,12 +75,9 @@ public class OCRService {
PDFDoc pdfDoc = null;
try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
pdfDoc = new PDFDoc(file);
removeInvisibleText(pdfDoc);
Map<Integer, List<ImagePosition>> pages = new HashMap<>();
// TODO take logic to ignore small and combine images from image-service.
// TODO Then replace logic so ocr-service is independent from image-service.
imageServiceResponse.getData()
.forEach(imageMetadata -> pages.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
.add(new ImagePosition(new Rectangle(new Point(imageMetadata.getPosition().getX1(), imageMetadata.getPosition().getY1()),
@ -97,11 +86,12 @@ public class OCRService {
imageMetadata.getPosition().getPageNumber()), imageMetadata.isAlpha())));
Map<Integer, PDFDoc> pdfDocMap = Collections.synchronizedMap(new HashMap<>());
Map<Integer, Integer> wordCountPerPage = Collections.synchronizedMap(new HashMap<>());
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder().fileId(fileId).numberOfPagesToOCR(pages.keySet().size()).build()));
ocrPages(pdfDoc, fileId, pages, pdfDocMap);
ocrPages(pdfDoc, fileId, pages, pdfDocMap, wordCountPerPage);
for (var entry : pdfDocMap.entrySet()) {
@ -109,8 +99,14 @@ public class OCRService {
var page = entry.getKey();
Page ocrPage = ocrDoc.getPageIterator(1).next();
pdfDoc.pageInsert(pdfDoc.getPageIterator(page), ocrPage);
pdfDoc.pageRemove(pdfDoc.getPageIterator(page + 1));
TextExtractor txt = new TextExtractor();
txt.begin(ocrPage);
int wordCount = txt.getWordCount();
if (wordCount >= wordCountPerPage.get(page)) {
pdfDoc.pageInsert(pdfDoc.getPageIterator(page), ocrPage);
pdfDoc.pageRemove(pdfDoc.getPageIterator(page + 1));
}
ocrDoc.close();
}
@ -143,7 +139,7 @@ public class OCRService {
@SneakyThrows
private void ocrPages(PDFDoc pdfDoc, String fileId, Map<Integer, List<ImagePosition>> pages, Map<Integer, PDFDoc> pdfDocMap) {
private void ocrPages(PDFDoc pdfDoc, String fileId, Map<Integer, List<ImagePosition>> pages, Map<Integer, PDFDoc> pdfDocMap, Map<Integer, Integer> wordCountPerPage) {
int numberOfOCRedPages = 0;
for (var pageEntry : pages.entrySet()) {
@ -153,20 +149,35 @@ public class OCRService {
var page = pageEntry.getKey();
var areasToRemoveInOcrDoc = new ArrayList<Rect>();
Page pdfPage = pdfDoc.getPageIterator(page).next();
pdfPage.setMediaBox(pdfPage.getCropBox());
TextExtractor txt = new TextExtractor();
txt.begin(pdfPage);
int wordCount = txt.getWordCount();
wordCountPerPage.put(page, wordCount);
for (ImagePosition imagePosition : pageEntry.getValue()) {
Rectangle rectangle = imagePosition.getRectangle();
Rect rect = convert(rectangle, pdfPage.getCropBox(), pdfPage.getMediaBox());
// Warning coordinate system is different in this call macOs/Linux
double y = -rectangle.getTopLeft().getY() + pdfPage.getCropBox().getY2() - rectangle.getHeight();
rectCollection.addRect(rectangle.getTopLeft().getX(), y, rectangle.getTopLeft().getX() + rectangle.getWidth(), y + rectangle.getHeight());
if (!imagePosition.isHasTransparency()) {
areasToRemoveInOcrDoc.add(rect);
}
}
rectCollection.clear();
PDFDoc ocrDoc = new PDFDoc();
ocrDoc.pagePushBack(pdfPage);
removeTextFromOCRPage(areasToRemoveInOcrDoc, ocrDoc);
pdfDocMap.put(pageEntry.getKey(), ocrDoc);
OCROptions options = new OCROptions();
@ -175,10 +186,8 @@ public class OCRService {
options.addDPI(settings.getOcrDPI());
OCRModule.processPDF(ocrDoc, options);
rectCollection.clear();
} catch (Exception e) {
log.warn("Failed to process PDF page {} - {}", pageEntry.getKey(), e);
log.warn("Failed to process PDF page {}", pageEntry.getKey());
}
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
@ -194,131 +203,36 @@ public class OCRService {
}
/**
* There are 2 possibilities to have invisible Text in pdfs.
* 1. gState is set to invisible, this is ocr text.
* 2. Filled Path elements in front of the text.
*/
@SneakyThrows
private void removeInvisibleText(PDFDoc pdfDoc) {
private void removeTextFromOCRPage(List<Rect> areasToRemoveInOcrDoc, PDFDoc ocrDoc) {
ElementWriter writer = new ElementWriter();
ElementReader reader = new ElementReader();
Set<Integer> visited = new TreeSet<>();
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
Page page = iterator.next();
removeOverlapText(page, reader, writer, visited);
}
}
@SneakyThrows
private void removeOverlapText(Page page, ElementReader reader, ElementWriter writer, Set<Integer> visited) {
visited.add((int) page.getSDFObj().getObjNum());
reader.begin(page);
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
processElements(reader, writer, visited, false);
writer.end();
reader.end();
}
@SneakyThrows
private void processElements(ElementReader reader, ElementWriter writer, Set<Integer> visited, boolean isInForm) {
Set<Rect> filledRectangles = new HashSet<>();
for (Element element = reader.next(); element != null; element = reader.next())
switch (element.getType()) {
case Element.e_image:
case Element.e_inline_image:
processImage(element, writer, isInForm);
break;
case Element.e_text:
processText(element, writer, filledRectangles);
break;
case Element.e_path:
processPath(element, writer, filledRectangles);
break;
case Element.e_form:
processForm(reader, writer, element, visited);
break;
default:
writer.writeElement(element);
}
}
@SneakyThrows
private void processImage(Element element, ElementWriter writer, boolean isInForm) {
if (!isInForm || !settings.isRemoveWatermark()) {
writer.writeElement(element);
}
}
@SneakyThrows
private void processText(Element element, ElementWriter writer, Set<Rect> filledRectangles) {
if (element.getBBox() == null) {
writer.writeElement(element);
return;
}
double x = element.getBBox().getX1();
double y = element.getBBox().getY1();
boolean filledRectangleIntersection = filledRectangles.stream().anyMatch(r -> {
Page ocrPage = ocrDoc.getPage(1);
for (var rect : areasToRemoveInOcrDoc) {
try {
return r.contains(x, y);
} catch (PDFNetException e) {
throw new RuntimeException("Internal pdftron error during removal of overlap text", e);
ContentReplacer replacer = new ContentReplacer(); // Reinitialize is needed in loop.
replacer.addText(rect, REPLACEMENT_TEXT);
replacer.process(ocrPage);
} catch (Exception e) {
log.warn("Skipping removing text behind image because of: {}", e.getMessage());
break;
}
});
var gState = element.getGState();
//See PDF Reference 5.3 Text rendering modes, 3 = Invisible, however this ocr does not use it.
if (!filledRectangleIntersection && gState.getTextRenderMode() != 3 || filledRectangleIntersection && gState.getTextRenderMode() == 0) {
writer.writeElement(element);
}
}
@SneakyThrows
private void processPath(Element element, ElementWriter writer, Set<Rect> filledRectangles) {
public Rect convert(Rectangle rectangle, Rect cropBox, Rect mediaBox) {
writer.writeElement(element);
if (element.getPathData() != null && element.getPathData().getPoints().length > 4) {
filledRectangles.add(element.getBBox());
}
}
try {
var offset = 0.01;
var x1 = rectangle.getTopLeft().getX() + cropBox.getX1() - mediaBox.getX1() + (cropBox.equals(mediaBox) ? cropBox.getX1() : 0f) - offset;
var y1 = rectangle.getTopLeft().getY() + rectangle.getHeight() - (mediaBox.getY1() - cropBox.getY1()) + (cropBox.equals(mediaBox) ? cropBox.getY1() : 0f) + offset;
var x2 = rectangle.getTopLeft().getX() + rectangle.getWidth() + cropBox.getX1() - mediaBox.getX1() + (cropBox.equals(mediaBox) ? cropBox.getX1() : 0f) + offset;
var y2 = rectangle.getTopLeft().getY() - (mediaBox.getY1() - cropBox.getY1()) + (cropBox.equals(mediaBox) ? cropBox.getY1() : 0f) - offset;
@SneakyThrows
private void processForm(ElementReader reader, ElementWriter writer, Element element, Set<Integer> visited) {
writer.writeElement(element);
Obj formObj = element.getXObject();
if (!visited.contains((int) formObj.getObjNum())) {
visited.add((int) formObj.getObjNum());
ElementWriter new_writer = new ElementWriter();
reader.formBegin();
new_writer.begin(formObj);
reader.clearChangeList();
new_writer.setDefaultGState(reader);
processElements(reader, new_writer, visited, true);
new_writer.end();
reader.end();
// Rect is specified by lower-left and upperright corner.
return new Rect(x1, y1, x2, y2);
} catch (PDFNetException e) {
throw new RuntimeException(e);
}
}

View File

@ -0,0 +1 @@
{"dossierId": "c8553cbd-409f-4e1a-baf4-34b11d49deac", "fileId": "bd6f93ed896dd0e2f641b0568f13ddf1", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": [{"classification": {"label": "other", "probabilities": {"other": 0.9999, "logo": 0.0001, "formula": 0.0, "signature": 0.0}}, "representation": "FFFFFEFBF7EFCFFFFFFFFFFFF", "position": {"x1": -3, "x2": 795, "y1": 0, "y2": 612, "pageNumber": 1}, "geometry": {"width": 798, "height": 612}, "alpha": false, "filters": {"geometry": {"imageSize": {"quotient": 1.0038, "tooLarge": true, "tooSmall": false}, "imageFormat": {"quotient": 1.3039, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "logo", "probabilities": {"logo": 0.9966, "other": 0.0025, "signature": 0.0005, "formula": 0.0003}}, "representation": "FFC33D3C323CCF3390C1F8C72", "position": {"x1": 120, "x2": 131, "y1": 264, "y2": 380, "pageNumber": 1}, "geometry": {"width": 11, "height": 116}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.0513, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.0948, "tooTall": true, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "other", "probabilities": {"other": 1.0, "formula": 0.0, "logo": 0.0, "signature": 0.0}}, "representation": "EF8FF6381060800318F0E187", "position": {"x1": 152, "x2": 205, "y1": 115, "y2": 533, "pageNumber": 1}, "geometry": {"width": 53, "height": 418}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.2138, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.1268, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": true}}, {"classification": {"label": "other", "probabilities": {"other": 0.5967, "logo": 0.1756, "signature": 0.1218, "formula": 0.106}}, "representation": "EC9377C9170E1070C3070C30F", "position": {"x1": 196, "x2": 221, "y1": 121, "y2": 245, "pageNumber": 1}, "geometry": {"width": 25, "height": 124}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.08, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.2016, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": true}}, {"classification": {"label": "logo", "probabilities": {"logo": 1.0, "formula": 0.0, "other": 0.0, "signature": 0.0}}, "representation": "CF1F1F70F140036860F1441B5", "position": {"x1": 707, "x2": 718, "y1": 481, "y2": 531, "pageNumber": 1}, "geometry": {"width": 11, "height": 50}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.0337, "tooLarge": false, "tooSmall": true}, "imageFormat": {"quotient": 0.22, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "logo", "probabilities": {"logo": 0.9704, "other": 0.0223, "formula": 0.0044, "signature": 0.0029}}, "representation": "CF0F1C70F1C7090081F7CF073", "position": {"x1": 732, "x2": 744, "y1": 118, "y2": 241, "pageNumber": 1}, "geometry": {"width": 12, "height": 123}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.0552, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.0976, "tooTall": true, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}], "dataCV": []}