RED-5911: Reverted to old ocr logic that uses ContentReplacer/TextExtractor to remove text behind images
This commit is contained in:
parent
e535861da8
commit
7a4c5c2f89
@ -6,11 +6,8 @@ import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||
@ -26,21 +23,17 @@ import com.iqser.red.service.persistence.service.v1.api.utils.SuppressFBWarnings
|
||||
import com.iqser.red.service.redaction.v1.model.Point;
|
||||
import com.iqser.red.service.redaction.v1.model.Rectangle;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.Element;
|
||||
import com.pdftron.pdf.ElementReader;
|
||||
import com.pdftron.pdf.ElementWriter;
|
||||
import com.pdftron.pdf.ContentReplacer;
|
||||
import com.pdftron.pdf.OCRModule;
|
||||
import com.pdftron.pdf.OCROptions;
|
||||
import com.pdftron.pdf.Optimizer;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.pdf.PageIterator;
|
||||
import com.pdftron.pdf.Rect;
|
||||
import com.pdftron.pdf.RectCollection;
|
||||
import com.pdftron.sdf.Obj;
|
||||
import com.pdftron.pdf.TextExtractor;
|
||||
import com.pdftron.sdf.SDFDoc;
|
||||
|
||||
import io.micrometer.core.annotation.Timed;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
@ -51,6 +44,7 @@ import lombok.extern.slf4j.Slf4j;
|
||||
public class OCRService {
|
||||
|
||||
public static final String ENGLISH = "eng";
|
||||
public static final String REPLACEMENT_TEXT = "";
|
||||
|
||||
private final FileStorageService fileStorageService;
|
||||
private final OcrServiceSettings settings;
|
||||
@ -60,12 +54,10 @@ public class OCRService {
|
||||
private final ObjectMapper objectMapper;
|
||||
|
||||
|
||||
@Timed("redactmanager_PDFTron-ocrDocument")
|
||||
@SneakyThrows
|
||||
public InputStream ocrDocument(String dossierId, String fileId) {
|
||||
|
||||
var fileStream = fileStorageService.getOriginalFileAsStream(dossierId, fileId);
|
||||
|
||||
var imageServiceResponse = fileStorageService.getImageServiceResponse(dossierId, fileId);
|
||||
|
||||
var fileBytes = IOUtils.toByteArray(fileStream);
|
||||
@ -83,12 +75,9 @@ public class OCRService {
|
||||
PDFDoc pdfDoc = null;
|
||||
try (ByteArrayOutputStream out = new ByteArrayOutputStream()) {
|
||||
pdfDoc = new PDFDoc(file);
|
||||
removeInvisibleText(pdfDoc);
|
||||
|
||||
Map<Integer, List<ImagePosition>> pages = new HashMap<>();
|
||||
|
||||
// TODO take logic to ignore small and combine images from image-service.
|
||||
// TODO Then replace logic so ocr-service is independent from image-service.
|
||||
imageServiceResponse.getData()
|
||||
.forEach(imageMetadata -> pages.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
|
||||
.add(new ImagePosition(new Rectangle(new Point(imageMetadata.getPosition().getX1(), imageMetadata.getPosition().getY1()),
|
||||
@ -97,11 +86,12 @@ public class OCRService {
|
||||
imageMetadata.getPosition().getPageNumber()), imageMetadata.isAlpha())));
|
||||
|
||||
Map<Integer, PDFDoc> pdfDocMap = Collections.synchronizedMap(new HashMap<>());
|
||||
Map<Integer, Integer> wordCountPerPage = Collections.synchronizedMap(new HashMap<>());
|
||||
|
||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
|
||||
objectMapper.writeValueAsString(OCRStatusUpdateResponse.builder().fileId(fileId).numberOfPagesToOCR(pages.keySet().size()).build()));
|
||||
|
||||
ocrPages(pdfDoc, fileId, pages, pdfDocMap);
|
||||
ocrPages(pdfDoc, fileId, pages, pdfDocMap, wordCountPerPage);
|
||||
|
||||
for (var entry : pdfDocMap.entrySet()) {
|
||||
|
||||
@ -109,8 +99,14 @@ public class OCRService {
|
||||
var page = entry.getKey();
|
||||
|
||||
Page ocrPage = ocrDoc.getPageIterator(1).next();
|
||||
pdfDoc.pageInsert(pdfDoc.getPageIterator(page), ocrPage);
|
||||
pdfDoc.pageRemove(pdfDoc.getPageIterator(page + 1));
|
||||
|
||||
TextExtractor txt = new TextExtractor();
|
||||
txt.begin(ocrPage);
|
||||
int wordCount = txt.getWordCount();
|
||||
if (wordCount >= wordCountPerPage.get(page)) {
|
||||
pdfDoc.pageInsert(pdfDoc.getPageIterator(page), ocrPage);
|
||||
pdfDoc.pageRemove(pdfDoc.getPageIterator(page + 1));
|
||||
}
|
||||
|
||||
ocrDoc.close();
|
||||
}
|
||||
@ -143,7 +139,7 @@ public class OCRService {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void ocrPages(PDFDoc pdfDoc, String fileId, Map<Integer, List<ImagePosition>> pages, Map<Integer, PDFDoc> pdfDocMap) {
|
||||
private void ocrPages(PDFDoc pdfDoc, String fileId, Map<Integer, List<ImagePosition>> pages, Map<Integer, PDFDoc> pdfDocMap, Map<Integer, Integer> wordCountPerPage) {
|
||||
|
||||
int numberOfOCRedPages = 0;
|
||||
for (var pageEntry : pages.entrySet()) {
|
||||
@ -153,20 +149,35 @@ public class OCRService {
|
||||
|
||||
var page = pageEntry.getKey();
|
||||
|
||||
var areasToRemoveInOcrDoc = new ArrayList<Rect>();
|
||||
|
||||
Page pdfPage = pdfDoc.getPageIterator(page).next();
|
||||
|
||||
pdfPage.setMediaBox(pdfPage.getCropBox());
|
||||
|
||||
TextExtractor txt = new TextExtractor();
|
||||
txt.begin(pdfPage);
|
||||
int wordCount = txt.getWordCount();
|
||||
wordCountPerPage.put(page, wordCount);
|
||||
|
||||
for (ImagePosition imagePosition : pageEntry.getValue()) {
|
||||
Rectangle rectangle = imagePosition.getRectangle();
|
||||
Rect rect = convert(rectangle, pdfPage.getCropBox(), pdfPage.getMediaBox());
|
||||
|
||||
// Warning coordinate system is different in this call macOs/Linux
|
||||
double y = -rectangle.getTopLeft().getY() + pdfPage.getCropBox().getY2() - rectangle.getHeight();
|
||||
rectCollection.addRect(rectangle.getTopLeft().getX(), y, rectangle.getTopLeft().getX() + rectangle.getWidth(), y + rectangle.getHeight());
|
||||
|
||||
if (!imagePosition.isHasTransparency()) {
|
||||
areasToRemoveInOcrDoc.add(rect);
|
||||
}
|
||||
}
|
||||
|
||||
rectCollection.clear();
|
||||
|
||||
PDFDoc ocrDoc = new PDFDoc();
|
||||
ocrDoc.pagePushBack(pdfPage);
|
||||
removeTextFromOCRPage(areasToRemoveInOcrDoc, ocrDoc);
|
||||
pdfDocMap.put(pageEntry.getKey(), ocrDoc);
|
||||
|
||||
OCROptions options = new OCROptions();
|
||||
@ -175,10 +186,8 @@ public class OCRService {
|
||||
options.addDPI(settings.getOcrDPI());
|
||||
OCRModule.processPDF(ocrDoc, options);
|
||||
|
||||
rectCollection.clear();
|
||||
|
||||
} catch (Exception e) {
|
||||
log.warn("Failed to process PDF page {} - {}", pageEntry.getKey(), e);
|
||||
log.warn("Failed to process PDF page {}", pageEntry.getKey());
|
||||
}
|
||||
|
||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
|
||||
@ -194,131 +203,36 @@ public class OCRService {
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* There are 2 possibilities to have invisible Text in pdfs.
|
||||
* 1. gState is set to invisible, this is ocr text.
|
||||
* 2. Filled Path elements in front of the text.
|
||||
*/
|
||||
@SneakyThrows
|
||||
private void removeInvisibleText(PDFDoc pdfDoc) {
|
||||
private void removeTextFromOCRPage(List<Rect> areasToRemoveInOcrDoc, PDFDoc ocrDoc) {
|
||||
|
||||
ElementWriter writer = new ElementWriter();
|
||||
ElementReader reader = new ElementReader();
|
||||
Set<Integer> visited = new TreeSet<>();
|
||||
|
||||
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
|
||||
Page page = iterator.next();
|
||||
removeOverlapText(page, reader, writer, visited);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void removeOverlapText(Page page, ElementReader reader, ElementWriter writer, Set<Integer> visited) {
|
||||
|
||||
visited.add((int) page.getSDFObj().getObjNum());
|
||||
reader.begin(page);
|
||||
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
|
||||
processElements(reader, writer, visited, false);
|
||||
writer.end();
|
||||
reader.end();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void processElements(ElementReader reader, ElementWriter writer, Set<Integer> visited, boolean isInForm) {
|
||||
|
||||
Set<Rect> filledRectangles = new HashSet<>();
|
||||
for (Element element = reader.next(); element != null; element = reader.next())
|
||||
|
||||
switch (element.getType()) {
|
||||
case Element.e_image:
|
||||
case Element.e_inline_image:
|
||||
processImage(element, writer, isInForm);
|
||||
break;
|
||||
|
||||
case Element.e_text:
|
||||
processText(element, writer, filledRectangles);
|
||||
break;
|
||||
|
||||
case Element.e_path:
|
||||
processPath(element, writer, filledRectangles);
|
||||
break;
|
||||
|
||||
case Element.e_form:
|
||||
processForm(reader, writer, element, visited);
|
||||
break;
|
||||
|
||||
default:
|
||||
writer.writeElement(element);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void processImage(Element element, ElementWriter writer, boolean isInForm) {
|
||||
|
||||
if (!isInForm || !settings.isRemoveWatermark()) {
|
||||
writer.writeElement(element);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void processText(Element element, ElementWriter writer, Set<Rect> filledRectangles) {
|
||||
|
||||
if (element.getBBox() == null) {
|
||||
writer.writeElement(element);
|
||||
return;
|
||||
}
|
||||
|
||||
double x = element.getBBox().getX1();
|
||||
double y = element.getBBox().getY1();
|
||||
boolean filledRectangleIntersection = filledRectangles.stream().anyMatch(r -> {
|
||||
Page ocrPage = ocrDoc.getPage(1);
|
||||
for (var rect : areasToRemoveInOcrDoc) {
|
||||
try {
|
||||
return r.contains(x, y);
|
||||
} catch (PDFNetException e) {
|
||||
throw new RuntimeException("Internal pdftron error during removal of overlap text", e);
|
||||
ContentReplacer replacer = new ContentReplacer(); // Reinitialize is needed in loop.
|
||||
replacer.addText(rect, REPLACEMENT_TEXT);
|
||||
replacer.process(ocrPage);
|
||||
} catch (Exception e) {
|
||||
log.warn("Skipping removing text behind image because of: {}", e.getMessage());
|
||||
break;
|
||||
}
|
||||
});
|
||||
|
||||
var gState = element.getGState();
|
||||
|
||||
//See PDF Reference 5.3 Text rendering modes, 3 = Invisible, however this ocr does not use it.
|
||||
if (!filledRectangleIntersection && gState.getTextRenderMode() != 3 || filledRectangleIntersection && gState.getTextRenderMode() == 0) {
|
||||
writer.writeElement(element);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void processPath(Element element, ElementWriter writer, Set<Rect> filledRectangles) {
|
||||
public Rect convert(Rectangle rectangle, Rect cropBox, Rect mediaBox) {
|
||||
|
||||
writer.writeElement(element);
|
||||
if (element.getPathData() != null && element.getPathData().getPoints().length > 4) {
|
||||
filledRectangles.add(element.getBBox());
|
||||
}
|
||||
}
|
||||
try {
|
||||
var offset = 0.01;
|
||||
var x1 = rectangle.getTopLeft().getX() + cropBox.getX1() - mediaBox.getX1() + (cropBox.equals(mediaBox) ? cropBox.getX1() : 0f) - offset;
|
||||
var y1 = rectangle.getTopLeft().getY() + rectangle.getHeight() - (mediaBox.getY1() - cropBox.getY1()) + (cropBox.equals(mediaBox) ? cropBox.getY1() : 0f) + offset;
|
||||
var x2 = rectangle.getTopLeft().getX() + rectangle.getWidth() + cropBox.getX1() - mediaBox.getX1() + (cropBox.equals(mediaBox) ? cropBox.getX1() : 0f) + offset;
|
||||
var y2 = rectangle.getTopLeft().getY() - (mediaBox.getY1() - cropBox.getY1()) + (cropBox.equals(mediaBox) ? cropBox.getY1() : 0f) - offset;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void processForm(ElementReader reader, ElementWriter writer, Element element, Set<Integer> visited) {
|
||||
|
||||
writer.writeElement(element);
|
||||
Obj formObj = element.getXObject();
|
||||
|
||||
if (!visited.contains((int) formObj.getObjNum())) {
|
||||
visited.add((int) formObj.getObjNum());
|
||||
ElementWriter new_writer = new ElementWriter();
|
||||
reader.formBegin();
|
||||
new_writer.begin(formObj);
|
||||
|
||||
reader.clearChangeList();
|
||||
new_writer.setDefaultGState(reader);
|
||||
|
||||
processElements(reader, new_writer, visited, true);
|
||||
new_writer.end();
|
||||
reader.end();
|
||||
// Rect is specified by lower-left and upperright corner.
|
||||
return new Rect(x1, y1, x2, y2);
|
||||
} catch (PDFNetException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -0,0 +1 @@
|
||||
{"dossierId": "c8553cbd-409f-4e1a-baf4-34b11d49deac", "fileId": "bd6f93ed896dd0e2f641b0568f13ddf1", "targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz", "data": [{"classification": {"label": "other", "probabilities": {"other": 0.9999, "logo": 0.0001, "formula": 0.0, "signature": 0.0}}, "representation": "FFFFFEFBF7EFCFFFFFFFFFFFF", "position": {"x1": -3, "x2": 795, "y1": 0, "y2": 612, "pageNumber": 1}, "geometry": {"width": 798, "height": 612}, "alpha": false, "filters": {"geometry": {"imageSize": {"quotient": 1.0038, "tooLarge": true, "tooSmall": false}, "imageFormat": {"quotient": 1.3039, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "logo", "probabilities": {"logo": 0.9966, "other": 0.0025, "signature": 0.0005, "formula": 0.0003}}, "representation": "FFC33D3C323CCF3390C1F8C72", "position": {"x1": 120, "x2": 131, "y1": 264, "y2": 380, "pageNumber": 1}, "geometry": {"width": 11, "height": 116}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.0513, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.0948, "tooTall": true, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "other", "probabilities": {"other": 1.0, "formula": 0.0, "logo": 0.0, "signature": 0.0}}, "representation": "EF8FF6381060800318F0E187", "position": {"x1": 152, "x2": 205, "y1": 115, "y2": 533, "pageNumber": 1}, "geometry": {"width": 53, "height": 418}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.2138, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.1268, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": true}}, {"classification": {"label": "other", "probabilities": {"other": 0.5967, "logo": 0.1756, "signature": 0.1218, "formula": 0.106}}, "representation": "EC9377C9170E1070C3070C30F", "position": {"x1": 196, "x2": 221, "y1": 121, "y2": 245, "pageNumber": 1}, "geometry": {"width": 25, "height": 124}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.08, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.2016, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": true}}, {"classification": {"label": "logo", "probabilities": {"logo": 1.0, "formula": 0.0, "other": 0.0, "signature": 0.0}}, "representation": "CF1F1F70F140036860F1441B5", "position": {"x1": 707, "x2": 718, "y1": 481, "y2": 531, "pageNumber": 1}, "geometry": {"width": 11, "height": 50}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.0337, "tooLarge": false, "tooSmall": true}, "imageFormat": {"quotient": 0.22, "tooTall": false, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}, {"classification": {"label": "logo", "probabilities": {"logo": 0.9704, "other": 0.0223, "formula": 0.0044, "signature": 0.0029}}, "representation": "CF0F1C70F1C7090081F7CF073", "position": {"x1": 732, "x2": 744, "y1": 118, "y2": 241, "pageNumber": 1}, "geometry": {"width": 12, "height": 123}, "alpha": true, "filters": {"geometry": {"imageSize": {"quotient": 0.0552, "tooLarge": false, "tooSmall": false}, "imageFormat": {"quotient": 0.0976, "tooTall": true, "tooWide": false}}, "probability": {"unconfident": false}, "allPassed": false}}], "dataCV": []}
|
||||
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user