RED-8156: add debug layers to viewer document
* wip, fonts need to be created in the original document
This commit is contained in:
parent
724bb58969
commit
2aaa53f441
@ -25,5 +25,6 @@ dependencies {
|
||||
api("com.amazonaws:aws-java-sdk-kms:1.12.440")
|
||||
api("com.google.guava:guava:31.1-jre")
|
||||
api("com.iqser.red.commons:pdftron-logic-commons:2.23.0")
|
||||
api("com.knecon.fforesight:viewer-doc-processor:0.3.0")
|
||||
testImplementation("org.junit.jupiter:junit-jupiter:5.8.1")
|
||||
}
|
||||
|
||||
@ -1,14 +1,26 @@
|
||||
package com.knecon.fforesight.service.ocr.processor;
|
||||
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.boot.context.properties.EnableConfigurationProperties;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.ComponentScan;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService;
|
||||
|
||||
import io.micrometer.observation.ObservationRegistry;
|
||||
|
||||
@Configuration
|
||||
@ComponentScan
|
||||
@EnableConfigurationProperties(OcrServiceSettings.class)
|
||||
public class OcrServiceProcessorConfiguration {
|
||||
|
||||
@Bean
|
||||
@Autowired
|
||||
public ViewerDocumentService viewerDocumentService(ObservationRegistry registry) {
|
||||
|
||||
return new ViewerDocumentService(registry);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,13 +1,11 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Paths;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType;
|
||||
@ -31,47 +29,38 @@ public class FileStorageService {
|
||||
return dossierId + "/" + fileId + "." + fileType.name() + fileType.getExtension();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public byte[] getOriginalFile(String dossierId, String fileId) {
|
||||
|
||||
try (InputStream inputStream = getInputStream(getStorageId(dossierId, fileId, FileType.ORIGIN))) {
|
||||
return IOUtils.toByteArray(inputStream);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public InputStream getOriginalFileAsStream(String dossierId, String fileId) {
|
||||
|
||||
return getInputStream(getStorageId(dossierId, fileId, FileType.ORIGIN));
|
||||
}
|
||||
|
||||
|
||||
public void storeOriginalFile(String dossierId, String fileId, InputStream stream) {
|
||||
|
||||
storageService.storeObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.ORIGIN), stream);
|
||||
}
|
||||
|
||||
|
||||
public boolean untouchedFileExists(String dossierId, String fileId) {
|
||||
|
||||
return storageService.objectExists(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.UNTOUCHED));
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void storeFiles(String dossierId, String fileId, File documentFile, File viewerDocumentFile) {
|
||||
|
||||
public void storeUntouchedFile(String dossierId, String fileId, byte[] data) {
|
||||
|
||||
storageService.storeObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.UNTOUCHED), new ByteArrayInputStream(data));
|
||||
try (var in = new FileInputStream(documentFile)) {
|
||||
storageService.storeObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.ORIGIN), in);
|
||||
}
|
||||
try (var in = new FileInputStream(viewerDocumentFile)) {
|
||||
storageService.storeObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.VIEWER_DOCUMENT), in);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private InputStream getInputStream(String storageId) {
|
||||
public void downloadFiles(String dossierId, String fileId, File documentFile, File viewerDocumentFile) {
|
||||
|
||||
File tempFile = File.createTempFile("temp", ".data");
|
||||
storageService.downloadTo(TenantContext.getTenantId(), storageId, tempFile);
|
||||
return Files.newInputStream(Paths.get(tempFile.getPath()), StandardOpenOption.DELETE_ON_CLOSE);
|
||||
storageService.downloadTo(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.ORIGIN), documentFile);
|
||||
if (storageService.objectExists(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.VIEWER_DOCUMENT))) {
|
||||
storageService.downloadTo(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.VIEWER_DOCUMENT), viewerDocumentFile);
|
||||
} else {
|
||||
Files.copy(documentFile.toPath(), viewerDocumentFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
|
||||
}
|
||||
|
||||
if (!untouchedFileExists(dossierId, fileId)) {
|
||||
try (var in = new FileInputStream(documentFile)) {
|
||||
storageService.storeObject(TenantContext.getTenantId(), getStorageId(dossierId, fileId, FileType.UNTOUCHED), in);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,12 +1,12 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
@ -27,6 +27,7 @@ import com.knecon.fforesight.service.ocr.processor.model.OcrResult;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.scriptdetection.FontStyleDetector;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.threads.OCRThread;
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
|
||||
import io.micrometer.observation.ObservationRegistry;
|
||||
import io.micrometer.observation.annotation.Observed;
|
||||
@ -58,55 +59,66 @@ public class OCRService {
|
||||
* looking for stitchedImages (if so converting the current page to an image with ghostscript and work on this instead),
|
||||
* perform tesseract-ocr on these images (via threads) and write the generated ocr-text as invisible elements.
|
||||
*
|
||||
* @param dossierId Id of dossier
|
||||
* @param fileId Id of file
|
||||
* @param out OutputStream where to write to
|
||||
* @param dossierId Id of dossier
|
||||
* @param fileId Id of file
|
||||
* @param tmpDir working directory for all files
|
||||
* @param documentFile the file to perform ocr on, results are written invisibly
|
||||
* @param viewerDocumentFile debugging file, results are written visibly in an optional content group
|
||||
*/
|
||||
@Observed(name = "OCRService", contextualName = "run-ocr-on-document")
|
||||
@SneakyThrows
|
||||
public void runOcrOnDocument(String dossierId, String fileId, OutputStream out) {
|
||||
|
||||
try (InputStream fileStream = removeWatermarkIfEnabled(dossierId, fileId); ByteArrayOutputStream transferOutputStream = new ByteArrayOutputStream()) {
|
||||
|
||||
invisibleElementRemovalService.removeInvisibleElements(fileStream, transferOutputStream, false, false);
|
||||
|
||||
try (InputStream transferInputStream = new ByteArrayInputStream(transferOutputStream.toByteArray())) {
|
||||
log.info("Starting OCR for file {}", fileId);
|
||||
long ocrStart = System.currentTimeMillis();
|
||||
Statistics stats = runOcr(transferInputStream, out, fileId, dossierId);
|
||||
long ocrEnd = System.currentTimeMillis();
|
||||
log.info("ocr successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, String.format("%.1f", (ocrEnd - ocrStart) / 1000.0));
|
||||
log.info("Runtime breakdown: {}", stats);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private InputStream removeWatermarkIfEnabled(String dossierId, String fileId) throws IOException {
|
||||
public void runOcrOnDocument(String dossierId, String fileId, Path tmpDir, File documentFile, File viewerDocumentFile) {
|
||||
|
||||
if (settings.isRemoveWatermark()) {
|
||||
try (var in = fileStorageService.getOriginalFileAsStream(dossierId, fileId); var transferOutputStream = new ByteArrayOutputStream()) {
|
||||
watermarkRemovalService.removeWatermarks(in, transferOutputStream);
|
||||
return new ByteArrayInputStream(transferOutputStream.toByteArray());
|
||||
}
|
||||
removeWatermarkIfEnabled(documentFile);
|
||||
}
|
||||
return fileStorageService.getOriginalFileAsStream(dossierId, fileId);
|
||||
removeInvisibleElements(documentFile);
|
||||
|
||||
log.info("Starting OCR for file {}", fileId);
|
||||
long ocrStart = System.currentTimeMillis();
|
||||
Statistics stats = runOcr(tmpDir, documentFile, viewerDocumentFile, fileId, dossierId);
|
||||
long ocrEnd = System.currentTimeMillis();
|
||||
log.info("ocr successful for file with dossierId {} and fileId {}, took {}s", dossierId, fileId, String.format("%.1f", (ocrEnd - ocrStart) / 1000.0));
|
||||
log.info("Runtime breakdown: {}", stats);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public Statistics runOcr(InputStream in, OutputStream out, String fileId, String dossierId) {
|
||||
private void removeInvisibleElements(File originFile) {
|
||||
|
||||
Path tmpFile = Files.createTempFile("invisibleElements", ".pdf");
|
||||
try (var in = new FileInputStream(originFile); var out = new FileOutputStream(tmpFile.toFile())) {
|
||||
invisibleElementRemovalService.removeInvisibleElements(in, out, false, false);
|
||||
}
|
||||
Files.copy(tmpFile, originFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
|
||||
assert tmpFile.toFile().delete();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void removeWatermarkIfEnabled(File originFile) {
|
||||
|
||||
Path tmpFile = Files.createTempFile("removeWatermarks", ".pdf");
|
||||
try (var in = new FileInputStream(originFile); var out = new FileOutputStream(tmpFile.toFile())) {
|
||||
watermarkRemovalService.removeWatermarks(in, out);
|
||||
}
|
||||
Files.copy(tmpFile, originFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
|
||||
assert tmpFile.toFile().delete();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public Statistics runOcr(Path tmpDir, File documentFile, File viewerDocumentFile, String fileId, String dossierId) {
|
||||
|
||||
long timestamp;
|
||||
Path tmpDir = Path.of(OsUtils.getTemporaryDirectory()).resolve(dossierId + "-" + fileId);
|
||||
|
||||
Path tmpImageDir = tmpDir.resolve("images");
|
||||
Path tesseractOutputDir = tmpDir.resolve("tesseract_output");
|
||||
|
||||
tesseractOutputDir.toFile().mkdirs();
|
||||
tmpImageDir.toFile().mkdirs();
|
||||
|
||||
File documentFile = OsUtils.writeFileToTmpFolder(in, tmpDir);
|
||||
|
||||
Statistics stats;
|
||||
try (PDDocument document = Loader.loadPDF(documentFile)) {
|
||||
OcrProgressLogger logger = new OcrProgressLogger(document.getNumberOfPages(), ocrMessageSender, fileId);
|
||||
@ -150,12 +162,11 @@ public class OCRService {
|
||||
stats.increaseFontStyleDetectionDuration(System.currentTimeMillis() - timestamp);
|
||||
|
||||
timestamp = System.currentTimeMillis();
|
||||
var dictionariesToUpdate = ocrResultWriter.drawOcrResultsToPdf(document, imageWithTextPositionsPerPage);
|
||||
ocrResultWriter.drawOcrResultsToPdf(documentFile, viewerDocumentFile, imageWithTextPositionsPerPage);
|
||||
|
||||
log.info("Saving document");
|
||||
document.saveIncremental(out, dictionariesToUpdate);
|
||||
stats.increaseWritingTextDuration(System.currentTimeMillis() - timestamp);
|
||||
|
||||
FileSystemUtils.deleteRecursively(tmpDir);
|
||||
logger.sendFinished();
|
||||
return stats;
|
||||
}
|
||||
|
||||
@ -1,29 +1,29 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.io.File;
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.pdfbox.cos.COSDictionary;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
||||
import org.apache.pdfbox.pdmodel.PDResources;
|
||||
import org.apache.pdfbox.pdmodel.graphics.optionalcontent.PDOptionalContentGroup;
|
||||
import org.apache.pdfbox.pdmodel.graphics.optionalcontent.PDOptionalContentProperties;
|
||||
import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.OcrResultToWrite;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.QuadPoint;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.fonts.FontStyle;
|
||||
import com.knecon.fforesight.service.ocr.processor.settings.OcrServiceSettings;
|
||||
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.PlacedText;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
@ -37,180 +37,97 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class OcrResultWriter {
|
||||
|
||||
static String ocrLayerName = "knecon OCR";
|
||||
OcrServiceSettings settings;
|
||||
ViewerDocumentService viewerDocumentService;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public Set<COSDictionary> drawOcrResultsToPdf(PDDocument document, Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {
|
||||
public void drawOcrResultsToPdf(File document, File viewerDocument, Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {
|
||||
|
||||
Set<COSDictionary> dictionariesToUpdate = new HashSet<>();
|
||||
imagesWithResultsPerPage.keySet().forEach(pageNumber -> drawResultsPerPage(document, pageNumber, imagesWithResultsPerPage.get(pageNumber), dictionariesToUpdate));
|
||||
dictionariesToUpdate.add(document.getDocumentInformation().getCOSObject());
|
||||
return dictionariesToUpdate;
|
||||
List<VisualizationsOnPage> ocrVisualizationsOnPages = createVisualizations(imagesWithResultsPerPage);
|
||||
List<VisualizationsOnPage> ocrTextDebugVisualizationsOnPages = createDebugTextVisualizations(imagesWithResultsPerPage);
|
||||
List<VisualizationsOnPage> ocrBBoxDebugVisualizationsOnPages = createDebugBBoxVisualizations(imagesWithResultsPerPage);
|
||||
viewerDocumentService.addVisualizationsOnPage(document, document, ContentStreams.KNECON_OCR, ocrVisualizationsOnPages, false);
|
||||
viewerDocumentService.addVisualizationsOnPage(viewerDocument, viewerDocument, ContentStreams.KNECON_OCR, ocrVisualizationsOnPages, false);
|
||||
viewerDocumentService.addVisualizationsOnPage(viewerDocument, viewerDocument, ContentStreams.KNECON_OCR_TEXT_DEBUG, ocrTextDebugVisualizationsOnPages, false);
|
||||
viewerDocumentService.addVisualizationsOnPage(viewerDocument, viewerDocument, ContentStreams.KNECON_OCR_BBOX_DEBUG, ocrBBoxDebugVisualizationsOnPages, false);
|
||||
}
|
||||
|
||||
|
||||
private List<VisualizationsOnPage> createVisualizations(Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {
|
||||
|
||||
return imagesWithResultsPerPage.keySet().stream().map(pageNumber -> createVisualizations(pageNumber, imagesWithResultsPerPage.get(pageNumber))).toList();
|
||||
}
|
||||
|
||||
|
||||
private VisualizationsOnPage createVisualizations(Integer pageNumber, List<OcrResultToWrite> ocrResultsToWrite) {
|
||||
|
||||
List<TextPositionInImage> words = ocrResultsToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList();
|
||||
List<PlacedText> placedTexts = words.stream()
|
||||
.map(word -> new PlacedText(word.getText(),
|
||||
null,
|
||||
Color.BLACK,
|
||||
(float) word.getFontSize(),
|
||||
word.getFont(),
|
||||
Optional.of(word.getTextMatrix()),
|
||||
Optional.of(RenderingMode.NEITHER)))
|
||||
.toList();
|
||||
return VisualizationsOnPage.builder().pageNumber(pageNumber - 1).placedTexts(placedTexts).build();
|
||||
}
|
||||
|
||||
|
||||
private List<VisualizationsOnPage> createDebugTextVisualizations(Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {
|
||||
|
||||
return imagesWithResultsPerPage.keySet().stream().map(pageNumber -> createDebugTextVisualizations(pageNumber, imagesWithResultsPerPage.get(pageNumber))).toList();
|
||||
}
|
||||
|
||||
|
||||
private VisualizationsOnPage createDebugTextVisualizations(Integer pageNumber, List<OcrResultToWrite> ocrResultsToWrite) {
|
||||
|
||||
List<TextPositionInImage> words = ocrResultsToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList();
|
||||
List<PlacedText> placedTexts = words.stream()
|
||||
.map(word -> new PlacedText(word.getText(),
|
||||
null,
|
||||
word.getFontStyle().equals(FontStyle.REGULAR) ? Color.BLUE : Color.RED,
|
||||
(float) word.getFontSize(),
|
||||
word.getFont(),
|
||||
Optional.of(word.getTextMatrix()),
|
||||
Optional.of(RenderingMode.FILL)))
|
||||
.toList();
|
||||
return VisualizationsOnPage.builder().pageNumber(pageNumber).placedTexts(placedTexts).build();
|
||||
}
|
||||
|
||||
|
||||
private List<VisualizationsOnPage> createDebugBBoxVisualizations(Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {
|
||||
|
||||
return imagesWithResultsPerPage.keySet().stream().map(pageNumber -> createDebugBBoxVisualizations(pageNumber, imagesWithResultsPerPage.get(pageNumber))).toList();
|
||||
}
|
||||
|
||||
|
||||
private VisualizationsOnPage createDebugBBoxVisualizations(Integer pageNumber, List<OcrResultToWrite> ocrResultsToWrite) {
|
||||
|
||||
List<TextPositionInImage> words = ocrResultsToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList();
|
||||
List<ColoredLine> coloredLines = Stream.concat(//
|
||||
words.stream().map(TextPositionInImage::getTransformedTextBBox).map(this::quadPointAsLines),//
|
||||
ocrResultsToWrite.stream().map(OcrResultToWrite::imageBoundingBox).map(this::createGrid)//
|
||||
).flatMap(Collection::stream).toList();
|
||||
return VisualizationsOnPage.builder().pageNumber(pageNumber).coloredLines(coloredLines).build();
|
||||
}
|
||||
|
||||
|
||||
private List<ColoredLine> quadPointAsLines(QuadPoint rect) {
|
||||
|
||||
return List.of(new ColoredLine(new Line2D.Double(rect.a(), rect.b()), Color.ORANGE, 1),
|
||||
new ColoredLine(new Line2D.Double(rect.b(), rect.c()), Color.BLUE, 1),
|
||||
new ColoredLine(new Line2D.Double(rect.c(), rect.d()), Color.GREEN, 1),
|
||||
new ColoredLine(new Line2D.Double(rect.d(), rect.a()), Color.MAGENTA, 1));
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void drawResultsPerPage(PDDocument document, Integer pageNumber, List<OcrResultToWrite> ocrResultToWrite, Set<COSDictionary> dictionariesToUpdate) {
|
||||
private List<ColoredLine> createGrid(QuadPoint rect) {
|
||||
|
||||
var pdPage = document.getPage(pageNumber - 1);
|
||||
List<ColoredLine> lines = new LinkedList<>(quadPointAsLines(rect));
|
||||
|
||||
PDOptionalContentGroup textDebugLayer = new PDOptionalContentGroup(ocrLayerName);
|
||||
PDOptionalContentGroup bBoxDebugLayer = new PDOptionalContentGroup(ocrLayerName + "BBox");
|
||||
if (settings.isDebug()) {
|
||||
textDebugLayer = addOptionalGroup(ocrLayerName, document, pdPage, dictionariesToUpdate);
|
||||
bBoxDebugLayer = addOptionalGroup(ocrLayerName + " BBox", document, pdPage, dictionariesToUpdate);
|
||||
}
|
||||
|
||||
escapeContentStreams(document, pdPage);
|
||||
|
||||
List<TextPositionInImage> words = ocrResultToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList();
|
||||
try (var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, true)) {
|
||||
|
||||
// write invisible ocr text inside tagged content
|
||||
contentStream.beginMarkedContent(settings.getOcrMarkedContentTag());
|
||||
contentStream.saveGraphicsState();
|
||||
contentStream.setNonStrokingColor(Color.BLUE);
|
||||
contentStream.setStrokingColor(Color.BLUE);
|
||||
contentStream.setLineWidth(1);
|
||||
words.forEach(word -> drawInvisibleWord(word, contentStream));
|
||||
contentStream.restoreGraphicsState();
|
||||
contentStream.endMarkedContent();
|
||||
|
||||
if (settings.isDebug()) { // must not be written, as it will interfere with layout parsing
|
||||
// write visible ocr text inside optional group
|
||||
contentStream.beginMarkedContent(COSName.OC, textDebugLayer);
|
||||
contentStream.saveGraphicsState();
|
||||
words.forEach(word -> drawVisibleWord(word, contentStream));
|
||||
contentStream.restoreGraphicsState();
|
||||
contentStream.endMarkedContent();
|
||||
|
||||
// write word bounding boxes (tesseract output) inside optional group
|
||||
contentStream.beginMarkedContent(COSName.OC, bBoxDebugLayer);
|
||||
contentStream.saveGraphicsState();
|
||||
ocrResultToWrite.stream()
|
||||
.map(OcrResultToWrite::imageBoundingBox)
|
||||
.forEach(imagePosition -> drawGrid(contentStream, imagePosition));
|
||||
words.stream().map(TextPositionInImage::getTransformedTextBBox).forEach(word -> drawRectangle(contentStream, word));
|
||||
contentStream.restoreGraphicsState();
|
||||
contentStream.endMarkedContent();
|
||||
}
|
||||
}
|
||||
dictionariesToUpdate.add(pdPage.getCOSObject());
|
||||
dictionariesToUpdate.add(pdPage.getResources().getCOSObject());
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static void escapeContentStreams(PDDocument document, PDPage pdPage) {
|
||||
// We need to append to the contentstream, otherwise the content could be overlapped by images
|
||||
// But we also need to save the graphics state before, such that our appended content cannot be affected by previous contentstreams with side-effects, such as not escaped matrix transformations
|
||||
try (var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.PREPEND, false)) {
|
||||
contentStream.saveGraphicsState();
|
||||
}
|
||||
try (var contentStream = new PDPageContentStream(document, pdPage, PDPageContentStream.AppendMode.APPEND, false)) {
|
||||
contentStream.restoreGraphicsState();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private PDOptionalContentGroup addOptionalGroup(String ocrLayerName, PDDocument document, PDPage pdPage, Set<COSDictionary> dictionariesToUpdate) {
|
||||
|
||||
PDDocumentCatalog catalog = document.getDocumentCatalog();
|
||||
PDOptionalContentProperties ocprops = catalog.getOCProperties();
|
||||
if (ocprops == null) {
|
||||
ocprops = new PDOptionalContentProperties();
|
||||
catalog.setOCProperties(ocprops);
|
||||
}
|
||||
PDOptionalContentGroup layer = null;
|
||||
if (ocprops.hasGroup(ocrLayerName)) {
|
||||
layer = ocprops.getGroup(ocrLayerName);
|
||||
} else {
|
||||
layer = new PDOptionalContentGroup(ocrLayerName);
|
||||
ocprops.addGroup(layer);
|
||||
}
|
||||
|
||||
// enable debug layers by default only when DEBUG flag is set.
|
||||
ocprops.setGroupEnabled(layer, settings.isDebug());
|
||||
PDResources resources = pdPage.getResources();
|
||||
if (resources == null) {
|
||||
resources = new PDResources();
|
||||
pdPage.setResources(resources);
|
||||
}
|
||||
dictionariesToUpdate.add(catalog.getCOSObject());
|
||||
return layer;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void drawRectangle(PDPageContentStream contentStream, QuadPoint rect) {
|
||||
|
||||
contentStream.saveGraphicsState();
|
||||
contentStream.setLineWidth(1);
|
||||
contentStream.moveTo((float) rect.a().getX(), (float) rect.a().getY());
|
||||
contentStream.lineTo((float) rect.b().getX(), (float) rect.b().getY());
|
||||
contentStream.setStrokingColor(Color.ORANGE);
|
||||
contentStream.stroke();
|
||||
contentStream.moveTo((float) rect.b().getX(), (float) rect.b().getY());
|
||||
contentStream.lineTo((float) rect.c().getX(), (float) rect.c().getY());
|
||||
contentStream.setStrokingColor(Color.BLUE);
|
||||
contentStream.stroke();
|
||||
contentStream.moveTo((float) rect.c().getX(), (float) rect.c().getY());
|
||||
contentStream.lineTo((float) rect.d().getX(), (float) rect.d().getY());
|
||||
contentStream.setStrokingColor(Color.GREEN);
|
||||
contentStream.stroke();
|
||||
contentStream.moveTo((float) rect.d().getX(), (float) rect.d().getY());
|
||||
contentStream.lineTo((float) rect.a().getX(), (float) rect.a().getY());
|
||||
contentStream.setStrokingColor(Color.MAGENTA);
|
||||
contentStream.stroke();
|
||||
contentStream.restoreGraphicsState();
|
||||
}
|
||||
|
||||
|
||||
private void drawInvisibleWord(TextPositionInImage word, PDPageContentStream contentStream) {
|
||||
|
||||
drawWord(word, contentStream, RenderingMode.NEITHER);
|
||||
}
|
||||
|
||||
|
||||
private void drawVisibleWord(TextPositionInImage word, PDPageContentStream contentStream) {
|
||||
|
||||
drawWord(word, contentStream, RenderingMode.FILL);
|
||||
}
|
||||
|
||||
|
||||
// @SneakyThrows
|
||||
private void drawWord(TextPositionInImage position, PDPageContentStream contentStream, RenderingMode renderingMode) {
|
||||
|
||||
try {
|
||||
contentStream.setNonStrokingColor(switch (position.getFontStyle()) {
|
||||
case BOLD -> Color.RED;
|
||||
case ITALIC -> Color.GREEN;
|
||||
default -> Color.BLUE;
|
||||
});
|
||||
contentStream.beginText();
|
||||
contentStream.setRenderingMode(renderingMode);
|
||||
contentStream.setFont(position.getFont(), (float) position.getFontSize());
|
||||
contentStream.setTextMatrix(position.getTextMatrix());
|
||||
contentStream.showText(position.getText());
|
||||
contentStream.endText();
|
||||
|
||||
} catch (Exception e) {
|
||||
log.error("Failed to write text {}", position.getText());
|
||||
log.error(e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void drawGrid(PDPageContentStream contentStream, QuadPoint rect) {
|
||||
|
||||
drawRectangle(contentStream, rect);
|
||||
|
||||
contentStream.saveGraphicsState();
|
||||
contentStream.setStrokingColor(Color.BLACK);
|
||||
contentStream.setLineWidth(0.2F);
|
||||
int nRows = 8;
|
||||
int nCols = 8;
|
||||
|
||||
@ -218,7 +135,7 @@ public class OcrResultWriter {
|
||||
Point2D start = add(rect.a(), abStep);
|
||||
Point2D end = add(rect.d(), abStep);
|
||||
for (int row = 0; row < nRows; ++row) {
|
||||
drawLine(start, end, contentStream);
|
||||
lines.add(new ColoredLine(new Line2D.Double(start, end), Color.BLACK, 0.2f));
|
||||
start = add(start, abStep);
|
||||
end = add(end, abStep);
|
||||
}
|
||||
@ -226,21 +143,12 @@ public class OcrResultWriter {
|
||||
start = add(rect.a(), adStep);
|
||||
end = add(rect.b(), adStep);
|
||||
for (int col = 0; col < nCols; ++col) {
|
||||
drawLine(start, end, contentStream);
|
||||
lines.add(new ColoredLine(new Line2D.Double(start, end), Color.BLACK, 0.2f));
|
||||
start = add(start, adStep);
|
||||
end = add(end, adStep);
|
||||
}
|
||||
contentStream.restoreGraphicsState();
|
||||
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void drawLine(Point2D a, Point2D b, PDPageContentStream contentStream) {
|
||||
|
||||
contentStream.moveTo((float) a.getX(), (float) a.getY());
|
||||
contentStream.lineTo((float) b.getX(), (float) b.getY());
|
||||
contentStream.stroke();
|
||||
return lines;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -2,18 +2,23 @@ package com.knecon.fforesight.service.ocr.v1.server.queue;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.time.temporal.ChronoUnit;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.springframework.amqp.AmqpRejectAndDontRequeueException;
|
||||
import org.springframework.amqp.core.Message;
|
||||
import org.springframework.amqp.rabbit.annotation.RabbitHandler;
|
||||
import org.springframework.amqp.rabbit.annotation.RabbitListener;
|
||||
import org.springframework.http.HttpStatus;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.springframework.util.FileSystemUtils;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.OsUtils;
|
||||
import com.knecon.fforesight.service.ocr.v1.server.client.FileStatusProcessingUpdateClient;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.FileStorageService;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.OCRService;
|
||||
@ -21,7 +26,6 @@ import com.knecon.fforesight.service.ocr.v1.api.model.DocumentRequest;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileErrorInfo;
|
||||
|
||||
import feign.FeignException;
|
||||
import io.micrometer.observation.annotation.Observed;
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
@ -33,10 +37,10 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class OcrMessageReceiver {
|
||||
|
||||
FileStorageService fileStorageService;
|
||||
ObjectMapper objectMapper;
|
||||
FileStatusProcessingUpdateClient fileStatusProcessingUpdateClient;
|
||||
OCRService ocrService;
|
||||
FileStorageService fileStorageService;
|
||||
ObjectMapper objectMapper;
|
||||
FileStatusProcessingUpdateClient fileStatusProcessingUpdateClient;
|
||||
OCRService ocrService;
|
||||
|
||||
|
||||
@RabbitHandler
|
||||
@ -44,33 +48,33 @@ public class OcrMessageReceiver {
|
||||
public void receiveOcr(Message in) throws IOException {
|
||||
|
||||
DocumentRequest ocrRequestMessage = objectMapper.readValue(in.getBody(), DocumentRequest.class);
|
||||
log.info("--------------------------------------------------------------------------");
|
||||
log.info("Start ocr for file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
|
||||
String dossierId = ocrRequestMessage.getDossierId();
|
||||
String fileId = ocrRequestMessage.getFileId();
|
||||
Path tmpDir = Path.of(OsUtils.getTemporaryDirectory()).resolve(dossierId + "-" + fileId);
|
||||
|
||||
try {
|
||||
setStatusOcrProcessing(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
|
||||
log.info("--------------------------------------------------------------------------");
|
||||
log.info("Start ocr for file with dossierId {} and fileId {}", dossierId, fileId);
|
||||
|
||||
if (!fileStorageService.untouchedFileExists(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId())) {
|
||||
byte[] originalFile = fileStorageService.getOriginalFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
|
||||
fileStorageService.storeUntouchedFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), originalFile);
|
||||
}
|
||||
setStatusOcrProcessing(dossierId, fileId);
|
||||
|
||||
try (var transferStream = new ByteArrayOutputStream()) {
|
||||
ocrService.runOcrOnDocument(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), transferStream);
|
||||
try (var inputStream = new ByteArrayInputStream(transferStream.toByteArray())) {
|
||||
fileStorageService.storeOriginalFile(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId(), inputStream);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
log.error("Failed to store file with dossierId {} and fileId {}", ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
File documentFile = tmpDir.resolve("document.pdf").toFile();
|
||||
File viewerDocumentFile = tmpDir.resolve("viewerDocument.pdf").toFile();
|
||||
|
||||
fileStatusProcessingUpdateClient.ocrSuccessful(ocrRequestMessage.getDossierId(), ocrRequestMessage.getFileId());
|
||||
fileStorageService.downloadFiles(dossierId, fileId, documentFile, viewerDocumentFile);
|
||||
|
||||
ocrService.runOcrOnDocument(dossierId, fileId, tmpDir, documentFile, viewerDocumentFile);
|
||||
|
||||
fileStorageService.storeFiles(dossierId, fileId, documentFile, viewerDocumentFile);
|
||||
|
||||
fileStatusProcessingUpdateClient.ocrSuccessful(dossierId, fileId);
|
||||
} catch (Exception e) {
|
||||
log.warn("An exception occurred in ocr file stage: {}", e.getMessage());
|
||||
in.getMessageProperties().getHeaders().put(MessagingConfiguration.X_ERROR_INFO_HEADER, e.getMessage());
|
||||
in.getMessageProperties().getHeaders().put(MessagingConfiguration.X_ERROR_INFO_TIMESTAMP_HEADER, OffsetDateTime.now().truncatedTo(ChronoUnit.MILLIS));
|
||||
throw new RuntimeException(e);
|
||||
} finally {
|
||||
FileSystemUtils.deleteRecursively(tmpDir);
|
||||
}
|
||||
}
|
||||
|
||||
@ -80,6 +84,7 @@ public class OcrMessageReceiver {
|
||||
public void receiveOcrDLQ(Message failedMessage) throws IOException {
|
||||
|
||||
DocumentRequest ocrRequestMessage = objectMapper.readValue(failedMessage.getBody(), DocumentRequest.class);
|
||||
|
||||
log.info("OCR DQL received: {}", ocrRequestMessage);
|
||||
String errorMessage = failedMessage.getMessageProperties().getHeader(MessagingConfiguration.X_ERROR_INFO_HEADER);
|
||||
OffsetDateTime timestamp = failedMessage.getMessageProperties().getHeader(MessagingConfiguration.X_ERROR_INFO_TIMESTAMP_HEADER);
|
||||
|
||||
@ -9,6 +9,7 @@ import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
@ -25,13 +26,14 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.FileStorageService;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.OCRService;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.OsUtils;
|
||||
import com.knecon.fforesight.tenantcommons.TenantContext;
|
||||
|
||||
import io.micrometer.prometheus.PrometheusMeterRegistry;
|
||||
import io.micrometer.prometheus.PrometheusTimer;
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@Disabled // Ghostscript/Tesseract/Leptonica is not available on build server. If you want to run the test install these dependencies. See README.md for help.
|
||||
//@Disabled // Ghostscript/Tesseract/Leptonica is not available on build server. If you want to run the test install these dependencies. See README.md for help.
|
||||
@SpringBootTest()
|
||||
public class OcrServiceIntegrationTest extends AbstractTest {
|
||||
|
||||
@ -64,7 +66,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
||||
@SneakyThrows
|
||||
public void testOcr() {
|
||||
|
||||
String text = testOCR("files/UNAPPROVED_VV-331155 (1).pdf");
|
||||
String text = testOCR("files/402Study.pdf");
|
||||
}
|
||||
|
||||
|
||||
@ -116,18 +118,17 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
||||
private String testOCR(String fileName) {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource(fileName);
|
||||
var originId = FileStorageService.getStorageId(TEST_DOSSIER_ID, "file", FileType.ORIGIN);
|
||||
try (var fileStream = pdfFileResource.getInputStream()) {
|
||||
storageService.storeObject(TenantContext.getTenantId(), originId, fileStream);
|
||||
}
|
||||
Path tmpDir = Path.of(OsUtils.getTemporaryDirectory()).resolve("OCR_TEST").resolve(Path.of(fileName).getFileName());
|
||||
tmpDir.toFile().mkdirs();
|
||||
var documentFile = tmpDir.resolve(Path.of("document.pdf"));
|
||||
var viewerDocumentFile = tmpDir.resolve(Path.of("viewerDocument.pdf"));
|
||||
Files.copy(pdfFileResource.getFile().toPath(), documentFile, StandardCopyOption.REPLACE_EXISTING);
|
||||
Files.copy(pdfFileResource.getFile().toPath(), viewerDocumentFile, StandardCopyOption.REPLACE_EXISTING);
|
||||
|
||||
Path tmpFileName = Path.of(getTemporaryDirectory()).resolve(Path.of(fileName).getFileName());
|
||||
try (var out = new FileOutputStream(tmpFileName.toFile())) {
|
||||
ocrService.runOcrOnDocument(TEST_DOSSIER_ID, "file", out);
|
||||
System.out.println("File:" + tmpFileName);
|
||||
}
|
||||
ocrService.runOcrOnDocument(TEST_DOSSIER_ID, "file", tmpDir, documentFile.toFile(), viewerDocumentFile.toFile());
|
||||
System.out.println("File:" + documentFile);
|
||||
|
||||
try (var fileStream = new FileInputStream(tmpFileName.toFile())) {
|
||||
try (var fileStream = new FileInputStream(documentFile.toFile())) {
|
||||
return extractAllTextFromDocument(fileStream);
|
||||
}
|
||||
}
|
||||
@ -166,20 +167,18 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
||||
}
|
||||
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void testOCRForFile(File file) {
|
||||
|
||||
var originId = FileStorageService.getStorageId(TEST_DOSSIER_ID, "file", FileType.ORIGIN);
|
||||
try (var fileStream = new FileInputStream(file)) {
|
||||
storageService.storeObject(TenantContext.getTenantId(), originId, fileStream);
|
||||
}
|
||||
Path tmpDir = Path.of(OsUtils.getTemporaryDirectory()).resolve("OCR_TEST").resolve(file.toPath().getFileName());
|
||||
tmpDir.toFile().mkdirs();
|
||||
var documentFile = tmpDir.resolve(Path.of("document.pdf"));
|
||||
var viewerDocumentFile = tmpDir.resolve(Path.of("viewerDocument.pdf"));
|
||||
Files.copy(file.toPath(), documentFile, StandardCopyOption.REPLACE_EXISTING);
|
||||
Files.copy(file.toPath(), viewerDocumentFile, StandardCopyOption.REPLACE_EXISTING);
|
||||
|
||||
Path tmpFileName = Path.of(getTemporaryDirectory()).resolve(Path.of(file.getAbsolutePath()).getFileName());
|
||||
try (var out = new FileOutputStream(tmpFileName.toFile())) {
|
||||
ocrService.runOcrOnDocument(TEST_DOSSIER_ID, "file", out);
|
||||
System.out.println("File:" + tmpFileName);
|
||||
}
|
||||
ocrService.runOcrOnDocument(TEST_DOSSIER_ID, "file", tmpDir, documentFile.toFile(), viewerDocumentFile.toFile());
|
||||
System.out.println("File:" + documentFile);
|
||||
System.out.println("\n\n");
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user