Compare commits

...

3 Commits

Author SHA1 Message Date
Kilian Schuettler
220b3da42d RED-8156: add ocr debug layers to viewer document 2024-05-08 09:42:53 +02:00
Kilian Schuettler
dbbeac7e64 RED-8156: add ocr debug layers to viewer document 2024-05-06 14:15:35 +02:00
Kilian Schuettler
26f5395d0a certificate: adjust some magic numbers 2024-05-03 12:49:50 +02:00
10 changed files with 338 additions and 80 deletions

View File

@ -25,6 +25,6 @@ dependencies {
api("com.amazonaws:aws-java-sdk-kms:1.12.440")
api("com.google.guava:guava:31.1-jre")
api("com.iqser.red.commons:pdftron-logic-commons:2.27.0")
api("com.knecon.fforesight:viewer-doc-processor:0.89.0")
api("com.knecon.fforesight:viewer-doc-processor:0.124.0")
testImplementation("org.junit.jupiter:junit-jupiter:5.8.1")
}

View File

@ -6,12 +6,15 @@ import com.sun.jna.NativeLibrary;
import jakarta.annotation.PostConstruct;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
@Slf4j
@Component
@RequiredArgsConstructor
public class PDFNetInitializer {
public class NativeLibrariesInitializer {
@Value("${pdftron.license:}")
private String pdftronLicense;
@ -22,8 +25,21 @@ public class PDFNetInitializer {
// Do not change back to application runner, if it is application runner it takes messages from the queue before PDFNet is initialized, that leads to UnsatisfiedLinkError.
public void init() {
log.info("Initializing Native Libraries");
log.info("Setting pdftron license: {}", pdftronLicense);
PDFNet.setTempPath("/tmp/pdftron");
PDFNet.initialize(pdftronLicense);
log.info("Setting jna.library.path: {}", System.getenv("VCPKG_DYNAMIC_LIB"));
System.setProperty("jna.library.path", System.getenv("VCPKG_DYNAMIC_LIB"));
log.info("Asserting Native Libraries loaded");
NativeLibrary leptonicaLib = NativeLibrary.getInstance("leptonica");
assert leptonicaLib != null;
log.info("Leptonica library loaded from {}", leptonicaLib.getFile().getAbsolutePath());
NativeLibrary tesseractLib = NativeLibrary.getInstance("tesseract");
assert tesseractLib != null;
log.info("Tesseract library loaded from {}", leptonicaLib.getFile().getAbsolutePath());
}
}

View File

@ -93,7 +93,7 @@ public interface OcrImage {
if (getWidth() < 200 || getHeight() < 200) {
return ITessAPI.TessPageSegMode.PSM_SINGLE_BLOCK;
}
return ITessAPI.TessPageSegMode.PSM_AUTO;
return ITessAPI.TessPageSegMode.PSM_SPARSE_TEXT;
} // TODO: evaluate if PSM can be dynamically chosen to increase performance

View File

@ -1,5 +1,6 @@
package com.knecon.fforesight.service.ocr.processor.model;
import java.awt.Rectangle;
import java.awt.geom.AffineTransform;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
@ -34,6 +35,16 @@ public record QuadPoint(Point2D a, Point2D b, Point2D c, Point2D d) {
new Point2D.Double(bounds.getRight(), bounds.getBottom()));
}
public Rectangle2D getBounds2D() {
double minX = Math.min(Math.min(Math.min(a.getX(), b.getX()), c.getX()), d.getX());
double minY = Math.min(Math.min(Math.min(a.getY(), b.getY()), c.getY()), d.getY());
double maxX = Math.max(Math.max(Math.max(a.getX(), b.getX()), c.getX()), d.getX());
double maxY = Math.max(Math.max(Math.max(a.getY(), b.getY()), c.getY()), d.getY());
return new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY);
}
public QuadPoint getTransformed(AffineTransform at) {

View File

@ -3,8 +3,12 @@ package com.knecon.fforesight.service.ocr.processor.service;
import java.awt.Color;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.io.File;
import java.io.FileInputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
@ -16,6 +20,7 @@ import java.util.stream.Stream;
import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode;
import org.springframework.stereotype.Service;
import com.iqser.red.pdftronlogic.commons.Converter;
import com.knecon.fforesight.service.ocr.processor.model.OcrResultToWrite;
import com.knecon.fforesight.service.ocr.processor.model.QuadPoint;
import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage;
@ -26,6 +31,9 @@ import com.knecon.fforesight.service.viewerdoc.model.PlacedText;
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage;
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.TextExtractor;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
@ -39,32 +47,62 @@ import lombok.extern.slf4j.Slf4j;
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class OcrResultWriter {
public static final Color REGULAR_TEXT_COLOR = Color.BLUE;
public static final Color BOLD_TEXT_COLOR = Color.RED;
public static final Color REGULAR_TEXT_IN_IGNORE_ZONE = Color.ORANGE;
public static final Color BOLD_TEXT_IN_IGNORE_ZONE = Color.MAGENTA;
ViewerDocumentService viewerDocumentService;
@SneakyThrows
public void drawOcrResultsToPdf(File document, File viewerDocument, Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {
Map<Integer, VisualizationsOnPage> ocrVisualizationsOnPages = createVisualizations(imagesWithResultsPerPage);
Map<Integer, VisualizationsOnPage> ocrTextDebugVisualizationsOnPages = createDebugTextVisualizations(imagesWithResultsPerPage);
Map<Integer, VisualizationsOnPage> ocrBBoxDebugVisualizationsOnPages = createDebugBBoxVisualizations(imagesWithResultsPerPage);
Map<Integer, VisualizationsOnPage> ocrVisualizationsOnPages = new HashMap<>();
Map<Integer, VisualizationsOnPage> ocrTextDebugVisualizationsOnPages = new HashMap<>();
Map<Integer, VisualizationsOnPage> ocrBBoxDebugVisualizationsOnPages = new HashMap<>();
try (var in = new FileInputStream(document); PDFDoc doc = new PDFDoc(in)) {
for (Integer pageNumber : imagesWithResultsPerPage.keySet()) {
List<Rectangle2D> textBBoxes = getTextBBoxes(doc.getPage(pageNumber));
ocrVisualizationsOnPages.put(pageNumber - 1, createVisualizations(imagesWithResultsPerPage.get(pageNumber), textBBoxes));
ocrTextDebugVisualizationsOnPages.put(pageNumber - 1, createDebugTextVisualizations(imagesWithResultsPerPage.get(pageNumber), textBBoxes));
ocrBBoxDebugVisualizationsOnPages.put(pageNumber - 1, createDebugBBoxVisualizations(imagesWithResultsPerPage.get(pageNumber)));
}
}
Visualizations visualizations = new Visualizations(ContentStreams.KNECON_OCR, ocrVisualizationsOnPages, false);
List<Visualizations> debugVisualizations = List.of(visualizations,
new Visualizations(ContentStreams.KNECON_OCR_TEXT_DEBUG, ocrTextDebugVisualizationsOnPages, false),
new Visualizations(ContentStreams.KNECON_OCR_BBOX_DEBUG, ocrBBoxDebugVisualizationsOnPages, false));
new Visualizations(ContentStreams.KNECON_OCR_TEXT_DEBUG, ocrTextDebugVisualizationsOnPages, false),
new Visualizations(ContentStreams.KNECON_OCR_BBOX_DEBUG, ocrBBoxDebugVisualizationsOnPages, false));
viewerDocumentService.addVisualizationsOnPage(document, document, visualizations);
viewerDocumentService.addVisualizationsOnPage(document, document, List.of(visualizations));
viewerDocumentService.addVisualizationsOnPage(viewerDocument, viewerDocument, debugVisualizations);
}
private Map<Integer, VisualizationsOnPage> createVisualizations(Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {
private List<Rectangle2D> getTextBBoxes(Page page) {
return imagesWithResultsPerPage.keySet()
.stream()
.collect(Collectors.toMap(pageNumber1IdxTo0IdxMapper(), pageNumber -> createVisualizations(imagesWithResultsPerPage.get(pageNumber))));
List<Rectangle2D> textBBoxes = new ArrayList<>();
try (var textExtractor = new TextExtractor()) {
textExtractor.begin(page);
try {
for (TextExtractor.Line line = textExtractor.getFirstLine(); line.isValid(); line = line.getNextLine()) {
for (var word = line.getFirstWord(); word.isValid(); word = word.getNextWord()) {
textBBoxes.add(Converter.toRectangle2D(word.getBBox()));
}
}
} catch (Exception e) {
log.warn("Could not get word dimension, {}", e.getMessage());
}
return textBBoxes;
}
}
@ -74,71 +112,97 @@ public class OcrResultWriter {
}
private VisualizationsOnPage createVisualizations(List<OcrResultToWrite> ocrResultsToWrite) {
private VisualizationsOnPage createVisualizations(List<OcrResultToWrite> ocrResultsToWrite, List<Rectangle2D> ignoreZones) {
List<TextPositionInImage> words = ocrResultsToWrite.stream()
.map(OcrResultToWrite::textPositionInImage)
.flatMap(Collection::stream)
.filter(word -> ignoreZones.stream()
.noneMatch(ignoreZone -> word.getTransformedTextBBox().getBounds2D().intersects(ignoreZone)))
.toList();
List<TextPositionInImage> words = ocrResultsToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList();
List<PlacedText> placedTexts = words.stream()
.map(word -> new PlacedText(word.getText(),
null,
Color.BLACK,
(float) word.getFontSize(),
word.getFontMetricsFactory(),
Optional.of(word.getTextMatrix()),
Optional.of(RenderingMode.NEITHER)))
null,
Color.BLACK,
(float) word.getFontSize(),
word.getFontMetricsFactory(),
Optional.of(word.getTextMatrix()),
Optional.of(RenderingMode.NEITHER)))
.toList();
return VisualizationsOnPage.builder().placedTexts(placedTexts).build();
}
private Map<Integer, VisualizationsOnPage> createDebugTextVisualizations(Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {
return imagesWithResultsPerPage.keySet()
.stream()
.collect(Collectors.toMap(pageNumber1IdxTo0IdxMapper(), pageNumber -> createDebugTextVisualizations(imagesWithResultsPerPage.get(pageNumber))));
}
private VisualizationsOnPage createDebugTextVisualizations(List<OcrResultToWrite> ocrResultsToWrite, List<Rectangle2D> textBBoxes) {
List<TextPositionInImage> wordsToDraw = new ArrayList<>();
List<TextPositionInImage> ignoredWords = new ArrayList<>();
private VisualizationsOnPage createDebugTextVisualizations(List<OcrResultToWrite> ocrResultsToWrite) {
List<TextPositionInImage> words = ocrResultsToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList();
List<PlacedText> placedTexts = words.stream()
for (OcrResultToWrite ocrResultToWrite : ocrResultsToWrite) {
for (TextPositionInImage textPositionInImage : ocrResultToWrite.textPositionInImage()) {
if (textBBoxes.stream()
.anyMatch(ignoreZone -> textPositionInImage.getTransformedTextBBox().getBounds2D().intersects(ignoreZone))) {
ignoredWords.add(textPositionInImage);
} else {
wordsToDraw.add(textPositionInImage);
}
}
}
Stream<PlacedText> placedTexts = wordsToDraw.stream()
.map(word -> new PlacedText(word.getText(),
null,
word.getFontStyle().equals(FontStyle.REGULAR) ? Color.BLUE : Color.RED,
(float) word.getFontSize(),
word.getFontMetricsFactory(),
Optional.of(word.getTextMatrix()),
Optional.of(RenderingMode.FILL)))
.toList();
return VisualizationsOnPage.builder().placedTexts(placedTexts).build();
null,
word.getFontStyle().equals(FontStyle.REGULAR) ? REGULAR_TEXT_COLOR : BOLD_TEXT_COLOR,
(float) word.getFontSize(),
word.getFontMetricsFactory(),
Optional.of(word.getTextMatrix()),
Optional.of(RenderingMode.FILL)));
Stream<PlacedText> placedTexts2 = ignoredWords.stream()
.map(word -> new PlacedText(word.getText(),
null,
word.getFontStyle().equals(FontStyle.REGULAR) ? REGULAR_TEXT_IN_IGNORE_ZONE : BOLD_TEXT_IN_IGNORE_ZONE,
(float) word.getFontSize(),
word.getFontMetricsFactory(),
Optional.of(word.getTextMatrix()),
Optional.of(RenderingMode.FILL)));
return VisualizationsOnPage.builder()
.placedTexts(Stream.of(placedTexts, placedTexts2)
.flatMap(Function.identity())
.toList())
.build();
}
private Map<Integer, VisualizationsOnPage> createDebugBBoxVisualizations(Map<Integer, List<OcrResultToWrite>> imagesWithResultsPerPage) {
return imagesWithResultsPerPage.keySet()
.stream()
.collect(Collectors.toMap(pageNumber1IdxTo0IdxMapper(), pageNumber -> createDebugBBoxVisualizations(imagesWithResultsPerPage.get(pageNumber))));
}
private VisualizationsOnPage createDebugBBoxVisualizations(List<OcrResultToWrite> ocrResultsToWrite) {
List<TextPositionInImage> words = ocrResultsToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList();
List<TextPositionInImage> words = ocrResultsToWrite.stream()
.map(OcrResultToWrite::textPositionInImage)
.flatMap(Collection::stream)
.toList();
List<ColoredLine> coloredLines = Stream.concat(//
words.stream().map(TextPositionInImage::getTransformedTextBBox).map(this::quadPointAsLines),//
ocrResultsToWrite.stream().map(OcrResultToWrite::imageBoundingBox).map(this::createGrid)//
).flatMap(Collection::stream).toList();
words.stream()
.map(TextPositionInImage::getTransformedTextBBox)
.map(this::quadPointAsLines),//
ocrResultsToWrite.stream()
.map(OcrResultToWrite::imageBoundingBox)
.map(this::createGrid)//
)
.flatMap(Collection::stream)
.toList();
return VisualizationsOnPage.builder().coloredLines(coloredLines).build();
}
private List<ColoredLine> quadPointAsLines(QuadPoint rect) {
return List.of(new ColoredLine(new Line2D.Double(rect.a(), rect.b()), Color.ORANGE, 1),
new ColoredLine(new Line2D.Double(rect.b(), rect.c()), Color.BLUE, 1),
new ColoredLine(new Line2D.Double(rect.c(), rect.d()), Color.GREEN, 1),
new ColoredLine(new Line2D.Double(rect.d(), rect.a()), Color.MAGENTA, 1));
return List.of(new ColoredLine(new Line2D.Double(rect.a(), rect.b()), REGULAR_TEXT_IN_IGNORE_ZONE, 1),
new ColoredLine(new Line2D.Double(rect.b(), rect.c()), REGULAR_TEXT_COLOR, 1),
new ColoredLine(new Line2D.Double(rect.c(), rect.d()), Color.GREEN, 1),
new ColoredLine(new Line2D.Double(rect.d(), rect.a()), BOLD_TEXT_IN_IGNORE_ZONE, 1));
}

View File

@ -45,7 +45,7 @@ public class ImageProcessingThread extends Thread {
final BlockingQueue<UnprocessedImage> imageInputQueue;
final BlockingQueue<OcrImage> imageOutputQueue;
final ITessAPI.TessBaseAPI detectionScriptHandle = initDetectionScriptHandle();
final L_Kernel gaussianKernel = Leptonica1.makeGaussianKernel(2, 2, 1.0f, 1);
final L_Kernel gaussianKernel = Leptonica1.makeGaussianKernel(2, 2, 0.7f, 1);
final Statistics stats;
final OcrServiceSettings settings;
final PDDocument document;
@ -227,7 +227,7 @@ public class ImageProcessingThread extends Thread {
if (pix.w < 100 || pix.h < 100) {
binarized = Leptonica1.pixThresholdToBinary(gaussian, 170);
} else {
binarized = Leptonica1.pixOtsuThreshOnBackgroundNorm(gaussian, null, 50, 50, 165, 10, 100, 5, 5, 0.2f, null);
binarized = Leptonica1.pixOtsuThreshOnBackgroundNorm(gaussian, null, 50, 50, 165, 10, 100, 5, 5, 0.0f, null);
if (binarized == null) { // Sometimes Otsu just fails, then we binarize directly
binarized = Leptonica1.pixThresholdToBinary(gaussian, 170);
}

View File

@ -6,7 +6,7 @@
"overrides": [
{
"name": "tesseract",
"version": "5.3.3"
"version": "5.3.4"
},
{
"name": "leptonica",

View File

@ -24,10 +24,11 @@ import org.springframework.context.annotation.Primary;
import org.springframework.test.context.junit.jupiter.SpringExtension;
import com.iqser.red.commons.jackson.ObjectMapperFactory;
import com.knecon.fforesight.service.ocr.processor.initializer.PDFNetInitializer;
import com.knecon.fforesight.service.ocr.processor.initializer.NativeLibrariesInitializer;
import com.iqser.red.storage.commons.StorageAutoConfiguration;
import com.iqser.red.storage.commons.service.StorageService;
import com.iqser.red.storage.commons.utils.FileSystemBackedStorageService;
import com.knecon.fforesight.service.ocr.v1.server.queue.OcrMessageSender;
import com.knecon.fforesight.tenantcommons.TenantsClient;
import com.pdftron.pdf.PDFNet;
@ -36,7 +37,7 @@ import lombok.SneakyThrows;
@ExtendWith({SpringExtension.class, MockitoExtension.class})
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
@Import({AbstractTest.TestConfiguration.class, PDFNetInitializer.class})
@Import({AbstractTest.TestConfiguration.class, NativeLibrariesInitializer.class})
@AutoConfigureObservability
public class AbstractTest {
@ -46,6 +47,9 @@ public class AbstractTest {
@MockBean
private TenantsClient tenantsClient;
@MockBean
private OcrMessageSender ocrMessageSender;
@Autowired
protected StorageService storageService;

View File

@ -10,30 +10,46 @@ import java.io.FileOutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.core.io.ClassPathResource;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.knecon.fforesight.service.ocr.processor.service.FileStorageService;
import com.iqser.red.pdftronlogic.commons.ClippingPathStack;
import com.iqser.red.pdftronlogic.commons.ElementFeatures;
import com.iqser.red.pdftronlogic.commons.MarkedContentStack;
import com.knecon.fforesight.service.ocr.processor.service.OCRService;
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType;
import com.knecon.fforesight.service.ocr.processor.service.OsUtils;
import com.knecon.fforesight.tenantcommons.TenantContext;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.ColorPt;
import com.pdftron.pdf.ColorSpace;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementReader;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.GState;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
import com.pdftron.pdf.Rect;
import com.pdftron.sdf.Obj;
import com.pdftron.sdf.SDFDoc;
import io.micrometer.prometheus.PrometheusMeterRegistry;
import io.micrometer.prometheus.PrometheusTimer;
import lombok.Builder;
import lombok.SneakyThrows;
@Disabled // Ghostscript/Tesseract/Leptonica is not available on build server. If you want to run the test install these dependencies. See README.md for help.
//@Disabled // Ghostscript/Tesseract/Leptonica is not available on build server. If you want to run the test install these dependencies. See README.md for help.
@SpringBootTest()
public class OcrServiceIntegrationTest extends AbstractTest {
@ -54,7 +70,10 @@ public class OcrServiceIntegrationTest extends AbstractTest {
testOCR("files/Watermark.pdf");
testOCR("files/Watermark.pdf");
var ocrOnDocumentMeter = registry.getMeters().stream().filter(m -> m.getId().getName().equalsIgnoreCase("redactmanager_runOcrOnDocument")).findAny();
var ocrOnDocumentMeter = registry.getMeters()
.stream()
.filter(m -> m.getId().getName().equalsIgnoreCase("redactmanager_runOcrOnDocument"))
.findAny();
assertThat(ocrOnDocumentMeter.isPresent()).isTrue();
PrometheusTimer timer = (PrometheusTimer) ocrOnDocumentMeter.get();
assertThat(timer.count()).isEqualTo(3);
@ -66,7 +85,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
@SneakyThrows
public void testOcr() {
String text = testOCR("files/402Study.pdf");
String text = testOCR("files/13485 cert example 2.pdf");
}
@ -83,18 +102,18 @@ public class OcrServiceIntegrationTest extends AbstractTest {
// check visually for most of the images containing text, the resulting text is kind of nonsense, just ensure it is there
String text = testOCR("files/merge_images.pdf");
assertThat(text).contains("Bodyweight change of dams with live young - group mean values",
"Control",
"mg/g day",
"10 mg/kg/day",
"20 mg/kg/",
"Days",
"50",
"-200",
"—250",
"150",
"200",
"250",
"—150");
"Control",
"mg/g day",
"10 mg/kg/day",
"20 mg/kg/",
"Days",
"50",
"-200",
"—250",
"150",
"200",
"250",
"—150");
}
@ -138,7 +157,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
@SneakyThrows
public void testOcrForAllDMFiles() {
String dir = "/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/";
String dir = "/home/kschuettler/Dokumente/TestFiles/certificates/no-ocr";
List<File> foundFiles = Files.walk(Path.of(dir))
.sorted(Comparator.comparingLong(this::getFileSize))
.map(Path::toFile)
@ -148,7 +167,9 @@ public class OcrServiceIntegrationTest extends AbstractTest {
int fileCount = foundFiles.size();
AtomicInteger processedCount = new AtomicInteger();
System.out.printf("Found %s files, starting OCR for each.%n%n", fileCount);
foundFiles.stream().peek(file -> System.out.printf("%s/%s: %s%n", processedCount.getAndIncrement(), fileCount, file)).forEach(this::testOCRForFile);
foundFiles.stream()
.peek(file -> System.out.printf("%s/%s: %s%n", processedCount.getAndIncrement(), fileCount, file))
.forEach(this::testOCRForFile);
}
@ -182,4 +203,146 @@ public class OcrServiceIntegrationTest extends AbstractTest {
System.out.println("\n\n");
}
@SneakyThrows
@Test
public void testMakeTextVisible() {
File file = new File("/home/kschuettler/Downloads/18-Curacron_ToxicidadeOcularInVitro.pdf");
PDFDoc doc;
try (var in = new FileInputStream(file)) {
doc = new PDFDoc(in);
}
execute(doc, false, false, Collections.emptySet());
try (var out = new FileOutputStream("/tmp/test.pdf")) {
doc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
}
}
@SneakyThrows
private void execute(PDFDoc pdfDoc, boolean delta, boolean removePaths, Set<String> markedContentToIgnore) {
ElementWriter writer = new ElementWriter();
ElementReader reader = new ElementReader();
Set<Long> visitedXObjIds = new TreeSet<>();
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
Page page = iterator.next();
visitedXObjIds.add(page.getSDFObj().getObjNum());
Context context = Context.builder()
.reader(reader)
.clippingPathStack(new ClippingPathStack(page.getMediaBox()))
.markedContentStack(new MarkedContentStack())
.removePaths(removePaths)
.delta(delta)
.overlappedElements(new ArrayList<>())
.visibleElements(new ArrayList<>())
.visitedXObjIds(visitedXObjIds)
.markedContentToIgnore(markedContentToIgnore)
.build();
removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context);
}
writer.destroy();
reader.destroy();
}
private void removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(Page page, ElementWriter writer, Context context) throws PDFNetException {
context.reader().begin(page);
context.markedContentStack().clear();
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
processElements(writer, context);
writer.end();
context.reader().end();
}
private void processElements(ElementWriter writer, Context context) throws PDFNetException {
for (Element element = context.reader().next(); element != null; element = context.reader().next()) {
if (context.markedContentStack().currentMarkedContentContainsAny(context.markedContentToIgnore()) && element.getType() != Element.e_marked_content_end) {
writer.writeElement(element);
continue;
}
switch (element.getType()) {
case Element.e_text -> processText(element, writer, context);
case Element.e_form -> processForm(element, writer, context);
default -> writer.writeElement(element);
}
}
}
private void processForm(Element formElement, ElementWriter writer, Context context) throws PDFNetException {
writer.writeElement(formElement);
Obj formObj = formElement.getXObject();
if (!context.visitedXObjIds().contains(formObj.getObjNum())) {
context.visitedXObjIds().add(formObj.getObjNum());
// writer needs to be newly initialized when entering a new content stream
// see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
ElementWriter formWriter = new ElementWriter();
context.reader().formBegin();
formWriter.begin(formObj);
context.reader().clearChangeList();
formWriter.setDefaultGState(context.reader());
processElements(formWriter, context);
formWriter.end();
formWriter.destroy();
context.reader().end();
}
}
private void processText(Element textElement, ElementWriter writer, Context context) throws PDFNetException {
Rect textBBox = textElement.getBBox();
if (textBBox == null) {
writer.writeElement(textElement);
return;
}
GState gState = textElement.getGState();
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
// blue for elements removed due to transparency or not rendered or same color as background
gState.setFillColor(new ColorPt(0, 0, 1));
gState.setTextRenderMode(GState.e_fill_text);
gState.setFillOpacity(1);
writer.writeElement(textElement);
}
@Builder
record Context(
boolean removePaths,
boolean delta,
ElementReader reader,
ClippingPathStack clippingPathStack,
MarkedContentStack markedContentStack,
List<ElementFeatures> overlappedElements,
List<ElementFeatures> visibleElements,
Set<Long> visitedXObjIds,
Set<String> markedContentToIgnore
) {
}
}