From c61f71871ed7d0299918de0d7ee1659805a0c1b6 Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Wed, 8 May 2024 10:54:25 +0200 Subject: [PATCH 1/4] RED-7669: improve ocr * decrease otsu-scorefract slightly for thin lines * don't write text that is overlapped with existing text --- ...r.java => NativeLibrariesInitializer.java} | 22 ++- .../ocr/processor/model/QuadPoint.java | 11 ++ .../processor/service/OcrResultWriter.java | 165 ++++++++++++------ .../threads/ImageProcessingThread.java | 2 +- .../service/ocr/v1/server/AbstractTest.java | 4 +- 5 files changed, 147 insertions(+), 57 deletions(-) rename ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/initializer/{PDFNetInitializer.java => NativeLibrariesInitializer.java} (53%) diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/initializer/PDFNetInitializer.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/initializer/NativeLibrariesInitializer.java similarity index 53% rename from ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/initializer/PDFNetInitializer.java rename to ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/initializer/NativeLibrariesInitializer.java index fbd71a9..8158ebc 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/initializer/PDFNetInitializer.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/initializer/NativeLibrariesInitializer.java @@ -1,17 +1,20 @@ package com.knecon.fforesight.service.ocr.processor.initializer; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.stereotype.Component; + import com.pdftron.pdf.PDFNet; import com.sun.jna.NativeLibrary; import jakarta.annotation.PostConstruct; import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; -import org.springframework.beans.factory.annotation.Value; -import org.springframework.stereotype.Component; +import lombok.extern.slf4j.Slf4j; +@Slf4j @Component @RequiredArgsConstructor -public class PDFNetInitializer { +public class NativeLibrariesInitializer { @Value("${pdftron.license:}") private String pdftronLicense; @@ -22,8 +25,21 @@ public class PDFNetInitializer { // Do not change back to application runner, if it is application runner it takes messages from the queue before PDFNet is initialized, that leads to UnsatisfiedLinkError. public void init() { + log.info("Initializing Native Libraries"); + log.info("Setting pdftron license: {}", pdftronLicense); PDFNet.setTempPath("/tmp/pdftron"); PDFNet.initialize(pdftronLicense); + + log.info("Setting jna.library.path: {}", System.getenv("VCPKG_DYNAMIC_LIB")); System.setProperty("jna.library.path", System.getenv("VCPKG_DYNAMIC_LIB")); + + log.info("Asserting Native Libraries loaded"); + NativeLibrary leptonicaLib = NativeLibrary.getInstance("leptonica"); + assert leptonicaLib != null; + log.info("Leptonica library loaded from {}", leptonicaLib.getFile().getAbsolutePath()); + NativeLibrary tesseractLib = NativeLibrary.getInstance("tesseract"); + assert tesseractLib != null; + log.info("Tesseract library loaded from {}", tesseractLib.getFile().getAbsolutePath()); } + } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/QuadPoint.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/QuadPoint.java index c40aa1d..139398a 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/QuadPoint.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/model/QuadPoint.java @@ -1,5 +1,6 @@ package com.knecon.fforesight.service.ocr.processor.model; +import java.awt.Rectangle; import java.awt.geom.AffineTransform; import java.awt.geom.Line2D; import java.awt.geom.Point2D; @@ -34,6 +35,16 @@ public record QuadPoint(Point2D a, Point2D b, Point2D c, Point2D d) { new Point2D.Double(bounds.getRight(), bounds.getBottom())); } + public Rectangle2D getBounds2D() { + + double minX = Math.min(Math.min(Math.min(a.getX(), b.getX()), c.getX()), d.getX()); + double minY = Math.min(Math.min(Math.min(a.getY(), b.getY()), c.getY()), d.getY()); + double maxX = Math.max(Math.max(Math.max(a.getX(), b.getX()), c.getX()), d.getX()); + double maxY = Math.max(Math.max(Math.max(a.getY(), b.getY()), c.getY()), d.getY()); + + return new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY); + } + public QuadPoint getTransformed(AffineTransform at) { diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultWriter.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultWriter.java index 4bae7b4..0083112 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultWriter.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultWriter.java @@ -3,19 +3,23 @@ package com.knecon.fforesight.service.ocr.processor.service; import java.awt.Color; import java.awt.geom.Line2D; import java.awt.geom.Point2D; +import java.awt.geom.Rectangle2D; import java.io.File; +import java.io.FileInputStream; +import java.util.ArrayList; import java.util.Collection; +import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.function.Function; -import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode; import org.springframework.stereotype.Service; +import com.iqser.red.pdftronlogic.commons.Converter; import com.knecon.fforesight.service.ocr.processor.model.OcrResultToWrite; import com.knecon.fforesight.service.ocr.processor.model.QuadPoint; import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage; @@ -26,6 +30,9 @@ import com.knecon.fforesight.service.viewerdoc.model.PlacedText; import com.knecon.fforesight.service.viewerdoc.model.Visualizations; import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage; import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService; +import com.pdftron.pdf.PDFDoc; +import com.pdftron.pdf.Page; +import com.pdftron.pdf.TextExtractor; import lombok.AccessLevel; import lombok.RequiredArgsConstructor; @@ -39,32 +46,62 @@ import lombok.extern.slf4j.Slf4j; @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) public class OcrResultWriter { + public static final Color REGULAR_TEXT_COLOR = Color.BLUE; + public static final Color BOLD_TEXT_COLOR = Color.CYAN; + + public static final Color REGULAR_TEXT_IN_IGNORE_ZONE = Color.RED; + public static final Color BOLD_TEXT_IN_IGNORE_ZONE = Color.RED; + ViewerDocumentService viewerDocumentService; @SneakyThrows public void drawOcrResultsToPdf(File document, File viewerDocument, Map> imagesWithResultsPerPage) { - Map ocrVisualizationsOnPages = createVisualizations(imagesWithResultsPerPage); - Map ocrTextDebugVisualizationsOnPages = createDebugTextVisualizations(imagesWithResultsPerPage); - Map ocrBBoxDebugVisualizationsOnPages = createDebugBBoxVisualizations(imagesWithResultsPerPage); + Map ocrVisualizationsOnPages = new HashMap<>(); + Map ocrTextDebugVisualizationsOnPages = new HashMap<>(); + Map ocrBBoxDebugVisualizationsOnPages = new HashMap<>(); + + try (var in = new FileInputStream(document); PDFDoc doc = new PDFDoc(in)) { + + for (Integer pageNumber : imagesWithResultsPerPage.keySet()) { + + List textBBoxes = getTextBBoxes(doc.getPage(pageNumber)); + + ocrVisualizationsOnPages.put(pageNumber - 1, createVisualizations(imagesWithResultsPerPage.get(pageNumber), textBBoxes)); + ocrTextDebugVisualizationsOnPages.put(pageNumber - 1, createDebugTextVisualizations(imagesWithResultsPerPage.get(pageNumber), textBBoxes)); + ocrBBoxDebugVisualizationsOnPages.put(pageNumber - 1, createDebugBBoxVisualizations(imagesWithResultsPerPage.get(pageNumber))); + } + } Visualizations visualizations = new Visualizations(ContentStreams.KNECON_OCR, ocrVisualizationsOnPages, false); List debugVisualizations = List.of(visualizations, - new Visualizations(ContentStreams.KNECON_OCR_TEXT_DEBUG, ocrTextDebugVisualizationsOnPages, false), - new Visualizations(ContentStreams.KNECON_OCR_BBOX_DEBUG, ocrBBoxDebugVisualizationsOnPages, false)); + new Visualizations(ContentStreams.KNECON_OCR_TEXT_DEBUG, ocrTextDebugVisualizationsOnPages, false), + new Visualizations(ContentStreams.KNECON_OCR_BBOX_DEBUG, ocrBBoxDebugVisualizationsOnPages, false)); - viewerDocumentService.addVisualizationsOnPage(document, document, visualizations); + viewerDocumentService.addVisualizationsOnPage(document, document, List.of(visualizations)); viewerDocumentService.addVisualizationsOnPage(viewerDocument, viewerDocument, debugVisualizations); } - private Map createVisualizations(Map> imagesWithResultsPerPage) { + private List getTextBBoxes(Page page) { - return imagesWithResultsPerPage.keySet() - .stream() - .collect(Collectors.toMap(pageNumber1IdxTo0IdxMapper(), pageNumber -> createVisualizations(imagesWithResultsPerPage.get(pageNumber)))); + List textBBoxes = new ArrayList<>(); + try (var textExtractor = new TextExtractor()) { + textExtractor.begin(page); + + try { + for (TextExtractor.Line line = textExtractor.getFirstLine(); line.isValid(); line = line.getNextLine()) { + for (var word = line.getFirstWord(); word.isValid(); word = word.getNextWord()) { + textBBoxes.add(Converter.toRectangle2D(word.getBBox())); + } + } + } catch (Exception e) { + log.warn("Could not get word dimension, {}", e.getMessage()); + } + return textBBoxes; + } } @@ -74,71 +111,97 @@ public class OcrResultWriter { } - private VisualizationsOnPage createVisualizations(List ocrResultsToWrite) { + private VisualizationsOnPage createVisualizations(List ocrResultsToWrite, List ignoreZones) { + + List words = ocrResultsToWrite.stream() + .map(OcrResultToWrite::textPositionInImage) + .flatMap(Collection::stream) + .filter(word -> ignoreZones.stream() + .noneMatch(ignoreZone -> word.getTransformedTextBBox().getBounds2D().intersects(ignoreZone))) + .toList(); - List words = ocrResultsToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList(); List placedTexts = words.stream() .map(word -> new PlacedText(word.getText(), - null, - Color.BLACK, - (float) word.getFontSize(), - word.getFontMetricsFactory(), - Optional.of(word.getTextMatrix()), - Optional.of(RenderingMode.NEITHER))) + null, + Color.BLACK, + (float) word.getFontSize(), + word.getFontMetricsFactory(), + Optional.of(word.getTextMatrix()), + Optional.of(RenderingMode.NEITHER))) .toList(); return VisualizationsOnPage.builder().placedTexts(placedTexts).build(); } - private Map createDebugTextVisualizations(Map> imagesWithResultsPerPage) { - return imagesWithResultsPerPage.keySet() - .stream() - .collect(Collectors.toMap(pageNumber1IdxTo0IdxMapper(), pageNumber -> createDebugTextVisualizations(imagesWithResultsPerPage.get(pageNumber)))); - } + private VisualizationsOnPage createDebugTextVisualizations(List ocrResultsToWrite, List textBBoxes) { + List wordsToDraw = new ArrayList<>(); + List ignoredWords = new ArrayList<>(); - private VisualizationsOnPage createDebugTextVisualizations(List ocrResultsToWrite) { - - List words = ocrResultsToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList(); - List placedTexts = words.stream() + for (OcrResultToWrite ocrResultToWrite : ocrResultsToWrite) { + for (TextPositionInImage textPositionInImage : ocrResultToWrite.textPositionInImage()) { + if (textBBoxes.stream() + .anyMatch(ignoreZone -> textPositionInImage.getTransformedTextBBox().getBounds2D().intersects(ignoreZone))) { + ignoredWords.add(textPositionInImage); + } else { + wordsToDraw.add(textPositionInImage); + } + } + } + Stream placedTexts = wordsToDraw.stream() .map(word -> new PlacedText(word.getText(), - null, - word.getFontStyle().equals(FontStyle.REGULAR) ? Color.BLUE : Color.RED, - (float) word.getFontSize(), - word.getFontMetricsFactory(), - Optional.of(word.getTextMatrix()), - Optional.of(RenderingMode.FILL))) - .toList(); - return VisualizationsOnPage.builder().placedTexts(placedTexts).build(); + null, + word.getFontStyle().equals(FontStyle.REGULAR) ? REGULAR_TEXT_COLOR : BOLD_TEXT_COLOR, + (float) word.getFontSize(), + word.getFontMetricsFactory(), + Optional.of(word.getTextMatrix()), + Optional.of(RenderingMode.FILL))); + + Stream placedTexts2 = ignoredWords.stream() + .map(word -> new PlacedText(word.getText(), + null, + word.getFontStyle().equals(FontStyle.REGULAR) ? REGULAR_TEXT_IN_IGNORE_ZONE : BOLD_TEXT_IN_IGNORE_ZONE, + (float) word.getFontSize(), + word.getFontMetricsFactory(), + Optional.of(word.getTextMatrix()), + Optional.of(RenderingMode.FILL))); + + return VisualizationsOnPage.builder() + .placedTexts(Stream.of(placedTexts, placedTexts2) + .flatMap(Function.identity()) + .toList()) + .build(); } - private Map createDebugBBoxVisualizations(Map> imagesWithResultsPerPage) { - - return imagesWithResultsPerPage.keySet() - .stream() - .collect(Collectors.toMap(pageNumber1IdxTo0IdxMapper(), pageNumber -> createDebugBBoxVisualizations(imagesWithResultsPerPage.get(pageNumber)))); - } - private VisualizationsOnPage createDebugBBoxVisualizations(List ocrResultsToWrite) { - List words = ocrResultsToWrite.stream().map(OcrResultToWrite::textPositionInImage).flatMap(Collection::stream).toList(); + List words = ocrResultsToWrite.stream() + .map(OcrResultToWrite::textPositionInImage) + .flatMap(Collection::stream) + .toList(); List coloredLines = Stream.concat(// - words.stream().map(TextPositionInImage::getTransformedTextBBox).map(this::quadPointAsLines),// - ocrResultsToWrite.stream().map(OcrResultToWrite::imageBoundingBox).map(this::createGrid)// - ).flatMap(Collection::stream).toList(); + words.stream() + .map(TextPositionInImage::getTransformedTextBBox) + .map(this::quadPointAsLines),// + ocrResultsToWrite.stream() + .map(OcrResultToWrite::imageBoundingBox) + .map(this::createGrid)// + ) + .flatMap(Collection::stream) + .toList(); return VisualizationsOnPage.builder().coloredLines(coloredLines).build(); } private List quadPointAsLines(QuadPoint rect) { - return List.of(new ColoredLine(new Line2D.Double(rect.a(), rect.b()), Color.ORANGE, 1), - new ColoredLine(new Line2D.Double(rect.b(), rect.c()), Color.BLUE, 1), - new ColoredLine(new Line2D.Double(rect.c(), rect.d()), Color.GREEN, 1), - new ColoredLine(new Line2D.Double(rect.d(), rect.a()), Color.MAGENTA, 1)); + return List.of(new ColoredLine(new Line2D.Double(rect.a(), rect.b()), REGULAR_TEXT_IN_IGNORE_ZONE, 1), + new ColoredLine(new Line2D.Double(rect.b(), rect.c()), REGULAR_TEXT_COLOR, 1), + new ColoredLine(new Line2D.Double(rect.c(), rect.d()), Color.GREEN, 1), + new ColoredLine(new Line2D.Double(rect.d(), rect.a()), BOLD_TEXT_IN_IGNORE_ZONE, 1)); } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageProcessingThread.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageProcessingThread.java index 1f233e5..11d0841 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageProcessingThread.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/threads/ImageProcessingThread.java @@ -227,7 +227,7 @@ public class ImageProcessingThread extends Thread { if (pix.w < 100 || pix.h < 100) { binarized = Leptonica1.pixThresholdToBinary(gaussian, 170); } else { - binarized = Leptonica1.pixOtsuThreshOnBackgroundNorm(gaussian, null, 50, 50, 165, 10, 100, 5, 5, 0.2f, null); + binarized = Leptonica1.pixOtsuThreshOnBackgroundNorm(gaussian, null, 50, 50, 165, 10, 100, 5, 5, 0.1f, null); if (binarized == null) { // Sometimes Otsu just fails, then we binarize directly binarized = Leptonica1.pixThresholdToBinary(gaussian, 170); } diff --git a/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/AbstractTest.java b/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/AbstractTest.java index affca4e..e3ce1e2 100644 --- a/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/AbstractTest.java +++ b/ocr-service-v1/ocr-service-server/src/test/java/com/knecon/fforesight/service/ocr/v1/server/AbstractTest.java @@ -24,10 +24,10 @@ import org.springframework.context.annotation.Primary; import org.springframework.test.context.junit.jupiter.SpringExtension; import com.iqser.red.commons.jackson.ObjectMapperFactory; -import com.knecon.fforesight.service.ocr.processor.initializer.PDFNetInitializer; import com.iqser.red.storage.commons.StorageAutoConfiguration; import com.iqser.red.storage.commons.service.StorageService; import com.iqser.red.storage.commons.utils.FileSystemBackedStorageService; +import com.knecon.fforesight.service.ocr.processor.initializer.NativeLibrariesInitializer; import com.knecon.fforesight.tenantcommons.TenantsClient; import com.pdftron.pdf.PDFNet; @@ -36,7 +36,7 @@ import lombok.SneakyThrows; @ExtendWith({SpringExtension.class, MockitoExtension.class}) @SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT) -@Import({AbstractTest.TestConfiguration.class, PDFNetInitializer.class}) +@Import({AbstractTest.TestConfiguration.class, NativeLibrariesInitializer.class}) @AutoConfigureObservability public class AbstractTest { From 18ba1daaef121a1dfa5fe3636b709068d34d0998 Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Wed, 8 May 2024 10:55:38 +0200 Subject: [PATCH 2/4] RED-7669: improve ocr * decrease otsu-scorefract slightly for thin lines * don't write text that is overlapped with existing text --- .../service/ocr/processor/service/OcrResultWriter.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultWriter.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultWriter.java index 0083112..e730ede 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultWriter.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultWriter.java @@ -198,10 +198,10 @@ public class OcrResultWriter { private List quadPointAsLines(QuadPoint rect) { - return List.of(new ColoredLine(new Line2D.Double(rect.a(), rect.b()), REGULAR_TEXT_IN_IGNORE_ZONE, 1), - new ColoredLine(new Line2D.Double(rect.b(), rect.c()), REGULAR_TEXT_COLOR, 1), - new ColoredLine(new Line2D.Double(rect.c(), rect.d()), Color.GREEN, 1), - new ColoredLine(new Line2D.Double(rect.d(), rect.a()), BOLD_TEXT_IN_IGNORE_ZONE, 1)); + return List.of(new ColoredLine(new Line2D.Double(rect.a(), rect.b()), Color.ORANGE, 1), + new ColoredLine(new Line2D.Double(rect.b(), rect.c()), Color.BLUE, 1), + new ColoredLine(new Line2D.Double(rect.c(), rect.d()), Color.GREEN, 1), + new ColoredLine(new Line2D.Double(rect.d(), rect.a()), Color.MAGENTA, 1)); } From 7b5a175440a7d8cd16894562c5d3598d5b151a94 Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Mon, 13 May 2024 11:35:57 +0200 Subject: [PATCH 3/4] RED-7669: improve ocr * fix pmd --- .../NativeLibrariesInitializer.java | 16 ++++++---- .../processor/service/OcrResultWriter.java | 31 ++++++++++++------- 2 files changed, 30 insertions(+), 17 deletions(-) diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/initializer/NativeLibrariesInitializer.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/initializer/NativeLibrariesInitializer.java index 8158ebc..387a7a1 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/initializer/NativeLibrariesInitializer.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/initializer/NativeLibrariesInitializer.java @@ -34,12 +34,16 @@ public class NativeLibrariesInitializer { System.setProperty("jna.library.path", System.getenv("VCPKG_DYNAMIC_LIB")); log.info("Asserting Native Libraries loaded"); - NativeLibrary leptonicaLib = NativeLibrary.getInstance("leptonica"); - assert leptonicaLib != null; - log.info("Leptonica library loaded from {}", leptonicaLib.getFile().getAbsolutePath()); - NativeLibrary tesseractLib = NativeLibrary.getInstance("tesseract"); - assert tesseractLib != null; - log.info("Tesseract library loaded from {}", tesseractLib.getFile().getAbsolutePath()); + + try (NativeLibrary leptonicaLib = NativeLibrary.getInstance("leptonica")) { + assert leptonicaLib != null; + log.info("Leptonica library loaded from {}", leptonicaLib.getFile().getAbsolutePath()); + } + + try (NativeLibrary tesseractLib = NativeLibrary.getInstance("tesseract")) { + assert tesseractLib != null; + log.info("Tesseract library loaded from {}", tesseractLib.getFile().getAbsolutePath()); + } } } diff --git a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultWriter.java b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultWriter.java index e730ede..1895775 100644 --- a/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultWriter.java +++ b/ocr-service-v1/ocr-service-processor/src/main/java/com/knecon/fforesight/service/ocr/processor/service/OcrResultWriter.java @@ -85,15 +85,16 @@ public class OcrResultWriter { } + @SuppressWarnings("PMD") private List getTextBBoxes(Page page) { List textBBoxes = new ArrayList<>(); try (var textExtractor = new TextExtractor()) { textExtractor.begin(page); - try { - for (TextExtractor.Line line = textExtractor.getFirstLine(); line.isValid(); line = line.getNextLine()) { - for (var word = line.getFirstWord(); word.isValid(); word = word.getNextWord()) { + + for (TextExtractor.Line line = textExtractor.getFirstLine(); line.isValid(); line = getNextLine(line)) { + for (TextExtractor.Word word = line.getFirstWord(); word.isValid(); word = getNextWord(word)) { textBBoxes.add(Converter.toRectangle2D(word.getBBox())); } } @@ -105,9 +106,19 @@ public class OcrResultWriter { } - private static Function pageNumber1IdxTo0IdxMapper() { - // PDFBox uses a 0-based index for page numbers internally, while we use a 1-based index - return p -> p - 1; + private static TextExtractor.Word getNextWord(TextExtractor.Word word) { + + TextExtractor.Word nextWord = word.getNextWord(); + word.close(); + return nextWord; + } + + + private static TextExtractor.Line getNextLine(TextExtractor.Line line) { + + TextExtractor.Line newLine = line.getNextLine(); + line.close(); + return newLine; } @@ -133,7 +144,6 @@ public class OcrResultWriter { } - private VisualizationsOnPage createDebugTextVisualizations(List ocrResultsToWrite, List textBBoxes) { List wordsToDraw = new ArrayList<>(); @@ -175,7 +185,6 @@ public class OcrResultWriter { } - private VisualizationsOnPage createDebugBBoxVisualizations(List ocrResultsToWrite) { List words = ocrResultsToWrite.stream() @@ -199,9 +208,9 @@ public class OcrResultWriter { private List quadPointAsLines(QuadPoint rect) { return List.of(new ColoredLine(new Line2D.Double(rect.a(), rect.b()), Color.ORANGE, 1), - new ColoredLine(new Line2D.Double(rect.b(), rect.c()), Color.BLUE, 1), - new ColoredLine(new Line2D.Double(rect.c(), rect.d()), Color.GREEN, 1), - new ColoredLine(new Line2D.Double(rect.d(), rect.a()), Color.MAGENTA, 1)); + new ColoredLine(new Line2D.Double(rect.b(), rect.c()), Color.BLUE, 1), + new ColoredLine(new Line2D.Double(rect.c(), rect.d()), Color.GREEN, 1), + new ColoredLine(new Line2D.Double(rect.d(), rect.a()), Color.MAGENTA, 1)); } From 61b1010e24035a3ed90b45549be806c3b7d13972 Mon Sep 17 00:00:00 2001 From: Kilian Schuettler Date: Mon, 13 May 2024 12:59:40 +0200 Subject: [PATCH 4/4] RED-7669: improve ocr * fix pmd --- ocr-service-v1/ocr-service-processor/build.gradle.kts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocr-service-v1/ocr-service-processor/build.gradle.kts b/ocr-service-v1/ocr-service-processor/build.gradle.kts index 1c36b6d..9b20264 100644 --- a/ocr-service-v1/ocr-service-processor/build.gradle.kts +++ b/ocr-service-v1/ocr-service-processor/build.gradle.kts @@ -25,6 +25,6 @@ dependencies { api("com.amazonaws:aws-java-sdk-kms:1.12.440") api("com.google.guava:guava:31.1-jre") api("com.iqser.red.commons:pdftron-logic-commons:2.27.0") - api("com.knecon.fforesight:viewer-doc-processor:0.89.0") + api("com.knecon.fforesight:viewer-doc-processor:0.125.0") testImplementation("org.junit.jupiter:junit-jupiter:5.8.1") }