Compare commits

..

30 Commits

Author SHA1 Message Date
Kilian Schüttler
5fca39728b Merge branch 'RED-10365' into 'master'
RED-10365: update pdftron logic commons to avoid crash for specific file

See merge request fforesight/ocr-service!59
2024-11-06 09:25:26 +01:00
Kilian Schuettler
cd6390fde1 RED-10365: update pdftron logic commons to avoid crash for specific file 2024-11-06 09:09:50 +01:00
Kilian Schüttler
bc459ee966 Merge branch 'RED-9864' into 'master'
RED-9864: sped up invisible element removal, fixed crash

See merge request fforesight/ocr-service!58
2024-08-26 15:27:07 +02:00
Kilian Schuettler
47e7f8b297 RED-9864: sped up invisible element removal, fixed crash 2024-08-26 15:23:11 +02:00
Kilian Schüttler
22392e083d Merge branch 'RED-9746' into 'master'
RED-9746: update pdftron-ologic-commons version

See merge request fforesight/ocr-service!57
2024-08-20 09:43:58 +02:00
Kilian Schuettler
52a1fb4a05 RED-9746: update pdftron-ologic-commons version
* fox build
2024-08-19 13:41:46 +02:00
Kilian Schüttler
378436cb2f Merge branch 'RED-8800' into 'master'
RRED-8800: adjust coords to cropbox

See merge request fforesight/ocr-service!55
2024-07-15 17:51:06 +02:00
Kilian Schuettler
f1204acc60 RRED-8800: adjust coords to cropbox 2024-07-15 17:46:50 +02:00
Andrei Isvoran
998755c3e3 Merge branch 'RED-9496' into 'master'
RED-9496 - Implement graceful shutdown

See merge request fforesight/ocr-service!54
2024-07-04 12:35:01 +02:00
Andrei Isvoran
c598f62633 RED-9496 - Implement graceful shutdown 2024-07-04 12:17:12 +03:00
Corina Olariu
2e25ee2155 Merge branch 'RED-8701-deletefile' into 'master'
RED-8701 - Move files to customer data repositories

See merge request fforesight/ocr-service!53
2024-05-17 09:56:29 +02:00
Corina Olariu
7f04fb3c6f RED-8701 - Move files to customer data repositories
- remove one customer file (single page)
2024-05-17 10:48:10 +03:00
Andrei Isvoran
ff32f016eb Merge branch 'RED-9157-tracing' into 'master'
RED-9157 - Update tracing

See merge request fforesight/ocr-service!52
2024-05-15 09:59:00 +02:00
Andrei Isvoran
821ef265fe RED-9157 - Update tracing 2024-05-15 10:40:31 +03:00
Kilian Schüttler
7fcb6652ef Merge branch 'RED-7669' into 'master'
RED-7669: improve ocr

See merge request fforesight/ocr-service!51
2024-05-13 15:03:06 +02:00
Kilian Schuettler
61b1010e24 RED-7669: improve ocr
* fix pmd
2024-05-13 12:59:40 +02:00
Kilian Schuettler
7b5a175440 RED-7669: improve ocr
* fix pmd
2024-05-13 11:35:57 +02:00
Kilian Schuettler
18ba1daaef RED-7669: improve ocr
* decrease otsu-scorefract slightly for thin lines
* don't write text that is overlapped with existing text
2024-05-08 10:55:38 +02:00
Kilian Schuettler
c61f71871e RED-7669: improve ocr
* decrease otsu-scorefract slightly for thin lines
* don't write text that is overlapped with existing text
2024-05-08 10:54:25 +02:00
Timo Bejan
cc2937d0d2 Merge branch 'RED-8701-upgrade2' into 'master'
RED-8701 - Move files to customer data repositories

See merge request fforesight/ocr-service!50
2024-04-29 08:56:26 +02:00
Corina Olariu
71255d9fc9 RED-8701 - Move files to customer data repositories
- update springBootStarterVersion and org.springframework.cloud:spring-cloud-starter-openfeign
2024-04-26 15:01:48 +03:00
Kilian Schüttler
1f9dac17e3 Merge branch 'RED-8701-upgrade' into 'master'
RED-8701 - Move files to customer data repositories

See merge request fforesight/ocr-service!49
2024-04-25 12:56:28 +02:00
Corina Olariu
5712292698 RED-8701 - Move files to customer data repositories
- update fagiani_apt builpack
2024-04-25 13:37:59 +03:00
Kevin Tumma
1395318e18 Merge branch 'RED-8701-spring-version' into 'master'
RED-8701 - Move files to customer data repositories

See merge request fforesight/ocr-service!48
2024-04-25 11:11:53 +02:00
Corina Olariu
842b794153 RED-8701 - Move files to customer data repositories
- update "org.springframework.boot" version to 3.2.3
2024-04-25 12:00:00 +03:00
Yannik Hampe
4b3ccc28e2 Merge branch 'RED-8701' into 'master'
RED-8701 - Move files to customer data repositories

See merge request fforesight/ocr-service!47
2024-04-25 09:06:53 +02:00
Corina Olariu
b469ea4174 RED-8701 - Move files to customer data repositories
- update syngenta submodule
2024-04-23 14:56:57 +03:00
Corina Olariu
253bb70519 RED-8701 - Move files to customer data repositories
- update syngenta submodule
2024-04-23 10:44:48 +03:00
Corina Olariu
d55f245c5e RED-8701 - Move files to customer data repositories
- update unit tests with the new path to submodules for customer files
- remove customer files from project
2024-04-22 14:06:56 +03:00
Corina Olariu
7ed1632c6f RED-8701 - Move files to customer data repositories
- use git lfs to store customer files
2024-04-18 20:58:35 +03:00
20 changed files with 143 additions and 237 deletions

View File

@ -1,3 +1,7 @@
variables:
# SONAR_PROJECT_KEY: 'ocr-service:ocr-service-server'
GIT_SUBMODULE_STRATEGY: recursive
GIT_SUBMODULE_FORCE_HTTPS: 'true'
include:
- project: 'gitlab/gitlab'
ref: 'main'

8
.gitmodules vendored Normal file
View File

@ -0,0 +1,8 @@
[submodule "ocr-service-v1/ocr-service-server/src/test/resources/files/syngenta"]
path = ocr-service-v1/ocr-service-server/src/test/resources/files/syngenta
url = https://gitlab.knecon.com/fforesight/documents/syngenta.git
update = merge
[submodule "ocr-service-v1/ocr-service-server/src/test/resources/files/basf"]
path = ocr-service-v1/ocr-service-server/src/test/resources/files/basf
url = https://gitlab.knecon.com/fforesight/documents/basf.git
update = merge

View File

@ -15,6 +15,7 @@ dependencies {
api("com.iqser.red.commons:metric-commons:2.1.0")
api("com.iqser.red.commons:storage-commons:2.45.0")
api("com.knecon.fforesight:tenant-commons:0.21.0")
api("com.knecon.fforesight:lifecycle-commons:0.6.0")
api("com.pdftron:PDFNet:10.5.0")
api("org.apache.pdfbox:pdfbox:3.0.0")
api("org.apache.pdfbox:jbig2-imageio:3.0.4")
@ -24,7 +25,7 @@ dependencies {
api("io.github.karols:hocr4j:0.2.0")
api("com.amazonaws:aws-java-sdk-kms:1.12.440")
api("com.google.guava:guava:31.1-jre")
api("com.iqser.red.commons:pdftron-logic-commons:2.27.0")
api("com.knecon.fforesight:viewer-doc-processor:0.124.0")
api("com.iqser.red.commons:pdftron-logic-commons:2.32.0")
api("com.knecon.fforesight:viewer-doc-processor:0.125.0")
testImplementation("org.junit.jupiter:junit-jupiter:5.8.1")
}

View File

@ -1,5 +1,8 @@
package com.knecon.fforesight.service.ocr.processor.initializer;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import com.pdftron.pdf.PDFNet;
import com.sun.jna.NativeLibrary;
@ -8,9 +11,6 @@ import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
@Slf4j
@Component
@RequiredArgsConstructor
@ -34,12 +34,16 @@ public class NativeLibrariesInitializer {
System.setProperty("jna.library.path", System.getenv("VCPKG_DYNAMIC_LIB"));
log.info("Asserting Native Libraries loaded");
NativeLibrary leptonicaLib = NativeLibrary.getInstance("leptonica");
assert leptonicaLib != null;
log.info("Leptonica library loaded from {}", leptonicaLib.getFile().getAbsolutePath());
NativeLibrary tesseractLib = NativeLibrary.getInstance("tesseract");
assert tesseractLib != null;
log.info("Tesseract library loaded from {}", leptonicaLib.getFile().getAbsolutePath());
try (NativeLibrary leptonicaLib = NativeLibrary.getInstance("leptonica")) {
assert leptonicaLib != null;
log.info("Leptonica library loaded from {}", leptonicaLib.getFile().getAbsolutePath());
}
try (NativeLibrary tesseractLib = NativeLibrary.getInstance("tesseract")) {
assert tesseractLib != null;
log.info("Tesseract library loaded from {}", tesseractLib.getFile().getAbsolutePath());
}
}
}

View File

@ -93,7 +93,7 @@ public interface OcrImage {
if (getWidth() < 200 || getHeight() < 200) {
return ITessAPI.TessPageSegMode.PSM_SINGLE_BLOCK;
}
return ITessAPI.TessPageSegMode.PSM_SPARSE_TEXT;
return ITessAPI.TessPageSegMode.PSM_AUTO;
} // TODO: evaluate if PSM can be dynamically chosen to increase performance

View File

@ -24,7 +24,7 @@ public class RenderedPageOcrImage implements OcrImage {
public AffineTransform getImageCTM() {
double scalingFactor = calculateScalingFactor();
AffineTransform imageToCropBoxScaling = new AffineTransform(scalingFactor, 0, 0, scalingFactor, -pageInformation.minX(), -pageInformation.minY());
AffineTransform imageToCropBoxScaling = new AffineTransform(scalingFactor, 0, 0, scalingFactor, 0, 0);
AffineTransform mirrorMatrix = new AffineTransform(1, 0, 0, -1, 0, pageInformation.height());

View File

@ -14,7 +14,6 @@ import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode;
@ -48,10 +47,10 @@ import lombok.extern.slf4j.Slf4j;
public class OcrResultWriter {
public static final Color REGULAR_TEXT_COLOR = Color.BLUE;
public static final Color BOLD_TEXT_COLOR = Color.RED;
public static final Color BOLD_TEXT_COLOR = Color.CYAN;
public static final Color REGULAR_TEXT_IN_IGNORE_ZONE = Color.ORANGE;
public static final Color BOLD_TEXT_IN_IGNORE_ZONE = Color.MAGENTA;
public static final Color REGULAR_TEXT_IN_IGNORE_ZONE = Color.RED;
public static final Color BOLD_TEXT_IN_IGNORE_ZONE = Color.RED;
ViewerDocumentService viewerDocumentService;
@ -86,15 +85,16 @@ public class OcrResultWriter {
}
@SuppressWarnings("PMD")
private List<Rectangle2D> getTextBBoxes(Page page) {
List<Rectangle2D> textBBoxes = new ArrayList<>();
try (var textExtractor = new TextExtractor()) {
textExtractor.begin(page);
try {
for (TextExtractor.Line line = textExtractor.getFirstLine(); line.isValid(); line = line.getNextLine()) {
for (var word = line.getFirstWord(); word.isValid(); word = word.getNextWord()) {
for (TextExtractor.Line line = textExtractor.getFirstLine(); line.isValid(); line = getNextLine(line)) {
for (TextExtractor.Word word = line.getFirstWord(); word.isValid(); word = getNextWord(word)) {
textBBoxes.add(Converter.toRectangle2D(word.getBBox()));
}
}
@ -106,9 +106,19 @@ public class OcrResultWriter {
}
private static Function<Integer, Integer> pageNumber1IdxTo0IdxMapper() {
// PDFBox uses a 0-based index for page numbers internally, while we use a 1-based index
return p -> p - 1;
private static TextExtractor.Word getNextWord(TextExtractor.Word word) {
TextExtractor.Word nextWord = word.getNextWord();
word.close();
return nextWord;
}
private static TextExtractor.Line getNextLine(TextExtractor.Line line) {
TextExtractor.Line newLine = line.getNextLine();
line.close();
return newLine;
}
@ -134,7 +144,6 @@ public class OcrResultWriter {
}
private VisualizationsOnPage createDebugTextVisualizations(List<OcrResultToWrite> ocrResultsToWrite, List<Rectangle2D> textBBoxes) {
List<TextPositionInImage> wordsToDraw = new ArrayList<>();
@ -176,7 +185,6 @@ public class OcrResultWriter {
}
private VisualizationsOnPage createDebugBBoxVisualizations(List<OcrResultToWrite> ocrResultsToWrite) {
List<TextPositionInImage> words = ocrResultsToWrite.stream()
@ -199,10 +207,10 @@ public class OcrResultWriter {
private List<ColoredLine> quadPointAsLines(QuadPoint rect) {
return List.of(new ColoredLine(new Line2D.Double(rect.a(), rect.b()), REGULAR_TEXT_IN_IGNORE_ZONE, 1),
new ColoredLine(new Line2D.Double(rect.b(), rect.c()), REGULAR_TEXT_COLOR, 1),
return List.of(new ColoredLine(new Line2D.Double(rect.a(), rect.b()), Color.ORANGE, 1),
new ColoredLine(new Line2D.Double(rect.b(), rect.c()), Color.BLUE, 1),
new ColoredLine(new Line2D.Double(rect.c(), rect.d()), Color.GREEN, 1),
new ColoredLine(new Line2D.Double(rect.d(), rect.a()), BOLD_TEXT_IN_IGNORE_ZONE, 1));
new ColoredLine(new Line2D.Double(rect.d(), rect.a()), Color.MAGENTA, 1));
}

View File

@ -45,7 +45,7 @@ public class ImageProcessingThread extends Thread {
final BlockingQueue<UnprocessedImage> imageInputQueue;
final BlockingQueue<OcrImage> imageOutputQueue;
final ITessAPI.TessBaseAPI detectionScriptHandle = initDetectionScriptHandle();
final L_Kernel gaussianKernel = Leptonica1.makeGaussianKernel(2, 2, 0.7f, 1);
final L_Kernel gaussianKernel = Leptonica1.makeGaussianKernel(2, 2, 1.0f, 1);
final Statistics stats;
final OcrServiceSettings settings;
final PDDocument document;
@ -227,7 +227,7 @@ public class ImageProcessingThread extends Thread {
if (pix.w < 100 || pix.h < 100) {
binarized = Leptonica1.pixThresholdToBinary(gaussian, 170);
} else {
binarized = Leptonica1.pixOtsuThreshOnBackgroundNorm(gaussian, null, 50, 50, 165, 10, 100, 5, 5, 0.0f, null);
binarized = Leptonica1.pixOtsuThreshOnBackgroundNorm(gaussian, null, 50, 50, 165, 10, 100, 5, 5, 0.1f, null);
if (binarized == null) { // Sometimes Otsu just fails, then we binarize directly
binarized = Leptonica1.pixThresholdToBinary(gaussian, 170);
}

View File

@ -3,7 +3,7 @@ import org.springframework.boot.gradle.tasks.bundling.BootBuildImage
plugins {
application
id("com.iqser.red.service.java-conventions")
id("org.springframework.boot") version "3.1.5"
id("org.springframework.boot") version "3.2.3"
id("io.spring.dependency-management") version "1.1.3"
id("org.sonarqube") version "4.3.0.3225"
id("io.freefair.lombok") version "8.4"
@ -17,14 +17,14 @@ configurations {
}
}
val springBootStarterVersion = "3.1.5"
val springBootStarterVersion = "3.2.3"
dependencies {
implementation(project(":ocr-service-processor"))
implementation(project(":ocr-service-api"))
implementation("com.knecon.fforesight:tracing-commons:0.5.0")
implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.0.4")
implementation("com.knecon.fforesight:tracing-commons:0.7.0")
implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.1.1")
implementation("org.springframework.boot:spring-boot-starter-amqp:${springBootStarterVersion}")
implementation("net.logstash.logback:logstash-logback-encoder:7.4")
@ -39,7 +39,7 @@ tasks.named<BootBuildImage>("bootBuildImage") {
environment.put("BPE_DELIM_JAVA_TOOL_OPTIONS", " ")
environment.put("BPE_APPEND_JAVA_TOOL_OPTIONS", "-Dfile.encoding=UTF-8")
environment.put("BPE_GS_LIB", "/layers/fagiani_apt/apt/usr/share/ghostscript/9.26/Resource/Init/") // set ghostscript lib path
environment.put("BPE_GS_LIB", "/layers/fagiani_apt/apt/usr/share/ghostscript/9.55.0/Resource/Init/") // set ghostscript lib path, version in path must match version in Aptfile
environment.put("BPE_FONTCONFIG_PATH", "/layers/fagiani_apt/apt/etc/fonts/") // set ghostscript fontconfig path
var aptfile = layout.projectDirectory.file("src/main/resources/Aptfile").toString()
@ -53,7 +53,7 @@ tasks.named<BootBuildImage>("bootBuildImage") {
buildpacks.set(
listOf(
"ghcr.io/fagiani/buildpacks/fagiani_apt@sha256:6471c8c70f32b749e29f65ae562ac0339fecad26aa9217628c00a6c31f197dae",
"ghcr.io/knsita/buildpacks/fagiani_apt@sha256:9771d4d27d8050aee62769490b8882fffc794745c129fb98e1f33196e2c93504",
"ghcr.io/kschuettler/knecon-vcpkg@sha256:ba5e967b124de4865ff7e8f565684f752dd6e97b302e2dcf651283f6a19b98b9",
"ghcr.io/kschuettler/knecon-tessdata@sha256:9062f728aa0340ac963bcdd6f5e740d683823a81d3f480db894da15bff72691a",
"urn:cnb:builder:paketo-buildpacks/java"

View File

@ -6,23 +6,27 @@ import org.springframework.boot.autoconfigure.ImportAutoConfiguration;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.boot.autoconfigure.security.servlet.SecurityAutoConfiguration;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.EnableAspectJAutoProxy;
import org.springframework.context.annotation.Import;
import org.springframework.scheduling.annotation.EnableAsync;
import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService;
import com.iqser.red.pdftronlogic.commons.WatermarkRemovalService;
import com.knecon.fforesight.lifecyclecommons.LifecycleAutoconfiguration;
import com.knecon.fforesight.service.ocr.processor.OcrServiceProcessorConfiguration;
import com.knecon.fforesight.service.ocr.v1.server.queue.MessagingConfiguration;
import com.iqser.red.storage.commons.StorageAutoConfiguration;
import com.knecon.fforesight.tenantcommons.MultiTenancyAutoConfiguration;
import com.knecon.fforesight.tracing.OpenTelemetryConfig;
import io.micrometer.core.aop.TimedAspect;
import io.micrometer.core.instrument.MeterRegistry;
@EnableAsync
@ImportAutoConfiguration({MultiTenancyAutoConfiguration.class})
@ImportAutoConfiguration({MultiTenancyAutoConfiguration.class, LifecycleAutoconfiguration.class})
@SpringBootApplication(exclude = {SecurityAutoConfiguration.class, ManagementWebSecurityAutoConfiguration.class})
@Import({MessagingConfiguration.class, StorageAutoConfiguration.class, OcrServiceProcessorConfiguration.class})
@Import({MessagingConfiguration.class, StorageAutoConfiguration.class, OcrServiceProcessorConfiguration.class, OpenTelemetryConfig.class})
@EnableAspectJAutoProxy
public class Application {
/**

View File

@ -1,5 +1,5 @@
# you can list packages
ghostscript
ghostscript=9.55.0~dfsg1-0ubuntu5.9
pkg-config
zip
unzip
@ -11,6 +11,7 @@ libk5crypto3
libkrb5support0
libkeyutils1
libkrb5-3
libbrotli1
# or include links to specific .deb files
# http://ftp.debian.org/debian/pool/contrib/m/msttcorefonts/ttf-mscorefonts-installer_3.8_all.deb

View File

@ -12,6 +12,9 @@ project.version: 1.0-SNAPSHOT
server:
port: 8080
lifecycle:
base-package: com.knecon.fforesight.service.ocr
spring:
application:
name: ocr-service

View File

@ -6,7 +6,7 @@
"overrides": [
{
"name": "tesseract",
"version": "5.3.4"
"version": "5.3.3"
},
{
"name": "leptonica",

View File

@ -24,11 +24,10 @@ import org.springframework.context.annotation.Primary;
import org.springframework.test.context.junit.jupiter.SpringExtension;
import com.iqser.red.commons.jackson.ObjectMapperFactory;
import com.knecon.fforesight.service.ocr.processor.initializer.NativeLibrariesInitializer;
import com.iqser.red.storage.commons.StorageAutoConfiguration;
import com.iqser.red.storage.commons.service.StorageService;
import com.iqser.red.storage.commons.utils.FileSystemBackedStorageService;
import com.knecon.fforesight.service.ocr.v1.server.queue.OcrMessageSender;
import com.knecon.fforesight.service.ocr.processor.initializer.NativeLibrariesInitializer;
import com.knecon.fforesight.tenantcommons.TenantsClient;
import com.pdftron.pdf.PDFNet;
@ -47,9 +46,6 @@ public class AbstractTest {
@MockBean
private TenantsClient tenantsClient;
@MockBean
private OcrMessageSender ocrMessageSender;
@Autowired
protected StorageService storageService;

View File

@ -10,46 +10,30 @@ import java.io.FileOutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.core.io.ClassPathResource;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.pdftronlogic.commons.ClippingPathStack;
import com.iqser.red.pdftronlogic.commons.ElementFeatures;
import com.iqser.red.pdftronlogic.commons.MarkedContentStack;
import com.knecon.fforesight.service.ocr.processor.service.FileStorageService;
import com.knecon.fforesight.service.ocr.processor.service.OCRService;
import com.iqser.red.service.persistence.service.v1.api.shared.model.dossiertemplate.dossier.file.FileType;
import com.knecon.fforesight.service.ocr.processor.service.OsUtils;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.ColorPt;
import com.pdftron.pdf.ColorSpace;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementReader;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.GState;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
import com.pdftron.pdf.Rect;
import com.pdftron.sdf.Obj;
import com.pdftron.sdf.SDFDoc;
import com.knecon.fforesight.tenantcommons.TenantContext;
import io.micrometer.prometheus.PrometheusMeterRegistry;
import io.micrometer.prometheus.PrometheusTimer;
import lombok.Builder;
import lombok.SneakyThrows;
//@Disabled // Ghostscript/Tesseract/Leptonica is not available on build server. If you want to run the test install these dependencies. See README.md for help.
@Disabled // Ghostscript/Tesseract/Leptonica is not available on build server. If you want to run the test install these dependencies. See README.md for help.
@SpringBootTest()
public class OcrServiceIntegrationTest extends AbstractTest {
@ -66,14 +50,11 @@ public class OcrServiceIntegrationTest extends AbstractTest {
@Test
public void testOCRMetrics() {
testOCR("files/Watermark.pdf");
testOCR("files/Watermark.pdf");
testOCR("files/Watermark.pdf");
testOCR("files/syngenta/CustomerFiles/SinglePages/Watermark_Page1_10.SYN524464 FS (A16148C) - Absorção cutânea.pdf");
testOCR("files/syngenta/CustomerFiles/SinglePages/Watermark_Page1_10.SYN524464 FS (A16148C) - Absorção cutânea.pdf");
testOCR("files/syngenta/CustomerFiles/SinglePages/Watermark_Page1_10.SYN524464 FS (A16148C) - Absorção cutânea.pdf");
var ocrOnDocumentMeter = registry.getMeters()
.stream()
.filter(m -> m.getId().getName().equalsIgnoreCase("redactmanager_runOcrOnDocument"))
.findAny();
var ocrOnDocumentMeter = registry.getMeters().stream().filter(m -> m.getId().getName().equalsIgnoreCase("redactmanager_runOcrOnDocument")).findAny();
assertThat(ocrOnDocumentMeter.isPresent()).isTrue();
PrometheusTimer timer = (PrometheusTimer) ocrOnDocumentMeter.get();
assertThat(timer.count()).isEqualTo(3);
@ -85,7 +66,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
@SneakyThrows
public void testOcr() {
String text = testOCR("files/13485 cert example 2.pdf");
String text = testOCR("files/402Study.pdf");
}
@ -100,27 +81,27 @@ public class OcrServiceIntegrationTest extends AbstractTest {
@Test
public void testMergeImages() {
// check visually for most of the images containing text, the resulting text is kind of nonsense, just ensure it is there
String text = testOCR("files/merge_images.pdf");
String text = testOCR("files/syngenta/CustomerFiles/SinglePages/merge_images - Page241_18 Chlorothalonil RAR 08 Volume 3CA B 6a Oct 2017.pdf");
assertThat(text).contains("Bodyweight change of dams with live young - group mean values",
"Control",
"mg/g day",
"10 mg/kg/day",
"20 mg/kg/",
"Days",
"50",
"-200",
"—250",
"150",
"200",
"250",
"—150");
"Control",
"mg/g day",
"10 mg/kg/day",
"20 mg/kg/",
"Days",
"50",
"-200",
"—250",
"150",
"200",
"250",
"—150");
}
@Test
public void testOCRWatermark() {
assertThat(testOCR("files/Watermark.pdf")).contains("syngenta");
assertThat(testOCR("files/syngenta/CustomerFiles/SinglePages/Watermark_Page1_10.SYN524464 FS (A16148C) - Absorção cutânea.pdf")).contains("syngenta");
}
@ -157,7 +138,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
@SneakyThrows
public void testOcrForAllDMFiles() {
String dir = "/home/kschuettler/Dokumente/TestFiles/certificates/no-ocr";
String dir = "/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/";
List<File> foundFiles = Files.walk(Path.of(dir))
.sorted(Comparator.comparingLong(this::getFileSize))
.map(Path::toFile)
@ -167,9 +148,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
int fileCount = foundFiles.size();
AtomicInteger processedCount = new AtomicInteger();
System.out.printf("Found %s files, starting OCR for each.%n%n", fileCount);
foundFiles.stream()
.peek(file -> System.out.printf("%s/%s: %s%n", processedCount.getAndIncrement(), fileCount, file))
.forEach(this::testOCRForFile);
foundFiles.stream().peek(file -> System.out.printf("%s/%s: %s%n", processedCount.getAndIncrement(), fileCount, file)).forEach(this::testOCRForFile);
}
@ -203,146 +182,4 @@ public class OcrServiceIntegrationTest extends AbstractTest {
System.out.println("\n\n");
}
@SneakyThrows
@Test
public void testMakeTextVisible() {
File file = new File("/home/kschuettler/Downloads/18-Curacron_ToxicidadeOcularInVitro.pdf");
PDFDoc doc;
try (var in = new FileInputStream(file)) {
doc = new PDFDoc(in);
}
execute(doc, false, false, Collections.emptySet());
try (var out = new FileOutputStream("/tmp/test.pdf")) {
doc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
}
}
@SneakyThrows
private void execute(PDFDoc pdfDoc, boolean delta, boolean removePaths, Set<String> markedContentToIgnore) {
ElementWriter writer = new ElementWriter();
ElementReader reader = new ElementReader();
Set<Long> visitedXObjIds = new TreeSet<>();
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); ) {
Page page = iterator.next();
visitedXObjIds.add(page.getSDFObj().getObjNum());
Context context = Context.builder()
.reader(reader)
.clippingPathStack(new ClippingPathStack(page.getMediaBox()))
.markedContentStack(new MarkedContentStack())
.removePaths(removePaths)
.delta(delta)
.overlappedElements(new ArrayList<>())
.visibleElements(new ArrayList<>())
.visitedXObjIds(visitedXObjIds)
.markedContentToIgnore(markedContentToIgnore)
.build();
removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(page, writer, context);
}
writer.destroy();
reader.destroy();
}
private void removeClippedElementsAndInvisibleTextAndRememberOverlappedElements(Page page, ElementWriter writer, Context context) throws PDFNetException {
context.reader().begin(page);
context.markedContentStack().clear();
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
processElements(writer, context);
writer.end();
context.reader().end();
}
private void processElements(ElementWriter writer, Context context) throws PDFNetException {
for (Element element = context.reader().next(); element != null; element = context.reader().next()) {
if (context.markedContentStack().currentMarkedContentContainsAny(context.markedContentToIgnore()) && element.getType() != Element.e_marked_content_end) {
writer.writeElement(element);
continue;
}
switch (element.getType()) {
case Element.e_text -> processText(element, writer, context);
case Element.e_form -> processForm(element, writer, context);
default -> writer.writeElement(element);
}
}
}
private void processForm(Element formElement, ElementWriter writer, Context context) throws PDFNetException {
writer.writeElement(formElement);
Obj formObj = formElement.getXObject();
if (!context.visitedXObjIds().contains(formObj.getObjNum())) {
context.visitedXObjIds().add(formObj.getObjNum());
// writer needs to be newly initialized when entering a new content stream
// see ElementEditTest in PDFTron (https://www.pdftron.com/documentation/samples/android/java/ElementEditTest)
ElementWriter formWriter = new ElementWriter();
context.reader().formBegin();
formWriter.begin(formObj);
context.reader().clearChangeList();
formWriter.setDefaultGState(context.reader());
processElements(formWriter, context);
formWriter.end();
formWriter.destroy();
context.reader().end();
}
}
private void processText(Element textElement, ElementWriter writer, Context context) throws PDFNetException {
Rect textBBox = textElement.getBBox();
if (textBBox == null) {
writer.writeElement(textElement);
return;
}
GState gState = textElement.getGState();
gState.setFillColorSpace(ColorSpace.createDeviceRGB());
// blue for elements removed due to transparency or not rendered or same color as background
gState.setFillColor(new ColorPt(0, 0, 1));
gState.setTextRenderMode(GState.e_fill_text);
gState.setFillOpacity(1);
writer.writeElement(textElement);
}
@Builder
record Context(
boolean removePaths,
boolean delta,
ElementReader reader,
ClippingPathStack clippingPathStack,
MarkedContentStack markedContentStack,
List<ElementFeatures> overlappedElements,
List<ElementFeatures> visibleElements,
Set<Long> visitedXObjIds,
Set<String> markedContentToIgnore
) {
}
}

View File

@ -15,3 +15,10 @@ management:
health.enabled: true
endpoints.web.exposure.include: prometheus, health, metrics
metrics.export.prometheus.enabled: true
tracing:
enabled: ${TRACING_ENABLED:false}
sampling:
probability: ${TRACING_PROBABILITY:1.0}
otlp:
tracing:
endpoint: ${OTLP_ENDPOINT:http://otel-collector-opentelemetry-collector.otel-collector:4318/v1/traces}

@ -0,0 +1 @@
Subproject commit 9dc6c2337dea32e63aef53271dba0692537c6605

@ -0,0 +1 @@
Subproject commit 21fefb64bf27ca2b3329a6c69d90a27450b17930

View File

@ -1,5 +1,9 @@
#!/bin/bash
set -e
dir=${PWD##*/}
gradle assemble
# Get the current Git branch
@ -11,5 +15,32 @@ commit_hash=$(git rev-parse --short=5 HEAD)
# Combine branch and commit hash
buildName="${USER}-${branch}-${commit_hash}"
gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=$buildName
echo "nexus.knecon.com:5001/ff/${dir}-server:$buildName"
gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=${buildName}
newImageName="nexus.knecon.com:5001/ff/ocr-service-server:$buildName"
echo "full image name:"
echo ${newImageName}
echo ""
if [ -z "$1" ]; then
exit 0
fi
namespace=${1}
deployment_name="ocr-service-v1"
echo "deploying to ${namespace}"
oldImageName=$(rancher kubectl -n ${namespace} get deployment ${deployment_name} -o=jsonpath='{.spec.template.spec.containers[*].image}')
if [ "${newImageName}" = "${oldImageName}" ]; then
echo "Image tag did not change, redeploying..."
rancher kubectl rollout restart deployment ${deployment_name} -n ${namespace}
else
echo "upgrading the image tag..."
rancher kubectl set image deployment/${deployment_name} ${deployment_name}=${newImageName} -n ${namespace}
fi
rancher kubectl rollout status deployment ${deployment_name} -n ${namespace}
echo "Built ${deployment_name}:${buildName} and deployed to ${namespace}"