Compare commits

..

1 Commits

Author SHA1 Message Date
Kilian Schuettler
189bd8e979 RED-9353: adjust deployment name 2024-07-26 11:48:33 +02:00
76 changed files with 935 additions and 3111 deletions

View File

@ -22,5 +22,4 @@ deploy:
rules:
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
- if: $CI_COMMIT_BRANCH =~ /^release/
- if: $CI_COMMIT_BRANCH =~ /^feature/
- if: $CI_COMMIT_TAG

View File

@ -70,7 +70,7 @@ int concurrency = 8;
int batchSize = 128;
boolean debug; // writes the ocr layer visibly to the viewer doc pdf
boolean idpEnabled; // Enables table detection, paragraph classification, section detection, key-value detection.
boolean drawTablesAsLines; // writes the tables to the PDF as invisible lines.
boolean tableDetection; // writes the tables to the PDF as invisible lines.
boolean processAllPages; // if this parameter is set, ocr will be performed on any page, regardless if it has images or not
boolean fontStyleDetection; // Enables bold detection using ghostscript and leptonica
String contentFormat; // Either markdown or text. But, for whatever reason, with markdown enabled, key-values are not written by azure....

View File

@ -0,0 +1,25 @@
package com.knecon.fforesight.service.ocr.v1.api.model;
import java.util.ArrayList;
import java.util.List;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
@Getter
@Builder
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class AzureAnalyzeResult {
@Builder.Default
List<KeyValuePair> keyValuePairs = new ArrayList<>();
@Builder.Default
List<TextRegion> handWrittenText = new ArrayList<>();
@Builder.Default
List<Figure> figures = new ArrayList<>();
}

View File

@ -1,11 +0,0 @@
package com.knecon.fforesight.service.ocr.v1.api.model;
public enum AzureOcrFeature {
ROTATION_CORRECTION,
IDP,
FONT_STYLE_DETECTION,
ALL_PAGES,
REMOVE_WATERMARKS
}

View File

@ -1,8 +1,6 @@
package com.knecon.fforesight.service.ocr.v1.api.model;
import java.util.Collections;
import java.util.Optional;
import java.util.Set;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
@ -20,13 +18,12 @@ public class DocumentRequest {
String dossierId;
String fileId;
boolean removeWatermark;
String originDocumentId;
String viewerDocId;
String idpResultId;
Set<AzureOcrFeature> features;
boolean removeWatermarks;
public DocumentRequest(String dossierId, String fileId) {
@ -36,23 +33,18 @@ public class DocumentRequest {
originDocumentId = null;
viewerDocId = null;
idpResultId = null;
features = Collections.emptySet();
removeWatermarks = false;
}
// needed for backwards compatibility
public DocumentRequest(String dossierId, String fileId, boolean removeWatermark) {
public DocumentRequest(String dossierId, String fileId, boolean removeWatermarks) {
this.dossierId = dossierId;
this.fileId = fileId;
this.removeWatermarks = removeWatermarks;
originDocumentId = null;
viewerDocId = null;
idpResultId = null;
if (removeWatermark) {
features = Set.of(AzureOcrFeature.REMOVE_WATERMARKS);
} else {
features = Collections.emptySet();
}
}
@ -73,10 +65,4 @@ public class DocumentRequest {
return Optional.ofNullable(originDocumentId);
}
public Set<AzureOcrFeature> getFeatures() {
return features == null ? Collections.emptySet() : features;
}
}

View File

@ -1,11 +1,10 @@
package com.knecon.fforesight.service.ocr.v1.api.model;
import java.util.List;
import java.util.Optional;
import lombok.Builder;
@Builder
public record Figure(TextRegion caption, Region image, List<TextRegion> footnotes) {
public record Figure(Optional<TextRegion> caption, Region image) {
}

View File

@ -1,23 +0,0 @@
package com.knecon.fforesight.service.ocr.v1.api.model;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
public record IdpResult(List<KeyValuePair> keyValuePairs, List<TextRegion> handWrittenText, List<Figure> figures, List<Table> tables) {
public static IdpResult initSynchronized() {
return new IdpResult(Collections.synchronizedList(new LinkedList<>()),
Collections.synchronizedList(new LinkedList<>()),
Collections.synchronizedList(new LinkedList<>()),
Collections.synchronizedList(new LinkedList<>()));
}
public static IdpResult empty() {
return new IdpResult(Collections.emptyList(), Collections.emptyList(), Collections.emptyList(), Collections.emptyList());
}
}

View File

@ -1,8 +1,5 @@
package com.knecon.fforesight.service.ocr.v1.api.model;
import java.util.Collections;
import java.util.Set;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
@ -15,16 +12,9 @@ import lombok.NoArgsConstructor;
public class OCRStatusUpdateResponse {
private String fileId;
private Set<AzureOcrFeature> features;
private int numberOfPagesToOCR;
private int numberOfOCRedPages;
private boolean ocrFinished;
private boolean ocrStarted;
public Set<AzureOcrFeature> getFeatures() {
return features == null ? Collections.emptySet() : features;
}
}

View File

@ -5,107 +5,29 @@ import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.List;
import java.util.Objects;
import java.util.stream.Stream;
import lombok.Getter;
public record QuadPoint(Point2D a, Point2D b, Point2D c, Point2D d) {
public final class QuadPoint {
public enum Direction {
RIGHT,
/*
B _____ C
| |
A|_____|D
*/
DOWN,
/*
* A _____ B
* | |
* D|_____|C
*/
LEFT,
/*
* D _____ A
* | |
* C|_____|B
* */
UP,
/*
* C _____ D
* | |
* B|_____|A
*/
NONE
/*
* ? _____ ?
* | |
* ?|_____|?
*/
}
private static final double THRESHOLD_ANGLE = Math.toRadians(5); // QuadPoint is considered straight, when its angles are below this threshold.
private final Point2D a;
private final Point2D b;
private final Point2D c;
private final Point2D d;
@Getter
private final Direction direction;
// This constructor assumes, the points form a convex polygon, I will omit the assertion for performance reasons.
public QuadPoint(Point2D a, Point2D b, Point2D c, Point2D d) {
this.a = a;
this.b = b;
this.c = c;
this.d = d;
this.direction = calculateDirection();
}
private Direction calculateDirection() {
if (isHorizontal()) {
return a.getX() < d.getX() ? Direction.RIGHT : Direction.LEFT;
}
if (isVertical()) {
return a.getY() < d.getY() ? Direction.UP : Direction.DOWN;
}
return Direction.NONE;
}
/*
B _____ C
| |
A|_____|D
*/
public static QuadPoint fromRectangle2D(Rectangle2D rectangle2D) {
return fromRectangle2D(rectangle2D, Direction.NONE);
}
public static QuadPoint fromRectangle2D(Rectangle2D rectangle2D, Direction direction) {
var lowerLeft = new Point2D.Double(rectangle2D.getX(), rectangle2D.getY());
var upperLeft = new Point2D.Double(rectangle2D.getX(), rectangle2D.getMaxY());
var upperRight = new Point2D.Double(rectangle2D.getMaxX(), rectangle2D.getMaxY());
var lowerRight = new Point2D.Double(rectangle2D.getMaxX(), rectangle2D.getY());
return switch (direction) {
case DOWN -> new QuadPoint(upperLeft, upperRight, lowerRight, lowerLeft);
case LEFT -> new QuadPoint(upperRight, lowerRight, lowerLeft, upperLeft);
case UP -> new QuadPoint(lowerRight, lowerLeft, upperLeft, upperRight);
default -> new QuadPoint(lowerLeft, upperLeft, upperRight, lowerRight);
};
return new QuadPoint(new Point2D.Double(rectangle2D.getX(), rectangle2D.getY()),
new Point2D.Double(rectangle2D.getX(), rectangle2D.getMaxY()),
new Point2D.Double(rectangle2D.getMaxX(), rectangle2D.getMaxY()),
new Point2D.Double(rectangle2D.getMaxX(), rectangle2D.getY()));
}
public static QuadPoint fromPolygons(List<Double> polygon) {
if (polygon.size() != 8) {
throw new AssertionError();
}
assert polygon.size() == 8;
return new QuadPoint(new Point2D.Double(polygon.get(0), polygon.get(1)),
new Point2D.Double(polygon.get(6), polygon.get(7)),
new Point2D.Double(polygon.get(4), polygon.get(5)),
@ -134,23 +56,6 @@ public final class QuadPoint {
}
public boolean isHorizontal() {
double angle = calculateAngle(a, d);
double angle2 = calculateAngle(b, c);
return Math.abs(angle) <= THRESHOLD_ANGLE || Math.abs(angle2) <= THRESHOLD_ANGLE;
}
public boolean isVertical() {
double rightAngle = Math.PI / 2;
double angle = calculateAngle(a, d);
double angle2 = calculateAngle(b, c);
return Math.abs(rightAngle - Math.abs(angle)) <= THRESHOLD_ANGLE || Math.abs(rightAngle - Math.abs(angle2)) <= THRESHOLD_ANGLE;
}
public Stream<Line2D> asLines() {
return Stream.of(new Line2D.Double(a(), b()), new Line2D.Double(b(), c()), new Line2D.Double(c(), d()), new Line2D.Double(d(), a()));
@ -158,7 +63,7 @@ public final class QuadPoint {
}
public QuadPointData toData() {
public QuadPointData data() {
return new QuadPointData(new float[]{(float) a.getX(), (float) a.getY(), (float) b.getX(), (float) b.getY(), (float) c.getX(), (float) c.getY(), (float) d.getX(), (float) d.getY()});
}
@ -170,142 +75,6 @@ public final class QuadPoint {
}
public boolean contains(double x, double y) {
// split into two triangles, test if either contains the point, assumes the QuadPoint is convex and created correctly. More specifically, the points must be in the correct order.
return triangleContains(a, b, c, x, y) || triangleContains(a, c, d, x, y);
}
/*
checks if a triangle contains a point by converting the point to barycentric coordinates using cramer's rule and then checking if the linear combination is within the bounds of the triangle.
https://en.wikipedia.org/wiki/Barycentric_coordinate_system#Barycentric_coordinates_on_triangles
*/
private boolean triangleContains(Point2D a, Point2D b, Point2D c, double x, double y) {
// area of the triangle
double denominator = ((b.getY() - c.getY()) * (a.getX() - c.getX()) + (c.getX() - b.getX()) * (a.getY() - c.getY()));
double invertedDenominator = 1.0 / denominator;
double alpha = ((b.getY() - c.getY()) * (x - c.getX()) + (c.getX() - b.getX()) * (y - c.getY())) * invertedDenominator;
double beta = ((c.getY() - a.getY()) * (x - c.getX()) + (a.getX() - c.getX()) * (y - c.getY())) * invertedDenominator;
return alpha >= 0 && beta >= 0 && alpha + beta <= 1;
}
public boolean contains(Point2D p) {
return contains(p.getX(), p.getY());
}
public boolean contains(Rectangle2D r) {
double x = r.getX();
double y = r.getY();
double maxY = r.getMaxY();
double maxX = r.getMaxX();
Point2D p1 = new Point2D.Double(x, y);
Point2D p2 = new Point2D.Double(x, maxY);
Point2D p3 = new Point2D.Double(maxX, maxY);
Point2D p4 = new Point2D.Double(maxX, y);
return contains(p1) && contains(p2) && contains(p3) && contains(p4);
}
public double getCenterX() {
return (a.getX() + b.getX() + c.getX() + d.getX()) / 4;
}
public double getCenterY() {
return (a.getY() + b.getY() + c.getY() + d.getY()) / 4;
}
public Point2D getCenter() {
return new Point2D.Double(getCenterX(), getCenterY());
}
public boolean intersects(Line2D line) {
return contains(line.getP1()) || contains(line.getP2()) || asLines().anyMatch(qLine -> qLine.intersectsLine(line));
}
public Line2D getRightLine() {
return new Line2D.Double(getTopRight(), getLowerRight());
}
public Line2D getLeftLine() {
return new Line2D.Double(getTopLeft(), getLowerLeft());
}
public Line2D getBottomLine() {
return new Line2D.Double(getLowerLeft(), getLowerRight());
}
public Line2D getTopLine() {
return new Line2D.Double(getTopLeft(), getTopRight());
}
public Point2D getTopLeft() {
return switch (direction) {
case DOWN -> a;
case LEFT -> d;
case UP -> c;
default -> b;
};
}
public Point2D getTopRight() {
return switch (direction) {
case DOWN -> b;
case LEFT -> a;
case UP -> d;
default -> c;
};
}
public Point2D getLowerRight() {
return switch (direction) {
case DOWN -> c;
case LEFT -> b;
case UP -> a;
default -> d;
};
}
public Point2D getLowerLeft() {
return switch (direction) {
case DOWN -> d;
case LEFT -> c;
case UP -> b;
default -> a;
};
}
/**
* Determines if the given QuadPoint aligns with this QuadPoint within a given threshold.
* It does os by trying every possible combination of aligning sides. It starts with the most likely combination of ab and cd.
@ -365,56 +134,17 @@ public final class QuadPoint {
}
public double getRectangularSize() {
public double size() {
return a().distance(b()) * a().distance(d());
}
public double getAngle() {
return calculateAngle(a, d);
}
private static double calculateAngle(Point2D a, Point2D d) {
public double angle() {
double deltaY = d.getY() - a.getY();
double deltaX = d.getX() - a.getX();
return Math.atan2(deltaY, deltaX);
}
public Point2D a() {return a;}
public Point2D b() {return b;}
public Point2D c() {return c;}
public Point2D d() {return d;}
@Override
public boolean equals(Object obj) {
if (obj == this) {
return true;
}
if (obj == null || obj.getClass() != this.getClass()) {
return false;
}
var that = (QuadPoint) obj;
return Objects.equals(this.a, that.a) && Objects.equals(this.b, that.b) && Objects.equals(this.c, that.c) && Objects.equals(this.d, that.d);
}
@Override
public int hashCode() {
return Objects.hash(a, b, c, d);
}
}

View File

@ -5,9 +5,4 @@ import lombok.Builder;
@Builder
public record QuadPointData(float[] values) {
public QuadPoint get() {
return QuadPoint.fromData(this);
}
}

View File

@ -1,7 +0,0 @@
package com.knecon.fforesight.service.ocr.v1.api.model;
import java.util.List;
public record Table(TextRegion caption, int numberOfCols, int numberOfRows, List<TableCell> cells, List<TextRegion> footnotes, List<Region> bboxes) {
}

View File

@ -1,5 +0,0 @@
package com.knecon.fforesight.service.ocr.v1.api.model;
public record TableCell(TextRegion textRegion, int row, int col, TableCellType kind) {
}

View File

@ -1,5 +0,0 @@
package com.knecon.fforesight.service.ocr.v1.api.model;
public enum TableCellType {
ROW_HEADER, COLUMN_HEADER, CONTENT, STUB_HEAD, DESCRIPTION
}

View File

@ -10,18 +10,19 @@ configurations {
}
dependencies {
implementation(project(":azure-ocr-service-api"))
implementation("net.sourceforge.tess4j:tess4j:5.8.0")
implementation("com.iqser.red.commons:metric-commons:2.1.0")
implementation("com.pdftron:PDFNet:11.0.0")
implementation("org.apache.pdfbox:pdfbox:3.0.0")
implementation("org.apache.commons:commons-math3:3.6.1")
implementation("com.amazonaws:aws-java-sdk-kms:1.12.440")
implementation("com.google.guava:guava:31.1-jre")
implementation("com.knecon.fforesight:viewer-doc-processor:0.193.0")
implementation("com.azure:azure-ai-documentintelligence:1.0.0")
implementation("com.iqser.red.commons:pdftron-logic-commons:2.32.0")
api(project(":azure-ocr-service-api"))
api("com.iqser.red.service:persistence-service-internal-api-v1:2.224.0")
api("net.sourceforge.tess4j:tess4j:5.8.0")
api("com.iqser.red.commons:metric-commons:2.1.0")
api("com.iqser.red.commons:storage-commons:2.49.0")
api("com.knecon.fforesight:tenant-commons:0.21.0")
api("com.pdftron:PDFNet:10.7.0")
api("org.apache.pdfbox:pdfbox:3.0.0")
api("org.apache.commons:commons-math3:3.6.1")
api("com.amazonaws:aws-java-sdk-kms:1.12.440")
api("com.google.guava:guava:31.1-jre")
api("com.iqser.red.commons:pdftron-logic-commons:2.27.0")
api("com.knecon.fforesight:viewer-doc-processor:0.148.0")
api("com.azure:azure-ai-documentintelligence:1.0.0-beta.3")
testImplementation("org.junit.jupiter:junit-jupiter:5.8.1")
}

View File

@ -6,8 +6,6 @@ import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.ComponentScan;
import org.springframework.context.annotation.Configuration;
import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService;
import com.iqser.red.pdftronlogic.commons.WatermarkRemovalService;
import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService;
import io.micrometer.observation.ObservationRegistry;
@ -24,18 +22,4 @@ public class OcrServiceProcessorConfiguration {
return new PDFTronViewerDocumentService(registry);
}
@Bean
public InvisibleElementRemovalService invisibleElementRemovalService() {
return new InvisibleElementRemovalService();
}
@Bean
public WatermarkRemovalService watermarkRemovalService() {
return new WatermarkRemovalService();
}
}

View File

@ -11,16 +11,16 @@ import lombok.experimental.FieldDefaults;
@FieldDefaults(level = AccessLevel.PRIVATE)
public class OcrServiceSettings {
// Limits the number of concurrent calls to azure
int concurrency = 2;
// Limits the number of concurrent calls to the azure API. In my very rudimentary testing, azure starts throwing "too many requests" errors at around 80/s. Higher numbers greatly improve the speed.
int concurrency = 8;
// Limits the number of pages per call.
int batchSize = 32;
int batchSize = 128;
boolean debug; // writes the ocr layer visibly to the viewer doc pdf
boolean drawTablesAsLines; // writes the tables to the PDF as invisible lines.
boolean snuggify = true; // attempts to shrink the word boxes returned by azure to fit the actual word pixels snug
boolean useCaches; // skips azure api, pdf rendering and image processing, when the files are already present
boolean azureFontStyleDetection; // omits all image processing and uses azures FONT_STYLE feature (costs 0.6ct per page)
boolean idpEnabled; // Enables table detection, paragraph classification, section detection, key-value detection.
boolean tableDetection; // writes the tables to the PDF as invisible lines.
boolean processAllPages; // if this parameter is set, ocr will be performed on any page, regardless if it has images or not
boolean fontStyleDetection; // Enables bold detection using ghostscript and leptonica
String contentFormat; // Either markdown or text. But, for whatever reason, with markdown enabled, key-values are not written by azure....
}

View File

@ -7,7 +7,6 @@ import com.pdftron.pdf.PDFNet;
import com.sun.jna.NativeLibrary;
import jakarta.annotation.PostConstruct;
import lombok.AllArgsConstructor;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@ -15,14 +14,11 @@ import lombok.extern.slf4j.Slf4j;
@Slf4j
@Component
@RequiredArgsConstructor
@AllArgsConstructor
public class NativeLibrariesInitializer {
@Value("${pdftron.license:}")
private String pdftronLicense;
@Value("${native-libs.path:}")
private String nativeLibsPath;
@SneakyThrows
@PostConstruct
@ -34,8 +30,8 @@ public class NativeLibrariesInitializer {
PDFNet.setTempPath("/tmp/pdftron");
PDFNet.initialize(pdftronLicense);
log.info("Setting jna.library.path: {}", nativeLibsPath);
System.setProperty("jna.library.path", nativeLibsPath);
log.info("Setting jna.library.path: {}", System.getenv("VCPKG_DYNAMIC_LIB"));
System.setProperty("jna.library.path", System.getenv("VCPKG_DYNAMIC_LIB"));
log.info("Asserting Native Libraries loaded");

View File

@ -1,102 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.model;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.stream.Collectors;
import com.azure.ai.documentintelligence.models.AnalyzeResult;
import com.azure.ai.documentintelligence.models.DocumentPage;
import com.azure.ai.documentintelligence.models.DocumentSpan;
import com.azure.ai.documentintelligence.models.DocumentWord;
public class DocumentSpanLookup {
List<PageSpanLookup> documentWordLookup;
public DocumentSpanLookup(AnalyzeResult analyzeResult) {
documentWordLookup = new ArrayList<>(analyzeResult.getPages().size());
int offset = 0;
for (DocumentPage page : analyzeResult.getPages()) {
if (page.getWords() == null || page.getWords().isEmpty()) {
documentWordLookup.add(new PageSpanLookup(offset, offset, null));
}
int start = page.getWords()
.get(0).getSpan().getOffset();
DocumentSpan span = page.getWords()
.get(page.getWords().size() - 1).getSpan();
int end = span.getOffset() + span.getLength();
SpanLookup<DocumentWord> pageWords = new SpanLookup<>(page.getWords()
.stream(), DocumentWord::getSpan);
documentWordLookup.add(new PageSpanLookup(start, end, pageWords));
offset = end + 1;
}
}
public List<WordOnPage> findWordsOnPages(DocumentSpan documentSpan) {
if (documentSpan == null) {
return Collections.emptyList();
}
int firstSmallerIdx = findIdxOfFirstSmallerObject(documentSpan);
PageSpanLookup firstPage = documentWordLookup.get(firstSmallerIdx);
List<WordOnPage> wordsOnPages = new ArrayList<>();
for (int pageNumber = firstSmallerIdx; pageNumber < documentWordLookup.size(); pageNumber++) {
PageSpanLookup page = documentWordLookup.get(pageNumber);
if (page.end >= documentSpan.getOffset()) {
break;
}
firstPage.wordSpanLookup.findElementsContainedInSpan(documentSpan)
.stream()
.map(documentWord -> new WordOnPage(documentWord, firstSmallerIdx))
.forEach(wordsOnPages::add);
}
return wordsOnPages;
}
private int findIdxOfFirstSmallerObject(DocumentSpan documentSpan) {
int idx = Collections.binarySearch(documentWordLookup, new PageSpanLookup(documentSpan.getOffset(), -1, null), Comparator.comparing(PageSpanLookup::start));
if (idx >= 0) {
return idx;
} else {
int insertionPoint = -(idx + 1);
if (insertionPoint == 0) {
return -1;
}
var lastSmaller = documentWordLookup.get(insertionPoint - 1);
for (int resultIdx = insertionPoint - 2; resultIdx >= 0; resultIdx--) {
if (documentWordLookup.get(resultIdx).compareTo(lastSmaller) == 0) {
return resultIdx + 1;
}
}
return 0;
}
}
public record WordOnPage(DocumentWord documentWord, int pageNumber) {
}
private record PageSpanLookup(int start, int end, SpanLookup<DocumentWord> wordSpanLookup) implements Comparable<PageSpanLookup> {
@Override
public int compareTo(PageSpanLookup o) {
return Integer.compare(start, o.start);
}
}
}

View File

@ -1,7 +1,5 @@
package com.knecon.fforesight.service.ocr.processor.model;
import java.io.File;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
@ -12,10 +10,4 @@ public record ImageFile(int pageNumber, String absoluteFilePath) {
return Leptonica1.pixRead(absoluteFilePath);
}
public boolean exists() {
return new File(absoluteFilePath).exists();
}
}

View File

@ -2,129 +2,29 @@ package com.knecon.fforesight.service.ocr.processor.model;
import static com.knecon.fforesight.service.ocr.processor.utils.ListSplittingUtils.formatIntervals;
import java.io.File;
import java.io.FileInputStream;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.function.Consumer;
import com.azure.ai.documentintelligence.models.AnalyzeResult;
import com.azure.core.util.BinaryData;
import com.azure.json.JsonOptions;
import com.azure.json.JsonReader;
import com.azure.json.implementation.DefaultJsonReader;
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.GhostScriptService;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.NonNull;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public final class PageBatch implements Comparable<PageBatch> {
@Getter
int index;
@NonNull
List<Integer> batchPageToOriginPageLookup;
@NonNull
@Getter
Path batchDoc;
@NonNull
@Getter
Path batchDir;
@SneakyThrows
public AnalyzeResult getAzureResultCache() {
try (var in = new FileInputStream(getAzureResultCacheFile()); JsonReader reader = DefaultJsonReader.fromStream(in, new JsonOptions());) {
return AnalyzeResult.fromJson(reader);
}
}
@SneakyThrows
public File getAzureResultCacheFile() {
return batchDir.resolve("analyzeResult.json").toFile();
}
public List<ImageFile> getRenderedImageFiles() {
List<ImageFile> renderedImageFiles = new ArrayList<>();
for (int i = 0; i < batchPageToOriginPageLookup.size(); i++) {
renderedImageFiles.add(getRenderedImageFile(batchPageToOriginPageLookup.get(i), i + 1));
}
return renderedImageFiles;
}
public ImageFile getRenderedImageFile(int pageNumber, int numberInBatch) {
return new ImageFile(pageNumber, getRenderedImageNameFormat().formatted(numberInBatch));
}
public ImageFile getProcessedImageFile(int pageNumber, int numberInBatch) {
return new ImageFile(pageNumber, getProcessedImageNameFormat().formatted(numberInBatch));
}
public List<ImageFile> getProcessedImageFiles() {
List<ImageFile> processedImageFiles = new ArrayList<>();
for (int i = 0; i < batchPageToOriginPageLookup.size(); i++) {
processedImageFiles.add(getProcessedImageFile(batchPageToOriginPageLookup.get(i), i + 1));
}
return processedImageFiles;
}
public String getRenderedImageNameFormat() {
return getRenderedImageDir().resolve(getImageFormat()).toFile().toString();
}
public String getProcessedImageNameFormat() {
return getProcessedImageDir().resolve(getImageFormat()).toFile().toString();
}
private String getImageFormat() {
return "output_" + index + ".%04d" + GhostScriptService.FORMAT;
}
public Path getRenderedImageDir() {
return batchDir.resolve("rendered");
}
public Path getProcessedImageDir() {
return batchDir.resolve("processed");
}
List<Integer> lookup = new ArrayList<>();
@Override
public String toString() {
if (size() == 1) {
return String.format("%d", batchPageToOriginPageLookup.get(0));
return String.format("%d", lookup.get(0));
}
List<String> intervals = formatIntervals(batchPageToOriginPageLookup);
List<String> intervals = formatIntervals(lookup);
if (intervals.size() > 4) {
intervals = intervals.subList(0, 4);
intervals.add("...");
@ -134,54 +34,54 @@ public final class PageBatch implements Comparable<PageBatch> {
}
public void add(Integer pageNumber) {
lookup.add(pageNumber);
}
public void forEach(Consumer<? super Integer> consumer) {
batchPageToOriginPageLookup.forEach(consumer);
lookup.forEach(consumer);
}
public List<Integer> getAllPageNumbers() {
return batchPageToOriginPageLookup;
return lookup;
}
public int size() {
return batchPageToOriginPageLookup.size();
return lookup.size();
}
public boolean isEmpty() {
return batchPageToOriginPageLookup.isEmpty();
return lookup.isEmpty();
}
public int getPageNumber(int pageNumber) {
return batchPageToOriginPageLookup.get(pageNumber - 1);
return lookup.get(pageNumber - 1);
}
@Override
public int compareTo(PageBatch o) {
if (batchPageToOriginPageLookup.isEmpty() && o.batchPageToOriginPageLookup.isEmpty()) {
if (lookup.isEmpty() && o.lookup.isEmpty()) {
return 0;
} else if (batchPageToOriginPageLookup.isEmpty()) {
} else if (lookup.isEmpty()) {
return 1;
} else if (o.batchPageToOriginPageLookup.isEmpty()) {
} else if (o.lookup.isEmpty()) {
return -1;
}
return Integer.compare(batchPageToOriginPageLookup.get(0), o.batchPageToOriginPageLookup.get(0));
}
public BinaryData render() {
return BinaryData.fromFile(batchDoc);
return Integer.compare(lookup.get(0), o.lookup.get(0));
}
}

View File

@ -13,19 +13,17 @@ import com.pdftron.pdf.Rect;
import lombok.SneakyThrows;
public record PageInformation(Rectangle2D mediabox, Rectangle2D cropBox, int number, int rotationDegrees, List<Rectangle2D> wordBBoxes) {
public record PageInformation(Rectangle2D mediabox, int number, int rotationDegrees, List<Rectangle2D> wordBBoxes) {
@SneakyThrows
public static Map<Integer, PageInformation> fromPDFDoc(PDFDoc pdfDoc) {
ConcurrentHashMap<Integer, PageInformation> pageInformationMap = new ConcurrentHashMap<>();
int pageNumber = 1;
try (PageIterator iterator = pdfDoc.getPageIterator()) {
while (iterator.hasNext()) {
Page page = iterator.next();
pageInformationMap.put(pageNumber, PageInformation.fromPage(pageNumber, page));
pageNumber++;
}
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); pageNumber++) {
Page page = iterator.next();
pageInformationMap.put(pageNumber, PageInformation.fromPage(pageNumber, page));
}
return pageInformationMap;
}
@ -34,9 +32,8 @@ public record PageInformation(Rectangle2D mediabox, Rectangle2D cropBox, int num
@SneakyThrows
public static PageInformation fromPage(int pageNum, Page page) {
try (Rect mediaBox = page.getCropBox(); Rect cropBox = page.getCropBox()) {
try (Rect mediaBox = page.getCropBox()) {
return new PageInformation(new Rectangle2D.Double(mediaBox.getX1(), mediaBox.getY1(), mediaBox.getWidth(), mediaBox.getHeight()),
new Rectangle2D.Double(cropBox.getX1(), cropBox.getY1(), cropBox.getWidth(), cropBox.getHeight()),
pageNum,
page.getRotation() * 90,
DocumentTextExtractor.getTextBBoxes(page));

View File

@ -162,7 +162,7 @@ public class Statistics {
return batchStats.values()
.stream()
.mapToLong(BatchStats::getMappingResultDuration)
.mapToLong(BatchStats::getWritingTextDuration)
.toArray();
}

View File

@ -3,6 +3,7 @@ package com.knecon.fforesight.service.ocr.processor.model;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import com.azure.ai.documentintelligence.models.DocumentWord;
import com.knecon.fforesight.service.ocr.processor.visualizations.fonts.FontMetrics;
import com.knecon.fforesight.service.ocr.processor.visualizations.fonts.FontMetricsProvider;
import com.knecon.fforesight.service.ocr.processor.visualizations.fonts.FontStyle;
@ -19,8 +20,7 @@ public class TextPositionInImage {
final QuadPoint position;
final String text;
final AffineTransform resultToPageTransform;
final boolean snugBBox;
final AffineTransform imageCTM;
@Setter
boolean overlapsIgnoreZone;
@ -30,34 +30,33 @@ public class TextPositionInImage {
FontStyle fontStyle;
public TextPositionInImage(QuadPoint position, String text, AffineTransform resultToPageTransform, FontMetricsProvider fontMetricsProvider, FontStyle fontStyle, boolean snugBBox) {
public TextPositionInImage(DocumentWord word, AffineTransform imageCTM, FontMetricsProvider fontMetricsProvider, FontStyle fontStyle) {
this.position = position;
this.text = text;
this.resultToPageTransform = resultToPageTransform;
this.position = QuadPoint.fromPolygons(word.getPolygon());
this.text = word.getContent();
this.imageCTM = imageCTM;
this.fontMetricsProvider = fontMetricsProvider;
this.fontStyle = fontStyle;
this.snugBBox = snugBBox;
}
public QuadPoint getTransformedTextBBox() {
return position.getTransformed(resultToPageTransform);
return position.getTransformed(imageCTM);
}
public AffineTransform getTextMatrix() {
FontMetrics metrics = getMetrics();
FontMetrics metrics = fontMetricsProvider.calculateMetrics(text, getTransformedWidth(), getTransformedHeight());
// Matrix multiplication is from right to left:
// convert to image coords -> subtract descent -> scale height -> reverse imageCTM scaling -> translate to coordinates in image -> convert to pdf coords
// width must not be set, since it is scaled with the fontsize attribute
double rotation = position.getAngle();
double rotation = position.angle();
Point2D anchor = new Point2D.Double(position.b().getX(), position.b().getY());
AffineTransform ctm = new AffineTransform();
ctm.concatenate(resultToPageTransform);
ctm.concatenate(imageCTM);
ctm.translate(anchor.getX(), anchor.getY());
ctm.scale(getWidth() / getTransformedWidth(),
getHeight() / getTransformedHeight()); // scale with transformation coefficient, such that fontsize may be set with transformed width.
@ -70,15 +69,6 @@ public class TextPositionInImage {
}
private FontMetrics getMetrics() {
if (snugBBox) {
return fontMetricsProvider.calculateMetricsForTightBBox(text, getTransformedWidth(), getTransformedHeight());
}
return fontMetricsProvider.calculateMetricsForAzureBBox(text, getTransformedWidth(), getTransformedHeight());
}
public double getFontSize() {
// The fontsize as estimated by the word width
return fontMetricsProvider.calculateFontSize(text, getTransformedWidth());
@ -105,7 +95,7 @@ public class TextPositionInImage {
public double getFontSizeByHeight() {
// The fontsize as estimated by the word height, only used for font style detection
var metrics = getMetrics();
var metrics = fontMetricsProvider.calculateMetrics(text, getTransformedWidth(), getTransformedHeight());
return fontMetricsProvider.calculateFontSize(text, getTransformedWidth()) * metrics.getHeightScaling();
}
@ -118,25 +108,25 @@ public class TextPositionInImage {
public Point2D transformedA() {
return resultToPageTransform.transform(position.a(), null);
return imageCTM.transform(position.a(), null);
}
public Point2D transformedB() {
return resultToPageTransform.transform(position.b(), null);
return imageCTM.transform(position.b(), null);
}
public Point2D transformedC() {
return resultToPageTransform.transform(position.c(), null);
return imageCTM.transform(position.c(), null);
}
public Point2D transformedD() {
return resultToPageTransform.transform(position.d(), null);
return imageCTM.transform(position.d(), null);
}
}

View File

@ -1,25 +1,23 @@
package com.knecon.fforesight.service.ocr.processor.service;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.concurrent.CompletableFuture;
import java.util.function.Supplier;
import org.slf4j.MDC;
import org.springframework.stereotype.Service;
import com.azure.ai.documentintelligence.models.AnalyzeResult;
import com.azure.core.util.BinaryData;
import com.azure.core.util.polling.LongRunningOperationStatus;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.knecon.fforesight.service.ocr.processor.OcrServiceSettings;
import com.knecon.fforesight.service.ocr.processor.model.PageBatch;
import com.knecon.fforesight.service.ocr.processor.model.PageInformation;
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.ImageProcessingPipeline;
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.ImageProcessingSupervisor;
import com.knecon.fforesight.service.ocr.processor.visualizations.layers.LayerFactory;
import com.knecon.fforesight.service.ocr.processor.visualizations.layers.OcrResult;
import com.knecon.fforesight.service.ocr.v1.api.model.AzureOcrFeature;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.sdf.SDFDoc;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
@ -36,13 +34,16 @@ public class AsyncOcrService {
AzureOcrResource azureOcrResource;
OcrServiceSettings settings;
ImageProcessingPipeline imageProcessingPipeline;
ObjectMapper mapper;
public OcrResult awaitOcr(PDFDoc pdfDoc, OcrExecutionSupervisor supervisor, Set<AzureOcrFeature> features, List<PageBatch> batches) throws InterruptedException {
public OcrResult awaitOcr(PDFDoc pdfDoc,
OcrExecutionSupervisor supervisor,
Set<Integer> pagesWithImages,
ImageProcessingSupervisor imageSupervisor) throws InterruptedException, PDFNetException {
LayerFactory layerFactory = new LayerFactory(settings, features, supervisor, PageInformation.fromPDFDoc(pdfDoc), imageProcessingPipeline);
LayerFactory layerFactory = new LayerFactory(settings, supervisor, imageSupervisor, PageInformation.fromPDFDoc(pdfDoc));
List<PageBatch> batches = splitIntoBatches(pdfDoc, supervisor, pagesWithImages);
for (PageBatch batch : batches) {
@ -55,10 +56,12 @@ public class AsyncOcrService {
supervisor.requireNoErrors();
batchContext.batchStats().start();
BinaryData data = batch.render();
BinaryData data = renderBatch(pdfDoc, batch);
batchContext.batchStats().batchRenderFinished();
beginAnalysis(data, batchContext, features);
beginAnalysis(data, batchContext);
}
supervisor.awaitAllPagesProcessed();
@ -67,21 +70,44 @@ public class AsyncOcrService {
}
private void beginAnalysis(BinaryData data, BatchContext batchContext, Set<AzureOcrFeature> features) throws InterruptedException {
private static BinaryData renderBatch(PDFDoc pdfDoc, PageBatch batch) throws PDFNetException {
if (settings.isUseCaches() && batchContext.batch().getAzureResultCacheFile().exists()) {
handleCached(batchContext);
BinaryData docData;
try (var smallerDoc = extractBatchDocument(pdfDoc, batch)) {
docData = BinaryData.fromBytes(smallerDoc.save(SDFDoc.SaveMode.LINEARIZED, null));
}
return docData;
}
private List<PageBatch> splitIntoBatches(PDFDoc pdfDoc, OcrExecutionSupervisor supervisor, Set<Integer> pagesWithImages) throws PDFNetException {
List<PageBatch> batches = new ArrayList<>();
PageBatch currentBatch = new PageBatch();
batches.add(currentBatch);
for (int pageNumber = 1; pageNumber <= pdfDoc.getPageCount(); pageNumber++) {
if (!settings.isProcessAllPages() && !pagesWithImages.contains(pageNumber)) {
supervisor.logPageSkipped(pageNumber);
continue;
}
currentBatch.add(pageNumber);
if (currentBatch.size() == settings.getBatchSize()) {
currentBatch = new PageBatch();
batches.add(currentBatch);
}
}
return batches;
}
private void beginAnalysis(BinaryData data, BatchContext batchContext) throws InterruptedException {
batchContext.supervisor.enterConcurrency(batchContext.batch);
batchContext.supervisor.logUploadStart(batchContext.batch, data.getLength());
var mdcContext = MDC.getCopyOfContextMap();
azureOcrResource.callAzureAsync(data, features)
azureOcrResource.callAzureAsync(data)
.flatMap(response -> {
MDC.setContextMap(mdcContext);
if (response.getStatus().equals(LongRunningOperationStatus.IN_PROGRESS)) {
batchContext.supervisor.logInProgress(batchContext.batch);
}
@ -91,62 +117,54 @@ public class AsyncOcrService {
if (LongRunningOperationStatus.SUCCESSFULLY_COMPLETED == response.getStatus()) {
return response.getFinalResult();
}
String message = "Polling completed unsuccessfully with status: " + response.getStatus();
log.error(message);
return Mono.error(new IllegalStateException(message));
return Mono.error(new IllegalStateException("Polling completed unsuccessfully with status: " + response.getStatus()));
}).subscribe(finalResult -> handleSuccessful(finalResult, batchContext),//
ex -> handleError(ex, batchContext),//
() -> handleCompleted(batchContext));
}
@SneakyThrows
private static void handleCached(BatchContext batchContext) {
var mdcContext = MDC.getCopyOfContextMap();
Thread thread = new Thread(() -> {
MDC.setContextMap(mdcContext);
log.info("Batch {}: Using cached ocr result", batchContext.batch.getIndex());
batchContext.batchStats().finishUpload();
batchContext.batchStats().finishApiWait();
batchContext.supervisor.logPageSuccess(batchContext.batch());
try {
batchContext.layerFactory.processAnalyzeResult(batchContext.batch(), batchContext.batch().getAzureResultCache());
} catch (InterruptedException e) {
batchContext.supervisor.logPageError(batchContext.batch, e);
}
});
thread.start();
}
private static void handleCompleted(BatchContext batchContext) {
log.info("Batch {}: Completed with pages {}", batchContext.batch.getIndex(), batchContext.batch);
batchContext.supervisor.leaveConcurrency(batchContext.batch);
}
private void handleError(Throwable ex, BatchContext batchContext) {
batchContext.supervisor.leaveConcurrency(batchContext.batch);
batchContext.supervisor.logPageError(batchContext.batch, ex);
}
private void handleSuccessful(AnalyzeResult finalResult, BatchContext batchContext) {
batchContext.supervisor.leaveConcurrency(batchContext.batch);
try {
mapper.writeValue(batchContext.batch().getAzureResultCacheFile(), finalResult);
batchContext.supervisor.logPageSuccess(batchContext.batch());
batchContext.layerFactory.processAnalyzeResult(batchContext.batch(), finalResult);
batchContext.layerFactory.addAnalyzeResult(batchContext.batch, finalResult);
batchContext.supervisor.logPageSuccess(batchContext.batch);
} catch (Exception e) {
handleError(e, batchContext);
}
}
private static PDFDoc extractBatchDocument(PDFDoc pdfDoc, PageBatch pageBatch) throws PDFNetException {
if (pageBatch.size() < 0) {
throw new IllegalArgumentException();
}
PDFDoc singlePagePdfDoc = new PDFDoc();
pageBatch.forEach(pageNumber -> addPageToNewDoc(pageNumber, pdfDoc, singlePagePdfDoc));
return singlePagePdfDoc;
}
@SneakyThrows
private static void addPageToNewDoc(Integer pageNumber, PDFDoc pdfDoc, PDFDoc singlePagePdfDoc) {
singlePagePdfDoc.pagePushBack(pdfDoc.getPage(pageNumber));
}
private record BatchContext(LayerFactory layerFactory, OcrExecutionSupervisor supervisor, PageBatch batch) {
BatchStats batchStats() {

View File

@ -2,25 +2,23 @@ package com.knecon.fforesight.service.ocr.processor.service;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import com.azure.ai.documentintelligence.DocumentIntelligenceAsyncClient;
import com.azure.ai.documentintelligence.DocumentIntelligenceClientBuilder;
import com.azure.ai.documentintelligence.models.AnalyzeDocumentOptions;
import com.azure.ai.documentintelligence.models.AnalyzeOperationDetails;
import com.azure.ai.documentintelligence.models.AnalyzeDocumentRequest;
import com.azure.ai.documentintelligence.models.AnalyzeResult;
import com.azure.ai.documentintelligence.models.AnalyzeResultOperation;
import com.azure.ai.documentintelligence.models.ContentFormat;
import com.azure.ai.documentintelligence.models.DocumentAnalysisFeature;
import com.azure.ai.documentintelligence.models.DocumentContentFormat;
import com.azure.ai.documentintelligence.models.StringIndexType;
import com.azure.core.credential.AzureKeyCredential;
import com.azure.core.util.BinaryData;
import com.azure.core.util.polling.PollerFlux;
import com.google.common.base.Objects;
import com.knecon.fforesight.service.ocr.processor.OcrServiceSettings;
import com.knecon.fforesight.service.ocr.v1.api.model.AzureOcrFeature;
import lombok.AccessLevel;
import lombok.SneakyThrows;
@ -44,48 +42,43 @@ public class AzureOcrResource {
@SneakyThrows
public PollerFlux<AnalyzeOperationDetails, AnalyzeResult> callAzureAsync(BinaryData data, Set<AzureOcrFeature> features) {
public PollerFlux<AnalyzeResultOperation, AnalyzeResult> callAzureAsync(BinaryData data) {
AnalyzeDocumentOptions analyzeDocumentOptions = new AnalyzeDocumentOptions(data.toBytes());
analyzeDocumentOptions.setStringIndexType(StringIndexType.UTF16_CODE_UNIT);
analyzeDocumentOptions.setDocumentAnalysisFeatures(buildFeatures(features));
analyzeDocumentOptions.setOutputContentFormat(buildContentFormat());
return asyncClient.beginAnalyzeDocument(getModelId(features), analyzeDocumentOptions);
AnalyzeDocumentRequest analyzeRequest = new AnalyzeDocumentRequest().setBase64Source(data.toBytes());
return asyncClient.beginAnalyzeDocument(getModelId(), null, null, StringIndexType.UTF16CODE_UNIT, buildFeatures(), null, buildContentFormat(), analyzeRequest);
}
private DocumentContentFormat buildContentFormat() {
private ContentFormat buildContentFormat() {
if (Objects.equal(settings.getContentFormat(), "markdown")) {
return DocumentContentFormat.MARKDOWN;
return ContentFormat.MARKDOWN;
}
return DocumentContentFormat.TEXT;
return ContentFormat.TEXT;
}
private String getModelId(Set<AzureOcrFeature> features) {
private String getModelId() {
if (features.contains(AzureOcrFeature.IDP)) {
if (settings.isIdpEnabled()) {
return "prebuilt-layout";
}
return "prebuilt-read";
}
private List<DocumentAnalysisFeature> buildFeatures(Set<AzureOcrFeature> features) {
private List<DocumentAnalysisFeature> buildFeatures() {
var azureFeatures = new ArrayList<DocumentAnalysisFeature>();
var features = new ArrayList<DocumentAnalysisFeature>();
if (features.contains(AzureOcrFeature.IDP)) {
azureFeatures.add(DocumentAnalysisFeature.KEY_VALUE_PAIRS);
if (settings.isIdpEnabled()) {
features.add(DocumentAnalysisFeature.KEY_VALUE_PAIRS);
}
if (settings.isAzureFontStyleDetection() && features.contains(AzureOcrFeature.FONT_STYLE_DETECTION)) {
azureFeatures.add(DocumentAnalysisFeature.STYLE_FONT);
}
azureFeatures.add(DocumentAnalysisFeature.BARCODES);
features.add(DocumentAnalysisFeature.BARCODES);
return azureFeatures;
return features;
}
}

View File

@ -1,144 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.service;
import static com.knecon.fforesight.service.ocr.processor.utils.ListSplittingUtils.formatIntervals;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.ocr.processor.OcrServiceSettings;
import com.knecon.fforesight.service.ocr.processor.model.PageBatch;
import com.knecon.fforesight.service.ocr.v1.api.model.AzureOcrFeature;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.Optimizer;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.sdf.SDFDoc;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class BatchFactory {
OcrServiceSettings settings;
@SneakyThrows
public List<PageBatch> splitIntoBatches(PDFDoc pdfDoc, OcrExecutionSupervisor supervisor, Set<AzureOcrFeature> features, Path runDir) {
Set<Integer> pagesToProcess = findPagesToProcess(pdfDoc, features);
supervisor.logImageExtractionFinished(pdfDoc.getPageCount(), pagesToProcess.size());
List<PageBatch> batches = buildBatches(pdfDoc, supervisor, features, runDir, pagesToProcess);
if (batches.size() > 1) {
log.info("Split {} pages to process into {} batches", pagesToProcess.size(), batches.size());
}
return batches;
}
@SneakyThrows
public Set<Integer> findPagesToProcess(PDFDoc pdfDoc, Set<AzureOcrFeature> features) {
if (features.contains(AzureOcrFeature.ALL_PAGES)) {
Set<Integer> pages = new HashSet<>();
for (int i = 1; i <= pdfDoc.getPageCount(); i++) {
pages.add(i);
}
return Collections.unmodifiableSet(pages);
}
return ImageDetectionService.findPagesWithImages(pdfDoc);
}
public List<PageBatch> buildBatches(PDFDoc pdfDoc,
OcrExecutionSupervisor supervisor,
Set<AzureOcrFeature> features,
Path runDir,
Set<Integer> pagesWithImages) throws PDFNetException {
List<PageBatch> batches = new ArrayList<>();
List<Integer> numbersForCurrentBatch = new ArrayList<>();
for (int pageNumber = 1; pageNumber <= pdfDoc.getPageCount(); pageNumber++) {
if (!features.contains(AzureOcrFeature.ALL_PAGES) && !pagesWithImages.contains(pageNumber)) {
supervisor.logPageSkipped(pageNumber);
continue;
}
numbersForCurrentBatch.add(pageNumber);
if (numbersForCurrentBatch.size() == settings.getBatchSize()) {
batches.add(create(batches.size(), pdfDoc, numbersForCurrentBatch, runDir));
numbersForCurrentBatch = new ArrayList<>();
}
}
if (!numbersForCurrentBatch.isEmpty()) {
batches.add(create(batches.size(), pdfDoc, numbersForCurrentBatch, runDir));
}
return batches;
}
@SneakyThrows
public static PageBatch create(int number, PDFDoc pdfDoc, List<Integer> pageNumbers, Path runDir) {
if (pageNumbers.isEmpty()) {
throw new IllegalArgumentException("pageNumbers must not be empty");
}
Path batchDir = formatBatchDir(number, pageNumbers, runDir);
Files.createDirectories(batchDir);
Path batchDocPath = batchDir.resolve("batch.pdf");
try (var batchDoc = extractBatchDocument(pdfDoc, pageNumbers)) {
Optimizer.optimize(batchDoc);
batchDoc.save(batchDocPath.toFile().toString(), SDFDoc.SaveMode.LINEARIZED, null);
}
PageBatch batch = new PageBatch(number, pageNumbers, batchDocPath, batchDir);
Files.createDirectories(batch.getRenderedImageDir());
Files.createDirectories(batch.getProcessedImageDir());
return batch;
}
private static Path formatBatchDir(int number, List<Integer> pageNumbers, Path runDir) {
List<String> intervals = formatIntervals(pageNumbers);
if (intervals.size() > 4) {
intervals = intervals.subList(0, 4);
intervals.add("...");
}
String batchName = String.join(", ", intervals);
return runDir.resolve("batch_%04d_%s".formatted(number, batchName));
}
private static PDFDoc extractBatchDocument(PDFDoc pdfDoc, List<Integer> pageBatch) throws PDFNetException {
if (pageBatch.isEmpty()) {
throw new IllegalArgumentException();
}
PDFDoc batchDoc = new PDFDoc();
pageBatch.forEach(pageNumber -> addPageToNewDoc(pageNumber, pdfDoc, batchDoc));
return batchDoc;
}
@SneakyThrows
private static void addPageToNewDoc(Integer pageNumber, PDFDoc pdfDoc, PDFDoc batchDoc) {
batchDoc.pagePushBack(pdfDoc.getPage(pageNumber));
}
}

View File

@ -10,7 +10,7 @@ public class BatchStats {
private long apiWaitTimestamp = -1;
private long imageUploadTimestamp = -1;
private long mappingResultTimestamp = -1;
private long writingTextTimestamp = -1;
private long batchRenderTimestamp = -1;
@ -38,9 +38,9 @@ public class BatchStats {
}
public void finishMappingResult() {
public void finishWritingText() {
mappingResultTimestamp = System.currentTimeMillis();
writingTextTimestamp = System.currentTimeMillis();
}
@ -50,33 +50,15 @@ public class BatchStats {
}
public boolean isApiWaitFinished() {
return apiWaitTimestamp > 0;
}
public boolean isMappingResultFinished() {
return mappingResultTimestamp > 0;
}
public boolean isBatchRenderFinished() {
return batchRenderTimestamp > 0;
}
public long getApiWaitDuration() {return this.apiWaitTimestamp - imageUploadTimestamp;}
public long getImageUploadDuration() {return this.imageUploadTimestamp - batchRenderTimestamp;}
public long getMappingResultDuration() {return this.mappingResultTimestamp - apiWaitTimestamp;}
public long getWritingTextDuration() {return this.writingTextTimestamp - apiWaitTimestamp;}
public long getBatchRenderDuration() {return startTimestamp - this.batchRenderTimestamp;}
public long getBatchRenderDuration() {return this.batchRenderTimestamp - startTimestamp;}
}

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.ocr.v1.server;
package com.knecon.fforesight.service.ocr.processor.service;
import java.io.File;
import java.io.FileInputStream;
@ -33,23 +33,20 @@ public class FileStorageService {
public void storeFiles(DocumentRequest request, File documentFile, File viewerDocumentFile, File analyzeResultFile) {
try (var in = new FileInputStream(viewerDocumentFile)) {
if (request.optionalViewerDocumentId()
.isPresent()) {
if (request.optionalViewerDocumentId().isPresent()) {
storageService.storeObject(TenantContext.getTenantId(), request.getViewerDocId(), in);
} else {
storageService.storeObject(TenantContext.getTenantId(), getStorageId(request.getDossierId(), request.getFileId(), FileType.VIEWER_DOCUMENT), in);
}
}
try (var in = new FileInputStream(documentFile)) {
if (request.optionalOriginDocumentId()
.isPresent()) {
if (request.optionalOriginDocumentId().isPresent()) {
storageService.storeObject(TenantContext.getTenantId(), request.getOriginDocumentId(), in);
} else {
storageService.storeObject(TenantContext.getTenantId(), getStorageId(request.getDossierId(), request.getFileId(), FileType.ORIGIN), in);
}
}
if (request.optionalIdpResultId()
.isPresent() && analyzeResultFile.exists()) {
if (request.optionalIdpResultId().isPresent()) {
try (var in = new FileInputStream(analyzeResultFile)) {
storageService.storeObject(TenantContext.getTenantId(), request.getIdpResultId(), in);
}
@ -62,8 +59,7 @@ public class FileStorageService {
Files.createDirectories(documentFile.getParentFile().toPath());
String originDocumentId = request.optionalOriginDocumentId()
.orElse(getStorageId(request.getDossierId(), request.getFileId(), FileType.ORIGIN));
String originDocumentId = request.optionalOriginDocumentId().orElse(getStorageId(request.getDossierId(), request.getFileId(), FileType.ORIGIN));
storageService.downloadTo(TenantContext.getTenantId(), originDocumentId, documentFile);

View File

@ -1,24 +1,16 @@
package com.knecon.fforesight.service.ocr.processor.service;
import java.util.Set;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.ocr.v1.api.model.AzureOcrFeature;
import com.knecon.fforesight.service.ocr.v1.api.model.DocumentRequest;
@Service
public interface IOcrMessageSender {
void sendUpdate(String fileId, int finishedImages, int totalImages, Set<AzureOcrFeature> features);
void sendUpdate(String fileId, int finishedImages, int totalImages);
void sendOCRStarted(String fileId);
void sendOCRStarted(String fileId, Set<AzureOcrFeature> features);
void sendOcrFinished(String fileId, int totalImages);
void sendOcrFinished(String fileId, int totalImages, Set<AzureOcrFeature> features);
void sendOcrResponse(DocumentRequest request);
void sendOcrResponse(String dossierId, String fileId);
}

View File

@ -7,24 +7,40 @@ import java.util.Set;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.ocr.processor.OcrServiceSettings;
import com.knecon.fforesight.service.ocr.v1.api.model.AzureOcrFeature;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementReader;
import com.pdftron.pdf.PDFDoc;
import lombok.SneakyThrows;
import lombok.experimental.UtilityClass;
@UtilityClass
@Service
public class ImageDetectionService {
// any image with smaller height and width than this gets thrown out, see everyPointInDashedLineIsImage.pdf
private static final int PIXEL_THRESHOLD = 0;
private final OcrServiceSettings ocrServiceSettings;
public ImageDetectionService(OcrServiceSettings ocrServiceSettings) {this.ocrServiceSettings = ocrServiceSettings;}
@SneakyThrows
public Set<Integer> findPagesWithImages(PDFDoc pdfDoc) {
public Set<Integer> findPagesToProcess(PDFDoc pdfDoc) {
if (ocrServiceSettings.isProcessAllPages()) {
Set<Integer> pages = new HashSet<>();
for (int i = 1; i <= pdfDoc.getPageCount(); i++) {
pages.add(i);
}
return Collections.unmodifiableSet(pages);
}
return findPagesWithImages(pdfDoc);
}
private Set<Integer> findPagesWithImages(PDFDoc pdfDoc) throws PDFNetException {
Set<Integer> pagesWithImages = new HashSet<>();
try (ElementReader reader = new ElementReader()) {
@ -56,11 +72,8 @@ public class ImageDetectionService {
}
case Element.e_form -> {
reader.formBegin();
var found = findImagePositionsOnPage(reader);
findImagePositionsOnPage(reader);
reader.end();
if (found) {
return true;
}
}
}
}

View File

@ -5,25 +5,21 @@ import static com.knecon.fforesight.service.ocr.processor.model.Statistics.human
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.util.List;
import java.util.Set;
import org.springframework.stereotype.Service;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService;
import com.iqser.red.pdftronlogic.commons.OCGWatermarkRemovalService;
import com.iqser.red.pdftronlogic.commons.WatermarkRemovalService;
import com.knecon.fforesight.service.ocr.processor.OcrServiceSettings;
import com.knecon.fforesight.service.ocr.processor.model.PageBatch;
import com.knecon.fforesight.service.ocr.processor.model.Statistics;
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.ImageProcessingPipeline;
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.ImageProcessingSupervisor;
import com.knecon.fforesight.service.ocr.processor.visualizations.layers.OcrResult;
import com.knecon.fforesight.service.ocr.processor.visualizations.utils.RotationCorrectionUtility;
import com.knecon.fforesight.service.ocr.v1.api.model.AzureOcrFeature;
import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService;
import com.pdftron.pdf.PDFDoc;
@ -44,10 +40,10 @@ public class OCRService {
WatermarkRemovalService watermarkRemovalService;
InvisibleElementRemovalService invisibleElementRemovalService;
PDFTronViewerDocumentService viewerDocumentService;
BatchFactory batchFactory;
ImageDetectionService imageDetectionService;
AsyncOcrService asyncOcrService;
OcrServiceSettings settings;
ObjectMapper mapper;
ImageProcessingPipeline imageProcessingPipeline;
/**
@ -60,23 +56,24 @@ public class OCRService {
* @param tmpDir working directory for all files
* @param documentFile the file to perform ocr on, results are written invisibly
* @param viewerDocumentFile debugging file, results are written visibly in an optional content group
* @param idpResultFile result file with additional information
* @param analyzeResultFile result file with additional information
*/
@Observed(name = "OCRService", contextualName = "run-ocr-on-document")
public void runOcrOnDocument(String dossierId, String fileId, Set<AzureOcrFeature> features, Path tmpDir, File documentFile, File viewerDocumentFile, File idpResultFile) {
public void runOcrOnDocument(String dossierId, String fileId, boolean removeWatermark, Path tmpDir, File documentFile, File viewerDocumentFile, File analyzeResultFile) {
if (features.contains(AzureOcrFeature.REMOVE_WATERMARKS)) {
if (removeWatermark) {
removeWatermark(documentFile);
}
removeInvisibleElements(documentFile);
log.info("Starting OCR for file {}", fileId);
long ocrStart = System.currentTimeMillis();
Statistics stats = runOcr(tmpDir, documentFile, viewerDocumentFile, fileId, dossierId, idpResultFile, features).getStatistics();
Statistics stats = runOcr(tmpDir, documentFile, viewerDocumentFile, fileId, dossierId, analyzeResultFile).getStatistics();
long ocrEnd = System.currentTimeMillis();
log.info("OCR successful, took {}", humanizeDuration(ocrEnd - ocrStart));
log.info("ocr successful for file with dossierId {} and fileId {}, took {}", dossierId, fileId, humanizeDuration(ocrEnd - ocrStart));
if (settings.isDebug()) {
logRuntimeBreakdown(ocrEnd, ocrStart, stats);
@ -120,37 +117,34 @@ public class OCRService {
@SneakyThrows
public OcrExecutionSupervisor runOcr(Path runDir,
File documentFile,
File viewerDocumentFile,
String fileId,
String dossierId,
File idpResultFile,
Set<AzureOcrFeature> features) {
public OcrExecutionSupervisor runOcr(Path tmpDir, File documentFile, File viewerDocumentFile, String fileId, String dossierId, File analyzeResultFile) {
Path tmpImageDir = tmpDir.resolve("images");
Path azureOutputDir = tmpDir.resolve("azure_output");
Files.createDirectories(azureOutputDir);
Files.createDirectories(tmpImageDir);
try (var in = new FileInputStream(documentFile); PDFDoc pdfDoc = new PDFDoc(in)) {
OCGWatermarkRemovalService.removeWatermarks(pdfDoc);
OcrExecutionSupervisor supervisor = new OcrExecutionSupervisor(pdfDoc.getPageCount(), ocrMessageSender, fileId, settings, features);
OcrExecutionSupervisor supervisor = new OcrExecutionSupervisor(pdfDoc.getPageCount(), ocrMessageSender, fileId, settings);
supervisor.getStatistics().setStart();
List<PageBatch> batches = batchFactory.splitIntoBatches(pdfDoc, supervisor, features, runDir);
Set<Integer> pagesWithImages = imageDetectionService.findPagesToProcess(pdfDoc);
ImageProcessingSupervisor imageSupervisor = null;
if (settings.isFontStyleDetection()) {
imageSupervisor = imageProcessingPipeline.run(pagesWithImages, tmpImageDir, documentFile);
}
OcrResult ocrResult = asyncOcrService.awaitOcr(pdfDoc, supervisor, features, batches);
supervisor.logImageExtractionFinished(pdfDoc.getPageCount(), pagesWithImages.size());
OcrResult ocrResult = asyncOcrService.awaitOcr(pdfDoc, supervisor, pagesWithImages, imageSupervisor);
viewerDocumentService.addLayerGroups(documentFile, documentFile, ocrResult.regularLayers());
viewerDocumentService.addLayerGroups(documentFile, viewerDocumentFile, ocrResult.debugLayers());
if (features.contains(AzureOcrFeature.ROTATION_CORRECTION)) {
RotationCorrectionUtility.rotatePages(documentFile.toPath(), documentFile.toPath(), ocrResult.anglesPerPage());
RotationCorrectionUtility.rotatePages(viewerDocumentFile.toPath(), viewerDocumentFile.toPath(), ocrResult.anglesPerPage());
}
if (features.contains(AzureOcrFeature.IDP)) {
saveIdpResultFile(idpResultFile, ocrResult);
}
supervisor.getStatistics().drawingPdfFinished();
supervisor.sendFinished();
@ -160,12 +154,4 @@ public class OCRService {
}
private void saveIdpResultFile(File idpResultFile, OcrResult ocrResult) throws IOException {
try (var out = new FileOutputStream(idpResultFile)) {
mapper.writeValue(out, ocrResult.idpResult());
}
}
}

View File

@ -5,7 +5,6 @@ import static com.knecon.fforesight.service.ocr.processor.model.Statistics.human
import java.util.Collections;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
@ -14,7 +13,6 @@ import java.util.concurrent.CountDownLatch;
import com.knecon.fforesight.service.ocr.processor.OcrServiceSettings;
import com.knecon.fforesight.service.ocr.processor.model.PageBatch;
import com.knecon.fforesight.service.ocr.processor.model.Statistics;
import com.knecon.fforesight.service.ocr.v1.api.model.AzureOcrFeature;
import lombok.AccessLevel;
import lombok.Getter;
@ -40,15 +38,12 @@ public class OcrExecutionSupervisor {
String fileId;
Set<AzureOcrFeature> features;
public OcrExecutionSupervisor(int totalPageCount, IOcrMessageSender ocrMessageSender, String fileId, OcrServiceSettings settings, Set<AzureOcrFeature> features) {
public OcrExecutionSupervisor(int totalPageCount, IOcrMessageSender ocrMessageSender, String fileId, OcrServiceSettings settings) {
this.totalPageCount = totalPageCount;
this.ocrMessageSender = ocrMessageSender;
this.fileId = fileId;
this.features = features;
this.errorPages = Collections.synchronizedSet(new HashSet<>());
this.countDownPagesToProcess = new CountDownLatch(totalPageCount);
this.statistics = new Statistics();
@ -70,16 +65,16 @@ public class OcrExecutionSupervisor {
}
public void logImageExtractionFinished(int numberOfPages, int numberOfPagesToProcess) {
public void logImageExtractionFinished(int numberOfPages, int numberOfImages) {
statistics.imageExtractionFinished();
log.info("Images found on {}/{} pages in {}", numberOfPagesToProcess, numberOfPages, humanizeDuration(statistics.getImageExtractionDuration()));
log.info("Images found on {}/{} pages in {}", numberOfImages, numberOfPages, humanizeDuration(statistics.getImageExtractionDuration()));
}
public void logUploadStart(PageBatch pageRange, long bytes) {
log.info("Batch {}: Start uploading pages {} with {}", pageRange.getIndex(), pageRange, humanizeBytes(bytes));
log.info("Start uploading pages {} with {}", pageRange, humanizeBytes(bytes));
statistics.getBatchStats(pageRange).start();
statistics.increaseTotalBytes(pageRange, bytes);
}
@ -88,28 +83,27 @@ public class OcrExecutionSupervisor {
public void logInProgress(PageBatch pageRange) {
if (!statistics.getBatchStats(pageRange).isUploadFinished()) {
log.info("Batch {}: Pages {} is in progress", pageRange.getIndex(), pageRange);
log.info("Pages {} is in progress", pageRange);
statistics.getBatchStats(pageRange).finishUpload();
ocrMessageSender.sendUpdate(fileId, processedPages(), getTotalPageCount(), features);
ocrMessageSender.sendUpdate(fileId, processedPages(), getTotalPageCount());
} else {
log.debug("Batch {}: Pages {} still in progress", pageRange.getIndex(), pageRange);
log.debug("Pages {} still in progress", pageRange);
}
}
public void finishMappingResult(PageBatch batch) {
public void finishMappingResult(PageBatch pageRange) {
batch.forEach(pageIndex -> countDownPagesToProcess.countDown());
statistics.getBatchStats(batch).finishMappingResult();
ocrMessageSender.sendUpdate(fileId, this.processedPages(), getTotalPageCount(), features);
log.info("Batch {}: Finished mapping result with pages {}", batch.getIndex(), batch);
pageRange.forEach(pageIndex -> countDownPagesToProcess.countDown());
statistics.getBatchStats(pageRange).finishWritingText();
ocrMessageSender.sendUpdate(fileId, this.processedPages(), getTotalPageCount());
}
public void logPageSkipped(Integer pageIndex) {
this.countDownPagesToProcess.countDown();
ocrMessageSender.sendUpdate(fileId, this.processedPages(), getTotalPageCount(), features);
ocrMessageSender.sendUpdate(fileId, this.processedPages(), getTotalPageCount());
log.debug("{}/{}: No images to ocr on page {}", processedPages(), getTotalPageCount(), pageIndex);
}
@ -119,43 +113,21 @@ public class OcrExecutionSupervisor {
this.errorPages.add(batch);
batch.forEach(pageIndex -> this.countDownPagesToProcess.countDown());
ocrMessageSender.sendUpdate(fileId, this.processedPages(), getTotalPageCount(), features);
log.error("{}/{}: Error occurred in batch {} with pages {}", processedPages(), getTotalPageCount(), batch.getIndex(), batch, e);
ocrMessageSender.sendUpdate(fileId, this.processedPages(), getTotalPageCount());
log.error("{}/{}: Error occurred on pages {}", processedPages(), getTotalPageCount(), batch, e);
}
public void logPageSuccess(PageBatch batch) {
statistics.getBatchStats(batch).finishApiWait();
log.info("{}/{}: Finished OCR in batch {} with pages {}", processedPages(), getTotalPageCount(), batch.getIndex(), batch);
log.info("{}/{}: Finished OCR on pages {}", processedPages(), getTotalPageCount(), batch);
}
private int processedPages() {
if (countDownPagesToProcess.getCount() == 0) {
return totalPageCount;
}
int processedPages = 0;
for (Map.Entry<PageBatch, BatchStats> entry : statistics.getBatchStats().entrySet()) {
PageBatch pageBatch = entry.getKey();
BatchStats batchStats = entry.getValue();
float percentage = 0;
if (batchStats.isBatchRenderFinished()) {
percentage += 0.1f;
}
if (batchStats.isUploadFinished()) {
percentage += 0.3f;
}
if (batchStats.isApiWaitFinished()) {
percentage += 0.3f;
}
if (batchStats.isMappingResultFinished()) {
percentage += 0.3f;
}
processedPages += (int) (pageBatch.size() * percentage);
}
return processedPages;
return (int) (totalPageCount - countDownPagesToProcess.getCount());
}
@ -172,7 +144,7 @@ public class OcrExecutionSupervisor {
requireNoErrors();
log.info("{}/{}: Finished OCR on all pages", getTotalPageCount(), getTotalPageCount());
ocrMessageSender.sendOcrFinished(fileId, getTotalPageCount(), features);
ocrMessageSender.sendOcrFinished(fileId, getTotalPageCount());
}

View File

@ -1,510 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.service;
import java.awt.geom.AffineTransform;
import java.awt.geom.Line2D;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.azure.ai.documentintelligence.models.AnalyzeResult;
import com.azure.ai.documentintelligence.models.BoundingRegion;
import com.azure.ai.documentintelligence.models.DocumentFontStyle;
import com.azure.ai.documentintelligence.models.DocumentPage;
import com.azure.ai.documentintelligence.models.DocumentSpan;
import com.azure.ai.documentintelligence.models.DocumentStyle;
import com.azure.ai.documentintelligence.models.DocumentTable;
import com.azure.ai.documentintelligence.models.DocumentTableCell;
import com.azure.ai.documentintelligence.models.DocumentWord;
import com.google.common.base.Functions;
import com.knecon.fforesight.service.ocr.processor.OcrServiceSettings;
import com.knecon.fforesight.service.ocr.processor.model.ImageFile;
import com.knecon.fforesight.service.ocr.processor.model.PageBatch;
import com.knecon.fforesight.service.ocr.processor.model.PageInformation;
import com.knecon.fforesight.service.ocr.processor.model.SpanLookup;
import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage;
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.BBoxSnuggificationService;
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.FontStyleDetector;
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.ImageProcessingPipeline;
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.ImageProcessingSupervisor;
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.StrokeWidthCalculator;
import com.knecon.fforesight.service.ocr.processor.visualizations.WritableOcrResult;
import com.knecon.fforesight.service.ocr.processor.visualizations.fonts.FontMetricsProvider;
import com.knecon.fforesight.service.ocr.processor.visualizations.fonts.FontStyle;
import com.knecon.fforesight.service.ocr.processor.visualizations.fonts.Type0FontMetricsProvider;
import com.knecon.fforesight.service.ocr.processor.visualizations.utils.RotationCorrectionUtility;
import com.knecon.fforesight.service.ocr.v1.api.model.AzureOcrFeature;
import com.knecon.fforesight.service.ocr.v1.api.model.QuadPoint;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
import net.sourceforge.lept4j.Box;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.lept4j.util.LeptUtils;
@Slf4j
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class OcrResultPostProcessingPipeline {
@Getter
Map<Integer, AffineTransform> resultToPageTransforms;
Map<Integer, PageInformation> pageInformation;
ImageProcessingPipeline imageProcessingPipeline;
OcrServiceSettings settings;
Set<AzureOcrFeature> features;
@SneakyThrows
public OcrResultPostProcessingPipeline(Map<Integer, PageInformation> pageInformation,
ImageProcessingPipeline imageProcessingPipeline,
OcrServiceSettings settings,
Set<AzureOcrFeature> features) {
this.imageProcessingPipeline = imageProcessingPipeline;
this.pageInformation = pageInformation;
resultToPageTransforms = Collections.synchronizedMap(new HashMap<>());
this.settings = settings;
this.features = features;
}
public List<WritableOcrResult> processAnalyzeResult(AnalyzeResult analyzeResult, PageBatch batch) throws InterruptedException {
ImageProcessingSupervisor imageSupervisor = renderImagesIfNecessary(analyzeResult, batch);
List<WritableOcrResult> writableOcrResultList = new ArrayList<>();
Lookups lookups = getLookups(analyzeResult);
for (DocumentPage resultPage : analyzeResult.getPages()) {
PageInformation pageInformation = getPageInformation(getPageNumber(batch, resultPage));
AffineTransform resultToPageTransform = buildResultToPageTransform(pageInformation, resultPage.getWidth());
resultToPageTransforms.put(getPageNumber(batch, resultPage), resultToPageTransform);
List<TextPositionInImage> words = buildTextPositionsInImage(batch, resultPage, resultToPageTransform, lookups, pageInformation, imageSupervisor);
var builder = WritableOcrResult.builder().pageNumber(pageInformation.number()).textPositionInImage(words).angle(-resultPage.getAngle());
if (settings.isDrawTablesAsLines()) {
builder.tableLines(getTableLines(analyzeResult, pageInformation, resultToPageTransform));
}
writableOcrResultList.add(builder.build());
}
log.debug("Batch {}: finished post-processing.", batch.getIndex());
return writableOcrResultList;
}
private ImageProcessingSupervisor renderImagesIfNecessary(AnalyzeResult analyzeResult, PageBatch batch) {
ImageProcessingSupervisor imageSupervisor = null;
if (useRenderedImages()) {
Map<Integer, Double> anglesPerPage = analyzeResult.getPages()
.stream()
.collect(Collectors.toMap(DocumentPage::getPageNumber, documentPage -> -documentPage.getAngle()));
RotationCorrectionUtility.rotatePages(batch.getBatchDoc(), batch.getBatchDoc(), anglesPerPage);
imageSupervisor = imageProcessingPipeline.addToPipeline(batch);
}
return imageSupervisor;
}
private boolean useRenderedImages() {
if (settings.isAzureFontStyleDetection() && features.contains(AzureOcrFeature.FONT_STYLE_DETECTION)) {
return false;
}
return settings.isSnuggify() || features.contains(AzureOcrFeature.FONT_STYLE_DETECTION);
}
private List<TextPositionInImage> buildTextPositionsInImage(PageBatch pageOffset,
DocumentPage resultPage,
AffineTransform resultToPageTransform,
Lookups lookups,
PageInformation pageInformation,
ImageProcessingSupervisor imageSupervisor) throws InterruptedException {
if (!useRenderedImages()) {
return buildText(resultPage, resultToPageTransform, lookups, pageInformation);
}
ImageFile imageFile = imageSupervisor.awaitProcessedPage(getPageNumber(pageOffset, resultPage));
if (imageFile == null) {
return buildText(resultPage, resultToPageTransform, lookups, pageInformation);
}
synchronized (ImageProcessingSupervisor.class) {
// Leptonica is not thread safe, but is being called in ImageProcessingService as well
if (features.contains(AzureOcrFeature.FONT_STYLE_DETECTION)) {
return buildTextWithBoldDetection(resultPage, resultToPageTransform, pageInformation, imageFile);
}
return buildTextWithSnugBBoxes(resultPage, imageFile, resultToPageTransform, lookups, pageInformation);
}
}
@SneakyThrows
private List<TextPositionInImage> buildTextWithBoldDetection(DocumentPage resultPage,
AffineTransform resultToPageTransform,
PageInformation pageInformation,
ImageFile imageFile) {
Pix pageImage = imageFile.readPix();
List<TextPositionInImage> words = new ArrayList<>();
try (FontStyleDetector fontStyleDetector = new FontStyleDetector()) {
AffineTransform resultToImageTransform = buildResultToImageTransform(resultPage, pageImage);
for (DocumentWord word : resultPage.getWords()) {
TextPositionInImage textPosition;
if (canBeSnuggified(resultPage, resultToImageTransform)) {
textPosition = buildTextPositionInImageWithSnugBBox(word,
resultToPageTransform,
new FontInformation(FontStyle.REGULAR, Type0FontMetricsProvider.REGULAR_INSTANCE),
pageImage,
resultToImageTransform);
} else {
textPosition = new TextPositionInImage(QuadPoint.fromPolygons(word.getPolygon()),
word.getContent(),
resultToPageTransform,
new FontInformation(FontStyle.REGULAR, Type0FontMetricsProvider.REGULAR_INSTANCE).font(),
new FontInformation(FontStyle.REGULAR, Type0FontMetricsProvider.REGULAR_INSTANCE).fontStyle(),
false);
}
if (intersectsIgnoreZone(pageInformation.wordBBoxes(), textPosition)) {
textPosition.setOverlapsIgnoreZone(true);
}
QuadPoint originTransformed = QuadPoint.fromPolygons(word.getPolygon()).getTransformed(resultToImageTransform);
Pix wordImage = extractWordImage(originTransformed, pageImage);
if (wordImage == null) {
log.debug("Unable to extract word image! wordImage: {}, pageImage {}", originTransformed.getBounds2D(), new Rectangle2D.Float(0, 0, pageImage.w, pageImage.h));
continue;
}
if (StrokeWidthCalculator.wordImageHasMinimumPixelDensity(wordImage)) {
fontStyleDetector.add(textPosition, wordImage, textPosition.getFontSizeByHeight());
}
words.add(textPosition);
}
fontStyleDetector.classifyWords();
} finally {
LeptUtils.disposePix(pageImage);
}
return words;
}
@SneakyThrows
public static AffineTransform buildResultToImageTransform(DocumentPage resultPage, Pix pageImage) {
int quadrant = RotationCorrectionUtility.getQuadrantRotation(-resultPage.getAngle());
AffineTransform rotationCorrection = RotationCorrectionUtility.buildTransform(-resultPage.getAngle(), pageImage.w, pageImage.h);
AffineTransform imageTransform = new AffineTransform();
double scalingFactor = switch (quadrant) {
case 1, 3 -> pageImage.h / resultPage.getWidth();
default -> pageImage.w / resultPage.getWidth();
};
imageTransform.concatenate(rotationCorrection);
imageTransform.scale(scalingFactor, scalingFactor);
return imageTransform;
}
public static Pix extractWordImage(QuadPoint wordPosition, Pix pageImage) {
Rectangle2D wordBBox = wordPosition.getBounds2D();
Rectangle2D pageBBox = new Rectangle2D.Double(0, 0, pageImage.w, pageImage.h);
if (!pageBBox.contains(wordBBox)) {
return null;
}
Box box = new Box((int) wordBBox.getX(), (int) wordBBox.getY(), (int) wordBBox.getWidth(), (int) wordBBox.getHeight(), 1);
Pix wordImage = Leptonica1.pixClipRectangle(pageImage, box, null);
box.clear();
return wordImage;
}
public List<TextPositionInImage> buildTextWithSnugBBoxes(DocumentPage resultPage,
ImageFile imageFile,
AffineTransform pageCtm,
Lookups lookups,
PageInformation pageInformation) {
Pix pageImage = imageFile.readPix();
AffineTransform resultToImageTransform = buildResultToImageTransform(resultPage, pageImage);
boolean snuggify = canBeSnuggified(resultPage, resultToImageTransform);
List<TextPositionInImage> list = new ArrayList<>();
for (DocumentWord word : resultPage.getWords()) {
FontInformation fontInformation = FontInformation.determineStyle(word, lookups);
TextPositionInImage textPositionInImage;
if (snuggify) {
textPositionInImage = buildTextPositionInImageWithSnugBBox(word, pageCtm, fontInformation, pageImage, resultToImageTransform);
} else {
textPositionInImage = new TextPositionInImage(QuadPoint.fromPolygons(word.getPolygon()),
word.getContent(),
pageCtm,
fontInformation.font(),
fontInformation.fontStyle(),
false);
}
markTextOverlappingIgnoreZone(textPositionInImage, pageInformation.wordBBoxes());
list.add(textPositionInImage);
}
LeptUtils.disposePix(pageImage);
return list;
}
private boolean canBeSnuggified(DocumentPage resultPage, AffineTransform resultToImageTransform) {
return settings.isSnuggify() && BBoxSnuggificationService.canBeSnuggified(resultPage, resultToImageTransform);
}
public List<TextPositionInImage> buildText(DocumentPage resultPage, AffineTransform pageCtm, Lookups lookups, PageInformation pageInformation) {
return resultPage.getWords()
.stream()
.map(word -> new TextPositionInImage(QuadPoint.fromPolygons(word.getPolygon()),
word.getContent(),
pageCtm,
FontInformation.determineStyle(word, lookups).font(),
FontInformation.determineStyle(word, lookups).fontStyle(),
false))
.map(textPositionInImage -> markTextOverlappingIgnoreZone(textPositionInImage, pageInformation.wordBBoxes()))
.collect(Collectors.toList());
}
private static int getPageNumber(PageBatch pageBatch, DocumentPage resultPage) {
return pageBatch.getPageNumber(resultPage.getPageNumber());
}
private static Lookups getLookups(AnalyzeResult analyzeResult) {
if (analyzeResult.getStyles() == null || analyzeResult.getStyles().isEmpty()) {
return Lookups.empty();
}
// Azure stopped supporting bold text detection in 1.0.0 release
SpanLookup<DocumentSpan> boldLookup = new SpanLookup<>(Stream.empty(), Function.identity());
SpanLookup<DocumentSpan> italicLookup = new SpanLookup<>(analyzeResult.getStyles()
.stream()
.filter(style -> Objects.equals(style.getFontStyle(),
DocumentFontStyle.ITALIC))
.map(DocumentStyle::getSpans)
.flatMap(Collection::stream), Functions.identity());
SpanLookup<DocumentSpan> handWrittenLookup = new SpanLookup<>(analyzeResult.getStyles()
.stream()
.filter(documentStyle -> documentStyle.isHandwritten() != null && documentStyle.isHandwritten())
.map(DocumentStyle::getSpans)
.flatMap(Collection::stream), Functions.identity());
return new Lookups(boldLookup, italicLookup, handWrittenLookup);
}
@SneakyThrows
private TextPositionInImage buildTextPositionInImageWithSnugBBox(DocumentWord dw,
AffineTransform imageCTM,
FontInformation fontInformation,
Pix pageImage,
AffineTransform resultToImageTransform) {
QuadPoint origin = QuadPoint.fromPolygons(dw.getPolygon());
Optional<QuadPoint> snugBBox = BBoxSnuggificationService.snuggify(pageImage, dw, resultToImageTransform);
return new TextPositionInImage(snugBBox.orElse(origin), dw.getContent(), imageCTM, fontInformation.font(), fontInformation.fontStyle(), snugBBox.isPresent());
}
private record FontInformation(FontStyle fontStyle, FontMetricsProvider font) {
public static FontInformation determineStyle(DocumentWord dw, Lookups lookups) {
boolean bold = lookups.bold().containedInAnySpan(dw.getSpan());
boolean italic = lookups.italic().containedInAnySpan(dw.getSpan());
boolean handwritten = lookups.handwritten().containedInAnySpan(dw.getSpan());
FontStyle fontStyle;
FontMetricsProvider font;
if (handwritten) {
fontStyle = FontStyle.HANDWRITTEN;
font = Type0FontMetricsProvider.REGULAR_INSTANCE;
} else if (italic && bold) {
fontStyle = FontStyle.BOLD_ITALIC;
font = Type0FontMetricsProvider.BOLD_ITALIC_INSTANCE;
} else if (bold) {
fontStyle = FontStyle.BOLD;
font = Type0FontMetricsProvider.BOLD_INSTANCE;
} else if (italic) {
fontStyle = FontStyle.ITALIC;
font = Type0FontMetricsProvider.ITALIC_INSTANCE;
} else {
fontStyle = FontStyle.REGULAR;
font = Type0FontMetricsProvider.REGULAR_INSTANCE;
}
return new FontInformation(fontStyle, font);
}
}
private static List<Line2D> getTableLines(AnalyzeResult analyzeResult, PageInformation pageInformation, AffineTransform imageCTM) {
if (analyzeResult.getTables() == null || analyzeResult.getTables().isEmpty()) {
return Collections.emptyList();
}
return analyzeResult.getTables()
.stream()
.map(DocumentTable::getCells)
.flatMap(Collection::stream)
.map(DocumentTableCell::getBoundingRegions)
.flatMap(Collection::stream)
.filter(table -> table.getPageNumber() == pageInformation.number())
.map(BoundingRegion::getPolygon)
.map(QuadPoint::fromPolygons)
.map(qp -> qp.getTransformed(imageCTM))
.flatMap(QuadPoint::asLines)
.toList();
}
private static TextPositionInImage markTextOverlappingIgnoreZone(TextPositionInImage textPositionInImage, List<Rectangle2D> ignoreZones) {
if (intersectsIgnoreZone(ignoreZones, textPositionInImage)) {
textPositionInImage.setOverlapsIgnoreZone(true);
}
return textPositionInImage;
}
private static boolean intersectsIgnoreZone(List<Rectangle2D> ignoreZones, TextPositionInImage textPositionInImage) {
for (Rectangle2D ignoreZone : ignoreZones) {
Rectangle2D textBBox = textPositionInImage.getTransformedTextBBox().getBounds2D();
if (textBBox.intersects(ignoreZone)) {
double intersectedArea = calculateIntersectedArea(textBBox, ignoreZone);
double textArea = textBBox.getWidth() * textBBox.getHeight();
if (intersectedArea / textArea > 0.5) {
return true;
}
double ignoreZoneArea = ignoreZone.getWidth() * ignoreZone.getHeight();
if (intersectedArea / ignoreZoneArea > 0.5) {
return true;
}
}
}
return false;
}
public static double calculateIntersectedArea(Rectangle2D r1, Rectangle2D r2) {
double xOverlap = Math.max(0, Math.min(r1.getMaxX(), r2.getMaxX()) - Math.max(r1.getMinX(), r2.getMinX()));
double yOverlap = Math.max(0, Math.min(r1.getMaxY(), r2.getMaxY()) - Math.max(r1.getY(), r2.getY()));
return xOverlap * yOverlap;
}
public static AffineTransform buildResultToPageTransform(PageInformation pageInformation, double imageWidth) {
double scalingFactor = calculateScalingFactor(imageWidth, pageInformation);
AffineTransform imageToCropBoxScaling = new AffineTransform(scalingFactor, 0, 0, scalingFactor, 0, 0);
AffineTransform mirrorMatrix = new AffineTransform(1, 0, 0, -1, 0, pageInformation.height());
AffineTransform rotationMatrix = switch (pageInformation.rotationDegrees()) {
case 90 -> new AffineTransform(0, 1, -1, 0, pageInformation.height(), 0);
case 180 -> new AffineTransform(-1, 0, 0, -1, pageInformation.width(), pageInformation.height());
case 270 -> new AffineTransform(0, -1, 1, 0, pageInformation.width() - pageInformation.height(), pageInformation.height());
default -> new AffineTransform();
};
// matrix multiplication is performed from right to left, so the order is reversed.
// scaling -> mirror -> rotation
AffineTransform resultMatrix = new AffineTransform();
resultMatrix.concatenate(rotationMatrix);
resultMatrix.concatenate(mirrorMatrix);
resultMatrix.concatenate(imageToCropBoxScaling);
return resultMatrix;
}
private static double calculateScalingFactor(double width, PageInformation pageInformation) {
// PDFBox always returns page height and width based on rotation
double pageWidth;
if (pageInformation.rotationDegrees() == 90 || pageInformation.rotationDegrees() == 270) {
pageWidth = pageInformation.height();
} else {
pageWidth = pageInformation.width();
}
return pageWidth / width;
}
@SneakyThrows
private PageInformation getPageInformation(Integer pageNumber) {
return pageInformation.get(pageNumber);
}
public record Lookups(SpanLookup<DocumentSpan> bold, SpanLookup<DocumentSpan> italic, SpanLookup<DocumentSpan> handwritten) {
public static Lookups empty() {
return new Lookups(new SpanLookup<>(Stream.empty(), Function.identity()),
new SpanLookup<>(Stream.empty(), Function.identity()),
new SpanLookup<>(Stream.empty(), Function.identity()));
}
}
}

View File

@ -1,215 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.service.imageprocessing;
import java.awt.geom.AffineTransform;
import java.awt.geom.Rectangle2D;
import java.nio.IntBuffer;
import java.util.Optional;
import com.azure.ai.documentintelligence.models.DocumentPage;
import com.azure.ai.documentintelligence.models.DocumentWord;
import com.knecon.fforesight.service.ocr.processor.service.OcrResultPostProcessingPipeline;
import com.knecon.fforesight.service.ocr.processor.visualizations.utils.RotationCorrectionUtility;
import com.knecon.fforesight.service.ocr.v1.api.model.QuadPoint;
import com.sun.jna.Pointer;
import lombok.SneakyThrows;
import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Numa;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.lept4j.util.LeptUtils;
/**
* This class attempts to shrink the BBox of a word to match the exact height of the word. This is only attempted for horizontal or vertical words. Any askew text is left as is.
*/
@Slf4j
@UtilityClass
public class BBoxSnuggificationService {
public static final int PIXEL_COUNT_THRESHOLD = 2; // minimum active pixel count per row for shrinking to stop
private static final double AVERAGE_ANGLE_THRESHOLD = 0.2; // Skips snuggification, if the average remaining word rotation of a word, written from left-to-right is bigger than this
public static final int INDIVIDUAL_ANGLE_THRESHOLD = 5; // skips snuggification for word, if the remaining rotation is larger than this angle
public static final int MAX_SHRINK_PIXELS = 40; // Number of pixels that are allowed to be removed from the top or bottom of an image
private static final int MINIMUM_WORD_PIXELS = 5; // Number of pixels that are required for snuggification
private enum Operation {
HORIZONTAL,
VERTICAL,
BOTH,
NONE
}
@SneakyThrows
public Optional<QuadPoint> snuggify(Pix pageImage, DocumentWord origin, AffineTransform resultToImageTransform) {
if (pageImage == null) {
return Optional.empty();
}
if (origin.getContent().equals("-") || origin.getContent().equals(",")) {
// very slim characters should not be snuggified, or the fontsize may be off significantly
return Optional.empty();
}
QuadPoint originTransformed = QuadPoint.fromPolygons(origin.getPolygon()).getTransformed(resultToImageTransform);
double remainingAngle = Math.abs(RotationCorrectionUtility.getRemainingAngle(originTransformed.getAngle()));
QuadPoint.Direction direction = originTransformed.getDirection();
Operation operation = determineOperation(origin, direction, remainingAngle, originTransformed);
if (operation == Operation.NONE) {
return Optional.empty();
}
Pix wordImage = OcrResultPostProcessingPipeline.extractWordImage(originTransformed, pageImage);
if (wordImage == null) {
log.debug("Unable to extract word image! wordImage: {}, pageImage {}", originTransformed.getBounds2D(), new Rectangle2D.Float(0, 0, pageImage.w, pageImage.h));
return Optional.empty();
}
if (!StrokeWidthCalculator.wordImageHasMinimumPixelDensity(wordImage)) {
return Optional.empty();
}
Optional<Rectangle2D> snugBox = switch (operation) {
case HORIZONTAL -> snuggifyY(wordImage, originTransformed.getBounds2D());
case VERTICAL -> snuggifyX(wordImage, originTransformed.getBounds2D());
case BOTH -> snuggifyBoth(wordImage, originTransformed);
default -> Optional.empty();
};
LeptUtils.disposePix(wordImage);
AffineTransform imageToResultTransform = resultToImageTransform.createInverse();
return snugBox.map(snugBBox -> QuadPoint.fromRectangle2D(snugBBox, direction))
.map(bbox -> bbox.getTransformed(imageToResultTransform));
}
private Optional<Rectangle2D> snuggifyBoth(Pix wordImage, QuadPoint originTransformed) {
Optional<Rectangle2D> snugY = snuggifyY(wordImage, originTransformed.getBounds2D());
Optional<Rectangle2D> snugX = snuggifyX(wordImage, originTransformed.getBounds2D());
if (snugY.isPresent() && snugX.isPresent()) {
return Optional.of(snugY.get().createIntersection(snugX.get()).getBounds2D());
} else if (snugY.isPresent()) {
return snugY;
} else {
return snugX;
}
}
private Operation determineOperation(DocumentWord origin, QuadPoint.Direction direction, double remainingAngle, QuadPoint originTransformed) {
Operation operation = Operation.NONE;
if (((direction.equals(QuadPoint.Direction.RIGHT) || direction.equals(QuadPoint.Direction.LEFT)) && remainingAngle < INDIVIDUAL_ANGLE_THRESHOLD)) {
operation = Operation.HORIZONTAL;
} else if ((direction.equals(QuadPoint.Direction.UP) || direction.equals(QuadPoint.Direction.DOWN)) && remainingAngle < INDIVIDUAL_ANGLE_THRESHOLD) {
operation = Operation.VERTICAL;
} else if ((origin.getContent().length() < 4 || Math.abs(originTransformed.getAngle()) < AVERAGE_ANGLE_THRESHOLD * 3)) {
return Operation.BOTH;
}
return operation;
}
private Optional<Rectangle2D> snuggifyX(Pix wordImage, Rectangle2D origin) {
Numa colCounts = Leptonica1.pixCountPixelsByColumn(wordImage);
int start = 0;
int end = wordImage.w - PIXEL_COUNT_THRESHOLD;
for (int i = start; i < Math.min(wordImage.w, MAX_SHRINK_PIXELS); i++) {
if (pixCountPerColumn(i, colCounts) > PIXEL_COUNT_THRESHOLD) {
start = i;
break;
}
}
for (int i = end; i > Math.max(0, wordImage.w - MAX_SHRINK_PIXELS); i--) {
if (pixCountPerColumn(i, colCounts) > PIXEL_COUNT_THRESHOLD) {
end = i;
break;
}
}
if (start == 0 && end == wordImage.w) {
return Optional.empty();
}
if (Math.abs(start - end) < MINIMUM_WORD_PIXELS) {
return Optional.empty();
}
return Optional.of(new Rectangle2D.Double(origin.getX() + start, origin.getY(), origin.getWidth() - start - (wordImage.w - end), origin.getHeight()));
}
private Optional<Rectangle2D> snuggifyY(Pix wordImage, Rectangle2D origin) {
int start = 0;
int end = wordImage.h - 1;
for (int i = start; i < Math.min(wordImage.h, MAX_SHRINK_PIXELS); i++) {
if (pixCountPerRow(i, wordImage) > PIXEL_COUNT_THRESHOLD) {
start = i;
break;
}
}
for (int i = end; i > Math.max(0, wordImage.h - MAX_SHRINK_PIXELS); i--) {
if (pixCountPerRow(i, wordImage) > PIXEL_COUNT_THRESHOLD) {
end = i;
break;
}
}
if (start == 0 && end == wordImage.h) {
return Optional.empty();
}
if (Math.abs(start - end) < MINIMUM_WORD_PIXELS) {
return Optional.empty();
}
return Optional.of(new Rectangle2D.Double(origin.getX(), origin.getY() + start, origin.getWidth(), origin.getHeight() - start - (wordImage.h - end)));
}
private int pixCountPerRow(int row, Pix pix) {
IntBuffer result = IntBuffer.allocate(1);
int success = Leptonica1.pixCountPixelsInRow(pix, row, result, null);
if (success == 0) {
return result.get();
} else {
return -1;
}
}
private int pixCountPerColumn(int column, Numa colCounts) {
if (column > colCounts.n) {
throw new IndexOutOfBoundsException("column " + column + " is out of bounds for column count " + colCounts.n);
}
Pointer pointer = colCounts.array.getPointer();
// Read the float value at position i. Each float takes 4 bytes.
return (int) pointer.getFloat((long) column * Float.BYTES);
}
public boolean canBeSnuggified(DocumentPage resultPage, AffineTransform imageTransform) {
double averageAngle = resultPage.getWords()
.stream()
.filter(word -> word.getContent().length() >= 4)
.map(DocumentWord::getPolygon)
.map(QuadPoint::fromPolygons)
.map(qp -> qp.getTransformed(imageTransform))
.filter(qp -> qp.getDirection().equals(QuadPoint.Direction.RIGHT))
.mapToDouble(QuadPoint::getAngle)
.map(Math::toDegrees)
.map(RotationCorrectionUtility::getRemainingAngle).average()
.orElse(Double.MAX_VALUE);
return Math.abs(averageAngle) < AVERAGE_ANGLE_THRESHOLD;
}
}

View File

@ -84,7 +84,6 @@ public class FontStyleDetector implements Closeable {
wordImage.textPosition().setFontMetricsProvider(Type0FontMetricsProvider.BOLD_INSTANCE);
wordImage.textPosition().setFontStyle(FontStyle.BOLD);
} else {
wordImage.textPosition().setFontMetricsProvider(Type0FontMetricsProvider.REGULAR_INSTANCE);
wordImage.textPosition().setFontStyle(FontStyle.REGULAR);
}
}

View File

@ -1,20 +1,16 @@
package com.knecon.fforesight.service.ocr.processor.service.imageprocessing;
import static com.knecon.fforesight.service.ocr.processor.utils.ListSplittingUtils.formatIntervals;
import java.io.BufferedReader;
import java.io.File;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.List;
import java.util.Map;
import java.util.function.Consumer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.slf4j.MDC;
import com.knecon.fforesight.service.ocr.processor.model.ImageFile;
import com.knecon.fforesight.service.ocr.processor.model.PageBatch;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
@ -31,7 +27,7 @@ public class GhostScriptOutputHandler extends Thread {
// If the stdError or stdOut buffer of a thread is not being emptied it might lock the process in case of errors, so we need to empty both streams to prevent a deadlock.
// Since both need to read simultaneously we need to implement the readers as separate threads.
final int batchIdx;
final InputStream is;
final String processName;
final Type type;
@ -40,32 +36,24 @@ public class GhostScriptOutputHandler extends Thread {
final Consumer<ImageFile> outputHandler;
final Consumer<String> errorHandler;
final Map<String, String> parentMdcContext;
int currentPageNumber;
public static GhostScriptOutputHandler stdError(int batchIdx, InputStream is, Consumer<String> errorHandler) {
public static GhostScriptOutputHandler stdError(InputStream is, Consumer<String> errorHandler) {
return new GhostScriptOutputHandler(batchIdx, is, "GS", Type.ERROR, null, null, errorHandler, MDC.getCopyOfContextMap());
return new GhostScriptOutputHandler(is, "GS", Type.ERROR, null, null, errorHandler);
}
public static GhostScriptOutputHandler stdOut(int batchIdx,
InputStream is,
Map<Integer, ImageFile> pagesToProcess,
Consumer<ImageFile> imageFileOutput,
Consumer<String> errorHandler) {
public static GhostScriptOutputHandler stdOut(InputStream is, Map<Integer, ImageFile> pagesToProcess, Consumer<ImageFile> imageFileOutput, Consumer<String> errorHandler) {
return new GhostScriptOutputHandler(batchIdx, is, "GS", Type.STD_OUT, pagesToProcess, imageFileOutput, errorHandler, MDC.getCopyOfContextMap());
return new GhostScriptOutputHandler(is, "GS", Type.STD_OUT, pagesToProcess, imageFileOutput, errorHandler);
}
@SneakyThrows
public void run() {
MDC.setContextMap(parentMdcContext);
try (InputStreamReader isr = new InputStreamReader(is); BufferedReader br = new BufferedReader(isr)) {
String line;
@ -75,14 +63,13 @@ public class GhostScriptOutputHandler extends Thread {
if (line == null) {
break;
}
switch (type) {
case STD_OUT -> {
log.debug("Batch {}: {}_{}>{}", batchIdx, processName, type.name(), line);
addProcessedImageToQueue(line);
}
case ERROR -> log.error("Batch {}: {}_{}>{}", batchIdx, processName, type.name(), line);
}
if (type.equals(Type.ERROR)) {
log.error("{}_{}>{}", processName, type.name(), line);
} else {
log.debug("{}_{}>{}", processName, type.name(), line);
addProcessedImageToQueue(line);
}
}
}
is.close();
@ -90,9 +77,7 @@ public class GhostScriptOutputHandler extends Thread {
queueFinishedPage(currentPageNumber);
if (!pagesToProcess.isEmpty()) {
errorHandler.accept(String.format("Ghostscript finished for batch %d, but pages %s remain unprocessed.", batchIdx, formatPagesToProcess()));
} else {
log.info("Batch {}: rendered successfully!", batchIdx);
errorHandler.accept(String.format("Ghostscript finished for batch, but pages %s remain unprocessed.", formatPagesToProcess()));
}
}
@ -101,16 +86,10 @@ public class GhostScriptOutputHandler extends Thread {
private String formatPagesToProcess() {
List<String> intervals = formatIntervals(pagesToProcess.keySet()
.stream()
.sorted()
.toList());
if (intervals.size() > 4) {
intervals = intervals.subList(0, 4);
intervals.add("...");
}
return String.join(", ", intervals);
var pages = new PageBatch();
pagesToProcess.keySet()
.forEach(pages::add);
return pages.toString();
}
@ -127,6 +106,7 @@ public class GhostScriptOutputHandler extends Thread {
currentPageNumber = pageNumber;
return;
}
queueFinishedPage(currentPageNumber);
currentPageNumber = pageNumber;
}
@ -137,10 +117,10 @@ public class GhostScriptOutputHandler extends Thread {
var imageFile = this.pagesToProcess.remove(pageNumber);
if (imageFile == null) {
errorHandler.accept(String.format("%d: Page number %d does not exist in this thread. It only has pagenumbers %s", batchIdx, pageNumber, pagesToProcess.keySet()));
errorHandler.accept(String.format("Page number %d does not exist in this thread. It only has pagenumbers %s", pageNumber, pagesToProcess.keySet()));
} else {
if (!new File(imageFile.absoluteFilePath()).exists()) {
errorHandler.accept(String.format("%d: Rendered page with number %d does not exist!", batchIdx, pageNumber));
errorHandler.accept(String.format("Rendered page with number %d does not exist!", pageNumber));
}
}
outputHandler.accept(imageFile);

View File

@ -2,136 +2,155 @@ package com.knecon.fforesight.service.ocr.processor.service.imageprocessing;
import java.io.InputStream;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Semaphore;
import java.util.concurrent.TimeUnit;
import java.util.function.Consumer;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.ocr.processor.OcrServiceSettings;
import com.knecon.fforesight.service.ocr.processor.model.ImageFile;
import com.knecon.fforesight.service.ocr.processor.model.PageBatch;
import com.knecon.fforesight.service.ocr.processor.utils.ListSplittingUtils;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
@SuppressWarnings("PMD") // can't figure out how to safely close the stdOut and stdError streams in line 72/74
@SuppressWarnings("PMD") // can't figure out how to safely close the stdOut and stdError streams in line 142/144
public class GhostScriptService {
private OcrServiceSettings ocrServiceSettings;
public static String FORMAT = ".tiff";
public static final int BATCH_SIZE = 256;
static String FORMAT = ".tiff";
static String DEVICE = "tiffgray";
static int DPI = 300;
private Semaphore concurrencySemaphore = new Semaphore(3);
static int PROCESS_COUNT = 1;
public GhostScriptService(OcrServiceSettings ocrServiceSettings) {
@SneakyThrows
public void renderPagesBatched(List<Integer> pagesToProcess,
String documentAbsolutePath,
Path tmpImageDir,
ImageProcessingSupervisor supervisor,
Consumer<ImageFile> successHandler,
Consumer<String> errorHandler) {
this.ocrServiceSettings = ocrServiceSettings;
assertGhostscriptIsInstalled();
}
List<List<ProcessInfo>> processInfoBatches = buildSubListForEachProcess(pagesToProcess,
PROCESS_COUNT,
BATCH_SIZE
* PROCESS_COUNT); // GS has a limit on how many pageIndices per call are possible, so we limit it to 256 pages per process
for (int batchIdx = 0; batchIdx < processInfoBatches.size(); batchIdx++) {
supervisor.requireNoErrors();
private void assertGhostscriptIsInstalled() {
List<ProcessInfo> processInfos = processInfoBatches.get(batchIdx);
try {
Process p = Runtime.getRuntime().exec("gs -v");
InputStream stdOut = p.getInputStream();
InputStream errOut = p.getErrorStream();
assert p.waitFor(1, TimeUnit.SECONDS);
log.info("Ghostscript is installed.");
String out = new String(stdOut.readAllBytes());
String error = new String(errOut.readAllBytes());
for (String line : out.split("\n")) {
log.info(line);
log.info("Batch {}: Running {} gs processes with ({}) pages each",
batchIdx,
processInfos.size(),
processInfos.stream()
.map(info -> info.pageNumbers().size())
.map(String::valueOf)
.collect(Collectors.joining(", ")));
int finalBatchIdx = batchIdx;
List<Process> processes = processInfos.stream()
.parallel()
.map(info -> buildCmdArgs(info.processIdx(), finalBatchIdx, info.pageNumbers(), tmpImageDir, documentAbsolutePath))
.peek(s -> log.debug(String.join(" ", s.cmdArgs())))
.map(processInfo -> executeProcess(processInfo, successHandler, errorHandler))
.toList();
List<Integer> processExitCodes = new LinkedList<>();
for (Process process : processes) {
processExitCodes.add(process.waitFor());
}
if (!error.isBlank()) {
log.error(error);
}
} catch (Exception e) {
log.error("Ghostscript is not installed!");
log.error(e.getMessage(), e);
throw new RuntimeException(e);
log.info("Batch {}: Ghostscript processes finished with exit codes {}", batchIdx, processExitCodes);
}
}
@SneakyThrows
public void startBatchRender(PageBatch batch, ImageProcessingSupervisor supervisor, Consumer<ImageFile> successHandler, Consumer<String> errorHandler) {
private List<List<ProcessInfo>> buildSubListForEachProcess(List<Integer> stitchedPageNumbers, int processCount, int batchSize) {
supervisor.requireNoErrors();
// GhostScript command line can only handle so many page numbers at once, so we split it into batches
int batchCount = (int) Math.ceil((double) stitchedPageNumbers.size() / batchSize);
List<ImageFile> renderedImageFiles = batch.getRenderedImageFiles();
if (ocrServiceSettings.isUseCaches() && renderedImageFiles.stream()
.allMatch(ImageFile::exists)) {
log.info("Batch {}: Using cached GhostScript rendering with page(s) {}", batch.getIndex(), batch);
renderedImageFiles.forEach(successHandler);
return;
log.info("Splitting {} page renderings across {} process(es) in {} batch(es) with size {}", stitchedPageNumbers.size(), processCount, batchCount, batchSize);
List<List<ProcessInfo>> processInfoBatches = new ArrayList<>(batchCount);
List<List<List<Integer>>> batchedBalancedSublist = ListSplittingUtils.buildBatchedBalancedSublist(stitchedPageNumbers.stream()
.sorted()
.toList(), processCount, batchCount);
for (var batch : batchedBalancedSublist) {
List<ProcessInfo> processInfos = new ArrayList<>(processCount);
for (int threadIdx = 0; threadIdx < batch.size(); threadIdx++) {
List<Integer> balancedPageNumbersSubList = batch.get(threadIdx);
processInfos.add(new ProcessInfo(threadIdx, balancedPageNumbersSubList));
}
processInfoBatches.add(processInfos);
}
concurrencySemaphore.acquire();
log.info("Batch {}: starting GhostScript rendering with page(s) {}", batch.getIndex(), batch);
executeProcess(batch, buildCmdArgs(batch, batch.getBatchDoc()), successHandler, errorHandler);
return processInfoBatches;
}
@SneakyThrows
private ProcessCmdsAndRenderedImageFiles buildCmdArgs(PageBatch batch, Path document) {
private ProcessCmdsAndRenderedImageFiles buildCmdArgs(Integer processIdx,
Integer batchIdx,
List<Integer> stitchedImagePageIndices,
Path outputDir,
String documentAbsolutePath) {
String imagePathFormat = outputDir.resolve("output_" + processIdx + "_" + batchIdx + ".%04d" + FORMAT).toFile().toString();
Map<Integer, ImageFile> fullPageImages = new HashMap<>();
List<ImageFile> renderedImageFiles = batch.getRenderedImageFiles();
for (int i = 1; i <= renderedImageFiles.size(); i++) {
ImageFile renderedImageFile = renderedImageFiles.get(i - 1);
fullPageImages.put(i, renderedImageFile);
for (int i = 0; i < stitchedImagePageIndices.size(); i++) {
Integer pageNumber = stitchedImagePageIndices.get(i);
fullPageImages.put(pageNumber, new ImageFile(pageNumber, String.format(imagePathFormat, i + 1)));
}
String[] cmdArgs = buildCmdArgs(document, batch.getRenderedImageNameFormat());
String[] cmdArgs = buildCmdArgs(stitchedImagePageIndices, documentAbsolutePath, imagePathFormat);
return new ProcessCmdsAndRenderedImageFiles(cmdArgs, fullPageImages);
}
private String[] buildCmdArgs(Path document, String imagePathFormat) {
private String[] buildCmdArgs(List<Integer> pageNumbers, String documentAbsolutePath, String imagePathFormat) {
return new String[]{"gs", "-dNOPAUSE", "-sDEVICE=" + DEVICE, "-r" + DPI, "-sOutputFile=" + imagePathFormat, document.toFile().toString(), "-c", "quit"};
StringBuilder sPageList = new StringBuilder();
int i = 1;
for (Integer integer : pageNumbers) {
sPageList.append(integer);
if (i < pageNumbers.size()) {
sPageList.append(",");
}
i++;
}
return new String[]{"gs", "-dNOPAUSE", "-sDEVICE=" + DEVICE, "-r" + DPI, "-sPageList=" + sPageList, "-sOutputFile=" + imagePathFormat, documentAbsolutePath, "-c", "quit"};
}
@SneakyThrows
private void executeProcess(PageBatch batch, ProcessCmdsAndRenderedImageFiles processInfo, Consumer<ImageFile> successHandler, Consumer<String> errorHandler) {
private Process executeProcess(ProcessCmdsAndRenderedImageFiles processInfo, Consumer<ImageFile> successHandler, Consumer<String> errorHandler) {
Process p = Runtime.getRuntime().exec(processInfo.cmdArgs());
InputStream stdOut = p.getInputStream();
GhostScriptOutputHandler stdOutLogger = GhostScriptOutputHandler.stdOut(batch.getIndex(), stdOut, processInfo.renderedPageImageFiles(), successHandler, errorHandler);
GhostScriptOutputHandler stdOutLogger = GhostScriptOutputHandler.stdOut(stdOut, processInfo.renderedPageImageFiles(), successHandler, errorHandler);
InputStream stdError = p.getErrorStream();
GhostScriptOutputHandler stdErrorLogger = GhostScriptOutputHandler.stdError(batch.getIndex(), stdError, errorHandler);
GhostScriptOutputHandler stdErrorLogger = GhostScriptOutputHandler.stdError(stdError, errorHandler);
stdOutLogger.start();
stdErrorLogger.start();
handleFinished(p, errorHandler, batch, successHandler);
}
private void handleFinished(Process p, Consumer<String> errorHandler, PageBatch batch, Consumer<ImageFile> successHandler) {
Thread finishedThread = new Thread(() -> {
try {
p.waitFor(2, TimeUnit.MINUTES);
} catch (InterruptedException e) {
errorHandler.accept("Batch %d: Ghostscript rendering has been terminated after 2 minutes \n %s".formatted(batch.getIndex(), e.getMessage()));
} finally {
concurrencySemaphore.release();
}
});
finishedThread.start();
return p;
}
@ -139,4 +158,8 @@ public class GhostScriptService {
}
private record ProcessInfo(Integer processIdx, List<Integer> pageNumbers) {
}
}

View File

@ -1,12 +1,15 @@
package com.knecon.fforesight.service.ocr.processor.service.imageprocessing;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import java.util.Set;
import java.util.function.Consumer;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.ocr.processor.model.ImageFile;
import com.knecon.fforesight.service.ocr.processor.model.PageBatch;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
@ -23,16 +26,24 @@ public class ImageProcessingPipeline {
@SneakyThrows
public ImageProcessingSupervisor addToPipeline(PageBatch batch) {
public ImageProcessingSupervisor run(Set<Integer> pageNumberSet, Path imageDir, File document) {
List<Integer> pageNumbers = batch.getAllPageNumbers();
Path processedImageDir = imageDir.resolve("processed");
Path renderedImageDir = imageDir.resolve("rendered");
Files.createDirectories(renderedImageDir);
Files.createDirectories(processedImageDir);
List<Integer> pageNumbers = pageNumberSet.stream()
.sorted()
.toList();
ImageProcessingSupervisor supervisor = new ImageProcessingSupervisor(pageNumbers);
Consumer<ImageFile> renderingSuccessConsumer = imageFile -> imageProcessingService.addToProcessingQueue(imageFile, batch.getProcessedImageDir(), supervisor);
Consumer<ImageFile> renderingSuccessConsumer = imageFile -> imageProcessingService.addToProcessingQueue(imageFile, processedImageDir, supervisor);
Consumer<String> renderingErrorConsumer = supervisor::markError;
ghostScriptService.startBatchRender(batch, supervisor, renderingSuccessConsumer, renderingErrorConsumer);
ghostScriptService.renderPagesBatched(pageNumbers, document.toString(), renderedImageDir, supervisor, renderingSuccessConsumer, renderingErrorConsumer);
return supervisor;
}

View File

@ -1,13 +1,11 @@
package com.knecon.fforesight.service.ocr.processor.service.imageprocessing;
import java.io.File;
import java.nio.file.Path;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.ocr.processor.OcrServiceSettings;
import com.knecon.fforesight.service.ocr.processor.model.ImageFile;
import lombok.AccessLevel;
@ -25,10 +23,9 @@ import net.sourceforge.lept4j.util.LeptUtils;
public class ImageProcessingService {
BlockingQueue<ProcessParams> queue = new LinkedBlockingQueue<>();
private final OcrServiceSettings ocrServiceSettings;
public ImageProcessingService(OcrServiceSettings ocrServiceSettings) {
public ImageProcessingService() {
Thread queueConsumerThread = new Thread(() -> {
while (true) {
@ -41,13 +38,12 @@ public class ImageProcessingService {
try {
process(processParams.unprocessedImage(), processParams.outputDir, processParams.supervisor());
} catch (Exception e) {
processParams.supervisor.markPageFinished(processParams.unprocessedImage());
log.error(e.getMessage(), e);
}
}
});
queueConsumerThread.start();
this.ocrServiceSettings = ocrServiceSettings;
}
@ -58,43 +54,31 @@ public class ImageProcessingService {
}
@SneakyThrows
private void process(ImageFile unprocessedImage, Path outputDir, ImageProcessingSupervisor supervisor) {
String absoluteFilePath = outputDir.resolve(Path.of(unprocessedImage.absoluteFilePath()).getFileName()).toFile().toString();
ImageFile processedImage = new ImageFile(unprocessedImage.pageNumber(), absoluteFilePath);
supervisor.requireNoErrors();
if (ocrServiceSettings.isUseCaches() && processedImage.exists()) {
supervisor.markPageFinished(processedImage);
return;
}
synchronized (ImageProcessingSupervisor.class) {
// Leptonica is not thread safe, but is being called in WritableOcrResultFactory as well
Pix processedPix;
Pix pix = unprocessedImage.readPix();
try {
if (!unprocessedImage.exists()) {
log.error("ERROR, rendered image {} does not exist", unprocessedImage.absoluteFilePath());
throw new AssertionError();
}
synchronized (ImageProcessingSupervisor.class) {
// Leptonica is not thread safe, but is being called in WritableOcrResultFactory as well
Pix processedPix;
Pix pix = unprocessedImage.readPix();
String absoluteFilePath = outputDir.resolve(Path.of(unprocessedImage.absoluteFilePath()).getFileName()).toFile().toString();
assert pix != null;
processedPix = processPix(pix);
Leptonica1.pixWrite(absoluteFilePath, processedPix, ILeptonica.IFF_TIFF_PACKBITS);
processedPix = processPix(pix);
Leptonica1.pixWrite(processedImage.absoluteFilePath(), processedPix, ILeptonica.IFF_TIFF_PACKBITS);
LeptUtils.disposePix(pix);
LeptUtils.disposePix(processedPix);
LeptUtils.disposePix(pix);
LeptUtils.disposePix(processedPix);
}
} catch (Exception e) {
supervisor.markError("Page %d could not be processed due to: %s".formatted(unprocessedImage.pageNumber(), e.getMessage()));
} finally {
supervisor.markPageFinished(processedImage);
log.debug("Finished page: {}", processedImage.pageNumber());
ImageFile imageFile = new ImageFile(unprocessedImage.pageNumber(), absoluteFilePath);
supervisor.markPageFinished(imageFile);
}
}
@SneakyThrows
private Pix processPix(Pix pix) {
Pix binarized;

View File

@ -53,7 +53,7 @@ public class ImageProcessingSupervisor {
public ImageFile awaitProcessedPage(Integer pageNumber) throws InterruptedException {
if (hasErrors()) {
if (hasErros()) {
return null;
}
getPageLatch(pageNumber).await();
@ -61,15 +61,14 @@ public class ImageProcessingSupervisor {
}
private boolean hasErrors() {
private boolean hasErros() {
return !errors.isEmpty();
return errors.isEmpty();
}
public void markError(String errorMessage) {
log.error(errorMessage);
this.errors.add(errorMessage);
}
@ -87,7 +86,7 @@ public class ImageProcessingSupervisor {
if (this.errors.isEmpty()) {
return;
}
throw new IllegalStateException("Error(s) occurred during image processing: " + String.join("\n", errors.subList(0, Math.min(errors.size(), 3))));
throw new IllegalStateException("Error(s) occurred during image processing: " + String.join("\n", errors));
}
}

View File

@ -3,9 +3,13 @@ package com.knecon.fforesight.service.ocr.processor.service.imageprocessing;
import static net.sourceforge.lept4j.ILeptonica.L_THIN_FG;
import java.io.Closeable;
import java.io.IOException;
import java.nio.IntBuffer;
import org.springframework.stereotype.Service;
import lombok.AccessLevel;
import lombok.NoArgsConstructor;
import lombok.experimental.FieldDefaults;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
@ -15,7 +19,6 @@ import net.sourceforge.lept4j.util.LeptUtils;
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class StrokeWidthCalculator implements Closeable {
public static final double MINIMUM_PIXEL_DENSITY = 0.05;
Sela thinningSel = Leptonica1.selaMakeThinSets(1, 0);
@ -43,14 +46,6 @@ public class StrokeWidthCalculator implements Closeable {
}
public static boolean wordImageHasMinimumPixelDensity(Pix wordImage) {
IntBuffer pixelCount = IntBuffer.allocate(1);
Leptonica1.pixCountPixels(wordImage, pixelCount, null);
return (double) pixelCount.get(0) / (wordImage.w * wordImage.h) >= MINIMUM_PIXEL_DENSITY;
}
public boolean hasLargerStrokeWidth(Pix pix, double strokeWidth, double threshold) {
int roundedStrokeWidth = (int) Math.round(strokeWidth);

View File

@ -1,26 +1,25 @@
package com.knecon.fforesight.service.ocr.processor.utils;
import java.util.Locale;
import org.apache.commons.lang3.StringUtils;
import lombok.experimental.UtilityClass;
@UtilityClass
public final class OsUtils {
private static final String SERVICE_NAME = "azure-ocr-service";
private static boolean isWindows() {
String osName = System.getProperty("os.name");
if (osName == null) {
return false;
}
return osName.toLowerCase(Locale.ENGLISH).contains("windows");
return StringUtils.containsIgnoreCase(System.getProperty("os.name"), "Windows");
}
public static String getTemporaryDirectory() {
String tmpdir = System.getProperty("java.io.tmpdir");
if (isWindows() && !tmpdir.isBlank()) {
if (isWindows() && StringUtils.isNotBlank(tmpdir)) {
return tmpdir;
}
return "/tmp";

View File

@ -1,40 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.utils;
import java.util.regex.Pattern;
import lombok.experimental.UtilityClass;
@UtilityClass
public class StringCleaningUtility {
public static final Pattern hyphenLineBreaks = Pattern.compile("[-~‐‒⁻−﹣゠⁓‑\\u00AD][\\r\\n]+");
public static final Pattern linebreaks = Pattern.compile("[\\r\\n]+");
public static final Pattern doubleWhitespaces = Pattern.compile("\\s{2,}");
public static String cleanString(String value) {
String noHyphenLinebreaks = removeHyphenLinebreaks(value);
String noLinebreaks = removeLinebreaks(noHyphenLinebreaks);
return removeMultipleWhitespaces(noLinebreaks);
}
private String removeHyphenLinebreaks(String value) {
return hyphenLineBreaks.matcher(value).replaceAll("");
}
private String removeMultipleWhitespaces(String value) {
return doubleWhitespaces.matcher(value).replaceAll(" ");
}
private String removeLinebreaks(String value) {
return linebreaks.matcher(value).replaceAll(" ");
}
}

View File

@ -1,14 +1,14 @@
package com.knecon.fforesight.service.ocr.processor.visualizations;
import com.azure.ai.documentintelligence.models.AnalyzeResult;
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
import com.knecon.fforesight.service.ocr.v1.api.model.AzureAnalyzeResult;
import lombok.experimental.UtilityClass;
@UtilityClass
public class AnalyzeResultMapper {
public IdpResult map(AnalyzeResult analyzeResult) {
public AzureAnalyzeResult map(AnalyzeResult analyzeResult) {
return null;
}

View File

@ -1,23 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.visualizations;
import java.util.function.Function;
import java.util.stream.Stream;
import com.azure.ai.documentintelligence.models.DocumentSpan;
import com.knecon.fforesight.service.ocr.processor.model.SpanLookup;
public class FontStyler {
public record Lookups(SpanLookup<DocumentSpan> bold, SpanLookup<DocumentSpan> italic, SpanLookup<DocumentSpan> handwritten) {
public static Lookups empty() {
return new Lookups(new SpanLookup<>(Stream.empty(), Function.identity()),
new SpanLookup<>(Stream.empty(), Function.identity()),
new SpanLookup<>(Stream.empty(), Function.identity()));
}
}
}

View File

@ -20,7 +20,6 @@ import lombok.experimental.FieldDefaults;
public final class WritableOcrResult {
int pageNumber;
double angle;
@Builder.Default
List<TextPositionInImage> textPositionInImage = Collections.emptyList();
@Builder.Default

View File

@ -0,0 +1,367 @@
package com.knecon.fforesight.service.ocr.processor.visualizations;
import java.awt.geom.AffineTransform;
import java.awt.geom.Line2D;
import java.awt.geom.Rectangle2D;
import java.nio.IntBuffer;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.azure.ai.documentintelligence.models.AnalyzeResult;
import com.azure.ai.documentintelligence.models.BoundingRegion;
import com.azure.ai.documentintelligence.models.DocumentPage;
import com.azure.ai.documentintelligence.models.DocumentSpan;
import com.azure.ai.documentintelligence.models.DocumentStyle;
import com.azure.ai.documentintelligence.models.DocumentTable;
import com.azure.ai.documentintelligence.models.DocumentTableCell;
import com.azure.ai.documentintelligence.models.DocumentWord;
import com.azure.ai.documentintelligence.models.FontWeight;
import com.google.common.base.Functions;
import com.knecon.fforesight.service.ocr.processor.model.ImageFile;
import com.knecon.fforesight.service.ocr.processor.model.PageInformation;
import com.knecon.fforesight.service.ocr.processor.model.PageBatch;
import com.knecon.fforesight.service.ocr.processor.model.SpanLookup;
import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage;
import com.knecon.fforesight.service.ocr.processor.OcrServiceSettings;
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.FontStyleDetector;
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.ImageProcessingSupervisor;
import com.knecon.fforesight.service.ocr.processor.visualizations.fonts.FontMetricsProvider;
import com.knecon.fforesight.service.ocr.processor.visualizations.fonts.FontStyle;
import com.knecon.fforesight.service.ocr.processor.visualizations.fonts.Type0FontMetricsProvider;
import com.knecon.fforesight.service.ocr.v1.api.model.QuadPoint;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import net.sourceforge.lept4j.Box;
import net.sourceforge.lept4j.Leptonica1;
import net.sourceforge.lept4j.Pix;
import net.sourceforge.lept4j.util.LeptUtils;
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class WritableOcrResultFactory {
FontMetricsProvider regularFont = Type0FontMetricsProvider.REGULAR_INSTANCE;
FontMetricsProvider boldFont = Type0FontMetricsProvider.BOLD_INSTANCE;
FontMetricsProvider italicFont = Type0FontMetricsProvider.ITALIC_INSTANCE;
FontMetricsProvider boldItalicFont = Type0FontMetricsProvider.BOLD_ITALIC_INSTANCE;
@Getter
Map<Integer, AffineTransform> pageCtms;
Map<Integer, PageInformation> pageInformation;
OcrServiceSettings settings;
ImageProcessingSupervisor imageSupervisor;
@SneakyThrows
public WritableOcrResultFactory(Map<Integer, PageInformation> pageInformation, OcrServiceSettings settings, ImageProcessingSupervisor imageSupervisor) {
this.pageInformation = pageInformation;
pageCtms = Collections.synchronizedMap(new HashMap<>());
this.settings = settings;
this.imageSupervisor = imageSupervisor;
}
public List<WritableOcrResult> buildOcrResultToWrite(AnalyzeResult analyzeResult, PageBatch pageOffset) throws InterruptedException {
List<WritableOcrResult> writableOcrResultList = new ArrayList<>();
Lookups lookups = getLookups(analyzeResult);
for (DocumentPage resultPage : analyzeResult.getPages()) {
PageInformation pageInformation = getPageInformation(getPageNumber(pageOffset, resultPage));
AffineTransform pageCtm = getPageCTM(pageInformation, resultPage.getWidth());
pageCtms.put(getPageNumber(pageOffset, resultPage), pageCtm);
List<TextPositionInImage> words = buildTextPositionsInImage(pageOffset, resultPage, pageCtm, lookups, pageInformation);
var builder = WritableOcrResult.builder().pageNumber(pageInformation.number()).textPositionInImage(words);
if (settings.isTableDetection()) {
builder.tableLines(getTableLines(analyzeResult, pageInformation, pageCtm));
}
writableOcrResultList.add(builder.build());
}
return writableOcrResultList;
}
private List<TextPositionInImage> buildTextPositionsInImage(PageBatch pageOffset,
DocumentPage resultPage,
AffineTransform pageCtm,
Lookups lookups,
PageInformation pageInformation) throws InterruptedException {
if (!settings.isFontStyleDetection()) {
return buildText(resultPage, pageCtm, lookups, pageInformation);
}
ImageFile imageFile = imageSupervisor.awaitProcessedPage(getPageNumber(pageOffset, resultPage));
if (imageFile == null) {
return buildText(resultPage, pageCtm, lookups, pageInformation);
}
synchronized (ImageProcessingSupervisor.class) {
return buildTextWithBoldDetection(resultPage, pageCtm, pageInformation, imageFile);
}
}
private static List<TextPositionInImage> buildTextWithBoldDetection(DocumentPage resultPage, AffineTransform pageCtm, PageInformation pageInformation, ImageFile imageFile) {
// Leptonica is not thread safe, but is being called in ImageProcessingService as well
Pix pageImage = imageFile.readPix();
List<TextPositionInImage> words = new ArrayList<>();
try (FontStyleDetector fontStyleDetector = new FontStyleDetector()) {
AffineTransform imageTransform = new AffineTransform();
double scalingFactor = pageImage.w / resultPage.getWidth();
imageTransform.scale(scalingFactor, scalingFactor);
for (DocumentWord word : resultPage.getWords()) {
TextPositionInImage textPosition = new TextPositionInImage(word, pageCtm, Type0FontMetricsProvider.REGULAR_INSTANCE, FontStyle.REGULAR);
if (intersectsIgnoreZone(pageInformation.wordBBoxes(), textPosition)) {
textPosition.setOverlapsIgnoreZone(true);
}
Pix wordImage = extractWordImage(word, imageTransform, pageImage);
IntBuffer pixelCount = IntBuffer.allocate(1);
Leptonica1.pixCountPixels(wordImage, pixelCount, null);
if (pixelCount.get(0) > 3) {
fontStyleDetector.add(textPosition, wordImage, textPosition.getFontSizeByHeight());
}
words.add(textPosition);
}
fontStyleDetector.classifyWords();
} finally {
LeptUtils.disposePix(pageImage);
}
return words;
}
private static Pix extractWordImage(DocumentWord word, AffineTransform imageTransform, Pix pageImage) {
Rectangle2D wordBBox = QuadPoint.fromPolygons(word.getPolygon()).getTransformed(imageTransform).getBounds2D();
Box box = new Box((int) wordBBox.getX(), (int) wordBBox.getY(), (int) wordBBox.getWidth(), (int) wordBBox.getHeight(), 1);
Pix wordImage = Leptonica1.pixClipRectangle(pageImage, box, null);
box.clear();
return wordImage;
}
private List<TextPositionInImage> buildText(DocumentPage resultPage, AffineTransform pageCtm, Lookups lookups, PageInformation pageInformation) {
return resultPage.getWords()
.stream()
.map(word -> buildTextPositionInImage(word, pageCtm, lookups))
.map(textPositionInImage -> markTextOverlappingIgnoreZone(textPositionInImage, pageInformation.wordBBoxes()))
.collect(Collectors.toList());
}
private static int getPageNumber(PageBatch pageOffset, DocumentPage resultPage) {
return pageOffset.getPageNumber(resultPage.getPageNumber());
}
private static Lookups getLookups(AnalyzeResult analyzeResult) {
if (analyzeResult.getStyles() == null || analyzeResult.getStyles().isEmpty()) {
return Lookups.empty();
}
SpanLookup<DocumentSpan> boldLookup = new SpanLookup<>(analyzeResult.getStyles()
.stream()
.filter(style -> Objects.equals(style.getFontWeight(), FontWeight.BOLD))
.map(DocumentStyle::getSpans)
.flatMap(Collection::stream), Function.identity());
SpanLookup<DocumentSpan> italicLookup = new SpanLookup<>(analyzeResult.getStyles()
.stream()
.filter(style -> Objects.equals(style.getFontStyle(),
com.azure.ai.documentintelligence.models.FontStyle.ITALIC))
.map(DocumentStyle::getSpans)
.flatMap(Collection::stream), Functions.identity());
SpanLookup<DocumentSpan> handWrittenLookup = new SpanLookup<>(analyzeResult.getStyles()
.stream()
.filter(documentStyle -> documentStyle.isHandwritten() != null && documentStyle.isHandwritten())
.map(DocumentStyle::getSpans)
.flatMap(Collection::stream), Functions.identity());
return new Lookups(boldLookup, italicLookup, handWrittenLookup);
}
private TextPositionInImage buildTextPositionInImage(DocumentWord dw, AffineTransform imageCTM, Lookups lookups) {
boolean bold = lookups.bold().containedInAnySpan(dw.getSpan());
boolean italic = lookups.italic().containedInAnySpan(dw.getSpan());
boolean handwritten = lookups.handwritten().containedInAnySpan(dw.getSpan());
FontStyle fontStyle;
FontMetricsProvider font;
if (handwritten) {
fontStyle = FontStyle.HANDWRITTEN;
font = regularFont;
} else if (italic && bold) {
fontStyle = FontStyle.BOLD_ITALIC;
font = boldItalicFont;
} else if (bold) {
fontStyle = FontStyle.BOLD;
font = boldFont;
} else if (italic) {
fontStyle = FontStyle.ITALIC;
font = italicFont;
} else {
fontStyle = FontStyle.REGULAR;
font = regularFont;
}
return new TextPositionInImage(dw, imageCTM, font, fontStyle);
}
private static List<Line2D> getTableLines(AnalyzeResult analyzeResult, PageInformation pageInformation, AffineTransform imageCTM) {
if (analyzeResult.getTables() == null || analyzeResult.getTables().isEmpty()) {
return Collections.emptyList();
}
return analyzeResult.getTables()
.stream()
.map(DocumentTable::getCells)
.flatMap(Collection::stream)
.map(DocumentTableCell::getBoundingRegions)
.flatMap(Collection::stream)
.filter(table -> table.getPageNumber() == pageInformation.number())
.map(BoundingRegion::getPolygon)
.map(QuadPoint::fromPolygons)
.map(qp -> qp.getTransformed(imageCTM))
.flatMap(QuadPoint::asLines)
.toList();
}
private static TextPositionInImage markTextOverlappingIgnoreZone(TextPositionInImage textPositionInImage, List<Rectangle2D> ignoreZones) {
if (intersectsIgnoreZone(ignoreZones, textPositionInImage)) {
textPositionInImage.setOverlapsIgnoreZone(true);
}
return textPositionInImage;
}
private static boolean intersectsIgnoreZone(List<Rectangle2D> ignoreZones, TextPositionInImage textPositionInImage) {
for (Rectangle2D ignoreZone : ignoreZones) {
Rectangle2D textBBox = textPositionInImage.getTransformedTextBBox().getBounds2D();
if (textBBox.intersects(ignoreZone)) {
double intersectedArea = calculateIntersectedArea(textBBox, ignoreZone);
double textArea = textBBox.getWidth() * textBBox.getHeight();
if (intersectedArea / textArea > 0.5) {
return true;
}
double ignoreZoneArea = ignoreZone.getWidth() * ignoreZone.getHeight();
if (intersectedArea / ignoreZoneArea > 0.5) {
return true;
}
}
}
return false;
}
public static double calculateIntersectedArea(Rectangle2D r1, Rectangle2D r2) {
double xOverlap = Math.max(0, Math.min(r1.getMaxX(), r2.getMaxX()) - Math.max(r1.getMinX(), r2.getMinX()));
double yOverlap = Math.max(0, Math.min(r1.getMaxY(), r2.getMaxY()) - Math.max(r1.getY(), r2.getY()));
return xOverlap * yOverlap;
}
public static AffineTransform getPageCTM(PageInformation pageInformation, double imageWidth) {
double scalingFactor = calculateScalingFactor(imageWidth, pageInformation);
AffineTransform imageToCropBoxScaling = new AffineTransform(scalingFactor, 0, 0, scalingFactor, 0, 0);
AffineTransform mirrorMatrix = new AffineTransform(1, 0, 0, -1, 0, pageInformation.height());
AffineTransform rotationMatrix = switch (pageInformation.rotationDegrees()) {
case 90 -> new AffineTransform(0, 1, -1, 0, pageInformation.height(), 0);
case 180 -> new AffineTransform(-1, 0, 0, -1, pageInformation.width(), pageInformation.height());
case 270 -> new AffineTransform(0, -1, 1, 0, pageInformation.width() - pageInformation.height(), pageInformation.height());
default -> new AffineTransform();
};
// matrix multiplication is performed from right to left, so the order is reversed.
// scaling -> mirror -> rotation
AffineTransform resultMatrix = new AffineTransform();
resultMatrix.concatenate(rotationMatrix);
resultMatrix.concatenate(mirrorMatrix);
resultMatrix.concatenate(imageToCropBoxScaling);
return resultMatrix;
}
private static double calculateScalingFactor(double width, PageInformation pageInformation) {
// PDFBox always returns page height and width based on rotation
double pageWidth;
if (pageInformation.rotationDegrees() == 90 || pageInformation.rotationDegrees() == 270) {
pageWidth = pageInformation.height();
} else {
pageWidth = pageInformation.width();
}
return pageWidth / width;
}
@SneakyThrows
private PageInformation getPageInformation(Integer pageNumber) {
return pageInformation.get(pageNumber);
}
private record Lookups(SpanLookup<DocumentSpan> bold, SpanLookup<DocumentSpan> italic, SpanLookup<DocumentSpan> handwritten) {
public static Lookups empty() {
return new Lookups(new SpanLookup<>(Stream.empty(), Function.identity()),
new SpanLookup<>(Stream.empty(), Function.identity()),
new SpanLookup<>(Stream.empty(), Function.identity()));
}
}
}

View File

@ -8,7 +8,7 @@ import lombok.SneakyThrows;
public interface FontMetricsProvider extends EmbeddableFont {
default FontMetrics calculateMetricsForAzureBBox(String text, double textWidth, double textHeight) {
default FontMetrics calculateMetrics(String text, double textWidth, double textHeight) {
HeightAndDescent heightAndDescent = calculateHeightAndDescent(text);
float fontSize = calculateFontSize(text, textWidth);
@ -18,16 +18,6 @@ public interface FontMetricsProvider extends EmbeddableFont {
}
default FontMetrics calculateMetricsForTightBBox(String text, double textWidth, double textHeight) {
HeightAndDescent heightAndDescent = calculateHeightAndDescent(text);
float fontSize = calculateFontSize(text, textWidth);
float heightScaling = (float) ((textHeight / (heightAndDescent.height() - heightAndDescent.descent())) * 1000) / fontSize;
return new FontMetrics((heightAndDescent.descent() / 1000) * fontSize, fontSize, heightScaling);
}
@SneakyThrows
default float calculateFontSize(String text, double textWidth) {

View File

@ -12,6 +12,8 @@ import com.azure.ai.documentintelligence.models.DocumentBarcode;
import com.azure.ai.documentintelligence.models.DocumentFigure;
import com.azure.ai.documentintelligence.models.DocumentKeyValuePair;
import com.azure.ai.documentintelligence.models.DocumentLine;
import com.azure.ai.documentintelligence.models.DocumentList;
import com.azure.ai.documentintelligence.models.DocumentListItem;
import com.azure.ai.documentintelligence.models.DocumentParagraph;
import com.azure.ai.documentintelligence.models.DocumentSection;
import com.azure.ai.documentintelligence.models.DocumentTable;
@ -21,8 +23,8 @@ import com.azure.ai.documentintelligence.models.DocumentWord;
import com.azure.ai.documentintelligence.models.ParagraphRole;
import com.knecon.fforesight.service.ocr.processor.model.PageBatch;
import com.knecon.fforesight.service.ocr.processor.model.SpanLookup;
import com.knecon.fforesight.service.ocr.processor.visualizations.utils.LineUtils;
import com.knecon.fforesight.service.ocr.processor.visualizations.utils.Rectangle2DBBoxCollector;
import com.knecon.fforesight.service.ocr.processor.visualizations.utils.LineUtils;
import com.knecon.fforesight.service.ocr.v1.api.model.QuadPoint;
import com.knecon.fforesight.service.viewerdoc.layers.IdpLayerConfig;
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
@ -40,7 +42,7 @@ import lombok.experimental.FieldDefaults;
public class IdpLayer extends IdpLayerConfig {
public static final int LINE_WIDTH = 1;
private Map<Integer, AffineTransform> resultToPageTransform;
private Map<Integer, AffineTransform> pageCtms;
public void addSection(int pageNumber, DocumentSection section, SpanLookup<DocumentWord> wordsOnPage) {
@ -63,7 +65,15 @@ public class IdpLayer extends IdpLayerConfig {
var sectionsOnPage = getOrCreateVisualizationsOnPage(pageNumber, vis);
sectionsOnPage.getColoredRectangles().add(new ColoredRectangle(bbox.getTransformed(resultToPageTransform.get(pageNumber)).getBounds2D(), color, LINE_WIDTH));
sectionsOnPage.getColoredRectangles().add(new ColoredRectangle(bbox.getTransformed(pageCtms.get(pageNumber)).getBounds2D(), color, LINE_WIDTH));
}
public void addList(DocumentList list, PageBatch pageOffset) {
for (DocumentListItem item : list.getItems()) {
addBoundingRegion(item.getBoundingRegions(), lists, PARAGRAPH_COLOR, pageOffset);
}
}
@ -75,27 +85,19 @@ public class IdpLayer extends IdpLayerConfig {
public void addKeyValue(DocumentKeyValuePair keyValue, PageBatch pageOffset) {
if (keyValue.getKey() == null || keyValue.getKey().getContent().isEmpty()) {
return;
}
addBoundingRegion(keyValue.getKey().getBoundingRegions(), keyValuePairs, KEY_COLOR, pageOffset);
if (keyValue.getValue() != null && !keyValue.getValue().getContent().isEmpty()) {
if (keyValue.getValue() != null) {
addBoundingRegion(keyValue.getValue().getBoundingRegions(), keyValuePairs, VALUE_COLOR, pageOffset);
if (keyValue.getKey().getBoundingRegions()
.get(0).getPageNumber() != keyValue.getValue().getBoundingRegions()
.get(0).getPageNumber()) {
if (keyValue.getKey().getBoundingRegions().get(0).getPageNumber() != keyValue.getValue().getBoundingRegions().get(0).getPageNumber()) {
return;
}
int pageNumberWithOffset = pageOffset.getPageNumber(keyValue.getKey().getBoundingRegions()
.get(0).getPageNumber());
QuadPoint key = QuadPoint.fromPolygons(keyValue.getKey().getBoundingRegions()
.get(0).getPolygon());
QuadPoint value = QuadPoint.fromPolygons(keyValue.getValue().getBoundingRegions()
.get(0).getPolygon());
int pageNumberWithOffset = pageOffset.getPageNumber(keyValue.getKey().getBoundingRegions().get(0).getPageNumber());
QuadPoint key = QuadPoint.fromPolygons(keyValue.getKey().getBoundingRegions().get(0).getPolygon());
QuadPoint value = QuadPoint.fromPolygons(keyValue.getValue().getBoundingRegions().get(0).getPolygon());
var line = LineUtils.findClosestMidpointLine(key, value);
line = LineUtils.transform(line, resultToPageTransform.get(pageNumberWithOffset));
line = LineUtils.transform(line, pageCtms.get(pageNumberWithOffset));
var arrowHead = LineUtils.createArrowHead(line, Math.min(LineUtils.length(line), 5));
var linesOnPage = getOrCreateVisualizationsOnPage(pageNumberWithOffset, keyValuePairs).getColoredLines();
linesOnPage.add(new ColoredLine(line, KEY_VALUE_BBOX_COLOR, LINE_WIDTH));
@ -140,7 +142,7 @@ public class IdpLayer extends IdpLayerConfig {
private void addPolygon(int pageNumber, List<Double> polygon, Visualizations visualizations, Color color) {
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, visualizations);
visualizationsOnPage.getColoredLines().addAll(LineUtils.quadPointAsLines(QuadPoint.fromPolygons(polygon).getTransformed(resultToPageTransform.get(pageNumber)), color));
visualizationsOnPage.getColoredLines().addAll(LineUtils.quadPointAsLines(QuadPoint.fromPolygons(polygon).getTransformed(pageCtms.get(pageNumber)), color));
}
@ -179,8 +181,7 @@ public class IdpLayer extends IdpLayerConfig {
var vis = getOrCreateVisualizationsOnPage(pageOffset.getPageNumber(boundingRegion.getPageNumber()), tables);
QuadPoint qp = QuadPoint.fromPolygons(boundingRegion.getPolygon())
.getTransformed(resultToPageTransform.get(pageOffset.getPageNumber(boundingRegion.getPageNumber())));
QuadPoint qp = QuadPoint.fromPolygons(boundingRegion.getPolygon()).getTransformed(pageCtms.get(pageOffset.getPageNumber(boundingRegion.getPageNumber())));
vis.getFilledRectangles().add(new FilledRectangle(qp.getBounds2D(), TITLE_COLOR, 0.2f));

View File

@ -20,9 +20,9 @@ public class IdpLayerFactory {
private final IdpLayer idpLayer;
IdpLayerFactory(Map<Integer, AffineTransform> resultToPageTransform) {
IdpLayerFactory(Map<Integer, AffineTransform> pageCtms) {
this.idpLayer = new IdpLayer(resultToPageTransform);
this.idpLayer = new IdpLayer(pageCtms);
}
@ -65,6 +65,10 @@ public class IdpLayerFactory {
analyzeResult.getTables()
.forEach(documentTable -> idpLayer.addTable(documentTable, pageOffset));
}
if (analyzeResult.getLists() != null) {
analyzeResult.getLists()
.forEach(list -> idpLayer.addList(list, pageOffset));
}
if (analyzeResult.getKeyValuePairs() != null) {
analyzeResult.getKeyValuePairs()
.forEach(keyValue -> idpLayer.addKeyValue(keyValue, pageOffset));

View File

@ -1,241 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.visualizations.layers;
import static com.knecon.fforesight.service.ocr.processor.utils.StringCleaningUtility.cleanString;
import java.awt.geom.AffineTransform;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import com.azure.ai.documentintelligence.models.AnalyzeResult;
import com.azure.ai.documentintelligence.models.BoundingRegion;
import com.azure.ai.documentintelligence.models.DocumentCaption;
import com.azure.ai.documentintelligence.models.DocumentFigure;
import com.azure.ai.documentintelligence.models.DocumentFootnote;
import com.azure.ai.documentintelligence.models.DocumentKeyValuePair;
import com.azure.ai.documentintelligence.models.DocumentTable;
import com.azure.ai.documentintelligence.models.DocumentTableCell;
import com.knecon.fforesight.service.ocr.processor.model.DocumentSpanLookup;
import com.knecon.fforesight.service.ocr.processor.model.PageBatch;
import com.knecon.fforesight.service.ocr.processor.model.PageInformation;
import com.knecon.fforesight.service.ocr.processor.visualizations.utils.Rectangle2DBBoxCollector;
import com.knecon.fforesight.service.ocr.processor.visualizations.utils.RotationCorrectionUtility;
import com.knecon.fforesight.service.ocr.v1.api.model.AzureOcrFeature;
import com.knecon.fforesight.service.ocr.v1.api.model.Figure;
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
import com.knecon.fforesight.service.ocr.v1.api.model.KeyValuePair;
import com.knecon.fforesight.service.ocr.v1.api.model.QuadPoint;
import com.knecon.fforesight.service.ocr.v1.api.model.Region;
import com.knecon.fforesight.service.ocr.v1.api.model.Table;
import com.knecon.fforesight.service.ocr.v1.api.model.TableCell;
import com.knecon.fforesight.service.ocr.v1.api.model.TableCellType;
import com.knecon.fforesight.service.ocr.v1.api.model.TextRegion;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
@Getter
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class IdpResultFactory {
IdpResult idpResult;
Map<Integer, AffineTransform> resultToPageTransforms;
Map<Integer, PageInformation> pageInformation;
Map<Integer, Double> angles;
boolean rotationCorrection;
public IdpResultFactory(Map<Integer, AffineTransform> resultToPageTransforms,
Map<Integer, PageInformation> pageInformation,
Map<Integer, Double> angles,
Set<AzureOcrFeature> features) {
this.angles = angles;
this.rotationCorrection = features.contains(AzureOcrFeature.ROTATION_CORRECTION);
this.resultToPageTransforms = resultToPageTransforms;
this.pageInformation = pageInformation;
this.idpResult = IdpResult.initSynchronized();
}
public AffineTransform getResultToPageTransform(Integer pageNumber) {
AffineTransform transform;
if (rotationCorrection) {
PageInformation page = pageInformation.get(pageNumber);
transform = RotationCorrectionUtility.buildTransform(-angles.get(pageNumber), page.cropBox().getWidth(), page.cropBox().getHeight(), false);
} else {
transform = new AffineTransform();
}
transform.concatenate(resultToPageTransforms.get(pageNumber));
return transform;
}
public void addAnalyzeResult(AnalyzeResult analyzeResult, PageBatch batch) {
DocumentSpanLookup words = new DocumentSpanLookup(analyzeResult);
if (analyzeResult.getTables() != null) {
analyzeResult.getTables()
.forEach(documentTable -> addTable(documentTable, words, batch));
}
if (analyzeResult.getKeyValuePairs() != null) {
analyzeResult.getKeyValuePairs()
.forEach(documentKeyValuePair -> addKeyValuePair(documentKeyValuePair, batch));
}
if (analyzeResult.getFigures() != null) {
analyzeResult.getFigures()
.forEach(documentFigure -> addFigure(documentFigure, batch, words));
}
}
private void addFigure(DocumentFigure documentFigure, PageBatch batch, DocumentSpanLookup words) {
List<TextRegion> footNotes = new LinkedList<>();
if (documentFigure.getFootnotes() != null) {
documentFigure.getFootnotes()
.stream()
.map(footNote -> toTextRegion(footNote, batch))
.filter(Objects::nonNull)
.forEach(footNotes::add);
}
int batchPageNumber = documentFigure.getBoundingRegions()
.get(0).getPageNumber();
Region bbox = toRegionFromRegions(batch.getPageNumber(batchPageNumber), documentFigure.getBoundingRegions());
TextRegion caption = toTextRegion(documentFigure.getCaption(), batch);
idpResult.figures().add(new Figure(caption, bbox, footNotes));
}
private void addKeyValuePair(DocumentKeyValuePair documentKeyValuePair, PageBatch batch) {
TextRegion key = null;
if (documentKeyValuePair.getKey() != null && !documentKeyValuePair.getKey().getContent().isEmpty()) {
Region region = toRegionFromRegions(batch, documentKeyValuePair.getKey().getBoundingRegions());
key = new TextRegion(region, cleanString(documentKeyValuePair.getKey().getContent()));
}
TextRegion value = null;
if (documentKeyValuePair.getValue() != null && !documentKeyValuePair.getValue().getContent().isEmpty()) {
Region region = toRegionFromRegions(batch, documentKeyValuePair.getValue().getBoundingRegions());
value = new TextRegion(region, cleanString(documentKeyValuePair.getValue().getContent()));
}
idpResult.keyValuePairs().add(new KeyValuePair(key, value));
}
private void addTable(DocumentTable documentTable, DocumentSpanLookup words, PageBatch batch) {
TextRegion caption = toTextRegion(documentTable.getCaption(), batch);
List<TableCell> tableCells = documentTable.getCells()
.stream()
.map(documentTableCell -> toTableCell(documentTableCell, words, batch))
.toList();
List<TextRegion> footNotes = new LinkedList<>();
if (documentTable.getFootnotes() != null) {
documentTable.getFootnotes()
.stream()
.map(footNote -> toTextRegion(footNote, batch))
.filter(Objects::nonNull)
.forEach(footNotes::add);
}
List<Region> bbox = documentTable.getBoundingRegions()
.stream()
.map(b -> toRegion(b, batch))
.toList();
Table table = new Table(caption, documentTable.getColumnCount(), documentTable.getRowCount(), tableCells, footNotes, bbox);
idpResult.tables().add(table);
}
private TextRegion toTextRegion(DocumentFootnote footNote, PageBatch batch) {
if (footNote == null || footNote.getBoundingRegions().isEmpty()) {
return null;
}
Region region = toRegionFromRegions(batch, footNote.getBoundingRegions());
return new TextRegion(region, cleanString(footNote.getContent()));
}
private TextRegion toTextRegion(DocumentCaption caption, PageBatch batch) {
if (caption == null || caption.getBoundingRegions().isEmpty()) {
return null;
}
Region region = toRegionFromRegions(batch, caption.getBoundingRegions());
return new TextRegion(region, cleanString(caption.getContent()));
}
private TableCell toTableCell(DocumentTableCell documentTableCell, DocumentSpanLookup words, PageBatch batch) {
int batchPageNumber = documentTableCell.getBoundingRegions()
.get(0).getPageNumber();
Region region = toRegionFromRegions(batch.getPageNumber(batchPageNumber), documentTableCell.getBoundingRegions());
TableCellType kind = mapTableCellType(documentTableCell);
return new TableCell(new TextRegion(region, cleanString(documentTableCell.getContent())), documentTableCell.getRowIndex(), documentTableCell.getColumnIndex(), kind);
}
private static TableCellType mapTableCellType(DocumentTableCell documentTableCell) {
if (documentTableCell.getKind() == null) {
return TableCellType.CONTENT;
}
return switch (documentTableCell.getKind().toString()) {
case "columnHeader" -> TableCellType.COLUMN_HEADER;
case "rowHeader" -> TableCellType.ROW_HEADER;
case "description" -> TableCellType.DESCRIPTION;
case "stubHead" -> TableCellType.STUB_HEAD;
default -> TableCellType.CONTENT;
};
}
private Region toRegion(BoundingRegion boundingRegion, PageBatch batch) {
int pageNumber = batch.getPageNumber(boundingRegion.getPageNumber());
QuadPoint qp = QuadPoint.fromPolygons(boundingRegion.getPolygon()).getTransformed(getResultToPageTransform(pageNumber));
return new Region(pageNumber, qp.toData());
}
private Region toRegionFromRegions(int pageNumber, List<BoundingRegion> regions) {
if (regions.size() == 1) {
return new Region(pageNumber, QuadPoint.fromPolygons(regions.get(0).getPolygon()).getTransformed(getResultToPageTransform(pageNumber)).toData());
}
QuadPoint bbox = QuadPoint.fromRectangle2D(regions.stream()
.map(BoundingRegion::getPolygon)
.map(QuadPoint::fromPolygons)
.map(qp -> qp.getTransformed(getResultToPageTransform(pageNumber)).getBounds2D())
.collect(new Rectangle2DBBoxCollector()));
return new Region(pageNumber, bbox.toData());
}
private Region toRegionFromRegions(PageBatch batch, List<BoundingRegion> regions) {
assert !regions.isEmpty();
int batchPageNumber = regions.get(0).getPageNumber();
if (!regions.stream()
.map(BoundingRegion::getPageNumber)
.allMatch(number -> number == batchPageNumber)) {
throw new AssertionError();
}
int pageNumber = batch.getPageNumber(batchPageNumber);
return toRegionFromRegions(pageNumber, regions);
}
}

View File

@ -1,77 +1,57 @@
package com.knecon.fforesight.service.ocr.processor.visualizations.layers;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.azure.ai.documentintelligence.models.AnalyzeResult;
import com.knecon.fforesight.service.ocr.processor.OcrServiceSettings;
import com.knecon.fforesight.service.ocr.processor.model.PageBatch;
import com.knecon.fforesight.service.ocr.processor.model.PageInformation;
import com.knecon.fforesight.service.ocr.processor.service.OcrExecutionSupervisor;
import com.knecon.fforesight.service.ocr.processor.service.OcrResultPostProcessingPipeline;
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.ImageProcessingPipeline;
import com.knecon.fforesight.service.ocr.processor.OcrServiceSettings;
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.ImageProcessingSupervisor;
import com.knecon.fforesight.service.ocr.processor.visualizations.WritableOcrResult;
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
import com.knecon.fforesight.service.ocr.v1.api.model.AzureOcrFeature;
import com.knecon.fforesight.service.ocr.processor.visualizations.WritableOcrResultFactory;
import com.knecon.fforesight.service.viewerdoc.layers.LayerGroup;
import lombok.AccessLevel;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class LayerFactory {
OcrExecutionSupervisor supervisor;
OcrResultPostProcessingPipeline ocrResultPostProcessingPipeline;
WritableOcrResultFactory writableOcrResultFactory;
IdpLayerFactory idpLayerFactory;
OcrDebugLayerFactory ocrDebugLayerFactory;
OcrTextLayerFactory ocrTextLayerFactory;
IdpResultFactory idpResultFactory;
OcrServiceSettings settings;
Set<AzureOcrFeature> features;
Map<Integer, Double> angles;
public LayerFactory(OcrServiceSettings settings,
Set<AzureOcrFeature> features,
OcrExecutionSupervisor supervisor,
Map<Integer, PageInformation> pageInformation,
ImageProcessingPipeline imageProcessingPipeline) {
public LayerFactory(OcrServiceSettings settings, OcrExecutionSupervisor supervisor, ImageProcessingSupervisor imageSupervisor, Map<Integer, PageInformation> pageInformation) {
this.ocrResultPostProcessingPipeline = new OcrResultPostProcessingPipeline(pageInformation, imageProcessingPipeline, settings, features);
this.idpLayerFactory = new IdpLayerFactory(ocrResultPostProcessingPipeline.getResultToPageTransforms());
this.writableOcrResultFactory = new WritableOcrResultFactory(pageInformation, settings, imageSupervisor);
this.idpLayerFactory = new IdpLayerFactory(writableOcrResultFactory.getPageCtms());
this.ocrDebugLayerFactory = new OcrDebugLayerFactory();
this.ocrTextLayerFactory = new OcrTextLayerFactory();
this.settings = settings;
this.features = features;
this.supervisor = supervisor;
this.angles = Collections.synchronizedMap(new HashMap<>());
this.idpResultFactory = new IdpResultFactory(ocrResultPostProcessingPipeline.getResultToPageTransforms(), pageInformation, angles, features);
}
public void processAnalyzeResult(PageBatch batch, AnalyzeResult analyzeResult) throws InterruptedException {
List<WritableOcrResult> results = ocrResultPostProcessingPipeline.processAnalyzeResult(analyzeResult, batch);
results.forEach(result -> angles.put(result.getPageNumber(), result.getAngle()));
public void addAnalyzeResult(PageBatch pageRange, AnalyzeResult analyzeResult) throws InterruptedException {
List<WritableOcrResult> results = writableOcrResultFactory.buildOcrResultToWrite(analyzeResult, pageRange);
ocrTextLayerFactory.addWritableOcrResult(results);
if (settings.isDebug()) {
ocrDebugLayerFactory.addAnalysisResult(results);
}
if (features.contains(AzureOcrFeature.IDP)) {
idpLayerFactory.addAnalyzeResult(analyzeResult, batch);
idpResultFactory.addAnalyzeResult(analyzeResult, batch);
if (settings.isIdpEnabled()) {
idpLayerFactory.addAnalyzeResult(analyzeResult, pageRange);
}
this.supervisor.finishMappingResult(batch);
this.supervisor.finishMappingResult(pageRange);
}
@ -84,11 +64,10 @@ public class LayerFactory {
if (settings.isDebug()) {
debugLayers.add(ocrDebugLayerFactory.getOcrDebugLayer());
}
if (features.contains(AzureOcrFeature.IDP)) {
if (settings.isIdpEnabled()) {
debugLayers.add(idpLayerFactory.getIdpLayer());
}
IdpResult idpResult = features.contains(AzureOcrFeature.IDP) ? idpResultFactory.getIdpResult() : null;
return new OcrResult(List.of(ocrTextLayer), debugLayers, angles, idpResult);
return new OcrResult(List.of(ocrTextLayer), debugLayers);
}
}

View File

@ -31,7 +31,7 @@ public class OcrDebugLayer extends OcrDebugLayerConfig {
word.getFontMetricsProvider(),
Optional.of(word.getTextMatrix()),
Optional.of(RenderingMode.FILL)));
bboxOnPage.getColoredLines().addAll(LineUtils.quadPointAsLines(word.getTransformedTextBBox(), word.isSnugBBox()));
bboxOnPage.getColoredLines().addAll(LineUtils.quadPointAsLines(word.getTransformedTextBBox()));
}
@ -57,11 +57,4 @@ public class OcrDebugLayer extends OcrDebugLayerConfig {
};
}
@Override
public boolean isVisibleByDefault() {
return true;
}
}

View File

@ -1,11 +1,9 @@
package com.knecon.fforesight.service.ocr.processor.visualizations.layers;
import java.util.List;
import java.util.Map;
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
import com.knecon.fforesight.service.viewerdoc.layers.LayerGroup;
public record OcrResult(List<LayerGroup> regularLayers, List<LayerGroup> debugLayers, Map<Integer, Double> anglesPerPage, IdpResult idpResult) {
public record OcrResult(List<LayerGroup> regularLayers, List<LayerGroup> debugLayers) {
}

View File

@ -14,19 +14,12 @@ import lombok.experimental.UtilityClass;
@UtilityClass
public class LineUtils {
public List<ColoredLine> quadPointAsLines(QuadPoint rect, boolean tight) {
public List<ColoredLine> quadPointAsLines(QuadPoint rect) {
if (tight) {
return List.of(new ColoredLine(new Line2D.Double(rect.a(), rect.b()), Color.GREEN, 1),
new ColoredLine(new Line2D.Double(rect.b(), rect.c()), Color.GREEN, 1),
new ColoredLine(new Line2D.Double(rect.c(), rect.d()), Color.GREEN, 1),
new ColoredLine(new Line2D.Double(rect.d(), rect.a()), Color.GREEN, 1));
}
return List.of(new ColoredLine(new Line2D.Double(rect.a(), rect.b()), Color.BLUE, 1),
return List.of(new ColoredLine(new Line2D.Double(rect.a(), rect.b()), Color.ORANGE, 1),
new ColoredLine(new Line2D.Double(rect.b(), rect.c()), Color.BLUE, 1),
new ColoredLine(new Line2D.Double(rect.c(), rect.d()), Color.BLUE, 1),
new ColoredLine(new Line2D.Double(rect.d(), rect.a()), Color.BLUE, 1));
new ColoredLine(new Line2D.Double(rect.c(), rect.d()), Color.GREEN, 1),
new ColoredLine(new Line2D.Double(rect.d(), rect.a()), Color.MAGENTA, 1));
}

View File

@ -1,217 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.visualizations.utils;
import java.awt.geom.AffineTransform;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import com.knecon.fforesight.service.viewerdoc.LayerIdentifier;
import com.pdftron.common.PDFNetException;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import com.pdftron.sdf.Obj;
import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows;
import lombok.experimental.UtilityClass;
@UtilityClass
public class RotationCorrectionUtility {
public static final LayerIdentifier KNECON_ROTATION_CORRECTION = new LayerIdentifier(null, "ROTATION_CORRECTION");
@SneakyThrows
public void rotatePages(Path inputFile, Path outputFile, Map<Integer, Double> anglesPerPage) {
Path tmp = Files.createTempFile("tempDocument", ".pdf");
Files.copy(inputFile, tmp, StandardCopyOption.REPLACE_EXISTING);
try (var in = new FileInputStream(tmp.toFile()); var out = new FileOutputStream(outputFile.toFile())) {
rotatePages(in, out, anglesPerPage);
}
Files.deleteIfExists(tmp);
}
@SneakyThrows
public void rotatePages(InputStream in, OutputStream out, Map<Integer, Double> anglesPerPage) {
try (PDFDoc doc = new PDFDoc(in)) {
anglesPerPage.forEach((pageNumber, angle) -> rotatePage(pageNumber, doc, angle));
doc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
}
}
@SneakyThrows
public void rotatePage(int pageNumber, PDFDoc doc, double angle) {
int quadrants = getQuadrantRotation(angle);
Page page = doc.getPage(pageNumber);
page.setRotation((quadrants + page.getRotation()) % 4);
double remainingAngle = getRemainingAngle(angle, quadrants);
Obj contents = page.getContents();
String content = buildRotationContent(remainingAngle, page);
Obj rotationStream = doc.createIndirectStream(content.getBytes());
Obj newContentsArray = doc.createIndirectArray();
newContentsArray.pushBack(rotationStream);
addPreviousContents(contents, newContentsArray);
String closingContent = buildClosingContent();
Obj closingStream = doc.createIndirectStream(closingContent.getBytes());
newContentsArray.pushBack(closingStream);
page.getSDFObj().erase("Contents");
page.getSDFObj().put("Contents", newContentsArray);
}
private String buildClosingContent() {
List<String> closingCommands = new LinkedList<>();
closingCommands.add("Q");
return String.join("\n", closingCommands);
}
private String buildRotationContent(double angle, Page page) throws PDFNetException {
List<String> commands = new LinkedList<>();
double scale = getScalingFactor(angle, page);
double x = page.getCropBox().getWidth() / 2;
double y = page.getCropBox().getHeight() / 2;
commands.add("q");
commands.add("/%s <<>> BDC".formatted(KNECON_ROTATION_CORRECTION.markedContentName()));
commands.add(buildMatrixCommands(AffineTransform.getTranslateInstance(x, y)));
commands.add(buildMatrixCommands(AffineTransform.getRotateInstance(Math.toRadians(angle))));
commands.add(buildMatrixCommands(AffineTransform.getScaleInstance(scale, scale)));
commands.add(buildMatrixCommands(AffineTransform.getTranslateInstance(-x, -y)));
commands.add("EMC");
return String.join("\n", commands);
}
private void addPreviousContents(Obj contents, Obj newContentsArray) throws PDFNetException {
switch (contents.getType()) {
case Obj.e_array -> {
for (int i = 0; i < contents.size(); i++) {
newContentsArray.pushBack(contents.getAt(i));
}
}
case Obj.e_stream -> newContentsArray.pushBack(contents);
default -> throw new IllegalStateException("Unexpected value: " + contents.getType());
}
}
public static double getScalingFactor(double angle, Page page) throws PDFNetException {
double width = page.getPageWidth();
double height = page.getPageHeight();
return getScalingFactor(angle, width, height);
}
public static double getScalingFactor(double angle, double w, double h) {
if (Math.abs(angle) < 20) {
return 1;
}
double sin = Math.abs(Math.sin(Math.toRadians(angle)));
double cos = Math.abs(Math.cos(Math.toRadians(angle)));
double newWidth = w * cos + h * sin;
double newHeight = h * cos + w * sin;
return Math.min(w / newWidth, h / newHeight);
}
public static AffineTransform buildTransform(double angle, double originalWidth, double originalHeight) {
return buildTransform(angle, originalWidth, originalHeight, true);
}
public static AffineTransform buildTransform(double angle, double originalWidth, double originalHeight, boolean quadrantRotation) {
int quadrants = getQuadrantRotation(angle);
double h = originalHeight;
double w = originalWidth;
AffineTransform quadrantRotationTransform = new AffineTransform();
if (quadrantRotation) {
if (quadrants == 1 || quadrants == 3) {
w = originalHeight;
h = originalWidth;
}
quadrantRotationTransform = switch (quadrants) {
case 1 -> new AffineTransform(0, 1, -1, 0, h, 0);
case 2 -> new AffineTransform(-1, 0, 0, -1, w, h);
case 3 -> new AffineTransform(0, -1, 1, 0, w - h, h);
default -> new AffineTransform();
};
}
double remainder = getRemainingAngle(angle, quadrants);
double scale = getScalingFactor(remainder, w, h);
AffineTransform transform = new AffineTransform();
transform.translate(w / 2, h / 2);
transform.rotate(Math.toRadians(remainder));
transform.scale(scale, scale);
transform.translate(-w / 2, -h / 2);
transform.concatenate(quadrantRotationTransform);
return transform;
}
public static int getQuadrantRotation(double angle) {
double remainder = angle % 360;
if (remainder < 0) {
remainder += 360;
}
if (remainder > 315 || remainder <= 45) {
return 0;
} else if (remainder > 45 && remainder <= 135) {
return 1;
} else if (remainder > 135 && remainder <= 225) {
return 2;
} else {
return 3;
}
}
public static double getRemainingAngle(double angle, int quadrants) {
double referenceAngle = 90 * quadrants;
return (angle - referenceAngle) % 360;
}
public static double getRemainingAngle(double angle) {
return getRemainingAngle(angle, getQuadrantRotation(angle));
}
private String buildMatrixCommands(AffineTransform at) {
return "%f %f %f %f %f %f cm".formatted(at.getScaleX(), at.getShearX(), at.getShearY(), at.getScaleY(), at.getTranslateX(), at.getTranslateY());
}
}

View File

@ -4,8 +4,8 @@ import java.io.File;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.util.LinkedList;
import java.util.List;
import java.util.HashSet;
import java.util.Set;
import org.apache.pdfbox.Loader;
import org.junit.jupiter.api.BeforeEach;
@ -13,16 +13,11 @@ import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.springframework.core.io.ClassPathResource;
import com.knecon.fforesight.service.ocr.processor.OcrServiceSettings;
import com.knecon.fforesight.service.ocr.processor.initializer.NativeLibrariesInitializer;
import com.knecon.fforesight.service.ocr.processor.model.PageBatch;
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.GhostScriptService;
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.ImageProcessingPipeline;
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.ImageProcessingService;
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.ImageProcessingSupervisor;
import com.knecon.fforesight.service.ocr.processor.utils.OsUtils;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.PDFNet;
import com.sun.jna.NativeLibrary;
import lombok.SneakyThrows;
@ -36,11 +31,13 @@ class ImageProcessingPipelineTest {
@BeforeEach
public void setup() {
new NativeLibrariesInitializer("demo:1650351709282:7bd235e003000000004ec28a6743e1163a085e2115de2536ab6e2cfe5a", "/home/kschuettler/software/leptonica/vcpkg/installed/x64-linux-dynamic/lib/").init();
System.setProperty("jna.library.path", System.getenv("VCPKG_DYNAMIC_LIB"));
try (NativeLibrary leptonicaLib = NativeLibrary.getInstance("leptonica")) {
assert leptonicaLib != null;
}
OcrServiceSettings settings = new OcrServiceSettings();
ImageProcessingService imageProcessingService = new ImageProcessingService(settings);
GhostScriptService ghostScriptService = new GhostScriptService(settings);
ImageProcessingService imageProcessingService = new ImageProcessingService();
GhostScriptService ghostScriptService = new GhostScriptService();
imageProcessingPipeline = new ImageProcessingPipeline(ghostScriptService, imageProcessingService);
}
@ -49,7 +46,7 @@ class ImageProcessingPipelineTest {
@SneakyThrows
public void testImageProcessingPipeline() {
String fileName = "/home/kschuettler/Dokumente/TestFiles/OCR/VV-331340/VV-331340_OCRED_first15.pdf";
String fileName = "/home/kschuettler/Dokumente/TestFiles/OCR/VV-331340.pdf";
File file;
if (fileName.startsWith("files")) {
@ -66,26 +63,21 @@ class ImageProcessingPipelineTest {
Files.copy(file.toPath(), documentFile, StandardCopyOption.REPLACE_EXISTING);
try (var doc = new PDFDoc(fileName)) {
List<Integer> pageNumbers = new LinkedList<>();
for (int i = 1; i <= doc.getPageCount(); i++) {
if (i % 2 == 0) {
continue;
}
pageNumbers.add(i);
}
PageBatch batch = BatchFactory.create(0, doc, pageNumbers, tmpDir);
ImageProcessingSupervisor supervisor = imageProcessingPipeline.addToPipeline(batch);
batch.forEach(pageNumber -> {
try {
assert supervisor.awaitProcessedPage(pageNumber) != null;
} catch (Exception e) {
e.printStackTrace();
}
});
int numberOfpages;
try (var doc = Loader.loadPDF(file)) {
numberOfpages = doc.getNumberOfPages();
}
Set<Integer> pageNumbers = new HashSet<>();
for (int i = 1; i <= numberOfpages; i++) {
if (i % 2 == 0) {
continue;
}
pageNumbers.add(i);
}
ImageProcessingSupervisor supervisor = imageProcessingPipeline.run(pageNumbers, tmpDir.resolve("images"), documentFile.toFile());
supervisor.awaitAll();
}
}

View File

@ -1,70 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.service;
import static com.knecon.fforesight.service.ocr.processor.visualizations.utils.RotationCorrectionUtility.KNECON_ROTATION_CORRECTION;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.ocr.processor.visualizations.utils.RotationCorrectionUtility;
import com.knecon.fforesight.service.viewerdoc.service.PageContentCleaner;
import com.pdftron.pdf.ElementReader;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.PDFNet;
import com.pdftron.pdf.Page;
import com.pdftron.pdf.PageIterator;
import com.pdftron.sdf.SDFDoc;
import lombok.SneakyThrows;
@Disabled // leptonica is not available in build server
public class PageRotationTest {
@BeforeAll
public static void setUp() {
PDFNet.initialize("demo:1650351709282:7bd235e003000000004ec28a6743e1163a085e2115de2536ab6e2cfe5a");
}
@Test
public void putRotation() {
Map<Integer, Double> angles = new HashMap<>();
for (int i = 1; i <= 100; i++) {
double a = -90 + (i * ((double) 180 / 100));
angles.put(i, a);
}
Path inputFile = Path.of("/home/kschuettler/Dokumente/TestFiles/OCR/VV-331340-first100.pdf");
RotationCorrectionUtility.rotatePages(inputFile, Path.of("/tmp").resolve(inputFile.getFileName() + "_rotated.pdf"), angles);
}
@Test
@SneakyThrows
public void removeRotation() {
Path inputFile = Path.of("/tmp/VV-331340-first100.pdf_rotated.pdf");
try (var doc = new PDFDoc(inputFile.toFile()
.toString()); var reader = new ElementReader(); var writer = new ElementWriter(); PageIterator pageIterator = doc.getPageIterator()) {
PageContentCleaner cleaner = PageContentCleaner.builder()
.reader(reader)
.writer(writer)
.markedContentToRemove(Set.of(KNECON_ROTATION_CORRECTION.markedContentName()))
.build();
while (pageIterator.hasNext()) {
Page page = pageIterator.next();
cleaner.removeMarkedContent(page);
}
doc.save(inputFile.resolveSibling(inputFile.getFileName() + "_derotated.pdf").toFile().toString(), SDFDoc.SaveMode.LINEARIZED, null);
}
}
}

View File

@ -1,232 +0,0 @@
package com.knecon.fforesight.service.ocr.processor.service;
import java.awt.Color;
import java.awt.geom.AffineTransform;
import java.awt.geom.Line2D;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.nio.file.Path;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import com.azure.ai.documentintelligence.models.AnalyzeResult;
import com.azure.json.JsonOptions;
import com.azure.json.JsonReader;
import com.azure.json.implementation.DefaultJsonReader;
import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService;
import com.knecon.fforesight.service.ocr.processor.OcrServiceSettings;
import com.knecon.fforesight.service.ocr.processor.initializer.NativeLibrariesInitializer;
import com.knecon.fforesight.service.ocr.processor.model.ImageFile;
import com.knecon.fforesight.service.ocr.processor.model.PageInformation;
import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage;
import com.knecon.fforesight.service.ocr.processor.visualizations.WritableOcrResult;
import com.knecon.fforesight.service.ocr.processor.visualizations.layers.OcrDebugLayerFactory;
import com.knecon.fforesight.service.ocr.processor.visualizations.utils.RotationCorrectionUtility;
import com.knecon.fforesight.service.ocr.v1.api.model.QuadPoint;
import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService;
import com.pdftron.pdf.ColorPt;
import com.pdftron.pdf.ColorSpace;
import com.pdftron.pdf.Element;
import com.pdftron.pdf.ElementBuilder;
import com.pdftron.pdf.ElementWriter;
import com.pdftron.pdf.PDFDoc;
import com.pdftron.pdf.Page;
import lombok.SneakyThrows;
@Disabled // leptonica is not available in build server
public class SnugBoxesTest {
public static final int PAGE_NUMBER = 41;
public static final Path ORIGIN_FILE = Path.of("/home/kschuettler/Dokumente/TestFiles/OCR/VV-331340-first100.pdf");
public static final Path TEST_FOLDER = Path.of("/tmp/OCR_TEST/").resolve(ORIGIN_FILE.getFileName());
public static final Path BATCH_FOLDER = TEST_FOLDER.resolve("batch_0");
public static final Path DESTINATION_FILE = BATCH_FOLDER.resolve("SnugBoxesTest.pdf");
PDFTronViewerDocumentService viewerDocumentService = new PDFTronViewerDocumentService(null);
@BeforeAll
public static void setUp() {
new NativeLibrariesInitializer("demo:1650351709282:7bd235e003000000004ec28a6743e1163a085e2115de2536ab6e2cfe5a", "/home/kschuettler/software/leptonica/vcpkg/installed/x64-linux-dynamic/lib/").init();
}
@Test
@SneakyThrows
public void snugBoxes() {
String filePath = ORIGIN_FILE.toFile().toString();
File file = new File(filePath);
assert file.exists();
ImageFile imageFile = new ImageFile(PAGE_NUMBER, file.toString());
AnalyzeResult result = null;
try (var in = new FileInputStream(BATCH_FOLDER.resolve("analyzeResult.json").toFile()); JsonReader reader = DefaultJsonReader.fromStream(in, new JsonOptions());) {
result = AnalyzeResult.fromJson(reader);
}
var resultPage = result.getPages()
.get(PAGE_NUMBER - 1);
OcrResultPostProcessingPipeline ocrResultPostProcessingPipeline = new OcrResultPostProcessingPipeline(null, null, new OcrServiceSettings(), Set.of());
OcrDebugLayerFactory debugLayerFactory = new OcrDebugLayerFactory();
InvisibleElementRemovalService invisibleElementRemovalService = new InvisibleElementRemovalService();
try (var in = new FileInputStream(ORIGIN_FILE.toFile()); var out = new FileOutputStream(DESTINATION_FILE.toFile())) {
invisibleElementRemovalService.removeInvisibleElements(in, out, false);
}
PageInformation pageInformation = getPageInformation(PAGE_NUMBER, DESTINATION_FILE.toFile().toString());
OcrResultPostProcessingPipeline.Lookups empty = OcrResultPostProcessingPipeline.Lookups.empty();
AffineTransform pageCtm = getPageCtm(PAGE_NUMBER, filePath, resultPage.getWidth());
// pageCtm.preConcatenate(rotationCorrection);
// pageCtm.preConcatenate(quadrantTransform);
// Pix pageImage = imageFile.readPix();
// AffineTransform imageTransform = WritableOcrResultFactory.buildImageTransform(resultPage, pageImage);
// List<Rectangle2D> rects = new LinkedList<>();
// for (DocumentWord word : resultPage.getWords()) {
// QuadPoint quadPoint = QuadPoint.fromPolygons(word.getPolygon());
// Rectangle2D rect = quadPoint.getTransformed(imageTransform).getBounds2D();
// if (rect.getX() > 0 && rect.getY() > 0 && rect.getMaxX() < pageImage.w && rect.getMaxY() < pageImage.h) {
// rects.add(rect);
// }
// }
// Boxa boxa = createBoxaFromRectangles(rects);
// Pix drawedPix = Leptonica1.pixDrawBoxa(pageImage, boxa, 5, 1);
// Leptonica1.pixWrite("/tmp/OCR_TEST/VV-331340-first100.pdf/image_pipeline/page_" + PAGE_NUMBER + ".tiff", drawedPix, 5);
//
List<TextPositionInImage> words = ocrResultPostProcessingPipeline.buildTextWithSnugBBoxes(resultPage, imageFile, pageCtm, empty, pageInformation);
var results = new WritableOcrResult(PAGE_NUMBER, -resultPage.getAngle(), words, Collections.emptyList());
debugLayerFactory.addAnalysisResult(List.of(results));
// try (var doc = new PDFDoc(tmpFile.toString()); var out = new FileOutputStream(DESTINATION_FILE.toFile())) {
// PageRotationHelper.rotatePage(PAGE_NUMBER, doc, -resultPage.getAngle());
// var rects = resultPage.getWords()
// .stream()
// .map(DocumentWord::getPolygon)
// .map(QuadPoint::fromPolygons)
// .map(qp -> qp.getTransformed(pageCtm))
// .toList();
// drawRects(doc, rects, PAGE_NUMBER);
// doc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
// }
// Files.deleteIfExists(tmpFile);
viewerDocumentService.addLayerGroups(DESTINATION_FILE.toFile(), DESTINATION_FILE.toFile(), List.of(debugLayerFactory.getOcrDebugLayer()));
RotationCorrectionUtility.rotatePages(DESTINATION_FILE, DESTINATION_FILE, Map.of(PAGE_NUMBER, -resultPage.getAngle()));
}
//
// private static List<Rectangle2D> readRectsFromBoxa(Boxa boxa) {
//
// Pointer[] pointers = boxa.box.getPointer().getPointerArray(0, boxa.n);
// List<Rectangle2D> boxes = new ArrayList<>(boxa.n);
// for (int i = 0; i < boxa.n; i++) {
// Box box = new Box(pointers[i]);
// boxes.add(new Rectangle2D.Double(box.x, box.y, box.w, box.h));
// LeptUtils.dispose(box);
// }
// return boxes;
// }
//
//
// @SuppressWarnings("PMD") // Memory will be de-allocated with boxa
// public static Boxa createBoxaFromRectangles(List<Rectangle2D> rectangles) {
//
// if (rectangles.isEmpty()) {
// return new Boxa();
// }
//
// int n = rectangles.size(); // Number of rectangles
// int nalloc = n; // Allocating memory for exactly 'n' boxes
// int refcount = 1; // Default refcount
//
// Pointer boxPointerArray = new Memory((long) Native.POINTER_SIZE * n); // Memory for n pointers
//
// for (int i = 0; i < n; i++) {
//
// Rectangle2D rect = rectangles.get(i);
// var mem = new Memory(20L);
// mem.setInt(0, (int) rect.getX());
// mem.setInt(4, (int) rect.getY());
// mem.setInt(8, (int) rect.getWidth());
// mem.setInt(12, (int) rect.getHeight());
// mem.setInt(16, refcount);
//
// // Write the pointer of each Box into the native memory
// boxPointerArray.setPointer((long) Native.POINTER_SIZE * i, mem);
// }
//
// // Create a PointerByReference pointing to the native memory of the array
// PointerByReference boxPointerRef = new PointerByReference();
// boxPointerRef.setPointer(boxPointerArray);
//
// // Create the Boxa instance
//
// return new Boxa(n, nalloc, refcount, boxPointerRef);
// }
@SneakyThrows
private void drawRects(PDFDoc doc, List<QuadPoint> quadPoints, int pageNumber) {
try (ElementWriter writer = new ElementWriter(); ElementBuilder builder = new ElementBuilder()) {
Page page = doc.getPage(pageNumber);
writer.begin(page, ElementWriter.e_overlay);
for (QuadPoint quadPoint : quadPoints) {
quadPoint.asLines()
.forEach(line -> {
drawLine(line, builder, writer);
});
}
writer.end();
}
}
@SneakyThrows
private static void drawLine(Line2D l, ElementBuilder builder, ElementWriter writer) {
float[] rgbComponents = Color.BLUE.getRGBColorComponents(null);
builder.pathBegin();
builder.moveTo(l.getX1(), l.getY1());
builder.lineTo(l.getX2(), l.getY2());
Element line = builder.pathEnd();
line.setPathStroke(true);
line.setPathFill(false);
line.getGState().setLineWidth(1);
line.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
try (ColorPt color = new ColorPt(rgbComponents[0], rgbComponents[1], rgbComponents[2])) {
line.getGState().setStrokeColor(color);
}
writer.writeElement(line);
}
@SneakyThrows
private static AffineTransform getPageCtm(int pageNumber, String file, double imageWidh) {
return OcrResultPostProcessingPipeline.buildResultToPageTransform(getPageInformation(pageNumber, file), imageWidh);
}
@SneakyThrows
private static PageInformation getPageInformation(int pageNumber, String file) {
try (var in = new FileInputStream(file); var doc = new PDFDoc(in)) {
return PageInformation.fromPage(pageNumber, doc.getPage(pageNumber));
}
}
}

View File

@ -20,7 +20,7 @@ class Type0FontMetricsProviderTest {
try (PDDocument document = Loader.loadPDF(new File(Type0FontMetricsProviderTest.class.getClassLoader().getResource("InvisibleText.pdf").getPath()))) {
Type0FontMetricsProvider metricsFactory = Type0FontMetricsProvider.regular(document);
FontMetrics fontMetrics = metricsFactory.calculateMetricsForAzureBBox("deine mutter", 100, 50);
FontMetrics fontMetrics = metricsFactory.calculateMetrics("deine mutter", 100, 50);
}
}

View File

@ -8,9 +8,6 @@ plugins {
id("org.sonarqube") version "4.3.0.3225"
id("io.freefair.lombok") version "8.4"
}
pmd {
isConsoleOutput = true
}
configurations {
all {
@ -27,21 +24,15 @@ dependencies {
implementation(project(":azure-ocr-service-api"))
implementation("com.knecon.fforesight:tracing-commons:0.5.0")
implementation("io.github.openfeign:feign-core:12.4")
implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.1.1")
implementation("org.springframework.boot:spring-boot-starter-amqp:${springBootStarterVersion}")
implementation("com.iqser.red.service:persistence-service-internal-api-v1:2.224.0")
implementation("com.knecon.fforesight:tenant-commons:0.31.0")
implementation("com.iqser.red.commons:storage-commons:2.50.0")
implementation("net.logstash.logback:logstash-logback-encoder:7.4")
implementation("ch.qos.logback:logback-classic")
testImplementation("org.springframework.boot:spring-boot-starter-test:${springBootStarterVersion}")
testImplementation("com.iqser.red.commons:test-commons:2.1.0")
testImplementation("org.springframework.amqp:spring-rabbit-test:3.0.2")
testImplementation("com.iqser.red.commons:pdftron-logic-commons:2.32.0")
}
tasks.named<BootBuildImage>("bootBuildImage") {

View File

@ -9,9 +9,11 @@ import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Import;
import org.springframework.scheduling.annotation.EnableAsync;
import com.iqser.red.storage.commons.StorageAutoConfiguration;
import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService;
import com.iqser.red.pdftronlogic.commons.WatermarkRemovalService;
import com.knecon.fforesight.service.ocr.processor.OcrServiceProcessorConfiguration;
import com.knecon.fforesight.service.ocr.v1.server.configuration.MessagingConfiguration;
import com.iqser.red.storage.commons.StorageAutoConfiguration;
import com.knecon.fforesight.tenantcommons.MultiTenancyAutoConfiguration;
import io.micrometer.core.aop.TimedAspect;
@ -41,4 +43,17 @@ public class Application {
}
@Bean
public InvisibleElementRemovalService invisibleElementRemovalService() {
return new InvisibleElementRemovalService();
}
@Bean
public WatermarkRemovalService watermarkRemovalService() {
return new WatermarkRemovalService();
}
}

View File

@ -1,9 +1,5 @@
package com.knecon.fforesight.service.ocr.v1.server.configuration;
import org.springframework.amqp.core.DirectExchange;
import org.springframework.amqp.core.Queue;
import org.springframework.amqp.core.QueueBuilder;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import lombok.RequiredArgsConstructor;
@ -12,26 +8,10 @@ import lombok.RequiredArgsConstructor;
@RequiredArgsConstructor
public class MessagingConfiguration {
public static final String OCR_REQUEST_QUEUE_PREFIX = "ocr_request";
public static final String OCR_REQUEST_EXCHANGE = "ocr_request_exchange";
public static final String OCR_DLQ = "ocr_error";
public static final String OCR_RESPONSE_EXCHANGE = "ocr_response_exchange";
public static final String OCR_STATUS_UPDATE_RESPONSE_EXCHANGE = "ocr_status_update_response_exchange";
public static final String OCR_STATUS_UPDATE_DLQ = "ocr_status_update_error";
public static final String OCR_REQUEST_QUEUE = "ocr_request_queue";
public static final String OCR_RESPONSE_QUEUE = "ocr_response_queue";
public static final String OCR_STATUS_UPDATE_RESPONSE_QUEUE = "ocr_status_update_response_queue";
public static final String X_ERROR_INFO_HEADER = "x-error-message";
public static final String X_ERROR_INFO_TIMESTAMP_HEADER = "x-error-message-timestamp";
@Bean
public DirectExchange ocrRequestExchange() {
return new DirectExchange(OCR_REQUEST_EXCHANGE);
}
@Bean
public Queue ocrDLQ() {
return QueueBuilder.durable(OCR_DLQ).build();
}
}

View File

@ -1,32 +0,0 @@
package com.knecon.fforesight.service.ocr.v1.server.configuration;
import static com.knecon.fforesight.service.ocr.v1.server.configuration.MessagingConfiguration.OCR_DLQ;
import static com.knecon.fforesight.service.ocr.v1.server.configuration.MessagingConfiguration.OCR_REQUEST_EXCHANGE;
import static com.knecon.fforesight.service.ocr.v1.server.configuration.MessagingConfiguration.OCR_REQUEST_QUEUE_PREFIX;
import java.util.Map;
import java.util.Set;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import com.knecon.fforesight.service.ocr.v1.server.queue.OcrMessageReceiver;
import com.knecon.fforesight.tenantcommons.model.TenantQueueConfiguration;
import com.knecon.fforesight.tenantcommons.model.TenantQueueProvider;
@Configuration
public class TenantQueueProviderConfig {
@Bean
protected TenantQueueProvider getTenantQueueConfigs() {
return new TenantQueueProvider(Set.of(TenantQueueConfiguration.builder()
.listenerId(OcrMessageReceiver.OCR_REQUEST_LISTENER_ID)
.exchangeName(OCR_REQUEST_EXCHANGE)
.queuePrefix(OCR_REQUEST_QUEUE_PREFIX)
.dlqName(OCR_DLQ)
.arguments(Map.of("x-max-priority", 2))
.build()));
}
}

View File

@ -1,13 +1,10 @@
package com.knecon.fforesight.service.ocr.v1.server.queue;
import java.util.Set;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.ocr.processor.service.IOcrMessageSender;
import com.knecon.fforesight.service.ocr.v1.api.model.AzureOcrFeature;
import com.knecon.fforesight.service.ocr.v1.api.model.DocumentRequest;
import com.knecon.fforesight.service.ocr.v1.server.configuration.MessagingConfiguration;
import com.knecon.fforesight.tenantcommons.TenantContext;
@ -25,24 +22,24 @@ public class NoStatusUpdateOcrMessageSender implements IOcrMessageSender {
RabbitTemplate rabbitTemplate;
public void sendOcrFinished(String fileId, int totalImages, Set<AzureOcrFeature> features) {
public void sendOcrFinished(String fileId, int totalImages) {
}
public void sendOCRStarted(String fileId, Set<AzureOcrFeature> features) {
public void sendOCRStarted(String fileId) {
}
public void sendUpdate(String fileId, int finishedImages, int totalImages, Set<AzureOcrFeature> features) {
public void sendUpdate(String fileId, int finishedImages, int totalImages) {
}
public void sendOcrResponse(DocumentRequest request) {
public void sendOcrResponse(String dossierId, String fileId) {
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_RESPONSE_EXCHANGE, TenantContext.getTenantId(), request);
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_RESPONSE_QUEUE, TenantContext.getTenantId(), new DocumentRequest(dossierId, fileId));
}
}

View File

@ -2,13 +2,11 @@ package com.knecon.fforesight.service.ocr.v1.server.queue;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.time.OffsetDateTime;
import java.time.temporal.ChronoUnit;
import java.util.Objects;
import java.util.stream.Collectors;
import org.slf4j.MDC;
import org.springframework.amqp.AmqpRejectAndDontRequeueException;
import org.springframework.amqp.core.Message;
import org.springframework.amqp.rabbit.annotation.RabbitHandler;
@ -17,8 +15,7 @@ import org.springframework.stereotype.Service;
import org.springframework.util.FileSystemUtils;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.knecon.fforesight.service.ocr.processor.utils.OsUtils;
import com.knecon.fforesight.service.ocr.v1.server.FileStorageService;
import com.knecon.fforesight.service.ocr.processor.service.FileStorageService;
import com.knecon.fforesight.service.ocr.processor.service.IOcrMessageSender;
import com.knecon.fforesight.service.ocr.processor.service.OCRService;
import com.knecon.fforesight.service.ocr.v1.api.model.DocumentRequest;
@ -35,11 +32,6 @@ import lombok.extern.slf4j.Slf4j;
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class OcrMessageReceiver {
public static final String OCR_REQUEST_LISTENER_ID = "ocr-request-listener";
public static final String IDP_RESULT_FILE_NAME = "idpResult.json";
public static final String VIEWER_DOCUMENT_FILE_NAME = "viewerDocument.pdf";
public static final String DOCUMENT_FILE_NAME = "document.pdf";
FileStorageService fileStorageService;
ObjectMapper objectMapper;
OCRService ocrService;
@ -47,7 +39,7 @@ public class OcrMessageReceiver {
@RabbitHandler
@RabbitListener(id = OCR_REQUEST_LISTENER_ID, concurrency = "1")
@RabbitListener(queues = MessagingConfiguration.OCR_REQUEST_QUEUE, concurrency = "1")
public void receiveOcr(Message in) throws IOException {
if (in.getMessageProperties().isRedelivered()) {
@ -57,35 +49,32 @@ public class OcrMessageReceiver {
DocumentRequest request = objectMapper.readValue(in.getBody(), DocumentRequest.class);
String dossierId = request.getDossierId();
String fileId = request.getFileId();
Path runDir = Path.of(OsUtils.getTemporaryDirectory()).resolve(request.getDossierId()).resolve(request.getFileId());
Path tmpDir = Files.createTempDirectory(null);
try {
MDC.put("fileId", fileId);
log.info("--------------------------------- Starting OCR ---------------------------------");
log.info("Features: {}", request.getFeatures().stream().map(Objects::toString).collect(Collectors.joining(", ")));
ocrMessageSender.sendOCRStarted(fileId, request.getFeatures());
log.info("--------------------------------------------------------------------------");
log.info("Start ocr for file with dossierId {} and fileId {}", dossierId, fileId);
File documentFile = runDir.resolve(DOCUMENT_FILE_NAME).toFile();
File viewerDocumentFile = runDir.resolve(VIEWER_DOCUMENT_FILE_NAME).toFile();
File idpResultFile = runDir.resolve(IDP_RESULT_FILE_NAME).toFile();
ocrMessageSender.sendOCRStarted(fileId);
File documentFile = tmpDir.resolve("document.pdf").toFile();
File viewerDocumentFile = tmpDir.resolve("viewerDocument.pdf").toFile();
File analyzeResultFile = tmpDir.resolve("azureAnalysisResult.json").toFile();
fileStorageService.downloadFiles(request, documentFile);
ocrService.runOcrOnDocument(dossierId, fileId, request.getFeatures(), runDir, documentFile, viewerDocumentFile, idpResultFile);
ocrService.runOcrOnDocument(dossierId, fileId, request.isRemoveWatermarks(), tmpDir, documentFile, viewerDocumentFile, analyzeResultFile);
fileStorageService.storeFiles(request, documentFile, viewerDocumentFile, idpResultFile);
fileStorageService.storeFiles(request, documentFile, viewerDocumentFile, analyzeResultFile);
ocrMessageSender.sendOcrResponse(request);
ocrMessageSender.sendOcrResponse(dossierId, fileId);
} catch (Exception e) {
log.warn("An exception occurred in ocr file stage: {}", e.getMessage());
in.getMessageProperties().getHeaders().put(MessagingConfiguration.X_ERROR_INFO_HEADER, e.getMessage());
in.getMessageProperties().getHeaders().put(MessagingConfiguration.X_ERROR_INFO_TIMESTAMP_HEADER, OffsetDateTime.now().truncatedTo(ChronoUnit.MILLIS));
throw new RuntimeException(e);
} finally {
log.info("Done");
log.info("--------------------------------- Done ---------------------------------");
MDC.remove("fileId");
FileSystemUtils.deleteRecursively(runDir);
FileSystemUtils.deleteRecursively(tmpDir);
}
}

View File

@ -1,13 +1,10 @@
package com.knecon.fforesight.service.ocr.v1.server.queue;
import java.util.Set;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.ocr.processor.service.IOcrMessageSender;
import com.knecon.fforesight.service.ocr.v1.api.model.AzureOcrFeature;
import com.knecon.fforesight.service.ocr.v1.api.model.DocumentRequest;
import com.knecon.fforesight.service.ocr.v1.api.model.OCRStatusUpdateResponse;
import com.knecon.fforesight.service.ocr.v1.server.configuration.MessagingConfiguration;
@ -28,46 +25,35 @@ public class OcrMessageSender implements IOcrMessageSender {
RabbitTemplate rabbitTemplate;
public void sendOcrFinished(String fileId, int totalImages, Set<AzureOcrFeature> features) {
public void sendOcrFinished(String fileId, int totalImages) {
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_EXCHANGE,
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
TenantContext.getTenantId(),
OCRStatusUpdateResponse.builder()
.fileId(fileId)
.numberOfPagesToOCR(totalImages)
.numberOfOCRedPages(totalImages)
.ocrFinished(true)
.features(features)
.build());
OCRStatusUpdateResponse.builder().fileId(fileId).numberOfPagesToOCR(totalImages).numberOfOCRedPages(totalImages).ocrFinished(true).build());
}
public void sendOCRStarted(String fileId, Set<AzureOcrFeature> features) {
public void sendOCRStarted(String fileId) {
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_EXCHANGE,
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
TenantContext.getTenantId(),
OCRStatusUpdateResponse.builder().fileId(fileId).features(features).ocrStarted(true).build());
OCRStatusUpdateResponse.builder().fileId(fileId).ocrStarted(true).build());
}
public void sendUpdate(String fileId, int finishedImages, int totalImages, Set<AzureOcrFeature> features) {
public void sendUpdate(String fileId, int finishedImages, int totalImages) {
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_EXCHANGE,
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
TenantContext.getTenantId(),
OCRStatusUpdateResponse.builder()
.fileId(fileId)
.features(features)
.numberOfPagesToOCR(totalImages)
.numberOfOCRedPages(finishedImages)
.build());
OCRStatusUpdateResponse.builder().fileId(fileId).numberOfPagesToOCR(totalImages).numberOfOCRedPages(finishedImages).build());
}
public void sendOcrResponse(DocumentRequest request) {
public void sendOcrResponse(String dossierId, String fileId) {
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_RESPONSE_EXCHANGE, TenantContext.getTenantId(), request);
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_RESPONSE_QUEUE, TenantContext.getTenantId(), new DocumentRequest(dossierId, fileId));
}
}

View File

@ -1,5 +1,5 @@
# you can list packages
ghostscript=9.55.0~dfsg1-0ubuntu5.10
ghostscript=9.55.0~dfsg1-0ubuntu5.9
pkg-config
zip
unzip

View File

@ -63,5 +63,3 @@ azure:
ocrService:
sendStatusUpdates: true
native-libs.path: ${VCPKG_DYNAMIC_LIB}

View File

@ -7,21 +7,11 @@
<include resource="org/springframework/boot/logging/logback/console-appender.xml"/>
<appender name="JSON" class="ch.qos.logback.core.ConsoleAppender">
<encoder class="net.logstash.logback.encoder.LogstashEncoder">
<pattern>%d{yyyy-MM-dd HH:mm:ss}%replace( [file:%X{fileId}]){' \[file:\]', ''} [%thread] %-5level%logger{36} - %msg%n</pattern>
</encoder>
</appender>
<appender name="CONSOLE" class="ch.qos.logback.core.ConsoleAppender">
<encoder>
<pattern>%d{yyyy-MM-dd HH:mm:ss}%replace( [file:%X{fileId}]){' \[file:\]', ''} [%thread] %-5level%logger{36} - %msg%n</pattern>
</encoder>
<encoder class="net.logstash.logback.encoder.LogstashEncoder"/>
</appender>
<root level="INFO">
<appender-ref ref="${logType}"/>
</root>
<logger name="com.iqser.red.pdftronlogic.commons" level="ERROR"/>
</configuration>

View File

@ -1,50 +0,0 @@
package com.knecon.fforesight.service.ocr.v1.api.model;
import static org.junit.jupiter.api.Assertions.*;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import org.junit.jupiter.api.Test;
class QuadPointTest {
@Test
public void testContains() {
var a = new Point2D.Double(0, 0);
var b = new Point2D.Double(0, 1);
var c = new Point2D.Double(1, 1);
var d = new Point2D.Double(1, 0);
var q = new QuadPoint(a, b, c, d);
assertTrue(q.isHorizontal());
assertFalse(q.isVertical());
assertTrue(q.contains(a));
assertTrue(q.contains(b));
assertTrue(q.contains(c));
assertTrue(q.contains(d));
var p = new Point2D.Double(0.5, 0.5);
assertTrue(q.contains(p));
var r = new Rectangle2D.Double(0.5, 0.5, 0.1, 0.1);
assertTrue(q.contains(r));
}
@Test
public void testCenter() {
var a = new Point2D.Double(0, 0);
var b = new Point2D.Double(1, 1);
var c = new Point2D.Double(2, 1);
var d = new Point2D.Double(1, 0);
var q = new QuadPoint(a, b, c, d);
assertTrue(q.isHorizontal());
assertFalse(q.isVertical());
assertEquals(QuadPoint.Direction.RIGHT, q.getDirection());
assertEquals(new Point2D.Double(1, 0.5), q.getCenter());
}
}

View File

@ -1,9 +1,6 @@
package com.knecon.fforesight.service.ocr.v1.server;
import static org.assertj.core.api.Assertions.assertThat;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.AfterEach;
@ -11,10 +8,7 @@ import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.MockitoAnnotations;
import org.mockito.junit.jupiter.MockitoExtension;
import org.springframework.amqp.rabbit.core.RabbitAdmin;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.amqp.rabbit.listener.MessageListenerContainer;
import org.springframework.amqp.rabbit.listener.RabbitListenerEndpointRegistry;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration;
@ -58,9 +52,6 @@ public class AbstractTest {
@MockBean
protected RabbitTemplate rabbitTemplate;
@MockBean
private RabbitAdmin rabbitAdmin;
private static String pdftronLicense;
@ -109,16 +100,6 @@ public class AbstractTest {
@ComponentScan(excludeFilters = {@ComponentScan.Filter(type = FilterType.ASSIGNABLE_TYPE, value = StorageAutoConfiguration.class)})
public static class TestConfiguration {
@Bean
public RabbitListenerEndpointRegistry rabbitListenerEndpointRegistry() {
var mock = mock(RabbitListenerEndpointRegistry.class);
when(mock.getListenerContainer(any())).thenReturn(mock(MessageListenerContainer.class));
return mock;
}
@Bean
@Primary
public StorageService inMemoryStorage() {

View File

@ -1,9 +1,6 @@
package com.knecon.fforesight.service.ocr.v1.server;
import static com.iqser.red.pdftronlogic.commons.PdfTextExtraction.extractAllTextFromDocument;
import static com.knecon.fforesight.service.ocr.v1.server.queue.OcrMessageReceiver.DOCUMENT_FILE_NAME;
import static com.knecon.fforesight.service.ocr.v1.server.queue.OcrMessageReceiver.IDP_RESULT_FILE_NAME;
import static com.knecon.fforesight.service.ocr.v1.server.queue.OcrMessageReceiver.VIEWER_DOCUMENT_FILE_NAME;
import java.io.File;
import java.io.FileInputStream;
@ -12,28 +9,23 @@ import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.util.Comparator;
import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.slf4j.MDC;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.core.io.ClassPathResource;
import com.knecon.fforesight.service.ocr.processor.service.OCRService;
import com.knecon.fforesight.service.ocr.processor.utils.OsUtils;
import com.knecon.fforesight.service.ocr.v1.api.model.AzureOcrFeature;
import lombok.SneakyThrows;
// in order to run, the azure.key must be set first in the application.yml and you must set the env variable VCPKG_DYNAMIC_LIB to your tesseract and leptonica installation folder
@Disabled
@Disabled // in order to run, the azure.key must be set first in the application.yml
@SpringBootTest()
public class OcrServiceIntegrationTest extends AbstractTest {
public static final Set<AzureOcrFeature> FEATURES = Set.of(AzureOcrFeature.ROTATION_CORRECTION, AzureOcrFeature.FONT_STYLE_DETECTION, AzureOcrFeature.IDP);
@Autowired
private OCRService ocrService;
@ -42,7 +34,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
@SneakyThrows
public void testOcrWith2000PageFile() {
testOCR("/home/kschuettler/Dokumente/TestFiles/OCR/brokenText.pdf");
testOCR("/home/kschuettler/Dokumente/TestFiles/OCR/VV-331340-first100.pdf");
}
@ -58,7 +50,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
@SneakyThrows
public void testOcrWithFile() {
testOCR("/home/kschuettler/Dokumente/TestFiles/OCR/TestSet/VV-331340-first100.pdf");
testOCR("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/1.A16148F - Toxicidade oral aguda.pdf");
}
@ -66,7 +58,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
@SneakyThrows
public void testOcrWithFolder() {
String dir = "/home/kschuettler/Dokumente/TestFiles/OCR/TestSet";
String dir = "/home/kschuettler/Dokumente/TestFiles/BASF/Documine_Test_docs/2013-1110704.pdf";
List<File> foundFiles = Files.walk(Path.of(dir))
.sorted(Comparator.comparingLong(this::getFileSize))
.map(Path::toFile)
@ -105,21 +97,18 @@ public class OcrServiceIntegrationTest extends AbstractTest {
@SneakyThrows
private String testOCR(File file) {
MDC.put("fileId", "test");
Path tmpDir = Path.of(OsUtils.getTemporaryDirectory()).resolve("OCR_TEST").resolve(file.toPath().getFileName());
assert tmpDir.toFile().exists() || tmpDir.toFile().mkdirs();
var documentFile = tmpDir.resolve(Path.of(DOCUMENT_FILE_NAME));
var viewerDocumentFile = tmpDir.resolve(Path.of(VIEWER_DOCUMENT_FILE_NAME));
var analyzeResultFile = tmpDir.resolve(Path.of(IDP_RESULT_FILE_NAME));
var documentFile = tmpDir.resolve(Path.of("document.pdf"));
var viewerDocumentFile = tmpDir.resolve(Path.of("viewerDocument.pdf"));
var analyzeResultFile = tmpDir.resolve(Path.of("azureAnalysisResult.json"));
Files.copy(file.toPath(), documentFile, StandardCopyOption.REPLACE_EXISTING);
Files.copy(file.toPath(), viewerDocumentFile, StandardCopyOption.REPLACE_EXISTING);
ocrService.runOcrOnDocument(TEST_DOSSIER_ID, "file", FEATURES, tmpDir, documentFile.toFile(), viewerDocumentFile.toFile(), analyzeResultFile.toFile());
MDC.remove("fileId");
ocrService.runOcrOnDocument(TEST_DOSSIER_ID, "file", false, tmpDir, documentFile.toFile(), viewerDocumentFile.toFile(), analyzeResultFile.toFile());
System.out.println("File:" + documentFile);
System.out.println("\n\n");
try (var fileStream = new FileInputStream(documentFile.toFile())) {

View File

@ -2,16 +2,12 @@ persistence-service.url: "http://persistence-service-v1:8080"
pdftron.license: demo:1650351709282:7bd235e003000000004ec28a6743e1163a085e2115de2536ab6e2cfe5a
azure:
endpoint: https://ff-ocr-dev.cognitiveservices.azure.com/
key: 444fe2f83e9c48da8e588c7bd5295309 # find key in Bitwarden under: Azure IDP Test Key
native-libs:
endpoint: https://ff-ocr-test.cognitiveservices.azure.com/
key: # find key in Bitwarden under: Azure IDP Test Key
logging.type: ${LOGGING_TYPE:CONSOLE}
ocrService:
sendStatusUpdates: false
debug: true
ocrService.sendStatusUpdates: false
management:
endpoint:
@ -21,5 +17,4 @@ management:
endpoints.web.exposure.include: prometheus, health, metrics
metrics.export.prometheus.enabled: true
POD_NAME: azure-ocr-service
native-libs.path: /home/kschuettler/software/leptonica/vcpkg/installed/x64-linux-dynamic/lib/
POD_NAME: azure-ocr-service

View File

@ -15,7 +15,6 @@
<exclude name="NullAssignment"/>
<exclude name="AssignmentInOperand"/>
<exclude name="BeanMembersShouldSerialize"/>
<exclude name="AvoidFieldNameMatchingMethodName"/>
</rule>
</ruleset>

View File

@ -17,7 +17,6 @@
<exclude name="AssignmentInOperand"/>
<exclude name="TestClassWithoutTestCases"/>
<exclude name="BeanMembersShouldSerialize"/>
<exclude name="AvoidFieldNameMatchingMethodName"/>
</rule>
</ruleset>