Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
189bd8e979 |
@ -22,5 +22,4 @@ deploy:
|
||||
rules:
|
||||
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
|
||||
- if: $CI_COMMIT_BRANCH =~ /^release/
|
||||
- if: $CI_COMMIT_BRANCH =~ /^feature/
|
||||
- if: $CI_COMMIT_TAG
|
||||
|
||||
@ -70,7 +70,7 @@ int concurrency = 8;
|
||||
int batchSize = 128;
|
||||
boolean debug; // writes the ocr layer visibly to the viewer doc pdf
|
||||
boolean idpEnabled; // Enables table detection, paragraph classification, section detection, key-value detection.
|
||||
boolean drawTablesAsLines; // writes the tables to the PDF as invisible lines.
|
||||
boolean tableDetection; // writes the tables to the PDF as invisible lines.
|
||||
boolean processAllPages; // if this parameter is set, ocr will be performed on any page, regardless if it has images or not
|
||||
boolean fontStyleDetection; // Enables bold detection using ghostscript and leptonica
|
||||
String contentFormat; // Either markdown or text. But, for whatever reason, with markdown enabled, key-values are not written by azure....
|
||||
|
||||
@ -0,0 +1,25 @@
|
||||
package com.knecon.fforesight.service.ocr.v1.api.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Getter
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class AzureAnalyzeResult {
|
||||
|
||||
@Builder.Default
|
||||
List<KeyValuePair> keyValuePairs = new ArrayList<>();
|
||||
@Builder.Default
|
||||
List<TextRegion> handWrittenText = new ArrayList<>();
|
||||
@Builder.Default
|
||||
List<Figure> figures = new ArrayList<>();
|
||||
|
||||
}
|
||||
@ -1,11 +0,0 @@
|
||||
package com.knecon.fforesight.service.ocr.v1.api.model;
|
||||
|
||||
public enum AzureOcrFeature {
|
||||
|
||||
ROTATION_CORRECTION,
|
||||
IDP,
|
||||
FONT_STYLE_DETECTION,
|
||||
ALL_PAGES,
|
||||
REMOVE_WATERMARKS
|
||||
|
||||
}
|
||||
@ -1,8 +1,6 @@
|
||||
package com.knecon.fforesight.service.ocr.v1.api.model;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
@ -20,13 +18,12 @@ public class DocumentRequest {
|
||||
|
||||
String dossierId;
|
||||
String fileId;
|
||||
boolean removeWatermark;
|
||||
|
||||
String originDocumentId;
|
||||
String viewerDocId;
|
||||
String idpResultId;
|
||||
|
||||
Set<AzureOcrFeature> features;
|
||||
boolean removeWatermarks;
|
||||
|
||||
|
||||
public DocumentRequest(String dossierId, String fileId) {
|
||||
@ -36,23 +33,18 @@ public class DocumentRequest {
|
||||
originDocumentId = null;
|
||||
viewerDocId = null;
|
||||
idpResultId = null;
|
||||
features = Collections.emptySet();
|
||||
removeWatermarks = false;
|
||||
}
|
||||
|
||||
|
||||
// needed for backwards compatibility
|
||||
public DocumentRequest(String dossierId, String fileId, boolean removeWatermark) {
|
||||
public DocumentRequest(String dossierId, String fileId, boolean removeWatermarks) {
|
||||
|
||||
this.dossierId = dossierId;
|
||||
this.fileId = fileId;
|
||||
this.removeWatermarks = removeWatermarks;
|
||||
originDocumentId = null;
|
||||
viewerDocId = null;
|
||||
idpResultId = null;
|
||||
if (removeWatermark) {
|
||||
features = Set.of(AzureOcrFeature.REMOVE_WATERMARKS);
|
||||
} else {
|
||||
features = Collections.emptySet();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -73,10 +65,4 @@ public class DocumentRequest {
|
||||
return Optional.ofNullable(originDocumentId);
|
||||
}
|
||||
|
||||
|
||||
public Set<AzureOcrFeature> getFeatures() {
|
||||
|
||||
return features == null ? Collections.emptySet() : features;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,11 +1,10 @@
|
||||
package com.knecon.fforesight.service.ocr.v1.api.model;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
import lombok.Builder;
|
||||
|
||||
@Builder
|
||||
public record Figure(TextRegion caption, Region image, List<TextRegion> footnotes) {
|
||||
public record Figure(Optional<TextRegion> caption, Region image) {
|
||||
|
||||
}
|
||||
|
||||
@ -1,23 +0,0 @@
|
||||
package com.knecon.fforesight.service.ocr.v1.api.model;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
public record IdpResult(List<KeyValuePair> keyValuePairs, List<TextRegion> handWrittenText, List<Figure> figures, List<Table> tables) {
|
||||
|
||||
public static IdpResult initSynchronized() {
|
||||
|
||||
return new IdpResult(Collections.synchronizedList(new LinkedList<>()),
|
||||
Collections.synchronizedList(new LinkedList<>()),
|
||||
Collections.synchronizedList(new LinkedList<>()),
|
||||
Collections.synchronizedList(new LinkedList<>()));
|
||||
}
|
||||
|
||||
|
||||
public static IdpResult empty() {
|
||||
|
||||
return new IdpResult(Collections.emptyList(), Collections.emptyList(), Collections.emptyList(), Collections.emptyList());
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,8 +1,5 @@
|
||||
package com.knecon.fforesight.service.ocr.v1.api.model;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.Set;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
@ -15,16 +12,9 @@ import lombok.NoArgsConstructor;
|
||||
public class OCRStatusUpdateResponse {
|
||||
|
||||
private String fileId;
|
||||
private Set<AzureOcrFeature> features;
|
||||
private int numberOfPagesToOCR;
|
||||
private int numberOfOCRedPages;
|
||||
private boolean ocrFinished;
|
||||
private boolean ocrStarted;
|
||||
|
||||
|
||||
public Set<AzureOcrFeature> getFeatures() {
|
||||
|
||||
return features == null ? Collections.emptySet() : features;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -5,107 +5,29 @@ import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import lombok.Getter;
|
||||
public record QuadPoint(Point2D a, Point2D b, Point2D c, Point2D d) {
|
||||
|
||||
public final class QuadPoint {
|
||||
|
||||
public enum Direction {
|
||||
RIGHT,
|
||||
/*
|
||||
B _____ C
|
||||
| |
|
||||
A|_____|D
|
||||
*/
|
||||
DOWN,
|
||||
/*
|
||||
* A _____ B
|
||||
* | |
|
||||
* D|_____|C
|
||||
*/
|
||||
LEFT,
|
||||
/*
|
||||
* D _____ A
|
||||
* | |
|
||||
* C|_____|B
|
||||
* */
|
||||
UP,
|
||||
/*
|
||||
* C _____ D
|
||||
* | |
|
||||
* B|_____|A
|
||||
*/
|
||||
NONE
|
||||
/*
|
||||
* ? _____ ?
|
||||
* | |
|
||||
* ?|_____|?
|
||||
*/
|
||||
}
|
||||
|
||||
private static final double THRESHOLD_ANGLE = Math.toRadians(5); // QuadPoint is considered straight, when its angles are below this threshold.
|
||||
|
||||
private final Point2D a;
|
||||
private final Point2D b;
|
||||
private final Point2D c;
|
||||
private final Point2D d;
|
||||
@Getter
|
||||
private final Direction direction;
|
||||
|
||||
|
||||
// This constructor assumes, the points form a convex polygon, I will omit the assertion for performance reasons.
|
||||
public QuadPoint(Point2D a, Point2D b, Point2D c, Point2D d) {
|
||||
|
||||
this.a = a;
|
||||
this.b = b;
|
||||
this.c = c;
|
||||
this.d = d;
|
||||
this.direction = calculateDirection();
|
||||
}
|
||||
|
||||
|
||||
private Direction calculateDirection() {
|
||||
|
||||
if (isHorizontal()) {
|
||||
return a.getX() < d.getX() ? Direction.RIGHT : Direction.LEFT;
|
||||
}
|
||||
if (isVertical()) {
|
||||
return a.getY() < d.getY() ? Direction.UP : Direction.DOWN;
|
||||
}
|
||||
return Direction.NONE;
|
||||
}
|
||||
/*
|
||||
B _____ C
|
||||
| |
|
||||
A|_____|D
|
||||
*/
|
||||
|
||||
|
||||
public static QuadPoint fromRectangle2D(Rectangle2D rectangle2D) {
|
||||
|
||||
return fromRectangle2D(rectangle2D, Direction.NONE);
|
||||
}
|
||||
|
||||
|
||||
public static QuadPoint fromRectangle2D(Rectangle2D rectangle2D, Direction direction) {
|
||||
|
||||
var lowerLeft = new Point2D.Double(rectangle2D.getX(), rectangle2D.getY());
|
||||
var upperLeft = new Point2D.Double(rectangle2D.getX(), rectangle2D.getMaxY());
|
||||
var upperRight = new Point2D.Double(rectangle2D.getMaxX(), rectangle2D.getMaxY());
|
||||
var lowerRight = new Point2D.Double(rectangle2D.getMaxX(), rectangle2D.getY());
|
||||
|
||||
return switch (direction) {
|
||||
case DOWN -> new QuadPoint(upperLeft, upperRight, lowerRight, lowerLeft);
|
||||
case LEFT -> new QuadPoint(upperRight, lowerRight, lowerLeft, upperLeft);
|
||||
case UP -> new QuadPoint(lowerRight, lowerLeft, upperLeft, upperRight);
|
||||
default -> new QuadPoint(lowerLeft, upperLeft, upperRight, lowerRight);
|
||||
};
|
||||
|
||||
return new QuadPoint(new Point2D.Double(rectangle2D.getX(), rectangle2D.getY()),
|
||||
new Point2D.Double(rectangle2D.getX(), rectangle2D.getMaxY()),
|
||||
new Point2D.Double(rectangle2D.getMaxX(), rectangle2D.getMaxY()),
|
||||
new Point2D.Double(rectangle2D.getMaxX(), rectangle2D.getY()));
|
||||
}
|
||||
|
||||
|
||||
public static QuadPoint fromPolygons(List<Double> polygon) {
|
||||
|
||||
if (polygon.size() != 8) {
|
||||
throw new AssertionError();
|
||||
}
|
||||
assert polygon.size() == 8;
|
||||
return new QuadPoint(new Point2D.Double(polygon.get(0), polygon.get(1)),
|
||||
new Point2D.Double(polygon.get(6), polygon.get(7)),
|
||||
new Point2D.Double(polygon.get(4), polygon.get(5)),
|
||||
@ -134,23 +56,6 @@ public final class QuadPoint {
|
||||
}
|
||||
|
||||
|
||||
public boolean isHorizontal() {
|
||||
|
||||
double angle = calculateAngle(a, d);
|
||||
double angle2 = calculateAngle(b, c);
|
||||
return Math.abs(angle) <= THRESHOLD_ANGLE || Math.abs(angle2) <= THRESHOLD_ANGLE;
|
||||
}
|
||||
|
||||
|
||||
public boolean isVertical() {
|
||||
|
||||
double rightAngle = Math.PI / 2;
|
||||
double angle = calculateAngle(a, d);
|
||||
double angle2 = calculateAngle(b, c);
|
||||
return Math.abs(rightAngle - Math.abs(angle)) <= THRESHOLD_ANGLE || Math.abs(rightAngle - Math.abs(angle2)) <= THRESHOLD_ANGLE;
|
||||
}
|
||||
|
||||
|
||||
public Stream<Line2D> asLines() {
|
||||
|
||||
return Stream.of(new Line2D.Double(a(), b()), new Line2D.Double(b(), c()), new Line2D.Double(c(), d()), new Line2D.Double(d(), a()));
|
||||
@ -158,7 +63,7 @@ public final class QuadPoint {
|
||||
}
|
||||
|
||||
|
||||
public QuadPointData toData() {
|
||||
public QuadPointData data() {
|
||||
|
||||
return new QuadPointData(new float[]{(float) a.getX(), (float) a.getY(), (float) b.getX(), (float) b.getY(), (float) c.getX(), (float) c.getY(), (float) d.getX(), (float) d.getY()});
|
||||
}
|
||||
@ -170,142 +75,6 @@ public final class QuadPoint {
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(double x, double y) {
|
||||
// split into two triangles, test if either contains the point, assumes the QuadPoint is convex and created correctly. More specifically, the points must be in the correct order.
|
||||
return triangleContains(a, b, c, x, y) || triangleContains(a, c, d, x, y);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
checks if a triangle contains a point by converting the point to barycentric coordinates using cramer's rule and then checking if the linear combination is within the bounds of the triangle.
|
||||
https://en.wikipedia.org/wiki/Barycentric_coordinate_system#Barycentric_coordinates_on_triangles
|
||||
*/
|
||||
private boolean triangleContains(Point2D a, Point2D b, Point2D c, double x, double y) {
|
||||
|
||||
// area of the triangle
|
||||
double denominator = ((b.getY() - c.getY()) * (a.getX() - c.getX()) + (c.getX() - b.getX()) * (a.getY() - c.getY()));
|
||||
double invertedDenominator = 1.0 / denominator;
|
||||
double alpha = ((b.getY() - c.getY()) * (x - c.getX()) + (c.getX() - b.getX()) * (y - c.getY())) * invertedDenominator;
|
||||
double beta = ((c.getY() - a.getY()) * (x - c.getX()) + (a.getX() - c.getX()) * (y - c.getY())) * invertedDenominator;
|
||||
|
||||
return alpha >= 0 && beta >= 0 && alpha + beta <= 1;
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(Point2D p) {
|
||||
|
||||
return contains(p.getX(), p.getY());
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(Rectangle2D r) {
|
||||
|
||||
double x = r.getX();
|
||||
double y = r.getY();
|
||||
double maxY = r.getMaxY();
|
||||
double maxX = r.getMaxX();
|
||||
|
||||
Point2D p1 = new Point2D.Double(x, y);
|
||||
Point2D p2 = new Point2D.Double(x, maxY);
|
||||
Point2D p3 = new Point2D.Double(maxX, maxY);
|
||||
Point2D p4 = new Point2D.Double(maxX, y);
|
||||
|
||||
return contains(p1) && contains(p2) && contains(p3) && contains(p4);
|
||||
}
|
||||
|
||||
|
||||
public double getCenterX() {
|
||||
|
||||
return (a.getX() + b.getX() + c.getX() + d.getX()) / 4;
|
||||
}
|
||||
|
||||
|
||||
public double getCenterY() {
|
||||
|
||||
return (a.getY() + b.getY() + c.getY() + d.getY()) / 4;
|
||||
}
|
||||
|
||||
|
||||
public Point2D getCenter() {
|
||||
|
||||
return new Point2D.Double(getCenterX(), getCenterY());
|
||||
}
|
||||
|
||||
|
||||
public boolean intersects(Line2D line) {
|
||||
|
||||
return contains(line.getP1()) || contains(line.getP2()) || asLines().anyMatch(qLine -> qLine.intersectsLine(line));
|
||||
}
|
||||
|
||||
|
||||
public Line2D getRightLine() {
|
||||
|
||||
return new Line2D.Double(getTopRight(), getLowerRight());
|
||||
}
|
||||
|
||||
|
||||
public Line2D getLeftLine() {
|
||||
|
||||
return new Line2D.Double(getTopLeft(), getLowerLeft());
|
||||
}
|
||||
|
||||
|
||||
public Line2D getBottomLine() {
|
||||
|
||||
return new Line2D.Double(getLowerLeft(), getLowerRight());
|
||||
}
|
||||
|
||||
|
||||
public Line2D getTopLine() {
|
||||
|
||||
return new Line2D.Double(getTopLeft(), getTopRight());
|
||||
}
|
||||
|
||||
|
||||
public Point2D getTopLeft() {
|
||||
|
||||
return switch (direction) {
|
||||
case DOWN -> a;
|
||||
case LEFT -> d;
|
||||
case UP -> c;
|
||||
default -> b;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
public Point2D getTopRight() {
|
||||
|
||||
return switch (direction) {
|
||||
case DOWN -> b;
|
||||
case LEFT -> a;
|
||||
case UP -> d;
|
||||
default -> c;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
public Point2D getLowerRight() {
|
||||
|
||||
return switch (direction) {
|
||||
case DOWN -> c;
|
||||
case LEFT -> b;
|
||||
case UP -> a;
|
||||
default -> d;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
public Point2D getLowerLeft() {
|
||||
|
||||
return switch (direction) {
|
||||
case DOWN -> d;
|
||||
case LEFT -> c;
|
||||
case UP -> b;
|
||||
default -> a;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Determines if the given QuadPoint aligns with this QuadPoint within a given threshold.
|
||||
* It does os by trying every possible combination of aligning sides. It starts with the most likely combination of ab and cd.
|
||||
@ -365,56 +134,17 @@ public final class QuadPoint {
|
||||
}
|
||||
|
||||
|
||||
public double getRectangularSize() {
|
||||
public double size() {
|
||||
|
||||
return a().distance(b()) * a().distance(d());
|
||||
}
|
||||
|
||||
|
||||
public double getAngle() {
|
||||
|
||||
return calculateAngle(a, d);
|
||||
}
|
||||
|
||||
|
||||
private static double calculateAngle(Point2D a, Point2D d) {
|
||||
public double angle() {
|
||||
|
||||
double deltaY = d.getY() - a.getY();
|
||||
double deltaX = d.getX() - a.getX();
|
||||
return Math.atan2(deltaY, deltaX);
|
||||
}
|
||||
|
||||
|
||||
public Point2D a() {return a;}
|
||||
|
||||
|
||||
public Point2D b() {return b;}
|
||||
|
||||
|
||||
public Point2D c() {return c;}
|
||||
|
||||
|
||||
public Point2D d() {return d;}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
|
||||
if (obj == this) {
|
||||
return true;
|
||||
}
|
||||
if (obj == null || obj.getClass() != this.getClass()) {
|
||||
return false;
|
||||
}
|
||||
var that = (QuadPoint) obj;
|
||||
return Objects.equals(this.a, that.a) && Objects.equals(this.b, that.b) && Objects.equals(this.c, that.c) && Objects.equals(this.d, that.d);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
|
||||
return Objects.hash(a, b, c, d);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -5,9 +5,4 @@ import lombok.Builder;
|
||||
@Builder
|
||||
public record QuadPointData(float[] values) {
|
||||
|
||||
public QuadPoint get() {
|
||||
|
||||
return QuadPoint.fromData(this);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,7 +0,0 @@
|
||||
package com.knecon.fforesight.service.ocr.v1.api.model;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public record Table(TextRegion caption, int numberOfCols, int numberOfRows, List<TableCell> cells, List<TextRegion> footnotes, List<Region> bboxes) {
|
||||
|
||||
}
|
||||
@ -1,5 +0,0 @@
|
||||
package com.knecon.fforesight.service.ocr.v1.api.model;
|
||||
|
||||
public record TableCell(TextRegion textRegion, int row, int col, TableCellType kind) {
|
||||
|
||||
}
|
||||
@ -1,5 +0,0 @@
|
||||
package com.knecon.fforesight.service.ocr.v1.api.model;
|
||||
|
||||
public enum TableCellType {
|
||||
ROW_HEADER, COLUMN_HEADER, CONTENT, STUB_HEAD, DESCRIPTION
|
||||
}
|
||||
@ -10,18 +10,19 @@ configurations {
|
||||
}
|
||||
|
||||
dependencies {
|
||||
implementation(project(":azure-ocr-service-api"))
|
||||
implementation("net.sourceforge.tess4j:tess4j:5.8.0")
|
||||
implementation("com.iqser.red.commons:metric-commons:2.1.0")
|
||||
implementation("com.pdftron:PDFNet:11.0.0")
|
||||
implementation("org.apache.pdfbox:pdfbox:3.0.0")
|
||||
implementation("org.apache.commons:commons-math3:3.6.1")
|
||||
implementation("com.amazonaws:aws-java-sdk-kms:1.12.440")
|
||||
implementation("com.google.guava:guava:31.1-jre")
|
||||
implementation("com.knecon.fforesight:viewer-doc-processor:0.193.0")
|
||||
implementation("com.azure:azure-ai-documentintelligence:1.0.0")
|
||||
|
||||
implementation("com.iqser.red.commons:pdftron-logic-commons:2.32.0")
|
||||
|
||||
api(project(":azure-ocr-service-api"))
|
||||
api("com.iqser.red.service:persistence-service-internal-api-v1:2.224.0")
|
||||
api("net.sourceforge.tess4j:tess4j:5.8.0")
|
||||
api("com.iqser.red.commons:metric-commons:2.1.0")
|
||||
api("com.iqser.red.commons:storage-commons:2.49.0")
|
||||
api("com.knecon.fforesight:tenant-commons:0.21.0")
|
||||
api("com.pdftron:PDFNet:10.7.0")
|
||||
api("org.apache.pdfbox:pdfbox:3.0.0")
|
||||
api("org.apache.commons:commons-math3:3.6.1")
|
||||
api("com.amazonaws:aws-java-sdk-kms:1.12.440")
|
||||
api("com.google.guava:guava:31.1-jre")
|
||||
api("com.iqser.red.commons:pdftron-logic-commons:2.27.0")
|
||||
api("com.knecon.fforesight:viewer-doc-processor:0.148.0")
|
||||
api("com.azure:azure-ai-documentintelligence:1.0.0-beta.3")
|
||||
testImplementation("org.junit.jupiter:junit-jupiter:5.8.1")
|
||||
}
|
||||
|
||||
@ -6,8 +6,6 @@ import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.ComponentScan;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService;
|
||||
import com.iqser.red.pdftronlogic.commons.WatermarkRemovalService;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService;
|
||||
|
||||
import io.micrometer.observation.ObservationRegistry;
|
||||
@ -24,18 +22,4 @@ public class OcrServiceProcessorConfiguration {
|
||||
return new PDFTronViewerDocumentService(registry);
|
||||
}
|
||||
|
||||
|
||||
@Bean
|
||||
public InvisibleElementRemovalService invisibleElementRemovalService() {
|
||||
|
||||
return new InvisibleElementRemovalService();
|
||||
}
|
||||
|
||||
|
||||
@Bean
|
||||
public WatermarkRemovalService watermarkRemovalService() {
|
||||
|
||||
return new WatermarkRemovalService();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -11,16 +11,16 @@ import lombok.experimental.FieldDefaults;
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class OcrServiceSettings {
|
||||
|
||||
// Limits the number of concurrent calls to azure
|
||||
int concurrency = 2;
|
||||
// Limits the number of concurrent calls to the azure API. In my very rudimentary testing, azure starts throwing "too many requests" errors at around 80/s. Higher numbers greatly improve the speed.
|
||||
int concurrency = 8;
|
||||
// Limits the number of pages per call.
|
||||
int batchSize = 32;
|
||||
int batchSize = 128;
|
||||
|
||||
boolean debug; // writes the ocr layer visibly to the viewer doc pdf
|
||||
boolean drawTablesAsLines; // writes the tables to the PDF as invisible lines.
|
||||
boolean snuggify = true; // attempts to shrink the word boxes returned by azure to fit the actual word pixels snug
|
||||
boolean useCaches; // skips azure api, pdf rendering and image processing, when the files are already present
|
||||
boolean azureFontStyleDetection; // omits all image processing and uses azures FONT_STYLE feature (costs 0.6ct per page)
|
||||
boolean idpEnabled; // Enables table detection, paragraph classification, section detection, key-value detection.
|
||||
boolean tableDetection; // writes the tables to the PDF as invisible lines.
|
||||
boolean processAllPages; // if this parameter is set, ocr will be performed on any page, regardless if it has images or not
|
||||
boolean fontStyleDetection; // Enables bold detection using ghostscript and leptonica
|
||||
String contentFormat; // Either markdown or text. But, for whatever reason, with markdown enabled, key-values are not written by azure....
|
||||
|
||||
}
|
||||
|
||||
@ -7,7 +7,6 @@ import com.pdftron.pdf.PDFNet;
|
||||
import com.sun.jna.NativeLibrary;
|
||||
|
||||
import jakarta.annotation.PostConstruct;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
@ -15,14 +14,11 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@Slf4j
|
||||
@Component
|
||||
@RequiredArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class NativeLibrariesInitializer {
|
||||
|
||||
@Value("${pdftron.license:}")
|
||||
private String pdftronLicense;
|
||||
|
||||
@Value("${native-libs.path:}")
|
||||
private String nativeLibsPath;
|
||||
|
||||
@SneakyThrows
|
||||
@PostConstruct
|
||||
@ -34,8 +30,8 @@ public class NativeLibrariesInitializer {
|
||||
PDFNet.setTempPath("/tmp/pdftron");
|
||||
PDFNet.initialize(pdftronLicense);
|
||||
|
||||
log.info("Setting jna.library.path: {}", nativeLibsPath);
|
||||
System.setProperty("jna.library.path", nativeLibsPath);
|
||||
log.info("Setting jna.library.path: {}", System.getenv("VCPKG_DYNAMIC_LIB"));
|
||||
System.setProperty("jna.library.path", System.getenv("VCPKG_DYNAMIC_LIB"));
|
||||
|
||||
log.info("Asserting Native Libraries loaded");
|
||||
|
||||
|
||||
@ -1,102 +0,0 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.azure.ai.documentintelligence.models.AnalyzeResult;
|
||||
import com.azure.ai.documentintelligence.models.DocumentPage;
|
||||
import com.azure.ai.documentintelligence.models.DocumentSpan;
|
||||
import com.azure.ai.documentintelligence.models.DocumentWord;
|
||||
|
||||
public class DocumentSpanLookup {
|
||||
|
||||
List<PageSpanLookup> documentWordLookup;
|
||||
|
||||
|
||||
public DocumentSpanLookup(AnalyzeResult analyzeResult) {
|
||||
|
||||
documentWordLookup = new ArrayList<>(analyzeResult.getPages().size());
|
||||
int offset = 0;
|
||||
for (DocumentPage page : analyzeResult.getPages()) {
|
||||
|
||||
if (page.getWords() == null || page.getWords().isEmpty()) {
|
||||
documentWordLookup.add(new PageSpanLookup(offset, offset, null));
|
||||
}
|
||||
int start = page.getWords()
|
||||
.get(0).getSpan().getOffset();
|
||||
DocumentSpan span = page.getWords()
|
||||
.get(page.getWords().size() - 1).getSpan();
|
||||
int end = span.getOffset() + span.getLength();
|
||||
SpanLookup<DocumentWord> pageWords = new SpanLookup<>(page.getWords()
|
||||
.stream(), DocumentWord::getSpan);
|
||||
documentWordLookup.add(new PageSpanLookup(start, end, pageWords));
|
||||
offset = end + 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public List<WordOnPage> findWordsOnPages(DocumentSpan documentSpan) {
|
||||
|
||||
if (documentSpan == null) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
int firstSmallerIdx = findIdxOfFirstSmallerObject(documentSpan);
|
||||
PageSpanLookup firstPage = documentWordLookup.get(firstSmallerIdx);
|
||||
List<WordOnPage> wordsOnPages = new ArrayList<>();
|
||||
for (int pageNumber = firstSmallerIdx; pageNumber < documentWordLookup.size(); pageNumber++) {
|
||||
PageSpanLookup page = documentWordLookup.get(pageNumber);
|
||||
if (page.end >= documentSpan.getOffset()) {
|
||||
break;
|
||||
}
|
||||
firstPage.wordSpanLookup.findElementsContainedInSpan(documentSpan)
|
||||
.stream()
|
||||
.map(documentWord -> new WordOnPage(documentWord, firstSmallerIdx))
|
||||
.forEach(wordsOnPages::add);
|
||||
}
|
||||
return wordsOnPages;
|
||||
}
|
||||
|
||||
|
||||
private int findIdxOfFirstSmallerObject(DocumentSpan documentSpan) {
|
||||
|
||||
int idx = Collections.binarySearch(documentWordLookup, new PageSpanLookup(documentSpan.getOffset(), -1, null), Comparator.comparing(PageSpanLookup::start));
|
||||
|
||||
if (idx >= 0) {
|
||||
return idx;
|
||||
} else {
|
||||
int insertionPoint = -(idx + 1);
|
||||
|
||||
if (insertionPoint == 0) {
|
||||
return -1;
|
||||
}
|
||||
var lastSmaller = documentWordLookup.get(insertionPoint - 1);
|
||||
for (int resultIdx = insertionPoint - 2; resultIdx >= 0; resultIdx--) {
|
||||
if (documentWordLookup.get(resultIdx).compareTo(lastSmaller) == 0) {
|
||||
return resultIdx + 1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public record WordOnPage(DocumentWord documentWord, int pageNumber) {
|
||||
|
||||
}
|
||||
|
||||
private record PageSpanLookup(int start, int end, SpanLookup<DocumentWord> wordSpanLookup) implements Comparable<PageSpanLookup> {
|
||||
|
||||
@Override
|
||||
public int compareTo(PageSpanLookup o) {
|
||||
|
||||
return Integer.compare(start, o.start);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,7 +1,5 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
|
||||
@ -12,10 +10,4 @@ public record ImageFile(int pageNumber, String absoluteFilePath) {
|
||||
return Leptonica1.pixRead(absoluteFilePath);
|
||||
}
|
||||
|
||||
|
||||
public boolean exists() {
|
||||
|
||||
return new File(absoluteFilePath).exists();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -2,129 +2,29 @@ package com.knecon.fforesight.service.ocr.processor.model;
|
||||
|
||||
import static com.knecon.fforesight.service.ocr.processor.utils.ListSplittingUtils.formatIntervals;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
import com.azure.ai.documentintelligence.models.AnalyzeResult;
|
||||
import com.azure.core.util.BinaryData;
|
||||
import com.azure.json.JsonOptions;
|
||||
import com.azure.json.JsonReader;
|
||||
import com.azure.json.implementation.DefaultJsonReader;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.GhostScriptService;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.NonNull;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public final class PageBatch implements Comparable<PageBatch> {
|
||||
|
||||
@Getter
|
||||
int index;
|
||||
@NonNull
|
||||
List<Integer> batchPageToOriginPageLookup;
|
||||
@NonNull
|
||||
@Getter
|
||||
Path batchDoc;
|
||||
@NonNull
|
||||
@Getter
|
||||
Path batchDir;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public AnalyzeResult getAzureResultCache() {
|
||||
|
||||
try (var in = new FileInputStream(getAzureResultCacheFile()); JsonReader reader = DefaultJsonReader.fromStream(in, new JsonOptions());) {
|
||||
return AnalyzeResult.fromJson(reader);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public File getAzureResultCacheFile() {
|
||||
|
||||
return batchDir.resolve("analyzeResult.json").toFile();
|
||||
}
|
||||
|
||||
|
||||
public List<ImageFile> getRenderedImageFiles() {
|
||||
|
||||
List<ImageFile> renderedImageFiles = new ArrayList<>();
|
||||
for (int i = 0; i < batchPageToOriginPageLookup.size(); i++) {
|
||||
renderedImageFiles.add(getRenderedImageFile(batchPageToOriginPageLookup.get(i), i + 1));
|
||||
}
|
||||
return renderedImageFiles;
|
||||
}
|
||||
|
||||
|
||||
public ImageFile getRenderedImageFile(int pageNumber, int numberInBatch) {
|
||||
|
||||
return new ImageFile(pageNumber, getRenderedImageNameFormat().formatted(numberInBatch));
|
||||
}
|
||||
|
||||
|
||||
public ImageFile getProcessedImageFile(int pageNumber, int numberInBatch) {
|
||||
|
||||
return new ImageFile(pageNumber, getProcessedImageNameFormat().formatted(numberInBatch));
|
||||
}
|
||||
|
||||
|
||||
public List<ImageFile> getProcessedImageFiles() {
|
||||
|
||||
List<ImageFile> processedImageFiles = new ArrayList<>();
|
||||
for (int i = 0; i < batchPageToOriginPageLookup.size(); i++) {
|
||||
processedImageFiles.add(getProcessedImageFile(batchPageToOriginPageLookup.get(i), i + 1));
|
||||
}
|
||||
return processedImageFiles;
|
||||
}
|
||||
|
||||
|
||||
public String getRenderedImageNameFormat() {
|
||||
|
||||
return getRenderedImageDir().resolve(getImageFormat()).toFile().toString();
|
||||
}
|
||||
|
||||
|
||||
public String getProcessedImageNameFormat() {
|
||||
|
||||
return getProcessedImageDir().resolve(getImageFormat()).toFile().toString();
|
||||
}
|
||||
|
||||
|
||||
private String getImageFormat() {
|
||||
|
||||
return "output_" + index + ".%04d" + GhostScriptService.FORMAT;
|
||||
}
|
||||
|
||||
|
||||
public Path getRenderedImageDir() {
|
||||
|
||||
return batchDir.resolve("rendered");
|
||||
}
|
||||
|
||||
|
||||
public Path getProcessedImageDir() {
|
||||
|
||||
return batchDir.resolve("processed");
|
||||
}
|
||||
List<Integer> lookup = new ArrayList<>();
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
if (size() == 1) {
|
||||
return String.format("%d", batchPageToOriginPageLookup.get(0));
|
||||
return String.format("%d", lookup.get(0));
|
||||
}
|
||||
|
||||
List<String> intervals = formatIntervals(batchPageToOriginPageLookup);
|
||||
List<String> intervals = formatIntervals(lookup);
|
||||
if (intervals.size() > 4) {
|
||||
intervals = intervals.subList(0, 4);
|
||||
intervals.add("...");
|
||||
@ -134,54 +34,54 @@ public final class PageBatch implements Comparable<PageBatch> {
|
||||
}
|
||||
|
||||
|
||||
public void add(Integer pageNumber) {
|
||||
|
||||
lookup.add(pageNumber);
|
||||
}
|
||||
|
||||
|
||||
public void forEach(Consumer<? super Integer> consumer) {
|
||||
|
||||
batchPageToOriginPageLookup.forEach(consumer);
|
||||
lookup.forEach(consumer);
|
||||
}
|
||||
|
||||
|
||||
public List<Integer> getAllPageNumbers() {
|
||||
|
||||
return batchPageToOriginPageLookup;
|
||||
return lookup;
|
||||
}
|
||||
|
||||
|
||||
public int size() {
|
||||
|
||||
return batchPageToOriginPageLookup.size();
|
||||
return lookup.size();
|
||||
}
|
||||
|
||||
|
||||
public boolean isEmpty() {
|
||||
|
||||
return batchPageToOriginPageLookup.isEmpty();
|
||||
return lookup.isEmpty();
|
||||
}
|
||||
|
||||
|
||||
public int getPageNumber(int pageNumber) {
|
||||
|
||||
return batchPageToOriginPageLookup.get(pageNumber - 1);
|
||||
return lookup.get(pageNumber - 1);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int compareTo(PageBatch o) {
|
||||
|
||||
if (batchPageToOriginPageLookup.isEmpty() && o.batchPageToOriginPageLookup.isEmpty()) {
|
||||
if (lookup.isEmpty() && o.lookup.isEmpty()) {
|
||||
return 0;
|
||||
} else if (batchPageToOriginPageLookup.isEmpty()) {
|
||||
} else if (lookup.isEmpty()) {
|
||||
return 1;
|
||||
} else if (o.batchPageToOriginPageLookup.isEmpty()) {
|
||||
} else if (o.lookup.isEmpty()) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return Integer.compare(batchPageToOriginPageLookup.get(0), o.batchPageToOriginPageLookup.get(0));
|
||||
}
|
||||
|
||||
|
||||
public BinaryData render() {
|
||||
|
||||
return BinaryData.fromFile(batchDoc);
|
||||
return Integer.compare(lookup.get(0), o.lookup.get(0));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -13,19 +13,17 @@ import com.pdftron.pdf.Rect;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
public record PageInformation(Rectangle2D mediabox, Rectangle2D cropBox, int number, int rotationDegrees, List<Rectangle2D> wordBBoxes) {
|
||||
public record PageInformation(Rectangle2D mediabox, int number, int rotationDegrees, List<Rectangle2D> wordBBoxes) {
|
||||
|
||||
@SneakyThrows
|
||||
public static Map<Integer, PageInformation> fromPDFDoc(PDFDoc pdfDoc) {
|
||||
|
||||
ConcurrentHashMap<Integer, PageInformation> pageInformationMap = new ConcurrentHashMap<>();
|
||||
int pageNumber = 1;
|
||||
try (PageIterator iterator = pdfDoc.getPageIterator()) {
|
||||
while (iterator.hasNext()) {
|
||||
Page page = iterator.next();
|
||||
pageInformationMap.put(pageNumber, PageInformation.fromPage(pageNumber, page));
|
||||
pageNumber++;
|
||||
}
|
||||
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); pageNumber++) {
|
||||
|
||||
Page page = iterator.next();
|
||||
pageInformationMap.put(pageNumber, PageInformation.fromPage(pageNumber, page));
|
||||
}
|
||||
return pageInformationMap;
|
||||
}
|
||||
@ -34,9 +32,8 @@ public record PageInformation(Rectangle2D mediabox, Rectangle2D cropBox, int num
|
||||
@SneakyThrows
|
||||
public static PageInformation fromPage(int pageNum, Page page) {
|
||||
|
||||
try (Rect mediaBox = page.getCropBox(); Rect cropBox = page.getCropBox()) {
|
||||
try (Rect mediaBox = page.getCropBox()) {
|
||||
return new PageInformation(new Rectangle2D.Double(mediaBox.getX1(), mediaBox.getY1(), mediaBox.getWidth(), mediaBox.getHeight()),
|
||||
new Rectangle2D.Double(cropBox.getX1(), cropBox.getY1(), cropBox.getWidth(), cropBox.getHeight()),
|
||||
pageNum,
|
||||
page.getRotation() * 90,
|
||||
DocumentTextExtractor.getTextBBoxes(page));
|
||||
|
||||
@ -162,7 +162,7 @@ public class Statistics {
|
||||
|
||||
return batchStats.values()
|
||||
.stream()
|
||||
.mapToLong(BatchStats::getMappingResultDuration)
|
||||
.mapToLong(BatchStats::getWritingTextDuration)
|
||||
.toArray();
|
||||
}
|
||||
|
||||
|
||||
@ -3,6 +3,7 @@ package com.knecon.fforesight.service.ocr.processor.model;
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
|
||||
import com.azure.ai.documentintelligence.models.DocumentWord;
|
||||
import com.knecon.fforesight.service.ocr.processor.visualizations.fonts.FontMetrics;
|
||||
import com.knecon.fforesight.service.ocr.processor.visualizations.fonts.FontMetricsProvider;
|
||||
import com.knecon.fforesight.service.ocr.processor.visualizations.fonts.FontStyle;
|
||||
@ -19,8 +20,7 @@ public class TextPositionInImage {
|
||||
|
||||
final QuadPoint position;
|
||||
final String text;
|
||||
final AffineTransform resultToPageTransform;
|
||||
final boolean snugBBox;
|
||||
final AffineTransform imageCTM;
|
||||
|
||||
@Setter
|
||||
boolean overlapsIgnoreZone;
|
||||
@ -30,34 +30,33 @@ public class TextPositionInImage {
|
||||
FontStyle fontStyle;
|
||||
|
||||
|
||||
public TextPositionInImage(QuadPoint position, String text, AffineTransform resultToPageTransform, FontMetricsProvider fontMetricsProvider, FontStyle fontStyle, boolean snugBBox) {
|
||||
public TextPositionInImage(DocumentWord word, AffineTransform imageCTM, FontMetricsProvider fontMetricsProvider, FontStyle fontStyle) {
|
||||
|
||||
this.position = position;
|
||||
this.text = text;
|
||||
this.resultToPageTransform = resultToPageTransform;
|
||||
this.position = QuadPoint.fromPolygons(word.getPolygon());
|
||||
this.text = word.getContent();
|
||||
this.imageCTM = imageCTM;
|
||||
this.fontMetricsProvider = fontMetricsProvider;
|
||||
this.fontStyle = fontStyle;
|
||||
this.snugBBox = snugBBox;
|
||||
}
|
||||
|
||||
|
||||
public QuadPoint getTransformedTextBBox() {
|
||||
|
||||
return position.getTransformed(resultToPageTransform);
|
||||
return position.getTransformed(imageCTM);
|
||||
}
|
||||
|
||||
|
||||
public AffineTransform getTextMatrix() {
|
||||
|
||||
FontMetrics metrics = getMetrics();
|
||||
FontMetrics metrics = fontMetricsProvider.calculateMetrics(text, getTransformedWidth(), getTransformedHeight());
|
||||
|
||||
// Matrix multiplication is from right to left:
|
||||
// convert to image coords -> subtract descent -> scale height -> reverse imageCTM scaling -> translate to coordinates in image -> convert to pdf coords
|
||||
// width must not be set, since it is scaled with the fontsize attribute
|
||||
double rotation = position.getAngle();
|
||||
double rotation = position.angle();
|
||||
Point2D anchor = new Point2D.Double(position.b().getX(), position.b().getY());
|
||||
AffineTransform ctm = new AffineTransform();
|
||||
ctm.concatenate(resultToPageTransform);
|
||||
ctm.concatenate(imageCTM);
|
||||
ctm.translate(anchor.getX(), anchor.getY());
|
||||
ctm.scale(getWidth() / getTransformedWidth(),
|
||||
getHeight() / getTransformedHeight()); // scale with transformation coefficient, such that fontsize may be set with transformed width.
|
||||
@ -70,15 +69,6 @@ public class TextPositionInImage {
|
||||
}
|
||||
|
||||
|
||||
private FontMetrics getMetrics() {
|
||||
|
||||
if (snugBBox) {
|
||||
return fontMetricsProvider.calculateMetricsForTightBBox(text, getTransformedWidth(), getTransformedHeight());
|
||||
}
|
||||
return fontMetricsProvider.calculateMetricsForAzureBBox(text, getTransformedWidth(), getTransformedHeight());
|
||||
}
|
||||
|
||||
|
||||
public double getFontSize() {
|
||||
// The fontsize as estimated by the word width
|
||||
return fontMetricsProvider.calculateFontSize(text, getTransformedWidth());
|
||||
@ -105,7 +95,7 @@ public class TextPositionInImage {
|
||||
|
||||
public double getFontSizeByHeight() {
|
||||
// The fontsize as estimated by the word height, only used for font style detection
|
||||
var metrics = getMetrics();
|
||||
var metrics = fontMetricsProvider.calculateMetrics(text, getTransformedWidth(), getTransformedHeight());
|
||||
return fontMetricsProvider.calculateFontSize(text, getTransformedWidth()) * metrics.getHeightScaling();
|
||||
}
|
||||
|
||||
@ -118,25 +108,25 @@ public class TextPositionInImage {
|
||||
|
||||
public Point2D transformedA() {
|
||||
|
||||
return resultToPageTransform.transform(position.a(), null);
|
||||
return imageCTM.transform(position.a(), null);
|
||||
}
|
||||
|
||||
|
||||
public Point2D transformedB() {
|
||||
|
||||
return resultToPageTransform.transform(position.b(), null);
|
||||
return imageCTM.transform(position.b(), null);
|
||||
}
|
||||
|
||||
|
||||
public Point2D transformedC() {
|
||||
|
||||
return resultToPageTransform.transform(position.c(), null);
|
||||
return imageCTM.transform(position.c(), null);
|
||||
}
|
||||
|
||||
|
||||
public Point2D transformedD() {
|
||||
|
||||
return resultToPageTransform.transform(position.d(), null);
|
||||
return imageCTM.transform(position.d(), null);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,25 +1,23 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
import java.util.function.Supplier;
|
||||
|
||||
import org.slf4j.MDC;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.azure.ai.documentintelligence.models.AnalyzeResult;
|
||||
import com.azure.core.util.BinaryData;
|
||||
import com.azure.core.util.polling.LongRunningOperationStatus;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.knecon.fforesight.service.ocr.processor.OcrServiceSettings;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.PageBatch;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.PageInformation;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.ImageProcessingPipeline;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.ImageProcessingSupervisor;
|
||||
import com.knecon.fforesight.service.ocr.processor.visualizations.layers.LayerFactory;
|
||||
import com.knecon.fforesight.service.ocr.processor.visualizations.layers.OcrResult;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.AzureOcrFeature;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.sdf.SDFDoc;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
@ -36,13 +34,16 @@ public class AsyncOcrService {
|
||||
|
||||
AzureOcrResource azureOcrResource;
|
||||
OcrServiceSettings settings;
|
||||
ImageProcessingPipeline imageProcessingPipeline;
|
||||
ObjectMapper mapper;
|
||||
|
||||
|
||||
public OcrResult awaitOcr(PDFDoc pdfDoc, OcrExecutionSupervisor supervisor, Set<AzureOcrFeature> features, List<PageBatch> batches) throws InterruptedException {
|
||||
public OcrResult awaitOcr(PDFDoc pdfDoc,
|
||||
OcrExecutionSupervisor supervisor,
|
||||
Set<Integer> pagesWithImages,
|
||||
ImageProcessingSupervisor imageSupervisor) throws InterruptedException, PDFNetException {
|
||||
|
||||
LayerFactory layerFactory = new LayerFactory(settings, features, supervisor, PageInformation.fromPDFDoc(pdfDoc), imageProcessingPipeline);
|
||||
LayerFactory layerFactory = new LayerFactory(settings, supervisor, imageSupervisor, PageInformation.fromPDFDoc(pdfDoc));
|
||||
|
||||
List<PageBatch> batches = splitIntoBatches(pdfDoc, supervisor, pagesWithImages);
|
||||
|
||||
for (PageBatch batch : batches) {
|
||||
|
||||
@ -55,10 +56,12 @@ public class AsyncOcrService {
|
||||
supervisor.requireNoErrors();
|
||||
|
||||
batchContext.batchStats().start();
|
||||
BinaryData data = batch.render();
|
||||
|
||||
BinaryData data = renderBatch(pdfDoc, batch);
|
||||
|
||||
batchContext.batchStats().batchRenderFinished();
|
||||
|
||||
beginAnalysis(data, batchContext, features);
|
||||
beginAnalysis(data, batchContext);
|
||||
}
|
||||
|
||||
supervisor.awaitAllPagesProcessed();
|
||||
@ -67,21 +70,44 @@ public class AsyncOcrService {
|
||||
}
|
||||
|
||||
|
||||
private void beginAnalysis(BinaryData data, BatchContext batchContext, Set<AzureOcrFeature> features) throws InterruptedException {
|
||||
private static BinaryData renderBatch(PDFDoc pdfDoc, PageBatch batch) throws PDFNetException {
|
||||
|
||||
if (settings.isUseCaches() && batchContext.batch().getAzureResultCacheFile().exists()) {
|
||||
handleCached(batchContext);
|
||||
BinaryData docData;
|
||||
try (var smallerDoc = extractBatchDocument(pdfDoc, batch)) {
|
||||
docData = BinaryData.fromBytes(smallerDoc.save(SDFDoc.SaveMode.LINEARIZED, null));
|
||||
}
|
||||
return docData;
|
||||
}
|
||||
|
||||
|
||||
private List<PageBatch> splitIntoBatches(PDFDoc pdfDoc, OcrExecutionSupervisor supervisor, Set<Integer> pagesWithImages) throws PDFNetException {
|
||||
|
||||
List<PageBatch> batches = new ArrayList<>();
|
||||
PageBatch currentBatch = new PageBatch();
|
||||
batches.add(currentBatch);
|
||||
for (int pageNumber = 1; pageNumber <= pdfDoc.getPageCount(); pageNumber++) {
|
||||
if (!settings.isProcessAllPages() && !pagesWithImages.contains(pageNumber)) {
|
||||
supervisor.logPageSkipped(pageNumber);
|
||||
continue;
|
||||
}
|
||||
currentBatch.add(pageNumber);
|
||||
if (currentBatch.size() == settings.getBatchSize()) {
|
||||
currentBatch = new PageBatch();
|
||||
batches.add(currentBatch);
|
||||
}
|
||||
}
|
||||
return batches;
|
||||
}
|
||||
|
||||
|
||||
private void beginAnalysis(BinaryData data, BatchContext batchContext) throws InterruptedException {
|
||||
|
||||
batchContext.supervisor.enterConcurrency(batchContext.batch);
|
||||
|
||||
batchContext.supervisor.logUploadStart(batchContext.batch, data.getLength());
|
||||
|
||||
var mdcContext = MDC.getCopyOfContextMap();
|
||||
|
||||
azureOcrResource.callAzureAsync(data, features)
|
||||
azureOcrResource.callAzureAsync(data)
|
||||
.flatMap(response -> {
|
||||
MDC.setContextMap(mdcContext);
|
||||
if (response.getStatus().equals(LongRunningOperationStatus.IN_PROGRESS)) {
|
||||
batchContext.supervisor.logInProgress(batchContext.batch);
|
||||
}
|
||||
@ -91,62 +117,54 @@ public class AsyncOcrService {
|
||||
if (LongRunningOperationStatus.SUCCESSFULLY_COMPLETED == response.getStatus()) {
|
||||
return response.getFinalResult();
|
||||
}
|
||||
String message = "Polling completed unsuccessfully with status: " + response.getStatus();
|
||||
log.error(message);
|
||||
return Mono.error(new IllegalStateException(message));
|
||||
return Mono.error(new IllegalStateException("Polling completed unsuccessfully with status: " + response.getStatus()));
|
||||
}).subscribe(finalResult -> handleSuccessful(finalResult, batchContext),//
|
||||
ex -> handleError(ex, batchContext),//
|
||||
() -> handleCompleted(batchContext));
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static void handleCached(BatchContext batchContext) {
|
||||
|
||||
var mdcContext = MDC.getCopyOfContextMap();
|
||||
Thread thread = new Thread(() -> {
|
||||
MDC.setContextMap(mdcContext);
|
||||
log.info("Batch {}: Using cached ocr result", batchContext.batch.getIndex());
|
||||
batchContext.batchStats().finishUpload();
|
||||
batchContext.batchStats().finishApiWait();
|
||||
batchContext.supervisor.logPageSuccess(batchContext.batch());
|
||||
try {
|
||||
batchContext.layerFactory.processAnalyzeResult(batchContext.batch(), batchContext.batch().getAzureResultCache());
|
||||
} catch (InterruptedException e) {
|
||||
batchContext.supervisor.logPageError(batchContext.batch, e);
|
||||
}
|
||||
|
||||
});
|
||||
thread.start();
|
||||
}
|
||||
|
||||
|
||||
private static void handleCompleted(BatchContext batchContext) {
|
||||
|
||||
log.info("Batch {}: Completed with pages {}", batchContext.batch.getIndex(), batchContext.batch);
|
||||
batchContext.supervisor.leaveConcurrency(batchContext.batch);
|
||||
}
|
||||
|
||||
|
||||
private void handleError(Throwable ex, BatchContext batchContext) {
|
||||
|
||||
batchContext.supervisor.leaveConcurrency(batchContext.batch);
|
||||
batchContext.supervisor.logPageError(batchContext.batch, ex);
|
||||
}
|
||||
|
||||
|
||||
private void handleSuccessful(AnalyzeResult finalResult, BatchContext batchContext) {
|
||||
|
||||
batchContext.supervisor.leaveConcurrency(batchContext.batch);
|
||||
try {
|
||||
mapper.writeValue(batchContext.batch().getAzureResultCacheFile(), finalResult);
|
||||
batchContext.supervisor.logPageSuccess(batchContext.batch());
|
||||
batchContext.layerFactory.processAnalyzeResult(batchContext.batch(), finalResult);
|
||||
batchContext.layerFactory.addAnalyzeResult(batchContext.batch, finalResult);
|
||||
batchContext.supervisor.logPageSuccess(batchContext.batch);
|
||||
} catch (Exception e) {
|
||||
handleError(e, batchContext);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static PDFDoc extractBatchDocument(PDFDoc pdfDoc, PageBatch pageBatch) throws PDFNetException {
|
||||
|
||||
if (pageBatch.size() < 0) {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
PDFDoc singlePagePdfDoc = new PDFDoc();
|
||||
pageBatch.forEach(pageNumber -> addPageToNewDoc(pageNumber, pdfDoc, singlePagePdfDoc));
|
||||
return singlePagePdfDoc;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static void addPageToNewDoc(Integer pageNumber, PDFDoc pdfDoc, PDFDoc singlePagePdfDoc) {
|
||||
|
||||
singlePagePdfDoc.pagePushBack(pdfDoc.getPage(pageNumber));
|
||||
}
|
||||
|
||||
|
||||
private record BatchContext(LayerFactory layerFactory, OcrExecutionSupervisor supervisor, PageBatch batch) {
|
||||
|
||||
BatchStats batchStats() {
|
||||
|
||||
@ -2,25 +2,23 @@ package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.azure.ai.documentintelligence.DocumentIntelligenceAsyncClient;
|
||||
import com.azure.ai.documentintelligence.DocumentIntelligenceClientBuilder;
|
||||
import com.azure.ai.documentintelligence.models.AnalyzeDocumentOptions;
|
||||
import com.azure.ai.documentintelligence.models.AnalyzeOperationDetails;
|
||||
import com.azure.ai.documentintelligence.models.AnalyzeDocumentRequest;
|
||||
import com.azure.ai.documentintelligence.models.AnalyzeResult;
|
||||
import com.azure.ai.documentintelligence.models.AnalyzeResultOperation;
|
||||
import com.azure.ai.documentintelligence.models.ContentFormat;
|
||||
import com.azure.ai.documentintelligence.models.DocumentAnalysisFeature;
|
||||
import com.azure.ai.documentintelligence.models.DocumentContentFormat;
|
||||
import com.azure.ai.documentintelligence.models.StringIndexType;
|
||||
import com.azure.core.credential.AzureKeyCredential;
|
||||
import com.azure.core.util.BinaryData;
|
||||
import com.azure.core.util.polling.PollerFlux;
|
||||
import com.google.common.base.Objects;
|
||||
import com.knecon.fforesight.service.ocr.processor.OcrServiceSettings;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.AzureOcrFeature;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.SneakyThrows;
|
||||
@ -44,48 +42,43 @@ public class AzureOcrResource {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public PollerFlux<AnalyzeOperationDetails, AnalyzeResult> callAzureAsync(BinaryData data, Set<AzureOcrFeature> features) {
|
||||
public PollerFlux<AnalyzeResultOperation, AnalyzeResult> callAzureAsync(BinaryData data) {
|
||||
|
||||
AnalyzeDocumentOptions analyzeDocumentOptions = new AnalyzeDocumentOptions(data.toBytes());
|
||||
analyzeDocumentOptions.setStringIndexType(StringIndexType.UTF16_CODE_UNIT);
|
||||
analyzeDocumentOptions.setDocumentAnalysisFeatures(buildFeatures(features));
|
||||
analyzeDocumentOptions.setOutputContentFormat(buildContentFormat());
|
||||
return asyncClient.beginAnalyzeDocument(getModelId(features), analyzeDocumentOptions);
|
||||
AnalyzeDocumentRequest analyzeRequest = new AnalyzeDocumentRequest().setBase64Source(data.toBytes());
|
||||
|
||||
return asyncClient.beginAnalyzeDocument(getModelId(), null, null, StringIndexType.UTF16CODE_UNIT, buildFeatures(), null, buildContentFormat(), analyzeRequest);
|
||||
|
||||
}
|
||||
|
||||
|
||||
private DocumentContentFormat buildContentFormat() {
|
||||
private ContentFormat buildContentFormat() {
|
||||
|
||||
if (Objects.equal(settings.getContentFormat(), "markdown")) {
|
||||
return DocumentContentFormat.MARKDOWN;
|
||||
return ContentFormat.MARKDOWN;
|
||||
}
|
||||
return DocumentContentFormat.TEXT;
|
||||
return ContentFormat.TEXT;
|
||||
}
|
||||
|
||||
|
||||
private String getModelId(Set<AzureOcrFeature> features) {
|
||||
private String getModelId() {
|
||||
|
||||
if (features.contains(AzureOcrFeature.IDP)) {
|
||||
if (settings.isIdpEnabled()) {
|
||||
return "prebuilt-layout";
|
||||
}
|
||||
return "prebuilt-read";
|
||||
}
|
||||
|
||||
|
||||
private List<DocumentAnalysisFeature> buildFeatures(Set<AzureOcrFeature> features) {
|
||||
private List<DocumentAnalysisFeature> buildFeatures() {
|
||||
|
||||
var azureFeatures = new ArrayList<DocumentAnalysisFeature>();
|
||||
var features = new ArrayList<DocumentAnalysisFeature>();
|
||||
|
||||
if (features.contains(AzureOcrFeature.IDP)) {
|
||||
azureFeatures.add(DocumentAnalysisFeature.KEY_VALUE_PAIRS);
|
||||
if (settings.isIdpEnabled()) {
|
||||
features.add(DocumentAnalysisFeature.KEY_VALUE_PAIRS);
|
||||
}
|
||||
if (settings.isAzureFontStyleDetection() && features.contains(AzureOcrFeature.FONT_STYLE_DETECTION)) {
|
||||
azureFeatures.add(DocumentAnalysisFeature.STYLE_FONT);
|
||||
}
|
||||
azureFeatures.add(DocumentAnalysisFeature.BARCODES);
|
||||
features.add(DocumentAnalysisFeature.BARCODES);
|
||||
|
||||
return azureFeatures;
|
||||
return features;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,144 +0,0 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import static com.knecon.fforesight.service.ocr.processor.utils.ListSplittingUtils.formatIntervals;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.OcrServiceSettings;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.PageBatch;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.AzureOcrFeature;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.Optimizer;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.sdf.SDFDoc;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class BatchFactory {
|
||||
|
||||
OcrServiceSettings settings;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public List<PageBatch> splitIntoBatches(PDFDoc pdfDoc, OcrExecutionSupervisor supervisor, Set<AzureOcrFeature> features, Path runDir) {
|
||||
|
||||
Set<Integer> pagesToProcess = findPagesToProcess(pdfDoc, features);
|
||||
supervisor.logImageExtractionFinished(pdfDoc.getPageCount(), pagesToProcess.size());
|
||||
|
||||
List<PageBatch> batches = buildBatches(pdfDoc, supervisor, features, runDir, pagesToProcess);
|
||||
if (batches.size() > 1) {
|
||||
log.info("Split {} pages to process into {} batches", pagesToProcess.size(), batches.size());
|
||||
}
|
||||
return batches;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public Set<Integer> findPagesToProcess(PDFDoc pdfDoc, Set<AzureOcrFeature> features) {
|
||||
|
||||
if (features.contains(AzureOcrFeature.ALL_PAGES)) {
|
||||
Set<Integer> pages = new HashSet<>();
|
||||
for (int i = 1; i <= pdfDoc.getPageCount(); i++) {
|
||||
pages.add(i);
|
||||
}
|
||||
return Collections.unmodifiableSet(pages);
|
||||
}
|
||||
|
||||
return ImageDetectionService.findPagesWithImages(pdfDoc);
|
||||
}
|
||||
|
||||
|
||||
public List<PageBatch> buildBatches(PDFDoc pdfDoc,
|
||||
OcrExecutionSupervisor supervisor,
|
||||
Set<AzureOcrFeature> features,
|
||||
Path runDir,
|
||||
Set<Integer> pagesWithImages) throws PDFNetException {
|
||||
|
||||
List<PageBatch> batches = new ArrayList<>();
|
||||
List<Integer> numbersForCurrentBatch = new ArrayList<>();
|
||||
for (int pageNumber = 1; pageNumber <= pdfDoc.getPageCount(); pageNumber++) {
|
||||
if (!features.contains(AzureOcrFeature.ALL_PAGES) && !pagesWithImages.contains(pageNumber)) {
|
||||
supervisor.logPageSkipped(pageNumber);
|
||||
continue;
|
||||
}
|
||||
numbersForCurrentBatch.add(pageNumber);
|
||||
if (numbersForCurrentBatch.size() == settings.getBatchSize()) {
|
||||
batches.add(create(batches.size(), pdfDoc, numbersForCurrentBatch, runDir));
|
||||
numbersForCurrentBatch = new ArrayList<>();
|
||||
}
|
||||
}
|
||||
if (!numbersForCurrentBatch.isEmpty()) {
|
||||
batches.add(create(batches.size(), pdfDoc, numbersForCurrentBatch, runDir));
|
||||
}
|
||||
return batches;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static PageBatch create(int number, PDFDoc pdfDoc, List<Integer> pageNumbers, Path runDir) {
|
||||
|
||||
if (pageNumbers.isEmpty()) {
|
||||
throw new IllegalArgumentException("pageNumbers must not be empty");
|
||||
}
|
||||
Path batchDir = formatBatchDir(number, pageNumbers, runDir);
|
||||
Files.createDirectories(batchDir);
|
||||
|
||||
Path batchDocPath = batchDir.resolve("batch.pdf");
|
||||
try (var batchDoc = extractBatchDocument(pdfDoc, pageNumbers)) {
|
||||
Optimizer.optimize(batchDoc);
|
||||
batchDoc.save(batchDocPath.toFile().toString(), SDFDoc.SaveMode.LINEARIZED, null);
|
||||
}
|
||||
PageBatch batch = new PageBatch(number, pageNumbers, batchDocPath, batchDir);
|
||||
Files.createDirectories(batch.getRenderedImageDir());
|
||||
Files.createDirectories(batch.getProcessedImageDir());
|
||||
return batch;
|
||||
}
|
||||
|
||||
|
||||
private static Path formatBatchDir(int number, List<Integer> pageNumbers, Path runDir) {
|
||||
|
||||
List<String> intervals = formatIntervals(pageNumbers);
|
||||
if (intervals.size() > 4) {
|
||||
intervals = intervals.subList(0, 4);
|
||||
intervals.add("...");
|
||||
}
|
||||
|
||||
String batchName = String.join(", ", intervals);
|
||||
return runDir.resolve("batch_%04d_%s".formatted(number, batchName));
|
||||
}
|
||||
|
||||
|
||||
private static PDFDoc extractBatchDocument(PDFDoc pdfDoc, List<Integer> pageBatch) throws PDFNetException {
|
||||
|
||||
if (pageBatch.isEmpty()) {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
PDFDoc batchDoc = new PDFDoc();
|
||||
pageBatch.forEach(pageNumber -> addPageToNewDoc(pageNumber, pdfDoc, batchDoc));
|
||||
return batchDoc;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static void addPageToNewDoc(Integer pageNumber, PDFDoc pdfDoc, PDFDoc batchDoc) {
|
||||
|
||||
batchDoc.pagePushBack(pdfDoc.getPage(pageNumber));
|
||||
}
|
||||
|
||||
}
|
||||
@ -10,7 +10,7 @@ public class BatchStats {
|
||||
|
||||
private long apiWaitTimestamp = -1;
|
||||
private long imageUploadTimestamp = -1;
|
||||
private long mappingResultTimestamp = -1;
|
||||
private long writingTextTimestamp = -1;
|
||||
private long batchRenderTimestamp = -1;
|
||||
|
||||
|
||||
@ -38,9 +38,9 @@ public class BatchStats {
|
||||
}
|
||||
|
||||
|
||||
public void finishMappingResult() {
|
||||
public void finishWritingText() {
|
||||
|
||||
mappingResultTimestamp = System.currentTimeMillis();
|
||||
writingTextTimestamp = System.currentTimeMillis();
|
||||
}
|
||||
|
||||
|
||||
@ -50,33 +50,15 @@ public class BatchStats {
|
||||
}
|
||||
|
||||
|
||||
public boolean isApiWaitFinished() {
|
||||
|
||||
return apiWaitTimestamp > 0;
|
||||
}
|
||||
|
||||
|
||||
public boolean isMappingResultFinished() {
|
||||
|
||||
return mappingResultTimestamp > 0;
|
||||
}
|
||||
|
||||
|
||||
public boolean isBatchRenderFinished() {
|
||||
|
||||
return batchRenderTimestamp > 0;
|
||||
}
|
||||
|
||||
|
||||
public long getApiWaitDuration() {return this.apiWaitTimestamp - imageUploadTimestamp;}
|
||||
|
||||
|
||||
public long getImageUploadDuration() {return this.imageUploadTimestamp - batchRenderTimestamp;}
|
||||
|
||||
|
||||
public long getMappingResultDuration() {return this.mappingResultTimestamp - apiWaitTimestamp;}
|
||||
public long getWritingTextDuration() {return this.writingTextTimestamp - apiWaitTimestamp;}
|
||||
|
||||
|
||||
public long getBatchRenderDuration() {return startTimestamp - this.batchRenderTimestamp;}
|
||||
public long getBatchRenderDuration() {return this.batchRenderTimestamp - startTimestamp;}
|
||||
|
||||
}
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.ocr.v1.server;
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
@ -33,23 +33,20 @@ public class FileStorageService {
|
||||
public void storeFiles(DocumentRequest request, File documentFile, File viewerDocumentFile, File analyzeResultFile) {
|
||||
|
||||
try (var in = new FileInputStream(viewerDocumentFile)) {
|
||||
if (request.optionalViewerDocumentId()
|
||||
.isPresent()) {
|
||||
if (request.optionalViewerDocumentId().isPresent()) {
|
||||
storageService.storeObject(TenantContext.getTenantId(), request.getViewerDocId(), in);
|
||||
} else {
|
||||
storageService.storeObject(TenantContext.getTenantId(), getStorageId(request.getDossierId(), request.getFileId(), FileType.VIEWER_DOCUMENT), in);
|
||||
}
|
||||
}
|
||||
try (var in = new FileInputStream(documentFile)) {
|
||||
if (request.optionalOriginDocumentId()
|
||||
.isPresent()) {
|
||||
if (request.optionalOriginDocumentId().isPresent()) {
|
||||
storageService.storeObject(TenantContext.getTenantId(), request.getOriginDocumentId(), in);
|
||||
} else {
|
||||
storageService.storeObject(TenantContext.getTenantId(), getStorageId(request.getDossierId(), request.getFileId(), FileType.ORIGIN), in);
|
||||
}
|
||||
}
|
||||
if (request.optionalIdpResultId()
|
||||
.isPresent() && analyzeResultFile.exists()) {
|
||||
if (request.optionalIdpResultId().isPresent()) {
|
||||
try (var in = new FileInputStream(analyzeResultFile)) {
|
||||
storageService.storeObject(TenantContext.getTenantId(), request.getIdpResultId(), in);
|
||||
}
|
||||
@ -62,8 +59,7 @@ public class FileStorageService {
|
||||
|
||||
Files.createDirectories(documentFile.getParentFile().toPath());
|
||||
|
||||
String originDocumentId = request.optionalOriginDocumentId()
|
||||
.orElse(getStorageId(request.getDossierId(), request.getFileId(), FileType.ORIGIN));
|
||||
String originDocumentId = request.optionalOriginDocumentId().orElse(getStorageId(request.getDossierId(), request.getFileId(), FileType.ORIGIN));
|
||||
|
||||
storageService.downloadTo(TenantContext.getTenantId(), originDocumentId, documentFile);
|
||||
|
||||
@ -1,24 +1,16 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.AzureOcrFeature;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.DocumentRequest;
|
||||
|
||||
@Service
|
||||
public interface IOcrMessageSender {
|
||||
|
||||
void sendUpdate(String fileId, int finishedImages, int totalImages, Set<AzureOcrFeature> features);
|
||||
void sendUpdate(String fileId, int finishedImages, int totalImages);
|
||||
|
||||
void sendOCRStarted(String fileId);
|
||||
|
||||
void sendOCRStarted(String fileId, Set<AzureOcrFeature> features);
|
||||
void sendOcrFinished(String fileId, int totalImages);
|
||||
|
||||
|
||||
void sendOcrFinished(String fileId, int totalImages, Set<AzureOcrFeature> features);
|
||||
|
||||
|
||||
void sendOcrResponse(DocumentRequest request);
|
||||
void sendOcrResponse(String dossierId, String fileId);
|
||||
|
||||
}
|
||||
|
||||
@ -7,24 +7,40 @@ import java.util.Set;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.OcrServiceSettings;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.AzureOcrFeature;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.Element;
|
||||
import com.pdftron.pdf.ElementReader;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
@Service
|
||||
public class ImageDetectionService {
|
||||
|
||||
// any image with smaller height and width than this gets thrown out, see everyPointInDashedLineIsImage.pdf
|
||||
private static final int PIXEL_THRESHOLD = 0;
|
||||
private final OcrServiceSettings ocrServiceSettings;
|
||||
|
||||
|
||||
public ImageDetectionService(OcrServiceSettings ocrServiceSettings) {this.ocrServiceSettings = ocrServiceSettings;}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public Set<Integer> findPagesWithImages(PDFDoc pdfDoc) {
|
||||
public Set<Integer> findPagesToProcess(PDFDoc pdfDoc) {
|
||||
|
||||
if (ocrServiceSettings.isProcessAllPages()) {
|
||||
Set<Integer> pages = new HashSet<>();
|
||||
for (int i = 1; i <= pdfDoc.getPageCount(); i++) {
|
||||
pages.add(i);
|
||||
}
|
||||
return Collections.unmodifiableSet(pages);
|
||||
}
|
||||
|
||||
return findPagesWithImages(pdfDoc);
|
||||
}
|
||||
|
||||
|
||||
private Set<Integer> findPagesWithImages(PDFDoc pdfDoc) throws PDFNetException {
|
||||
|
||||
Set<Integer> pagesWithImages = new HashSet<>();
|
||||
try (ElementReader reader = new ElementReader()) {
|
||||
@ -56,11 +72,8 @@ public class ImageDetectionService {
|
||||
}
|
||||
case Element.e_form -> {
|
||||
reader.formBegin();
|
||||
var found = findImagePositionsOnPage(reader);
|
||||
findImagePositionsOnPage(reader);
|
||||
reader.end();
|
||||
if (found) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -5,25 +5,21 @@ import static com.knecon.fforesight.service.ocr.processor.model.Statistics.human
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService;
|
||||
import com.iqser.red.pdftronlogic.commons.OCGWatermarkRemovalService;
|
||||
import com.iqser.red.pdftronlogic.commons.WatermarkRemovalService;
|
||||
import com.knecon.fforesight.service.ocr.processor.OcrServiceSettings;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.PageBatch;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.Statistics;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.ImageProcessingPipeline;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.ImageProcessingSupervisor;
|
||||
import com.knecon.fforesight.service.ocr.processor.visualizations.layers.OcrResult;
|
||||
import com.knecon.fforesight.service.ocr.processor.visualizations.utils.RotationCorrectionUtility;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.AzureOcrFeature;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
|
||||
@ -44,10 +40,10 @@ public class OCRService {
|
||||
WatermarkRemovalService watermarkRemovalService;
|
||||
InvisibleElementRemovalService invisibleElementRemovalService;
|
||||
PDFTronViewerDocumentService viewerDocumentService;
|
||||
BatchFactory batchFactory;
|
||||
ImageDetectionService imageDetectionService;
|
||||
AsyncOcrService asyncOcrService;
|
||||
OcrServiceSettings settings;
|
||||
ObjectMapper mapper;
|
||||
ImageProcessingPipeline imageProcessingPipeline;
|
||||
|
||||
|
||||
/**
|
||||
@ -60,23 +56,24 @@ public class OCRService {
|
||||
* @param tmpDir working directory for all files
|
||||
* @param documentFile the file to perform ocr on, results are written invisibly
|
||||
* @param viewerDocumentFile debugging file, results are written visibly in an optional content group
|
||||
* @param idpResultFile result file with additional information
|
||||
* @param analyzeResultFile result file with additional information
|
||||
*/
|
||||
@Observed(name = "OCRService", contextualName = "run-ocr-on-document")
|
||||
public void runOcrOnDocument(String dossierId, String fileId, Set<AzureOcrFeature> features, Path tmpDir, File documentFile, File viewerDocumentFile, File idpResultFile) {
|
||||
public void runOcrOnDocument(String dossierId, String fileId, boolean removeWatermark, Path tmpDir, File documentFile, File viewerDocumentFile, File analyzeResultFile) {
|
||||
|
||||
if (features.contains(AzureOcrFeature.REMOVE_WATERMARKS)) {
|
||||
if (removeWatermark) {
|
||||
removeWatermark(documentFile);
|
||||
}
|
||||
|
||||
removeInvisibleElements(documentFile);
|
||||
|
||||
log.info("Starting OCR for file {}", fileId);
|
||||
long ocrStart = System.currentTimeMillis();
|
||||
|
||||
Statistics stats = runOcr(tmpDir, documentFile, viewerDocumentFile, fileId, dossierId, idpResultFile, features).getStatistics();
|
||||
Statistics stats = runOcr(tmpDir, documentFile, viewerDocumentFile, fileId, dossierId, analyzeResultFile).getStatistics();
|
||||
|
||||
long ocrEnd = System.currentTimeMillis();
|
||||
log.info("OCR successful, took {}", humanizeDuration(ocrEnd - ocrStart));
|
||||
log.info("ocr successful for file with dossierId {} and fileId {}, took {}", dossierId, fileId, humanizeDuration(ocrEnd - ocrStart));
|
||||
|
||||
if (settings.isDebug()) {
|
||||
logRuntimeBreakdown(ocrEnd, ocrStart, stats);
|
||||
@ -120,37 +117,34 @@ public class OCRService {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public OcrExecutionSupervisor runOcr(Path runDir,
|
||||
File documentFile,
|
||||
File viewerDocumentFile,
|
||||
String fileId,
|
||||
String dossierId,
|
||||
File idpResultFile,
|
||||
Set<AzureOcrFeature> features) {
|
||||
public OcrExecutionSupervisor runOcr(Path tmpDir, File documentFile, File viewerDocumentFile, String fileId, String dossierId, File analyzeResultFile) {
|
||||
|
||||
Path tmpImageDir = tmpDir.resolve("images");
|
||||
Path azureOutputDir = tmpDir.resolve("azure_output");
|
||||
|
||||
Files.createDirectories(azureOutputDir);
|
||||
Files.createDirectories(tmpImageDir);
|
||||
|
||||
try (var in = new FileInputStream(documentFile); PDFDoc pdfDoc = new PDFDoc(in)) {
|
||||
|
||||
OCGWatermarkRemovalService.removeWatermarks(pdfDoc);
|
||||
|
||||
OcrExecutionSupervisor supervisor = new OcrExecutionSupervisor(pdfDoc.getPageCount(), ocrMessageSender, fileId, settings, features);
|
||||
OcrExecutionSupervisor supervisor = new OcrExecutionSupervisor(pdfDoc.getPageCount(), ocrMessageSender, fileId, settings);
|
||||
supervisor.getStatistics().setStart();
|
||||
|
||||
List<PageBatch> batches = batchFactory.splitIntoBatches(pdfDoc, supervisor, features, runDir);
|
||||
Set<Integer> pagesWithImages = imageDetectionService.findPagesToProcess(pdfDoc);
|
||||
ImageProcessingSupervisor imageSupervisor = null;
|
||||
if (settings.isFontStyleDetection()) {
|
||||
imageSupervisor = imageProcessingPipeline.run(pagesWithImages, tmpImageDir, documentFile);
|
||||
}
|
||||
|
||||
OcrResult ocrResult = asyncOcrService.awaitOcr(pdfDoc, supervisor, features, batches);
|
||||
supervisor.logImageExtractionFinished(pdfDoc.getPageCount(), pagesWithImages.size());
|
||||
|
||||
OcrResult ocrResult = asyncOcrService.awaitOcr(pdfDoc, supervisor, pagesWithImages, imageSupervisor);
|
||||
|
||||
viewerDocumentService.addLayerGroups(documentFile, documentFile, ocrResult.regularLayers());
|
||||
viewerDocumentService.addLayerGroups(documentFile, viewerDocumentFile, ocrResult.debugLayers());
|
||||
|
||||
if (features.contains(AzureOcrFeature.ROTATION_CORRECTION)) {
|
||||
RotationCorrectionUtility.rotatePages(documentFile.toPath(), documentFile.toPath(), ocrResult.anglesPerPage());
|
||||
RotationCorrectionUtility.rotatePages(viewerDocumentFile.toPath(), viewerDocumentFile.toPath(), ocrResult.anglesPerPage());
|
||||
}
|
||||
|
||||
if (features.contains(AzureOcrFeature.IDP)) {
|
||||
saveIdpResultFile(idpResultFile, ocrResult);
|
||||
}
|
||||
|
||||
supervisor.getStatistics().drawingPdfFinished();
|
||||
|
||||
supervisor.sendFinished();
|
||||
@ -160,12 +154,4 @@ public class OCRService {
|
||||
|
||||
}
|
||||
|
||||
|
||||
private void saveIdpResultFile(File idpResultFile, OcrResult ocrResult) throws IOException {
|
||||
|
||||
try (var out = new FileOutputStream(idpResultFile)) {
|
||||
mapper.writeValue(out, ocrResult.idpResult());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -5,7 +5,6 @@ import static com.knecon.fforesight.service.ocr.processor.model.Statistics.human
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ArrayBlockingQueue;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
@ -14,7 +13,6 @@ import java.util.concurrent.CountDownLatch;
|
||||
import com.knecon.fforesight.service.ocr.processor.OcrServiceSettings;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.PageBatch;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.Statistics;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.AzureOcrFeature;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
@ -40,15 +38,12 @@ public class OcrExecutionSupervisor {
|
||||
|
||||
String fileId;
|
||||
|
||||
Set<AzureOcrFeature> features;
|
||||
|
||||
|
||||
public OcrExecutionSupervisor(int totalPageCount, IOcrMessageSender ocrMessageSender, String fileId, OcrServiceSettings settings, Set<AzureOcrFeature> features) {
|
||||
public OcrExecutionSupervisor(int totalPageCount, IOcrMessageSender ocrMessageSender, String fileId, OcrServiceSettings settings) {
|
||||
|
||||
this.totalPageCount = totalPageCount;
|
||||
this.ocrMessageSender = ocrMessageSender;
|
||||
this.fileId = fileId;
|
||||
this.features = features;
|
||||
this.errorPages = Collections.synchronizedSet(new HashSet<>());
|
||||
this.countDownPagesToProcess = new CountDownLatch(totalPageCount);
|
||||
this.statistics = new Statistics();
|
||||
@ -70,16 +65,16 @@ public class OcrExecutionSupervisor {
|
||||
}
|
||||
|
||||
|
||||
public void logImageExtractionFinished(int numberOfPages, int numberOfPagesToProcess) {
|
||||
public void logImageExtractionFinished(int numberOfPages, int numberOfImages) {
|
||||
|
||||
statistics.imageExtractionFinished();
|
||||
log.info("Images found on {}/{} pages in {}", numberOfPagesToProcess, numberOfPages, humanizeDuration(statistics.getImageExtractionDuration()));
|
||||
log.info("Images found on {}/{} pages in {}", numberOfImages, numberOfPages, humanizeDuration(statistics.getImageExtractionDuration()));
|
||||
}
|
||||
|
||||
|
||||
public void logUploadStart(PageBatch pageRange, long bytes) {
|
||||
|
||||
log.info("Batch {}: Start uploading pages {} with {}", pageRange.getIndex(), pageRange, humanizeBytes(bytes));
|
||||
log.info("Start uploading pages {} with {}", pageRange, humanizeBytes(bytes));
|
||||
statistics.getBatchStats(pageRange).start();
|
||||
statistics.increaseTotalBytes(pageRange, bytes);
|
||||
}
|
||||
@ -88,28 +83,27 @@ public class OcrExecutionSupervisor {
|
||||
public void logInProgress(PageBatch pageRange) {
|
||||
|
||||
if (!statistics.getBatchStats(pageRange).isUploadFinished()) {
|
||||
log.info("Batch {}: Pages {} is in progress", pageRange.getIndex(), pageRange);
|
||||
log.info("Pages {} is in progress", pageRange);
|
||||
statistics.getBatchStats(pageRange).finishUpload();
|
||||
ocrMessageSender.sendUpdate(fileId, processedPages(), getTotalPageCount(), features);
|
||||
ocrMessageSender.sendUpdate(fileId, processedPages(), getTotalPageCount());
|
||||
} else {
|
||||
log.debug("Batch {}: Pages {} still in progress", pageRange.getIndex(), pageRange);
|
||||
log.debug("Pages {} still in progress", pageRange);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void finishMappingResult(PageBatch batch) {
|
||||
public void finishMappingResult(PageBatch pageRange) {
|
||||
|
||||
batch.forEach(pageIndex -> countDownPagesToProcess.countDown());
|
||||
statistics.getBatchStats(batch).finishMappingResult();
|
||||
ocrMessageSender.sendUpdate(fileId, this.processedPages(), getTotalPageCount(), features);
|
||||
log.info("Batch {}: Finished mapping result with pages {}", batch.getIndex(), batch);
|
||||
pageRange.forEach(pageIndex -> countDownPagesToProcess.countDown());
|
||||
statistics.getBatchStats(pageRange).finishWritingText();
|
||||
ocrMessageSender.sendUpdate(fileId, this.processedPages(), getTotalPageCount());
|
||||
}
|
||||
|
||||
|
||||
public void logPageSkipped(Integer pageIndex) {
|
||||
|
||||
this.countDownPagesToProcess.countDown();
|
||||
ocrMessageSender.sendUpdate(fileId, this.processedPages(), getTotalPageCount(), features);
|
||||
ocrMessageSender.sendUpdate(fileId, this.processedPages(), getTotalPageCount());
|
||||
log.debug("{}/{}: No images to ocr on page {}", processedPages(), getTotalPageCount(), pageIndex);
|
||||
|
||||
}
|
||||
@ -119,43 +113,21 @@ public class OcrExecutionSupervisor {
|
||||
|
||||
this.errorPages.add(batch);
|
||||
batch.forEach(pageIndex -> this.countDownPagesToProcess.countDown());
|
||||
ocrMessageSender.sendUpdate(fileId, this.processedPages(), getTotalPageCount(), features);
|
||||
log.error("{}/{}: Error occurred in batch {} with pages {}", processedPages(), getTotalPageCount(), batch.getIndex(), batch, e);
|
||||
ocrMessageSender.sendUpdate(fileId, this.processedPages(), getTotalPageCount());
|
||||
log.error("{}/{}: Error occurred on pages {}", processedPages(), getTotalPageCount(), batch, e);
|
||||
}
|
||||
|
||||
|
||||
public void logPageSuccess(PageBatch batch) {
|
||||
|
||||
statistics.getBatchStats(batch).finishApiWait();
|
||||
log.info("{}/{}: Finished OCR in batch {} with pages {}", processedPages(), getTotalPageCount(), batch.getIndex(), batch);
|
||||
log.info("{}/{}: Finished OCR on pages {}", processedPages(), getTotalPageCount(), batch);
|
||||
}
|
||||
|
||||
|
||||
private int processedPages() {
|
||||
|
||||
if (countDownPagesToProcess.getCount() == 0) {
|
||||
return totalPageCount;
|
||||
}
|
||||
int processedPages = 0;
|
||||
for (Map.Entry<PageBatch, BatchStats> entry : statistics.getBatchStats().entrySet()) {
|
||||
PageBatch pageBatch = entry.getKey();
|
||||
BatchStats batchStats = entry.getValue();
|
||||
float percentage = 0;
|
||||
if (batchStats.isBatchRenderFinished()) {
|
||||
percentage += 0.1f;
|
||||
}
|
||||
if (batchStats.isUploadFinished()) {
|
||||
percentage += 0.3f;
|
||||
}
|
||||
if (batchStats.isApiWaitFinished()) {
|
||||
percentage += 0.3f;
|
||||
}
|
||||
if (batchStats.isMappingResultFinished()) {
|
||||
percentage += 0.3f;
|
||||
}
|
||||
processedPages += (int) (pageBatch.size() * percentage);
|
||||
}
|
||||
return processedPages;
|
||||
return (int) (totalPageCount - countDownPagesToProcess.getCount());
|
||||
}
|
||||
|
||||
|
||||
@ -172,7 +144,7 @@ public class OcrExecutionSupervisor {
|
||||
requireNoErrors();
|
||||
|
||||
log.info("{}/{}: Finished OCR on all pages", getTotalPageCount(), getTotalPageCount());
|
||||
ocrMessageSender.sendOcrFinished(fileId, getTotalPageCount(), features);
|
||||
ocrMessageSender.sendOcrFinished(fileId, getTotalPageCount());
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1,510 +0,0 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.azure.ai.documentintelligence.models.AnalyzeResult;
|
||||
import com.azure.ai.documentintelligence.models.BoundingRegion;
|
||||
import com.azure.ai.documentintelligence.models.DocumentFontStyle;
|
||||
import com.azure.ai.documentintelligence.models.DocumentPage;
|
||||
import com.azure.ai.documentintelligence.models.DocumentSpan;
|
||||
import com.azure.ai.documentintelligence.models.DocumentStyle;
|
||||
import com.azure.ai.documentintelligence.models.DocumentTable;
|
||||
import com.azure.ai.documentintelligence.models.DocumentTableCell;
|
||||
import com.azure.ai.documentintelligence.models.DocumentWord;
|
||||
import com.google.common.base.Functions;
|
||||
import com.knecon.fforesight.service.ocr.processor.OcrServiceSettings;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ImageFile;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.PageBatch;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.PageInformation;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.SpanLookup;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.BBoxSnuggificationService;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.FontStyleDetector;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.ImageProcessingPipeline;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.ImageProcessingSupervisor;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.StrokeWidthCalculator;
|
||||
import com.knecon.fforesight.service.ocr.processor.visualizations.WritableOcrResult;
|
||||
import com.knecon.fforesight.service.ocr.processor.visualizations.fonts.FontMetricsProvider;
|
||||
import com.knecon.fforesight.service.ocr.processor.visualizations.fonts.FontStyle;
|
||||
import com.knecon.fforesight.service.ocr.processor.visualizations.fonts.Type0FontMetricsProvider;
|
||||
import com.knecon.fforesight.service.ocr.processor.visualizations.utils.RotationCorrectionUtility;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.AzureOcrFeature;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.QuadPoint;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import net.sourceforge.lept4j.Box;
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.lept4j.util.LeptUtils;
|
||||
|
||||
@Slf4j
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class OcrResultPostProcessingPipeline {
|
||||
|
||||
@Getter
|
||||
Map<Integer, AffineTransform> resultToPageTransforms;
|
||||
Map<Integer, PageInformation> pageInformation;
|
||||
ImageProcessingPipeline imageProcessingPipeline;
|
||||
OcrServiceSettings settings;
|
||||
Set<AzureOcrFeature> features;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public OcrResultPostProcessingPipeline(Map<Integer, PageInformation> pageInformation,
|
||||
ImageProcessingPipeline imageProcessingPipeline,
|
||||
OcrServiceSettings settings,
|
||||
Set<AzureOcrFeature> features) {
|
||||
|
||||
this.imageProcessingPipeline = imageProcessingPipeline;
|
||||
this.pageInformation = pageInformation;
|
||||
resultToPageTransforms = Collections.synchronizedMap(new HashMap<>());
|
||||
this.settings = settings;
|
||||
this.features = features;
|
||||
}
|
||||
|
||||
|
||||
public List<WritableOcrResult> processAnalyzeResult(AnalyzeResult analyzeResult, PageBatch batch) throws InterruptedException {
|
||||
|
||||
ImageProcessingSupervisor imageSupervisor = renderImagesIfNecessary(analyzeResult, batch);
|
||||
|
||||
List<WritableOcrResult> writableOcrResultList = new ArrayList<>();
|
||||
|
||||
Lookups lookups = getLookups(analyzeResult);
|
||||
|
||||
for (DocumentPage resultPage : analyzeResult.getPages()) {
|
||||
|
||||
PageInformation pageInformation = getPageInformation(getPageNumber(batch, resultPage));
|
||||
AffineTransform resultToPageTransform = buildResultToPageTransform(pageInformation, resultPage.getWidth());
|
||||
resultToPageTransforms.put(getPageNumber(batch, resultPage), resultToPageTransform);
|
||||
|
||||
List<TextPositionInImage> words = buildTextPositionsInImage(batch, resultPage, resultToPageTransform, lookups, pageInformation, imageSupervisor);
|
||||
|
||||
var builder = WritableOcrResult.builder().pageNumber(pageInformation.number()).textPositionInImage(words).angle(-resultPage.getAngle());
|
||||
|
||||
if (settings.isDrawTablesAsLines()) {
|
||||
builder.tableLines(getTableLines(analyzeResult, pageInformation, resultToPageTransform));
|
||||
}
|
||||
|
||||
writableOcrResultList.add(builder.build());
|
||||
|
||||
}
|
||||
log.debug("Batch {}: finished post-processing.", batch.getIndex());
|
||||
return writableOcrResultList;
|
||||
}
|
||||
|
||||
|
||||
private ImageProcessingSupervisor renderImagesIfNecessary(AnalyzeResult analyzeResult, PageBatch batch) {
|
||||
|
||||
ImageProcessingSupervisor imageSupervisor = null;
|
||||
if (useRenderedImages()) {
|
||||
|
||||
Map<Integer, Double> anglesPerPage = analyzeResult.getPages()
|
||||
.stream()
|
||||
.collect(Collectors.toMap(DocumentPage::getPageNumber, documentPage -> -documentPage.getAngle()));
|
||||
RotationCorrectionUtility.rotatePages(batch.getBatchDoc(), batch.getBatchDoc(), anglesPerPage);
|
||||
imageSupervisor = imageProcessingPipeline.addToPipeline(batch);
|
||||
}
|
||||
return imageSupervisor;
|
||||
}
|
||||
|
||||
|
||||
private boolean useRenderedImages() {
|
||||
|
||||
if (settings.isAzureFontStyleDetection() && features.contains(AzureOcrFeature.FONT_STYLE_DETECTION)) {
|
||||
return false;
|
||||
}
|
||||
return settings.isSnuggify() || features.contains(AzureOcrFeature.FONT_STYLE_DETECTION);
|
||||
}
|
||||
|
||||
|
||||
private List<TextPositionInImage> buildTextPositionsInImage(PageBatch pageOffset,
|
||||
DocumentPage resultPage,
|
||||
AffineTransform resultToPageTransform,
|
||||
Lookups lookups,
|
||||
PageInformation pageInformation,
|
||||
ImageProcessingSupervisor imageSupervisor) throws InterruptedException {
|
||||
|
||||
if (!useRenderedImages()) {
|
||||
return buildText(resultPage, resultToPageTransform, lookups, pageInformation);
|
||||
}
|
||||
|
||||
ImageFile imageFile = imageSupervisor.awaitProcessedPage(getPageNumber(pageOffset, resultPage));
|
||||
|
||||
if (imageFile == null) {
|
||||
return buildText(resultPage, resultToPageTransform, lookups, pageInformation);
|
||||
}
|
||||
|
||||
synchronized (ImageProcessingSupervisor.class) {
|
||||
// Leptonica is not thread safe, but is being called in ImageProcessingService as well
|
||||
|
||||
if (features.contains(AzureOcrFeature.FONT_STYLE_DETECTION)) {
|
||||
return buildTextWithBoldDetection(resultPage, resultToPageTransform, pageInformation, imageFile);
|
||||
}
|
||||
|
||||
return buildTextWithSnugBBoxes(resultPage, imageFile, resultToPageTransform, lookups, pageInformation);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private List<TextPositionInImage> buildTextWithBoldDetection(DocumentPage resultPage,
|
||||
AffineTransform resultToPageTransform,
|
||||
PageInformation pageInformation,
|
||||
ImageFile imageFile) {
|
||||
|
||||
Pix pageImage = imageFile.readPix();
|
||||
List<TextPositionInImage> words = new ArrayList<>();
|
||||
|
||||
try (FontStyleDetector fontStyleDetector = new FontStyleDetector()) {
|
||||
|
||||
AffineTransform resultToImageTransform = buildResultToImageTransform(resultPage, pageImage);
|
||||
|
||||
for (DocumentWord word : resultPage.getWords()) {
|
||||
TextPositionInImage textPosition;
|
||||
if (canBeSnuggified(resultPage, resultToImageTransform)) {
|
||||
textPosition = buildTextPositionInImageWithSnugBBox(word,
|
||||
resultToPageTransform,
|
||||
new FontInformation(FontStyle.REGULAR, Type0FontMetricsProvider.REGULAR_INSTANCE),
|
||||
pageImage,
|
||||
resultToImageTransform);
|
||||
} else {
|
||||
textPosition = new TextPositionInImage(QuadPoint.fromPolygons(word.getPolygon()),
|
||||
word.getContent(),
|
||||
resultToPageTransform,
|
||||
new FontInformation(FontStyle.REGULAR, Type0FontMetricsProvider.REGULAR_INSTANCE).font(),
|
||||
new FontInformation(FontStyle.REGULAR, Type0FontMetricsProvider.REGULAR_INSTANCE).fontStyle(),
|
||||
false);
|
||||
}
|
||||
|
||||
if (intersectsIgnoreZone(pageInformation.wordBBoxes(), textPosition)) {
|
||||
textPosition.setOverlapsIgnoreZone(true);
|
||||
}
|
||||
|
||||
QuadPoint originTransformed = QuadPoint.fromPolygons(word.getPolygon()).getTransformed(resultToImageTransform);
|
||||
Pix wordImage = extractWordImage(originTransformed, pageImage);
|
||||
|
||||
if (wordImage == null) {
|
||||
log.debug("Unable to extract word image! wordImage: {}, pageImage {}", originTransformed.getBounds2D(), new Rectangle2D.Float(0, 0, pageImage.w, pageImage.h));
|
||||
continue;
|
||||
}
|
||||
|
||||
if (StrokeWidthCalculator.wordImageHasMinimumPixelDensity(wordImage)) {
|
||||
fontStyleDetector.add(textPosition, wordImage, textPosition.getFontSizeByHeight());
|
||||
}
|
||||
|
||||
words.add(textPosition);
|
||||
}
|
||||
|
||||
fontStyleDetector.classifyWords();
|
||||
|
||||
} finally {
|
||||
LeptUtils.disposePix(pageImage);
|
||||
}
|
||||
|
||||
return words;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static AffineTransform buildResultToImageTransform(DocumentPage resultPage, Pix pageImage) {
|
||||
|
||||
int quadrant = RotationCorrectionUtility.getQuadrantRotation(-resultPage.getAngle());
|
||||
AffineTransform rotationCorrection = RotationCorrectionUtility.buildTransform(-resultPage.getAngle(), pageImage.w, pageImage.h);
|
||||
AffineTransform imageTransform = new AffineTransform();
|
||||
double scalingFactor = switch (quadrant) {
|
||||
case 1, 3 -> pageImage.h / resultPage.getWidth();
|
||||
default -> pageImage.w / resultPage.getWidth();
|
||||
};
|
||||
imageTransform.concatenate(rotationCorrection);
|
||||
imageTransform.scale(scalingFactor, scalingFactor);
|
||||
return imageTransform;
|
||||
}
|
||||
|
||||
|
||||
public static Pix extractWordImage(QuadPoint wordPosition, Pix pageImage) {
|
||||
|
||||
Rectangle2D wordBBox = wordPosition.getBounds2D();
|
||||
Rectangle2D pageBBox = new Rectangle2D.Double(0, 0, pageImage.w, pageImage.h);
|
||||
|
||||
if (!pageBBox.contains(wordBBox)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
Box box = new Box((int) wordBBox.getX(), (int) wordBBox.getY(), (int) wordBBox.getWidth(), (int) wordBBox.getHeight(), 1);
|
||||
Pix wordImage = Leptonica1.pixClipRectangle(pageImage, box, null);
|
||||
box.clear();
|
||||
return wordImage;
|
||||
}
|
||||
|
||||
|
||||
public List<TextPositionInImage> buildTextWithSnugBBoxes(DocumentPage resultPage,
|
||||
ImageFile imageFile,
|
||||
AffineTransform pageCtm,
|
||||
Lookups lookups,
|
||||
PageInformation pageInformation) {
|
||||
|
||||
Pix pageImage = imageFile.readPix();
|
||||
AffineTransform resultToImageTransform = buildResultToImageTransform(resultPage, pageImage);
|
||||
|
||||
boolean snuggify = canBeSnuggified(resultPage, resultToImageTransform);
|
||||
|
||||
List<TextPositionInImage> list = new ArrayList<>();
|
||||
for (DocumentWord word : resultPage.getWords()) {
|
||||
|
||||
FontInformation fontInformation = FontInformation.determineStyle(word, lookups);
|
||||
|
||||
TextPositionInImage textPositionInImage;
|
||||
if (snuggify) {
|
||||
textPositionInImage = buildTextPositionInImageWithSnugBBox(word, pageCtm, fontInformation, pageImage, resultToImageTransform);
|
||||
} else {
|
||||
textPositionInImage = new TextPositionInImage(QuadPoint.fromPolygons(word.getPolygon()),
|
||||
word.getContent(),
|
||||
pageCtm,
|
||||
fontInformation.font(),
|
||||
fontInformation.fontStyle(),
|
||||
false);
|
||||
}
|
||||
markTextOverlappingIgnoreZone(textPositionInImage, pageInformation.wordBBoxes());
|
||||
list.add(textPositionInImage);
|
||||
}
|
||||
LeptUtils.disposePix(pageImage);
|
||||
return list;
|
||||
}
|
||||
|
||||
|
||||
private boolean canBeSnuggified(DocumentPage resultPage, AffineTransform resultToImageTransform) {
|
||||
|
||||
return settings.isSnuggify() && BBoxSnuggificationService.canBeSnuggified(resultPage, resultToImageTransform);
|
||||
}
|
||||
|
||||
|
||||
public List<TextPositionInImage> buildText(DocumentPage resultPage, AffineTransform pageCtm, Lookups lookups, PageInformation pageInformation) {
|
||||
|
||||
return resultPage.getWords()
|
||||
.stream()
|
||||
.map(word -> new TextPositionInImage(QuadPoint.fromPolygons(word.getPolygon()),
|
||||
word.getContent(),
|
||||
pageCtm,
|
||||
FontInformation.determineStyle(word, lookups).font(),
|
||||
FontInformation.determineStyle(word, lookups).fontStyle(),
|
||||
false))
|
||||
.map(textPositionInImage -> markTextOverlappingIgnoreZone(textPositionInImage, pageInformation.wordBBoxes()))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
private static int getPageNumber(PageBatch pageBatch, DocumentPage resultPage) {
|
||||
|
||||
return pageBatch.getPageNumber(resultPage.getPageNumber());
|
||||
}
|
||||
|
||||
|
||||
private static Lookups getLookups(AnalyzeResult analyzeResult) {
|
||||
|
||||
if (analyzeResult.getStyles() == null || analyzeResult.getStyles().isEmpty()) {
|
||||
return Lookups.empty();
|
||||
}
|
||||
|
||||
// Azure stopped supporting bold text detection in 1.0.0 release
|
||||
SpanLookup<DocumentSpan> boldLookup = new SpanLookup<>(Stream.empty(), Function.identity());
|
||||
|
||||
SpanLookup<DocumentSpan> italicLookup = new SpanLookup<>(analyzeResult.getStyles()
|
||||
.stream()
|
||||
.filter(style -> Objects.equals(style.getFontStyle(),
|
||||
DocumentFontStyle.ITALIC))
|
||||
.map(DocumentStyle::getSpans)
|
||||
.flatMap(Collection::stream), Functions.identity());
|
||||
|
||||
SpanLookup<DocumentSpan> handWrittenLookup = new SpanLookup<>(analyzeResult.getStyles()
|
||||
.stream()
|
||||
.filter(documentStyle -> documentStyle.isHandwritten() != null && documentStyle.isHandwritten())
|
||||
.map(DocumentStyle::getSpans)
|
||||
.flatMap(Collection::stream), Functions.identity());
|
||||
|
||||
return new Lookups(boldLookup, italicLookup, handWrittenLookup);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private TextPositionInImage buildTextPositionInImageWithSnugBBox(DocumentWord dw,
|
||||
AffineTransform imageCTM,
|
||||
FontInformation fontInformation,
|
||||
Pix pageImage,
|
||||
AffineTransform resultToImageTransform) {
|
||||
|
||||
QuadPoint origin = QuadPoint.fromPolygons(dw.getPolygon());
|
||||
|
||||
Optional<QuadPoint> snugBBox = BBoxSnuggificationService.snuggify(pageImage, dw, resultToImageTransform);
|
||||
|
||||
return new TextPositionInImage(snugBBox.orElse(origin), dw.getContent(), imageCTM, fontInformation.font(), fontInformation.fontStyle(), snugBBox.isPresent());
|
||||
}
|
||||
|
||||
|
||||
private record FontInformation(FontStyle fontStyle, FontMetricsProvider font) {
|
||||
|
||||
public static FontInformation determineStyle(DocumentWord dw, Lookups lookups) {
|
||||
|
||||
boolean bold = lookups.bold().containedInAnySpan(dw.getSpan());
|
||||
boolean italic = lookups.italic().containedInAnySpan(dw.getSpan());
|
||||
boolean handwritten = lookups.handwritten().containedInAnySpan(dw.getSpan());
|
||||
|
||||
FontStyle fontStyle;
|
||||
FontMetricsProvider font;
|
||||
if (handwritten) {
|
||||
fontStyle = FontStyle.HANDWRITTEN;
|
||||
font = Type0FontMetricsProvider.REGULAR_INSTANCE;
|
||||
} else if (italic && bold) {
|
||||
fontStyle = FontStyle.BOLD_ITALIC;
|
||||
font = Type0FontMetricsProvider.BOLD_ITALIC_INSTANCE;
|
||||
} else if (bold) {
|
||||
fontStyle = FontStyle.BOLD;
|
||||
font = Type0FontMetricsProvider.BOLD_INSTANCE;
|
||||
} else if (italic) {
|
||||
fontStyle = FontStyle.ITALIC;
|
||||
font = Type0FontMetricsProvider.ITALIC_INSTANCE;
|
||||
} else {
|
||||
fontStyle = FontStyle.REGULAR;
|
||||
font = Type0FontMetricsProvider.REGULAR_INSTANCE;
|
||||
}
|
||||
return new FontInformation(fontStyle, font);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static List<Line2D> getTableLines(AnalyzeResult analyzeResult, PageInformation pageInformation, AffineTransform imageCTM) {
|
||||
|
||||
if (analyzeResult.getTables() == null || analyzeResult.getTables().isEmpty()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
return analyzeResult.getTables()
|
||||
.stream()
|
||||
.map(DocumentTable::getCells)
|
||||
.flatMap(Collection::stream)
|
||||
.map(DocumentTableCell::getBoundingRegions)
|
||||
.flatMap(Collection::stream)
|
||||
.filter(table -> table.getPageNumber() == pageInformation.number())
|
||||
.map(BoundingRegion::getPolygon)
|
||||
.map(QuadPoint::fromPolygons)
|
||||
.map(qp -> qp.getTransformed(imageCTM))
|
||||
.flatMap(QuadPoint::asLines)
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
private static TextPositionInImage markTextOverlappingIgnoreZone(TextPositionInImage textPositionInImage, List<Rectangle2D> ignoreZones) {
|
||||
|
||||
if (intersectsIgnoreZone(ignoreZones, textPositionInImage)) {
|
||||
textPositionInImage.setOverlapsIgnoreZone(true);
|
||||
}
|
||||
|
||||
return textPositionInImage;
|
||||
}
|
||||
|
||||
|
||||
private static boolean intersectsIgnoreZone(List<Rectangle2D> ignoreZones, TextPositionInImage textPositionInImage) {
|
||||
|
||||
for (Rectangle2D ignoreZone : ignoreZones) {
|
||||
Rectangle2D textBBox = textPositionInImage.getTransformedTextBBox().getBounds2D();
|
||||
if (textBBox.intersects(ignoreZone)) {
|
||||
double intersectedArea = calculateIntersectedArea(textBBox, ignoreZone);
|
||||
double textArea = textBBox.getWidth() * textBBox.getHeight();
|
||||
if (intersectedArea / textArea > 0.5) {
|
||||
return true;
|
||||
}
|
||||
double ignoreZoneArea = ignoreZone.getWidth() * ignoreZone.getHeight();
|
||||
if (intersectedArea / ignoreZoneArea > 0.5) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
public static double calculateIntersectedArea(Rectangle2D r1, Rectangle2D r2) {
|
||||
|
||||
double xOverlap = Math.max(0, Math.min(r1.getMaxX(), r2.getMaxX()) - Math.max(r1.getMinX(), r2.getMinX()));
|
||||
double yOverlap = Math.max(0, Math.min(r1.getMaxY(), r2.getMaxY()) - Math.max(r1.getY(), r2.getY()));
|
||||
|
||||
return xOverlap * yOverlap;
|
||||
}
|
||||
|
||||
|
||||
public static AffineTransform buildResultToPageTransform(PageInformation pageInformation, double imageWidth) {
|
||||
|
||||
double scalingFactor = calculateScalingFactor(imageWidth, pageInformation);
|
||||
AffineTransform imageToCropBoxScaling = new AffineTransform(scalingFactor, 0, 0, scalingFactor, 0, 0);
|
||||
|
||||
AffineTransform mirrorMatrix = new AffineTransform(1, 0, 0, -1, 0, pageInformation.height());
|
||||
|
||||
AffineTransform rotationMatrix = switch (pageInformation.rotationDegrees()) {
|
||||
case 90 -> new AffineTransform(0, 1, -1, 0, pageInformation.height(), 0);
|
||||
case 180 -> new AffineTransform(-1, 0, 0, -1, pageInformation.width(), pageInformation.height());
|
||||
case 270 -> new AffineTransform(0, -1, 1, 0, pageInformation.width() - pageInformation.height(), pageInformation.height());
|
||||
default -> new AffineTransform();
|
||||
};
|
||||
|
||||
// matrix multiplication is performed from right to left, so the order is reversed.
|
||||
// scaling -> mirror -> rotation
|
||||
AffineTransform resultMatrix = new AffineTransform();
|
||||
|
||||
resultMatrix.concatenate(rotationMatrix);
|
||||
resultMatrix.concatenate(mirrorMatrix);
|
||||
resultMatrix.concatenate(imageToCropBoxScaling);
|
||||
return resultMatrix;
|
||||
}
|
||||
|
||||
|
||||
private static double calculateScalingFactor(double width, PageInformation pageInformation) {
|
||||
|
||||
// PDFBox always returns page height and width based on rotation
|
||||
double pageWidth;
|
||||
if (pageInformation.rotationDegrees() == 90 || pageInformation.rotationDegrees() == 270) {
|
||||
pageWidth = pageInformation.height();
|
||||
} else {
|
||||
pageWidth = pageInformation.width();
|
||||
}
|
||||
|
||||
return pageWidth / width;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private PageInformation getPageInformation(Integer pageNumber) {
|
||||
|
||||
return pageInformation.get(pageNumber);
|
||||
}
|
||||
|
||||
|
||||
public record Lookups(SpanLookup<DocumentSpan> bold, SpanLookup<DocumentSpan> italic, SpanLookup<DocumentSpan> handwritten) {
|
||||
|
||||
public static Lookups empty() {
|
||||
|
||||
return new Lookups(new SpanLookup<>(Stream.empty(), Function.identity()),
|
||||
new SpanLookup<>(Stream.empty(), Function.identity()),
|
||||
new SpanLookup<>(Stream.empty(), Function.identity()));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,215 +0,0 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service.imageprocessing;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.nio.IntBuffer;
|
||||
import java.util.Optional;
|
||||
|
||||
import com.azure.ai.documentintelligence.models.DocumentPage;
|
||||
import com.azure.ai.documentintelligence.models.DocumentWord;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.OcrResultPostProcessingPipeline;
|
||||
import com.knecon.fforesight.service.ocr.processor.visualizations.utils.RotationCorrectionUtility;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.QuadPoint;
|
||||
import com.sun.jna.Pointer;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.UtilityClass;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
import net.sourceforge.lept4j.Numa;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.lept4j.util.LeptUtils;
|
||||
|
||||
/**
|
||||
* This class attempts to shrink the BBox of a word to match the exact height of the word. This is only attempted for horizontal or vertical words. Any askew text is left as is.
|
||||
*/
|
||||
@Slf4j
|
||||
@UtilityClass
|
||||
public class BBoxSnuggificationService {
|
||||
|
||||
public static final int PIXEL_COUNT_THRESHOLD = 2; // minimum active pixel count per row for shrinking to stop
|
||||
private static final double AVERAGE_ANGLE_THRESHOLD = 0.2; // Skips snuggification, if the average remaining word rotation of a word, written from left-to-right is bigger than this
|
||||
public static final int INDIVIDUAL_ANGLE_THRESHOLD = 5; // skips snuggification for word, if the remaining rotation is larger than this angle
|
||||
public static final int MAX_SHRINK_PIXELS = 40; // Number of pixels that are allowed to be removed from the top or bottom of an image
|
||||
private static final int MINIMUM_WORD_PIXELS = 5; // Number of pixels that are required for snuggification
|
||||
|
||||
private enum Operation {
|
||||
HORIZONTAL,
|
||||
VERTICAL,
|
||||
BOTH,
|
||||
NONE
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public Optional<QuadPoint> snuggify(Pix pageImage, DocumentWord origin, AffineTransform resultToImageTransform) {
|
||||
|
||||
if (pageImage == null) {
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
if (origin.getContent().equals("-") || origin.getContent().equals(",")) {
|
||||
// very slim characters should not be snuggified, or the fontsize may be off significantly
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
QuadPoint originTransformed = QuadPoint.fromPolygons(origin.getPolygon()).getTransformed(resultToImageTransform);
|
||||
double remainingAngle = Math.abs(RotationCorrectionUtility.getRemainingAngle(originTransformed.getAngle()));
|
||||
QuadPoint.Direction direction = originTransformed.getDirection();
|
||||
|
||||
Operation operation = determineOperation(origin, direction, remainingAngle, originTransformed);
|
||||
|
||||
if (operation == Operation.NONE) {
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
Pix wordImage = OcrResultPostProcessingPipeline.extractWordImage(originTransformed, pageImage);
|
||||
|
||||
if (wordImage == null) {
|
||||
log.debug("Unable to extract word image! wordImage: {}, pageImage {}", originTransformed.getBounds2D(), new Rectangle2D.Float(0, 0, pageImage.w, pageImage.h));
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
if (!StrokeWidthCalculator.wordImageHasMinimumPixelDensity(wordImage)) {
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
Optional<Rectangle2D> snugBox = switch (operation) {
|
||||
case HORIZONTAL -> snuggifyY(wordImage, originTransformed.getBounds2D());
|
||||
case VERTICAL -> snuggifyX(wordImage, originTransformed.getBounds2D());
|
||||
case BOTH -> snuggifyBoth(wordImage, originTransformed);
|
||||
default -> Optional.empty();
|
||||
};
|
||||
|
||||
LeptUtils.disposePix(wordImage);
|
||||
|
||||
AffineTransform imageToResultTransform = resultToImageTransform.createInverse();
|
||||
return snugBox.map(snugBBox -> QuadPoint.fromRectangle2D(snugBBox, direction))
|
||||
.map(bbox -> bbox.getTransformed(imageToResultTransform));
|
||||
|
||||
}
|
||||
|
||||
|
||||
private Optional<Rectangle2D> snuggifyBoth(Pix wordImage, QuadPoint originTransformed) {
|
||||
|
||||
Optional<Rectangle2D> snugY = snuggifyY(wordImage, originTransformed.getBounds2D());
|
||||
Optional<Rectangle2D> snugX = snuggifyX(wordImage, originTransformed.getBounds2D());
|
||||
if (snugY.isPresent() && snugX.isPresent()) {
|
||||
return Optional.of(snugY.get().createIntersection(snugX.get()).getBounds2D());
|
||||
} else if (snugY.isPresent()) {
|
||||
return snugY;
|
||||
} else {
|
||||
return snugX;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private Operation determineOperation(DocumentWord origin, QuadPoint.Direction direction, double remainingAngle, QuadPoint originTransformed) {
|
||||
|
||||
Operation operation = Operation.NONE;
|
||||
if (((direction.equals(QuadPoint.Direction.RIGHT) || direction.equals(QuadPoint.Direction.LEFT)) && remainingAngle < INDIVIDUAL_ANGLE_THRESHOLD)) {
|
||||
operation = Operation.HORIZONTAL;
|
||||
} else if ((direction.equals(QuadPoint.Direction.UP) || direction.equals(QuadPoint.Direction.DOWN)) && remainingAngle < INDIVIDUAL_ANGLE_THRESHOLD) {
|
||||
operation = Operation.VERTICAL;
|
||||
} else if ((origin.getContent().length() < 4 || Math.abs(originTransformed.getAngle()) < AVERAGE_ANGLE_THRESHOLD * 3)) {
|
||||
return Operation.BOTH;
|
||||
}
|
||||
return operation;
|
||||
}
|
||||
|
||||
|
||||
private Optional<Rectangle2D> snuggifyX(Pix wordImage, Rectangle2D origin) {
|
||||
|
||||
Numa colCounts = Leptonica1.pixCountPixelsByColumn(wordImage);
|
||||
int start = 0;
|
||||
int end = wordImage.w - PIXEL_COUNT_THRESHOLD;
|
||||
for (int i = start; i < Math.min(wordImage.w, MAX_SHRINK_PIXELS); i++) {
|
||||
if (pixCountPerColumn(i, colCounts) > PIXEL_COUNT_THRESHOLD) {
|
||||
start = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (int i = end; i > Math.max(0, wordImage.w - MAX_SHRINK_PIXELS); i--) {
|
||||
if (pixCountPerColumn(i, colCounts) > PIXEL_COUNT_THRESHOLD) {
|
||||
end = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (start == 0 && end == wordImage.w) {
|
||||
return Optional.empty();
|
||||
}
|
||||
if (Math.abs(start - end) < MINIMUM_WORD_PIXELS) {
|
||||
return Optional.empty();
|
||||
}
|
||||
return Optional.of(new Rectangle2D.Double(origin.getX() + start, origin.getY(), origin.getWidth() - start - (wordImage.w - end), origin.getHeight()));
|
||||
}
|
||||
|
||||
|
||||
private Optional<Rectangle2D> snuggifyY(Pix wordImage, Rectangle2D origin) {
|
||||
|
||||
int start = 0;
|
||||
int end = wordImage.h - 1;
|
||||
for (int i = start; i < Math.min(wordImage.h, MAX_SHRINK_PIXELS); i++) {
|
||||
if (pixCountPerRow(i, wordImage) > PIXEL_COUNT_THRESHOLD) {
|
||||
start = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (int i = end; i > Math.max(0, wordImage.h - MAX_SHRINK_PIXELS); i--) {
|
||||
if (pixCountPerRow(i, wordImage) > PIXEL_COUNT_THRESHOLD) {
|
||||
end = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (start == 0 && end == wordImage.h) {
|
||||
return Optional.empty();
|
||||
}
|
||||
if (Math.abs(start - end) < MINIMUM_WORD_PIXELS) {
|
||||
return Optional.empty();
|
||||
}
|
||||
return Optional.of(new Rectangle2D.Double(origin.getX(), origin.getY() + start, origin.getWidth(), origin.getHeight() - start - (wordImage.h - end)));
|
||||
}
|
||||
|
||||
|
||||
private int pixCountPerRow(int row, Pix pix) {
|
||||
|
||||
IntBuffer result = IntBuffer.allocate(1);
|
||||
int success = Leptonica1.pixCountPixelsInRow(pix, row, result, null);
|
||||
if (success == 0) {
|
||||
return result.get();
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private int pixCountPerColumn(int column, Numa colCounts) {
|
||||
|
||||
if (column > colCounts.n) {
|
||||
throw new IndexOutOfBoundsException("column " + column + " is out of bounds for column count " + colCounts.n);
|
||||
}
|
||||
Pointer pointer = colCounts.array.getPointer();
|
||||
|
||||
// Read the float value at position i. Each float takes 4 bytes.
|
||||
return (int) pointer.getFloat((long) column * Float.BYTES);
|
||||
}
|
||||
|
||||
|
||||
public boolean canBeSnuggified(DocumentPage resultPage, AffineTransform imageTransform) {
|
||||
|
||||
double averageAngle = resultPage.getWords()
|
||||
.stream()
|
||||
.filter(word -> word.getContent().length() >= 4)
|
||||
.map(DocumentWord::getPolygon)
|
||||
.map(QuadPoint::fromPolygons)
|
||||
.map(qp -> qp.getTransformed(imageTransform))
|
||||
.filter(qp -> qp.getDirection().equals(QuadPoint.Direction.RIGHT))
|
||||
.mapToDouble(QuadPoint::getAngle)
|
||||
.map(Math::toDegrees)
|
||||
.map(RotationCorrectionUtility::getRemainingAngle).average()
|
||||
.orElse(Double.MAX_VALUE);
|
||||
|
||||
return Math.abs(averageAngle) < AVERAGE_ANGLE_THRESHOLD;
|
||||
}
|
||||
|
||||
}
|
||||
@ -84,7 +84,6 @@ public class FontStyleDetector implements Closeable {
|
||||
wordImage.textPosition().setFontMetricsProvider(Type0FontMetricsProvider.BOLD_INSTANCE);
|
||||
wordImage.textPosition().setFontStyle(FontStyle.BOLD);
|
||||
} else {
|
||||
wordImage.textPosition().setFontMetricsProvider(Type0FontMetricsProvider.REGULAR_INSTANCE);
|
||||
wordImage.textPosition().setFontStyle(FontStyle.REGULAR);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,20 +1,16 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service.imageprocessing;
|
||||
|
||||
import static com.knecon.fforesight.service.ocr.processor.utils.ListSplittingUtils.formatIntervals;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.slf4j.MDC;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ImageFile;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.PageBatch;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
@ -31,7 +27,7 @@ public class GhostScriptOutputHandler extends Thread {
|
||||
|
||||
// If the stdError or stdOut buffer of a thread is not being emptied it might lock the process in case of errors, so we need to empty both streams to prevent a deadlock.
|
||||
// Since both need to read simultaneously we need to implement the readers as separate threads.
|
||||
final int batchIdx;
|
||||
|
||||
final InputStream is;
|
||||
final String processName;
|
||||
final Type type;
|
||||
@ -40,32 +36,24 @@ public class GhostScriptOutputHandler extends Thread {
|
||||
final Consumer<ImageFile> outputHandler;
|
||||
final Consumer<String> errorHandler;
|
||||
|
||||
final Map<String, String> parentMdcContext;
|
||||
|
||||
int currentPageNumber;
|
||||
|
||||
|
||||
public static GhostScriptOutputHandler stdError(int batchIdx, InputStream is, Consumer<String> errorHandler) {
|
||||
public static GhostScriptOutputHandler stdError(InputStream is, Consumer<String> errorHandler) {
|
||||
|
||||
return new GhostScriptOutputHandler(batchIdx, is, "GS", Type.ERROR, null, null, errorHandler, MDC.getCopyOfContextMap());
|
||||
return new GhostScriptOutputHandler(is, "GS", Type.ERROR, null, null, errorHandler);
|
||||
}
|
||||
|
||||
|
||||
public static GhostScriptOutputHandler stdOut(int batchIdx,
|
||||
InputStream is,
|
||||
Map<Integer, ImageFile> pagesToProcess,
|
||||
Consumer<ImageFile> imageFileOutput,
|
||||
Consumer<String> errorHandler) {
|
||||
public static GhostScriptOutputHandler stdOut(InputStream is, Map<Integer, ImageFile> pagesToProcess, Consumer<ImageFile> imageFileOutput, Consumer<String> errorHandler) {
|
||||
|
||||
return new GhostScriptOutputHandler(batchIdx, is, "GS", Type.STD_OUT, pagesToProcess, imageFileOutput, errorHandler, MDC.getCopyOfContextMap());
|
||||
return new GhostScriptOutputHandler(is, "GS", Type.STD_OUT, pagesToProcess, imageFileOutput, errorHandler);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void run() {
|
||||
|
||||
MDC.setContextMap(parentMdcContext);
|
||||
|
||||
try (InputStreamReader isr = new InputStreamReader(is); BufferedReader br = new BufferedReader(isr)) {
|
||||
|
||||
String line;
|
||||
@ -75,14 +63,13 @@ public class GhostScriptOutputHandler extends Thread {
|
||||
if (line == null) {
|
||||
break;
|
||||
}
|
||||
switch (type) {
|
||||
case STD_OUT -> {
|
||||
log.debug("Batch {}: {}_{}>{}", batchIdx, processName, type.name(), line);
|
||||
addProcessedImageToQueue(line);
|
||||
}
|
||||
case ERROR -> log.error("Batch {}: {}_{}>{}", batchIdx, processName, type.name(), line);
|
||||
}
|
||||
|
||||
if (type.equals(Type.ERROR)) {
|
||||
log.error("{}_{}>{}", processName, type.name(), line);
|
||||
} else {
|
||||
log.debug("{}_{}>{}", processName, type.name(), line);
|
||||
addProcessedImageToQueue(line);
|
||||
}
|
||||
}
|
||||
}
|
||||
is.close();
|
||||
@ -90,9 +77,7 @@ public class GhostScriptOutputHandler extends Thread {
|
||||
queueFinishedPage(currentPageNumber);
|
||||
|
||||
if (!pagesToProcess.isEmpty()) {
|
||||
errorHandler.accept(String.format("Ghostscript finished for batch %d, but pages %s remain unprocessed.", batchIdx, formatPagesToProcess()));
|
||||
} else {
|
||||
log.info("Batch {}: rendered successfully!", batchIdx);
|
||||
errorHandler.accept(String.format("Ghostscript finished for batch, but pages %s remain unprocessed.", formatPagesToProcess()));
|
||||
}
|
||||
}
|
||||
|
||||
@ -101,16 +86,10 @@ public class GhostScriptOutputHandler extends Thread {
|
||||
|
||||
private String formatPagesToProcess() {
|
||||
|
||||
List<String> intervals = formatIntervals(pagesToProcess.keySet()
|
||||
.stream()
|
||||
.sorted()
|
||||
.toList());
|
||||
if (intervals.size() > 4) {
|
||||
intervals = intervals.subList(0, 4);
|
||||
intervals.add("...");
|
||||
}
|
||||
|
||||
return String.join(", ", intervals);
|
||||
var pages = new PageBatch();
|
||||
pagesToProcess.keySet()
|
||||
.forEach(pages::add);
|
||||
return pages.toString();
|
||||
}
|
||||
|
||||
|
||||
@ -127,6 +106,7 @@ public class GhostScriptOutputHandler extends Thread {
|
||||
currentPageNumber = pageNumber;
|
||||
return;
|
||||
}
|
||||
|
||||
queueFinishedPage(currentPageNumber);
|
||||
currentPageNumber = pageNumber;
|
||||
}
|
||||
@ -137,10 +117,10 @@ public class GhostScriptOutputHandler extends Thread {
|
||||
|
||||
var imageFile = this.pagesToProcess.remove(pageNumber);
|
||||
if (imageFile == null) {
|
||||
errorHandler.accept(String.format("%d: Page number %d does not exist in this thread. It only has pagenumbers %s", batchIdx, pageNumber, pagesToProcess.keySet()));
|
||||
errorHandler.accept(String.format("Page number %d does not exist in this thread. It only has pagenumbers %s", pageNumber, pagesToProcess.keySet()));
|
||||
} else {
|
||||
if (!new File(imageFile.absoluteFilePath()).exists()) {
|
||||
errorHandler.accept(String.format("%d: Rendered page with number %d does not exist!", batchIdx, pageNumber));
|
||||
errorHandler.accept(String.format("Rendered page with number %d does not exist!", pageNumber));
|
||||
}
|
||||
}
|
||||
outputHandler.accept(imageFile);
|
||||
|
||||
@ -2,136 +2,155 @@ package com.knecon.fforesight.service.ocr.processor.service.imageprocessing;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.Semaphore;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.OcrServiceSettings;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ImageFile;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.PageBatch;
|
||||
import com.knecon.fforesight.service.ocr.processor.utils.ListSplittingUtils;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
@SuppressWarnings("PMD") // can't figure out how to safely close the stdOut and stdError streams in line 72/74
|
||||
@SuppressWarnings("PMD") // can't figure out how to safely close the stdOut and stdError streams in line 142/144
|
||||
public class GhostScriptService {
|
||||
|
||||
private OcrServiceSettings ocrServiceSettings;
|
||||
public static String FORMAT = ".tiff";
|
||||
public static final int BATCH_SIZE = 256;
|
||||
static String FORMAT = ".tiff";
|
||||
static String DEVICE = "tiffgray";
|
||||
static int DPI = 300;
|
||||
private Semaphore concurrencySemaphore = new Semaphore(3);
|
||||
static int PROCESS_COUNT = 1;
|
||||
|
||||
|
||||
public GhostScriptService(OcrServiceSettings ocrServiceSettings) {
|
||||
@SneakyThrows
|
||||
public void renderPagesBatched(List<Integer> pagesToProcess,
|
||||
String documentAbsolutePath,
|
||||
Path tmpImageDir,
|
||||
ImageProcessingSupervisor supervisor,
|
||||
Consumer<ImageFile> successHandler,
|
||||
Consumer<String> errorHandler) {
|
||||
|
||||
this.ocrServiceSettings = ocrServiceSettings;
|
||||
assertGhostscriptIsInstalled();
|
||||
}
|
||||
List<List<ProcessInfo>> processInfoBatches = buildSubListForEachProcess(pagesToProcess,
|
||||
PROCESS_COUNT,
|
||||
BATCH_SIZE
|
||||
* PROCESS_COUNT); // GS has a limit on how many pageIndices per call are possible, so we limit it to 256 pages per process
|
||||
for (int batchIdx = 0; batchIdx < processInfoBatches.size(); batchIdx++) {
|
||||
|
||||
supervisor.requireNoErrors();
|
||||
|
||||
private void assertGhostscriptIsInstalled() {
|
||||
List<ProcessInfo> processInfos = processInfoBatches.get(batchIdx);
|
||||
|
||||
try {
|
||||
Process p = Runtime.getRuntime().exec("gs -v");
|
||||
InputStream stdOut = p.getInputStream();
|
||||
InputStream errOut = p.getErrorStream();
|
||||
assert p.waitFor(1, TimeUnit.SECONDS);
|
||||
log.info("Ghostscript is installed.");
|
||||
String out = new String(stdOut.readAllBytes());
|
||||
String error = new String(errOut.readAllBytes());
|
||||
for (String line : out.split("\n")) {
|
||||
log.info(line);
|
||||
log.info("Batch {}: Running {} gs processes with ({}) pages each",
|
||||
batchIdx,
|
||||
processInfos.size(),
|
||||
processInfos.stream()
|
||||
.map(info -> info.pageNumbers().size())
|
||||
.map(String::valueOf)
|
||||
.collect(Collectors.joining(", ")));
|
||||
|
||||
int finalBatchIdx = batchIdx;
|
||||
List<Process> processes = processInfos.stream()
|
||||
.parallel()
|
||||
.map(info -> buildCmdArgs(info.processIdx(), finalBatchIdx, info.pageNumbers(), tmpImageDir, documentAbsolutePath))
|
||||
.peek(s -> log.debug(String.join(" ", s.cmdArgs())))
|
||||
.map(processInfo -> executeProcess(processInfo, successHandler, errorHandler))
|
||||
.toList();
|
||||
|
||||
List<Integer> processExitCodes = new LinkedList<>();
|
||||
for (Process process : processes) {
|
||||
processExitCodes.add(process.waitFor());
|
||||
}
|
||||
if (!error.isBlank()) {
|
||||
log.error(error);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.error("Ghostscript is not installed!");
|
||||
log.error(e.getMessage(), e);
|
||||
throw new RuntimeException(e);
|
||||
log.info("Batch {}: Ghostscript processes finished with exit codes {}", batchIdx, processExitCodes);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void startBatchRender(PageBatch batch, ImageProcessingSupervisor supervisor, Consumer<ImageFile> successHandler, Consumer<String> errorHandler) {
|
||||
private List<List<ProcessInfo>> buildSubListForEachProcess(List<Integer> stitchedPageNumbers, int processCount, int batchSize) {
|
||||
|
||||
supervisor.requireNoErrors();
|
||||
// GhostScript command line can only handle so many page numbers at once, so we split it into batches
|
||||
int batchCount = (int) Math.ceil((double) stitchedPageNumbers.size() / batchSize);
|
||||
|
||||
List<ImageFile> renderedImageFiles = batch.getRenderedImageFiles();
|
||||
if (ocrServiceSettings.isUseCaches() && renderedImageFiles.stream()
|
||||
.allMatch(ImageFile::exists)) {
|
||||
log.info("Batch {}: Using cached GhostScript rendering with page(s) {}", batch.getIndex(), batch);
|
||||
renderedImageFiles.forEach(successHandler);
|
||||
return;
|
||||
log.info("Splitting {} page renderings across {} process(es) in {} batch(es) with size {}", stitchedPageNumbers.size(), processCount, batchCount, batchSize);
|
||||
|
||||
List<List<ProcessInfo>> processInfoBatches = new ArrayList<>(batchCount);
|
||||
List<List<List<Integer>>> batchedBalancedSublist = ListSplittingUtils.buildBatchedBalancedSublist(stitchedPageNumbers.stream()
|
||||
.sorted()
|
||||
.toList(), processCount, batchCount);
|
||||
|
||||
for (var batch : batchedBalancedSublist) {
|
||||
List<ProcessInfo> processInfos = new ArrayList<>(processCount);
|
||||
for (int threadIdx = 0; threadIdx < batch.size(); threadIdx++) {
|
||||
List<Integer> balancedPageNumbersSubList = batch.get(threadIdx);
|
||||
processInfos.add(new ProcessInfo(threadIdx, balancedPageNumbersSubList));
|
||||
}
|
||||
processInfoBatches.add(processInfos);
|
||||
}
|
||||
|
||||
concurrencySemaphore.acquire();
|
||||
log.info("Batch {}: starting GhostScript rendering with page(s) {}", batch.getIndex(), batch);
|
||||
executeProcess(batch, buildCmdArgs(batch, batch.getBatchDoc()), successHandler, errorHandler);
|
||||
return processInfoBatches;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private ProcessCmdsAndRenderedImageFiles buildCmdArgs(PageBatch batch, Path document) {
|
||||
private ProcessCmdsAndRenderedImageFiles buildCmdArgs(Integer processIdx,
|
||||
Integer batchIdx,
|
||||
List<Integer> stitchedImagePageIndices,
|
||||
Path outputDir,
|
||||
String documentAbsolutePath) {
|
||||
|
||||
String imagePathFormat = outputDir.resolve("output_" + processIdx + "_" + batchIdx + ".%04d" + FORMAT).toFile().toString();
|
||||
|
||||
Map<Integer, ImageFile> fullPageImages = new HashMap<>();
|
||||
List<ImageFile> renderedImageFiles = batch.getRenderedImageFiles();
|
||||
for (int i = 1; i <= renderedImageFiles.size(); i++) {
|
||||
ImageFile renderedImageFile = renderedImageFiles.get(i - 1);
|
||||
fullPageImages.put(i, renderedImageFile);
|
||||
for (int i = 0; i < stitchedImagePageIndices.size(); i++) {
|
||||
Integer pageNumber = stitchedImagePageIndices.get(i);
|
||||
fullPageImages.put(pageNumber, new ImageFile(pageNumber, String.format(imagePathFormat, i + 1)));
|
||||
}
|
||||
|
||||
String[] cmdArgs = buildCmdArgs(document, batch.getRenderedImageNameFormat());
|
||||
String[] cmdArgs = buildCmdArgs(stitchedImagePageIndices, documentAbsolutePath, imagePathFormat);
|
||||
|
||||
return new ProcessCmdsAndRenderedImageFiles(cmdArgs, fullPageImages);
|
||||
}
|
||||
|
||||
|
||||
private String[] buildCmdArgs(Path document, String imagePathFormat) {
|
||||
private String[] buildCmdArgs(List<Integer> pageNumbers, String documentAbsolutePath, String imagePathFormat) {
|
||||
|
||||
return new String[]{"gs", "-dNOPAUSE", "-sDEVICE=" + DEVICE, "-r" + DPI, "-sOutputFile=" + imagePathFormat, document.toFile().toString(), "-c", "quit"};
|
||||
StringBuilder sPageList = new StringBuilder();
|
||||
int i = 1;
|
||||
for (Integer integer : pageNumbers) {
|
||||
sPageList.append(integer);
|
||||
if (i < pageNumbers.size()) {
|
||||
sPageList.append(",");
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
return new String[]{"gs", "-dNOPAUSE", "-sDEVICE=" + DEVICE, "-r" + DPI, "-sPageList=" + sPageList, "-sOutputFile=" + imagePathFormat, documentAbsolutePath, "-c", "quit"};
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void executeProcess(PageBatch batch, ProcessCmdsAndRenderedImageFiles processInfo, Consumer<ImageFile> successHandler, Consumer<String> errorHandler) {
|
||||
private Process executeProcess(ProcessCmdsAndRenderedImageFiles processInfo, Consumer<ImageFile> successHandler, Consumer<String> errorHandler) {
|
||||
|
||||
Process p = Runtime.getRuntime().exec(processInfo.cmdArgs());
|
||||
InputStream stdOut = p.getInputStream();
|
||||
GhostScriptOutputHandler stdOutLogger = GhostScriptOutputHandler.stdOut(batch.getIndex(), stdOut, processInfo.renderedPageImageFiles(), successHandler, errorHandler);
|
||||
GhostScriptOutputHandler stdOutLogger = GhostScriptOutputHandler.stdOut(stdOut, processInfo.renderedPageImageFiles(), successHandler, errorHandler);
|
||||
InputStream stdError = p.getErrorStream();
|
||||
GhostScriptOutputHandler stdErrorLogger = GhostScriptOutputHandler.stdError(batch.getIndex(), stdError, errorHandler);
|
||||
GhostScriptOutputHandler stdErrorLogger = GhostScriptOutputHandler.stdError(stdError, errorHandler);
|
||||
|
||||
stdOutLogger.start();
|
||||
stdErrorLogger.start();
|
||||
handleFinished(p, errorHandler, batch, successHandler);
|
||||
}
|
||||
|
||||
|
||||
private void handleFinished(Process p, Consumer<String> errorHandler, PageBatch batch, Consumer<ImageFile> successHandler) {
|
||||
|
||||
Thread finishedThread = new Thread(() -> {
|
||||
try {
|
||||
p.waitFor(2, TimeUnit.MINUTES);
|
||||
} catch (InterruptedException e) {
|
||||
errorHandler.accept("Batch %d: Ghostscript rendering has been terminated after 2 minutes \n %s".formatted(batch.getIndex(), e.getMessage()));
|
||||
} finally {
|
||||
concurrencySemaphore.release();
|
||||
}
|
||||
});
|
||||
finishedThread.start();
|
||||
return p;
|
||||
}
|
||||
|
||||
|
||||
@ -139,4 +158,8 @@ public class GhostScriptService {
|
||||
|
||||
}
|
||||
|
||||
private record ProcessInfo(Integer processIdx, List<Integer> pageNumbers) {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,12 +1,15 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service.imageprocessing;
|
||||
|
||||
import java.io.File;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ImageFile;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.PageBatch;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
@ -23,16 +26,24 @@ public class ImageProcessingPipeline {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public ImageProcessingSupervisor addToPipeline(PageBatch batch) {
|
||||
public ImageProcessingSupervisor run(Set<Integer> pageNumberSet, Path imageDir, File document) {
|
||||
|
||||
List<Integer> pageNumbers = batch.getAllPageNumbers();
|
||||
Path processedImageDir = imageDir.resolve("processed");
|
||||
Path renderedImageDir = imageDir.resolve("rendered");
|
||||
|
||||
Files.createDirectories(renderedImageDir);
|
||||
Files.createDirectories(processedImageDir);
|
||||
|
||||
List<Integer> pageNumbers = pageNumberSet.stream()
|
||||
.sorted()
|
||||
.toList();
|
||||
|
||||
ImageProcessingSupervisor supervisor = new ImageProcessingSupervisor(pageNumbers);
|
||||
|
||||
Consumer<ImageFile> renderingSuccessConsumer = imageFile -> imageProcessingService.addToProcessingQueue(imageFile, batch.getProcessedImageDir(), supervisor);
|
||||
Consumer<ImageFile> renderingSuccessConsumer = imageFile -> imageProcessingService.addToProcessingQueue(imageFile, processedImageDir, supervisor);
|
||||
Consumer<String> renderingErrorConsumer = supervisor::markError;
|
||||
|
||||
ghostScriptService.startBatchRender(batch, supervisor, renderingSuccessConsumer, renderingErrorConsumer);
|
||||
ghostScriptService.renderPagesBatched(pageNumbers, document.toString(), renderedImageDir, supervisor, renderingSuccessConsumer, renderingErrorConsumer);
|
||||
|
||||
return supervisor;
|
||||
}
|
||||
|
||||
@ -1,13 +1,11 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service.imageprocessing;
|
||||
|
||||
import java.io.File;
|
||||
import java.nio.file.Path;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.OcrServiceSettings;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ImageFile;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
@ -25,10 +23,9 @@ import net.sourceforge.lept4j.util.LeptUtils;
|
||||
public class ImageProcessingService {
|
||||
|
||||
BlockingQueue<ProcessParams> queue = new LinkedBlockingQueue<>();
|
||||
private final OcrServiceSettings ocrServiceSettings;
|
||||
|
||||
|
||||
public ImageProcessingService(OcrServiceSettings ocrServiceSettings) {
|
||||
public ImageProcessingService() {
|
||||
|
||||
Thread queueConsumerThread = new Thread(() -> {
|
||||
while (true) {
|
||||
@ -41,13 +38,12 @@ public class ImageProcessingService {
|
||||
try {
|
||||
process(processParams.unprocessedImage(), processParams.outputDir, processParams.supervisor());
|
||||
} catch (Exception e) {
|
||||
processParams.supervisor.markPageFinished(processParams.unprocessedImage());
|
||||
log.error(e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
});
|
||||
queueConsumerThread.start();
|
||||
this.ocrServiceSettings = ocrServiceSettings;
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -58,43 +54,31 @@ public class ImageProcessingService {
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void process(ImageFile unprocessedImage, Path outputDir, ImageProcessingSupervisor supervisor) {
|
||||
|
||||
String absoluteFilePath = outputDir.resolve(Path.of(unprocessedImage.absoluteFilePath()).getFileName()).toFile().toString();
|
||||
ImageFile processedImage = new ImageFile(unprocessedImage.pageNumber(), absoluteFilePath);
|
||||
supervisor.requireNoErrors();
|
||||
|
||||
if (ocrServiceSettings.isUseCaches() && processedImage.exists()) {
|
||||
supervisor.markPageFinished(processedImage);
|
||||
return;
|
||||
}
|
||||
synchronized (ImageProcessingSupervisor.class) {
|
||||
// Leptonica is not thread safe, but is being called in WritableOcrResultFactory as well
|
||||
Pix processedPix;
|
||||
Pix pix = unprocessedImage.readPix();
|
||||
|
||||
try {
|
||||
if (!unprocessedImage.exists()) {
|
||||
log.error("ERROR, rendered image {} does not exist", unprocessedImage.absoluteFilePath());
|
||||
throw new AssertionError();
|
||||
}
|
||||
synchronized (ImageProcessingSupervisor.class) {
|
||||
// Leptonica is not thread safe, but is being called in WritableOcrResultFactory as well
|
||||
Pix processedPix;
|
||||
Pix pix = unprocessedImage.readPix();
|
||||
String absoluteFilePath = outputDir.resolve(Path.of(unprocessedImage.absoluteFilePath()).getFileName()).toFile().toString();
|
||||
|
||||
assert pix != null;
|
||||
processedPix = processPix(pix);
|
||||
Leptonica1.pixWrite(absoluteFilePath, processedPix, ILeptonica.IFF_TIFF_PACKBITS);
|
||||
|
||||
processedPix = processPix(pix);
|
||||
Leptonica1.pixWrite(processedImage.absoluteFilePath(), processedPix, ILeptonica.IFF_TIFF_PACKBITS);
|
||||
LeptUtils.disposePix(pix);
|
||||
LeptUtils.disposePix(processedPix);
|
||||
|
||||
LeptUtils.disposePix(pix);
|
||||
LeptUtils.disposePix(processedPix);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
supervisor.markError("Page %d could not be processed due to: %s".formatted(unprocessedImage.pageNumber(), e.getMessage()));
|
||||
} finally {
|
||||
supervisor.markPageFinished(processedImage);
|
||||
log.debug("Finished page: {}", processedImage.pageNumber());
|
||||
ImageFile imageFile = new ImageFile(unprocessedImage.pageNumber(), absoluteFilePath);
|
||||
supervisor.markPageFinished(imageFile);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private Pix processPix(Pix pix) {
|
||||
|
||||
Pix binarized;
|
||||
|
||||
@ -53,7 +53,7 @@ public class ImageProcessingSupervisor {
|
||||
|
||||
public ImageFile awaitProcessedPage(Integer pageNumber) throws InterruptedException {
|
||||
|
||||
if (hasErrors()) {
|
||||
if (hasErros()) {
|
||||
return null;
|
||||
}
|
||||
getPageLatch(pageNumber).await();
|
||||
@ -61,15 +61,14 @@ public class ImageProcessingSupervisor {
|
||||
}
|
||||
|
||||
|
||||
private boolean hasErrors() {
|
||||
private boolean hasErros() {
|
||||
|
||||
return !errors.isEmpty();
|
||||
return errors.isEmpty();
|
||||
}
|
||||
|
||||
|
||||
public void markError(String errorMessage) {
|
||||
|
||||
log.error(errorMessage);
|
||||
this.errors.add(errorMessage);
|
||||
}
|
||||
|
||||
@ -87,7 +86,7 @@ public class ImageProcessingSupervisor {
|
||||
if (this.errors.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
throw new IllegalStateException("Error(s) occurred during image processing: " + String.join("\n", errors.subList(0, Math.min(errors.size(), 3))));
|
||||
throw new IllegalStateException("Error(s) occurred during image processing: " + String.join("\n", errors));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -3,9 +3,13 @@ package com.knecon.fforesight.service.ocr.processor.service.imageprocessing;
|
||||
import static net.sourceforge.lept4j.ILeptonica.L_THIN_FG;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.nio.IntBuffer;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
@ -15,7 +19,6 @@ import net.sourceforge.lept4j.util.LeptUtils;
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class StrokeWidthCalculator implements Closeable {
|
||||
|
||||
public static final double MINIMUM_PIXEL_DENSITY = 0.05;
|
||||
Sela thinningSel = Leptonica1.selaMakeThinSets(1, 0);
|
||||
|
||||
|
||||
@ -43,14 +46,6 @@ public class StrokeWidthCalculator implements Closeable {
|
||||
}
|
||||
|
||||
|
||||
public static boolean wordImageHasMinimumPixelDensity(Pix wordImage) {
|
||||
|
||||
IntBuffer pixelCount = IntBuffer.allocate(1);
|
||||
Leptonica1.pixCountPixels(wordImage, pixelCount, null);
|
||||
return (double) pixelCount.get(0) / (wordImage.w * wordImage.h) >= MINIMUM_PIXEL_DENSITY;
|
||||
}
|
||||
|
||||
|
||||
public boolean hasLargerStrokeWidth(Pix pix, double strokeWidth, double threshold) {
|
||||
|
||||
int roundedStrokeWidth = (int) Math.round(strokeWidth);
|
||||
|
||||
@ -1,26 +1,25 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.utils;
|
||||
|
||||
import java.util.Locale;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public final class OsUtils {
|
||||
|
||||
private static final String SERVICE_NAME = "azure-ocr-service";
|
||||
|
||||
|
||||
private static boolean isWindows() {
|
||||
|
||||
String osName = System.getProperty("os.name");
|
||||
if (osName == null) {
|
||||
return false;
|
||||
}
|
||||
return osName.toLowerCase(Locale.ENGLISH).contains("windows");
|
||||
return StringUtils.containsIgnoreCase(System.getProperty("os.name"), "Windows");
|
||||
}
|
||||
|
||||
|
||||
public static String getTemporaryDirectory() {
|
||||
|
||||
String tmpdir = System.getProperty("java.io.tmpdir");
|
||||
if (isWindows() && !tmpdir.isBlank()) {
|
||||
if (isWindows() && StringUtils.isNotBlank(tmpdir)) {
|
||||
return tmpdir;
|
||||
}
|
||||
return "/tmp";
|
||||
|
||||
@ -1,40 +0,0 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.utils;
|
||||
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class StringCleaningUtility {
|
||||
|
||||
public static final Pattern hyphenLineBreaks = Pattern.compile("[-~‐‒⁻−﹣゠⁓‑\\u00AD][\\r\\n]+");
|
||||
public static final Pattern linebreaks = Pattern.compile("[\\r\\n]+");
|
||||
public static final Pattern doubleWhitespaces = Pattern.compile("\\s{2,}");
|
||||
|
||||
|
||||
public static String cleanString(String value) {
|
||||
|
||||
String noHyphenLinebreaks = removeHyphenLinebreaks(value);
|
||||
String noLinebreaks = removeLinebreaks(noHyphenLinebreaks);
|
||||
return removeMultipleWhitespaces(noLinebreaks);
|
||||
}
|
||||
|
||||
|
||||
private String removeHyphenLinebreaks(String value) {
|
||||
|
||||
return hyphenLineBreaks.matcher(value).replaceAll("");
|
||||
}
|
||||
|
||||
|
||||
private String removeMultipleWhitespaces(String value) {
|
||||
|
||||
return doubleWhitespaces.matcher(value).replaceAll(" ");
|
||||
}
|
||||
|
||||
|
||||
private String removeLinebreaks(String value) {
|
||||
|
||||
return linebreaks.matcher(value).replaceAll(" ");
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,14 +1,14 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.visualizations;
|
||||
|
||||
import com.azure.ai.documentintelligence.models.AnalyzeResult;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.AzureAnalyzeResult;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class AnalyzeResultMapper {
|
||||
|
||||
public IdpResult map(AnalyzeResult analyzeResult) {
|
||||
public AzureAnalyzeResult map(AnalyzeResult analyzeResult) {
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
@ -1,23 +0,0 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.visualizations;
|
||||
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.azure.ai.documentintelligence.models.DocumentSpan;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.SpanLookup;
|
||||
|
||||
public class FontStyler {
|
||||
|
||||
|
||||
public record Lookups(SpanLookup<DocumentSpan> bold, SpanLookup<DocumentSpan> italic, SpanLookup<DocumentSpan> handwritten) {
|
||||
|
||||
public static Lookups empty() {
|
||||
|
||||
return new Lookups(new SpanLookup<>(Stream.empty(), Function.identity()),
|
||||
new SpanLookup<>(Stream.empty(), Function.identity()),
|
||||
new SpanLookup<>(Stream.empty(), Function.identity()));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -20,7 +20,6 @@ import lombok.experimental.FieldDefaults;
|
||||
public final class WritableOcrResult {
|
||||
|
||||
int pageNumber;
|
||||
double angle;
|
||||
@Builder.Default
|
||||
List<TextPositionInImage> textPositionInImage = Collections.emptyList();
|
||||
@Builder.Default
|
||||
|
||||
@ -0,0 +1,367 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.visualizations;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.nio.IntBuffer;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.azure.ai.documentintelligence.models.AnalyzeResult;
|
||||
import com.azure.ai.documentintelligence.models.BoundingRegion;
|
||||
import com.azure.ai.documentintelligence.models.DocumentPage;
|
||||
import com.azure.ai.documentintelligence.models.DocumentSpan;
|
||||
import com.azure.ai.documentintelligence.models.DocumentStyle;
|
||||
import com.azure.ai.documentintelligence.models.DocumentTable;
|
||||
import com.azure.ai.documentintelligence.models.DocumentTableCell;
|
||||
import com.azure.ai.documentintelligence.models.DocumentWord;
|
||||
import com.azure.ai.documentintelligence.models.FontWeight;
|
||||
import com.google.common.base.Functions;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ImageFile;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.PageInformation;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.PageBatch;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.SpanLookup;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.OcrServiceSettings;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.FontStyleDetector;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.ImageProcessingSupervisor;
|
||||
import com.knecon.fforesight.service.ocr.processor.visualizations.fonts.FontMetricsProvider;
|
||||
import com.knecon.fforesight.service.ocr.processor.visualizations.fonts.FontStyle;
|
||||
import com.knecon.fforesight.service.ocr.processor.visualizations.fonts.Type0FontMetricsProvider;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.QuadPoint;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import net.sourceforge.lept4j.Box;
|
||||
import net.sourceforge.lept4j.Leptonica1;
|
||||
import net.sourceforge.lept4j.Pix;
|
||||
import net.sourceforge.lept4j.util.LeptUtils;
|
||||
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class WritableOcrResultFactory {
|
||||
|
||||
FontMetricsProvider regularFont = Type0FontMetricsProvider.REGULAR_INSTANCE;
|
||||
FontMetricsProvider boldFont = Type0FontMetricsProvider.BOLD_INSTANCE;
|
||||
FontMetricsProvider italicFont = Type0FontMetricsProvider.ITALIC_INSTANCE;
|
||||
FontMetricsProvider boldItalicFont = Type0FontMetricsProvider.BOLD_ITALIC_INSTANCE;
|
||||
|
||||
@Getter
|
||||
Map<Integer, AffineTransform> pageCtms;
|
||||
Map<Integer, PageInformation> pageInformation;
|
||||
OcrServiceSettings settings;
|
||||
ImageProcessingSupervisor imageSupervisor;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public WritableOcrResultFactory(Map<Integer, PageInformation> pageInformation, OcrServiceSettings settings, ImageProcessingSupervisor imageSupervisor) {
|
||||
|
||||
this.pageInformation = pageInformation;
|
||||
pageCtms = Collections.synchronizedMap(new HashMap<>());
|
||||
this.settings = settings;
|
||||
this.imageSupervisor = imageSupervisor;
|
||||
}
|
||||
|
||||
|
||||
public List<WritableOcrResult> buildOcrResultToWrite(AnalyzeResult analyzeResult, PageBatch pageOffset) throws InterruptedException {
|
||||
|
||||
List<WritableOcrResult> writableOcrResultList = new ArrayList<>();
|
||||
|
||||
Lookups lookups = getLookups(analyzeResult);
|
||||
|
||||
for (DocumentPage resultPage : analyzeResult.getPages()) {
|
||||
|
||||
PageInformation pageInformation = getPageInformation(getPageNumber(pageOffset, resultPage));
|
||||
AffineTransform pageCtm = getPageCTM(pageInformation, resultPage.getWidth());
|
||||
pageCtms.put(getPageNumber(pageOffset, resultPage), pageCtm);
|
||||
|
||||
List<TextPositionInImage> words = buildTextPositionsInImage(pageOffset, resultPage, pageCtm, lookups, pageInformation);
|
||||
|
||||
var builder = WritableOcrResult.builder().pageNumber(pageInformation.number()).textPositionInImage(words);
|
||||
|
||||
if (settings.isTableDetection()) {
|
||||
builder.tableLines(getTableLines(analyzeResult, pageInformation, pageCtm));
|
||||
}
|
||||
|
||||
writableOcrResultList.add(builder.build());
|
||||
|
||||
}
|
||||
return writableOcrResultList;
|
||||
}
|
||||
|
||||
|
||||
private List<TextPositionInImage> buildTextPositionsInImage(PageBatch pageOffset,
|
||||
DocumentPage resultPage,
|
||||
AffineTransform pageCtm,
|
||||
Lookups lookups,
|
||||
PageInformation pageInformation) throws InterruptedException {
|
||||
|
||||
if (!settings.isFontStyleDetection()) {
|
||||
return buildText(resultPage, pageCtm, lookups, pageInformation);
|
||||
}
|
||||
|
||||
ImageFile imageFile = imageSupervisor.awaitProcessedPage(getPageNumber(pageOffset, resultPage));
|
||||
|
||||
if (imageFile == null) {
|
||||
return buildText(resultPage, pageCtm, lookups, pageInformation);
|
||||
}
|
||||
|
||||
synchronized (ImageProcessingSupervisor.class) {
|
||||
return buildTextWithBoldDetection(resultPage, pageCtm, pageInformation, imageFile);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static List<TextPositionInImage> buildTextWithBoldDetection(DocumentPage resultPage, AffineTransform pageCtm, PageInformation pageInformation, ImageFile imageFile) {
|
||||
// Leptonica is not thread safe, but is being called in ImageProcessingService as well
|
||||
Pix pageImage = imageFile.readPix();
|
||||
List<TextPositionInImage> words = new ArrayList<>();
|
||||
|
||||
try (FontStyleDetector fontStyleDetector = new FontStyleDetector()) {
|
||||
|
||||
AffineTransform imageTransform = new AffineTransform();
|
||||
double scalingFactor = pageImage.w / resultPage.getWidth();
|
||||
imageTransform.scale(scalingFactor, scalingFactor);
|
||||
|
||||
for (DocumentWord word : resultPage.getWords()) {
|
||||
|
||||
TextPositionInImage textPosition = new TextPositionInImage(word, pageCtm, Type0FontMetricsProvider.REGULAR_INSTANCE, FontStyle.REGULAR);
|
||||
|
||||
if (intersectsIgnoreZone(pageInformation.wordBBoxes(), textPosition)) {
|
||||
textPosition.setOverlapsIgnoreZone(true);
|
||||
}
|
||||
|
||||
Pix wordImage = extractWordImage(word, imageTransform, pageImage);
|
||||
|
||||
IntBuffer pixelCount = IntBuffer.allocate(1);
|
||||
Leptonica1.pixCountPixels(wordImage, pixelCount, null);
|
||||
|
||||
if (pixelCount.get(0) > 3) {
|
||||
fontStyleDetector.add(textPosition, wordImage, textPosition.getFontSizeByHeight());
|
||||
}
|
||||
|
||||
words.add(textPosition);
|
||||
}
|
||||
|
||||
fontStyleDetector.classifyWords();
|
||||
|
||||
} finally {
|
||||
LeptUtils.disposePix(pageImage);
|
||||
}
|
||||
|
||||
return words;
|
||||
}
|
||||
|
||||
|
||||
private static Pix extractWordImage(DocumentWord word, AffineTransform imageTransform, Pix pageImage) {
|
||||
|
||||
Rectangle2D wordBBox = QuadPoint.fromPolygons(word.getPolygon()).getTransformed(imageTransform).getBounds2D();
|
||||
Box box = new Box((int) wordBBox.getX(), (int) wordBBox.getY(), (int) wordBBox.getWidth(), (int) wordBBox.getHeight(), 1);
|
||||
Pix wordImage = Leptonica1.pixClipRectangle(pageImage, box, null);
|
||||
box.clear();
|
||||
return wordImage;
|
||||
}
|
||||
|
||||
|
||||
private List<TextPositionInImage> buildText(DocumentPage resultPage, AffineTransform pageCtm, Lookups lookups, PageInformation pageInformation) {
|
||||
|
||||
return resultPage.getWords()
|
||||
.stream()
|
||||
.map(word -> buildTextPositionInImage(word, pageCtm, lookups))
|
||||
.map(textPositionInImage -> markTextOverlappingIgnoreZone(textPositionInImage, pageInformation.wordBBoxes()))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
private static int getPageNumber(PageBatch pageOffset, DocumentPage resultPage) {
|
||||
|
||||
return pageOffset.getPageNumber(resultPage.getPageNumber());
|
||||
}
|
||||
|
||||
|
||||
private static Lookups getLookups(AnalyzeResult analyzeResult) {
|
||||
|
||||
if (analyzeResult.getStyles() == null || analyzeResult.getStyles().isEmpty()) {
|
||||
return Lookups.empty();
|
||||
}
|
||||
|
||||
SpanLookup<DocumentSpan> boldLookup = new SpanLookup<>(analyzeResult.getStyles()
|
||||
.stream()
|
||||
.filter(style -> Objects.equals(style.getFontWeight(), FontWeight.BOLD))
|
||||
.map(DocumentStyle::getSpans)
|
||||
.flatMap(Collection::stream), Function.identity());
|
||||
|
||||
SpanLookup<DocumentSpan> italicLookup = new SpanLookup<>(analyzeResult.getStyles()
|
||||
.stream()
|
||||
.filter(style -> Objects.equals(style.getFontStyle(),
|
||||
com.azure.ai.documentintelligence.models.FontStyle.ITALIC))
|
||||
.map(DocumentStyle::getSpans)
|
||||
.flatMap(Collection::stream), Functions.identity());
|
||||
|
||||
SpanLookup<DocumentSpan> handWrittenLookup = new SpanLookup<>(analyzeResult.getStyles()
|
||||
.stream()
|
||||
.filter(documentStyle -> documentStyle.isHandwritten() != null && documentStyle.isHandwritten())
|
||||
.map(DocumentStyle::getSpans)
|
||||
.flatMap(Collection::stream), Functions.identity());
|
||||
|
||||
return new Lookups(boldLookup, italicLookup, handWrittenLookup);
|
||||
|
||||
}
|
||||
|
||||
|
||||
private TextPositionInImage buildTextPositionInImage(DocumentWord dw, AffineTransform imageCTM, Lookups lookups) {
|
||||
|
||||
boolean bold = lookups.bold().containedInAnySpan(dw.getSpan());
|
||||
boolean italic = lookups.italic().containedInAnySpan(dw.getSpan());
|
||||
boolean handwritten = lookups.handwritten().containedInAnySpan(dw.getSpan());
|
||||
|
||||
FontStyle fontStyle;
|
||||
FontMetricsProvider font;
|
||||
if (handwritten) {
|
||||
fontStyle = FontStyle.HANDWRITTEN;
|
||||
font = regularFont;
|
||||
} else if (italic && bold) {
|
||||
fontStyle = FontStyle.BOLD_ITALIC;
|
||||
font = boldItalicFont;
|
||||
} else if (bold) {
|
||||
fontStyle = FontStyle.BOLD;
|
||||
font = boldFont;
|
||||
} else if (italic) {
|
||||
fontStyle = FontStyle.ITALIC;
|
||||
font = italicFont;
|
||||
} else {
|
||||
fontStyle = FontStyle.REGULAR;
|
||||
font = regularFont;
|
||||
}
|
||||
|
||||
return new TextPositionInImage(dw, imageCTM, font, fontStyle);
|
||||
}
|
||||
|
||||
|
||||
private static List<Line2D> getTableLines(AnalyzeResult analyzeResult, PageInformation pageInformation, AffineTransform imageCTM) {
|
||||
|
||||
if (analyzeResult.getTables() == null || analyzeResult.getTables().isEmpty()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
return analyzeResult.getTables()
|
||||
.stream()
|
||||
.map(DocumentTable::getCells)
|
||||
.flatMap(Collection::stream)
|
||||
.map(DocumentTableCell::getBoundingRegions)
|
||||
.flatMap(Collection::stream)
|
||||
.filter(table -> table.getPageNumber() == pageInformation.number())
|
||||
.map(BoundingRegion::getPolygon)
|
||||
.map(QuadPoint::fromPolygons)
|
||||
.map(qp -> qp.getTransformed(imageCTM))
|
||||
.flatMap(QuadPoint::asLines)
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
private static TextPositionInImage markTextOverlappingIgnoreZone(TextPositionInImage textPositionInImage, List<Rectangle2D> ignoreZones) {
|
||||
|
||||
if (intersectsIgnoreZone(ignoreZones, textPositionInImage)) {
|
||||
textPositionInImage.setOverlapsIgnoreZone(true);
|
||||
}
|
||||
|
||||
return textPositionInImage;
|
||||
}
|
||||
|
||||
|
||||
private static boolean intersectsIgnoreZone(List<Rectangle2D> ignoreZones, TextPositionInImage textPositionInImage) {
|
||||
|
||||
for (Rectangle2D ignoreZone : ignoreZones) {
|
||||
Rectangle2D textBBox = textPositionInImage.getTransformedTextBBox().getBounds2D();
|
||||
if (textBBox.intersects(ignoreZone)) {
|
||||
double intersectedArea = calculateIntersectedArea(textBBox, ignoreZone);
|
||||
double textArea = textBBox.getWidth() * textBBox.getHeight();
|
||||
if (intersectedArea / textArea > 0.5) {
|
||||
return true;
|
||||
}
|
||||
double ignoreZoneArea = ignoreZone.getWidth() * ignoreZone.getHeight();
|
||||
if (intersectedArea / ignoreZoneArea > 0.5) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
public static double calculateIntersectedArea(Rectangle2D r1, Rectangle2D r2) {
|
||||
|
||||
double xOverlap = Math.max(0, Math.min(r1.getMaxX(), r2.getMaxX()) - Math.max(r1.getMinX(), r2.getMinX()));
|
||||
double yOverlap = Math.max(0, Math.min(r1.getMaxY(), r2.getMaxY()) - Math.max(r1.getY(), r2.getY()));
|
||||
|
||||
return xOverlap * yOverlap;
|
||||
}
|
||||
|
||||
|
||||
public static AffineTransform getPageCTM(PageInformation pageInformation, double imageWidth) {
|
||||
|
||||
double scalingFactor = calculateScalingFactor(imageWidth, pageInformation);
|
||||
AffineTransform imageToCropBoxScaling = new AffineTransform(scalingFactor, 0, 0, scalingFactor, 0, 0);
|
||||
|
||||
AffineTransform mirrorMatrix = new AffineTransform(1, 0, 0, -1, 0, pageInformation.height());
|
||||
|
||||
AffineTransform rotationMatrix = switch (pageInformation.rotationDegrees()) {
|
||||
case 90 -> new AffineTransform(0, 1, -1, 0, pageInformation.height(), 0);
|
||||
case 180 -> new AffineTransform(-1, 0, 0, -1, pageInformation.width(), pageInformation.height());
|
||||
case 270 -> new AffineTransform(0, -1, 1, 0, pageInformation.width() - pageInformation.height(), pageInformation.height());
|
||||
default -> new AffineTransform();
|
||||
};
|
||||
|
||||
// matrix multiplication is performed from right to left, so the order is reversed.
|
||||
// scaling -> mirror -> rotation
|
||||
AffineTransform resultMatrix = new AffineTransform();
|
||||
|
||||
resultMatrix.concatenate(rotationMatrix);
|
||||
resultMatrix.concatenate(mirrorMatrix);
|
||||
resultMatrix.concatenate(imageToCropBoxScaling);
|
||||
return resultMatrix;
|
||||
}
|
||||
|
||||
|
||||
private static double calculateScalingFactor(double width, PageInformation pageInformation) {
|
||||
|
||||
// PDFBox always returns page height and width based on rotation
|
||||
double pageWidth;
|
||||
if (pageInformation.rotationDegrees() == 90 || pageInformation.rotationDegrees() == 270) {
|
||||
pageWidth = pageInformation.height();
|
||||
} else {
|
||||
pageWidth = pageInformation.width();
|
||||
}
|
||||
|
||||
return pageWidth / width;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private PageInformation getPageInformation(Integer pageNumber) {
|
||||
|
||||
return pageInformation.get(pageNumber);
|
||||
}
|
||||
|
||||
|
||||
private record Lookups(SpanLookup<DocumentSpan> bold, SpanLookup<DocumentSpan> italic, SpanLookup<DocumentSpan> handwritten) {
|
||||
|
||||
public static Lookups empty() {
|
||||
|
||||
return new Lookups(new SpanLookup<>(Stream.empty(), Function.identity()),
|
||||
new SpanLookup<>(Stream.empty(), Function.identity()),
|
||||
new SpanLookup<>(Stream.empty(), Function.identity()));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -8,7 +8,7 @@ import lombok.SneakyThrows;
|
||||
|
||||
public interface FontMetricsProvider extends EmbeddableFont {
|
||||
|
||||
default FontMetrics calculateMetricsForAzureBBox(String text, double textWidth, double textHeight) {
|
||||
default FontMetrics calculateMetrics(String text, double textWidth, double textHeight) {
|
||||
|
||||
HeightAndDescent heightAndDescent = calculateHeightAndDescent(text);
|
||||
float fontSize = calculateFontSize(text, textWidth);
|
||||
@ -18,16 +18,6 @@ public interface FontMetricsProvider extends EmbeddableFont {
|
||||
}
|
||||
|
||||
|
||||
default FontMetrics calculateMetricsForTightBBox(String text, double textWidth, double textHeight) {
|
||||
|
||||
HeightAndDescent heightAndDescent = calculateHeightAndDescent(text);
|
||||
float fontSize = calculateFontSize(text, textWidth);
|
||||
float heightScaling = (float) ((textHeight / (heightAndDescent.height() - heightAndDescent.descent())) * 1000) / fontSize;
|
||||
|
||||
return new FontMetrics((heightAndDescent.descent() / 1000) * fontSize, fontSize, heightScaling);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
default float calculateFontSize(String text, double textWidth) {
|
||||
|
||||
|
||||
@ -12,6 +12,8 @@ import com.azure.ai.documentintelligence.models.DocumentBarcode;
|
||||
import com.azure.ai.documentintelligence.models.DocumentFigure;
|
||||
import com.azure.ai.documentintelligence.models.DocumentKeyValuePair;
|
||||
import com.azure.ai.documentintelligence.models.DocumentLine;
|
||||
import com.azure.ai.documentintelligence.models.DocumentList;
|
||||
import com.azure.ai.documentintelligence.models.DocumentListItem;
|
||||
import com.azure.ai.documentintelligence.models.DocumentParagraph;
|
||||
import com.azure.ai.documentintelligence.models.DocumentSection;
|
||||
import com.azure.ai.documentintelligence.models.DocumentTable;
|
||||
@ -21,8 +23,8 @@ import com.azure.ai.documentintelligence.models.DocumentWord;
|
||||
import com.azure.ai.documentintelligence.models.ParagraphRole;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.PageBatch;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.SpanLookup;
|
||||
import com.knecon.fforesight.service.ocr.processor.visualizations.utils.LineUtils;
|
||||
import com.knecon.fforesight.service.ocr.processor.visualizations.utils.Rectangle2DBBoxCollector;
|
||||
import com.knecon.fforesight.service.ocr.processor.visualizations.utils.LineUtils;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.QuadPoint;
|
||||
import com.knecon.fforesight.service.viewerdoc.layers.IdpLayerConfig;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
|
||||
@ -40,7 +42,7 @@ import lombok.experimental.FieldDefaults;
|
||||
public class IdpLayer extends IdpLayerConfig {
|
||||
|
||||
public static final int LINE_WIDTH = 1;
|
||||
private Map<Integer, AffineTransform> resultToPageTransform;
|
||||
private Map<Integer, AffineTransform> pageCtms;
|
||||
|
||||
|
||||
public void addSection(int pageNumber, DocumentSection section, SpanLookup<DocumentWord> wordsOnPage) {
|
||||
@ -63,7 +65,15 @@ public class IdpLayer extends IdpLayerConfig {
|
||||
|
||||
var sectionsOnPage = getOrCreateVisualizationsOnPage(pageNumber, vis);
|
||||
|
||||
sectionsOnPage.getColoredRectangles().add(new ColoredRectangle(bbox.getTransformed(resultToPageTransform.get(pageNumber)).getBounds2D(), color, LINE_WIDTH));
|
||||
sectionsOnPage.getColoredRectangles().add(new ColoredRectangle(bbox.getTransformed(pageCtms.get(pageNumber)).getBounds2D(), color, LINE_WIDTH));
|
||||
}
|
||||
|
||||
|
||||
public void addList(DocumentList list, PageBatch pageOffset) {
|
||||
|
||||
for (DocumentListItem item : list.getItems()) {
|
||||
addBoundingRegion(item.getBoundingRegions(), lists, PARAGRAPH_COLOR, pageOffset);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -75,27 +85,19 @@ public class IdpLayer extends IdpLayerConfig {
|
||||
|
||||
public void addKeyValue(DocumentKeyValuePair keyValue, PageBatch pageOffset) {
|
||||
|
||||
if (keyValue.getKey() == null || keyValue.getKey().getContent().isEmpty()) {
|
||||
return;
|
||||
}
|
||||
addBoundingRegion(keyValue.getKey().getBoundingRegions(), keyValuePairs, KEY_COLOR, pageOffset);
|
||||
if (keyValue.getValue() != null && !keyValue.getValue().getContent().isEmpty()) {
|
||||
if (keyValue.getValue() != null) {
|
||||
addBoundingRegion(keyValue.getValue().getBoundingRegions(), keyValuePairs, VALUE_COLOR, pageOffset);
|
||||
|
||||
if (keyValue.getKey().getBoundingRegions()
|
||||
.get(0).getPageNumber() != keyValue.getValue().getBoundingRegions()
|
||||
.get(0).getPageNumber()) {
|
||||
if (keyValue.getKey().getBoundingRegions().get(0).getPageNumber() != keyValue.getValue().getBoundingRegions().get(0).getPageNumber()) {
|
||||
return;
|
||||
}
|
||||
int pageNumberWithOffset = pageOffset.getPageNumber(keyValue.getKey().getBoundingRegions()
|
||||
.get(0).getPageNumber());
|
||||
QuadPoint key = QuadPoint.fromPolygons(keyValue.getKey().getBoundingRegions()
|
||||
.get(0).getPolygon());
|
||||
QuadPoint value = QuadPoint.fromPolygons(keyValue.getValue().getBoundingRegions()
|
||||
.get(0).getPolygon());
|
||||
int pageNumberWithOffset = pageOffset.getPageNumber(keyValue.getKey().getBoundingRegions().get(0).getPageNumber());
|
||||
QuadPoint key = QuadPoint.fromPolygons(keyValue.getKey().getBoundingRegions().get(0).getPolygon());
|
||||
QuadPoint value = QuadPoint.fromPolygons(keyValue.getValue().getBoundingRegions().get(0).getPolygon());
|
||||
|
||||
var line = LineUtils.findClosestMidpointLine(key, value);
|
||||
line = LineUtils.transform(line, resultToPageTransform.get(pageNumberWithOffset));
|
||||
line = LineUtils.transform(line, pageCtms.get(pageNumberWithOffset));
|
||||
var arrowHead = LineUtils.createArrowHead(line, Math.min(LineUtils.length(line), 5));
|
||||
var linesOnPage = getOrCreateVisualizationsOnPage(pageNumberWithOffset, keyValuePairs).getColoredLines();
|
||||
linesOnPage.add(new ColoredLine(line, KEY_VALUE_BBOX_COLOR, LINE_WIDTH));
|
||||
@ -140,7 +142,7 @@ public class IdpLayer extends IdpLayerConfig {
|
||||
private void addPolygon(int pageNumber, List<Double> polygon, Visualizations visualizations, Color color) {
|
||||
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, visualizations);
|
||||
visualizationsOnPage.getColoredLines().addAll(LineUtils.quadPointAsLines(QuadPoint.fromPolygons(polygon).getTransformed(resultToPageTransform.get(pageNumber)), color));
|
||||
visualizationsOnPage.getColoredLines().addAll(LineUtils.quadPointAsLines(QuadPoint.fromPolygons(polygon).getTransformed(pageCtms.get(pageNumber)), color));
|
||||
}
|
||||
|
||||
|
||||
@ -179,8 +181,7 @@ public class IdpLayer extends IdpLayerConfig {
|
||||
|
||||
var vis = getOrCreateVisualizationsOnPage(pageOffset.getPageNumber(boundingRegion.getPageNumber()), tables);
|
||||
|
||||
QuadPoint qp = QuadPoint.fromPolygons(boundingRegion.getPolygon())
|
||||
.getTransformed(resultToPageTransform.get(pageOffset.getPageNumber(boundingRegion.getPageNumber())));
|
||||
QuadPoint qp = QuadPoint.fromPolygons(boundingRegion.getPolygon()).getTransformed(pageCtms.get(pageOffset.getPageNumber(boundingRegion.getPageNumber())));
|
||||
|
||||
vis.getFilledRectangles().add(new FilledRectangle(qp.getBounds2D(), TITLE_COLOR, 0.2f));
|
||||
|
||||
|
||||
@ -20,9 +20,9 @@ public class IdpLayerFactory {
|
||||
private final IdpLayer idpLayer;
|
||||
|
||||
|
||||
IdpLayerFactory(Map<Integer, AffineTransform> resultToPageTransform) {
|
||||
IdpLayerFactory(Map<Integer, AffineTransform> pageCtms) {
|
||||
|
||||
this.idpLayer = new IdpLayer(resultToPageTransform);
|
||||
this.idpLayer = new IdpLayer(pageCtms);
|
||||
}
|
||||
|
||||
|
||||
@ -65,6 +65,10 @@ public class IdpLayerFactory {
|
||||
analyzeResult.getTables()
|
||||
.forEach(documentTable -> idpLayer.addTable(documentTable, pageOffset));
|
||||
}
|
||||
if (analyzeResult.getLists() != null) {
|
||||
analyzeResult.getLists()
|
||||
.forEach(list -> idpLayer.addList(list, pageOffset));
|
||||
}
|
||||
if (analyzeResult.getKeyValuePairs() != null) {
|
||||
analyzeResult.getKeyValuePairs()
|
||||
.forEach(keyValue -> idpLayer.addKeyValue(keyValue, pageOffset));
|
||||
|
||||
@ -1,241 +0,0 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.visualizations.layers;
|
||||
|
||||
import static com.knecon.fforesight.service.ocr.processor.utils.StringCleaningUtility.cleanString;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
|
||||
import com.azure.ai.documentintelligence.models.AnalyzeResult;
|
||||
import com.azure.ai.documentintelligence.models.BoundingRegion;
|
||||
import com.azure.ai.documentintelligence.models.DocumentCaption;
|
||||
import com.azure.ai.documentintelligence.models.DocumentFigure;
|
||||
import com.azure.ai.documentintelligence.models.DocumentFootnote;
|
||||
import com.azure.ai.documentintelligence.models.DocumentKeyValuePair;
|
||||
import com.azure.ai.documentintelligence.models.DocumentTable;
|
||||
import com.azure.ai.documentintelligence.models.DocumentTableCell;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.DocumentSpanLookup;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.PageBatch;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.PageInformation;
|
||||
import com.knecon.fforesight.service.ocr.processor.visualizations.utils.Rectangle2DBBoxCollector;
|
||||
import com.knecon.fforesight.service.ocr.processor.visualizations.utils.RotationCorrectionUtility;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.AzureOcrFeature;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.Figure;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.KeyValuePair;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.QuadPoint;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.Region;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.Table;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.TableCell;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.TableCellType;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.TextRegion;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Getter
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class IdpResultFactory {
|
||||
|
||||
IdpResult idpResult;
|
||||
Map<Integer, AffineTransform> resultToPageTransforms;
|
||||
Map<Integer, PageInformation> pageInformation;
|
||||
Map<Integer, Double> angles;
|
||||
boolean rotationCorrection;
|
||||
|
||||
|
||||
public IdpResultFactory(Map<Integer, AffineTransform> resultToPageTransforms,
|
||||
Map<Integer, PageInformation> pageInformation,
|
||||
Map<Integer, Double> angles,
|
||||
Set<AzureOcrFeature> features) {
|
||||
|
||||
this.angles = angles;
|
||||
|
||||
this.rotationCorrection = features.contains(AzureOcrFeature.ROTATION_CORRECTION);
|
||||
this.resultToPageTransforms = resultToPageTransforms;
|
||||
this.pageInformation = pageInformation;
|
||||
this.idpResult = IdpResult.initSynchronized();
|
||||
}
|
||||
|
||||
|
||||
public AffineTransform getResultToPageTransform(Integer pageNumber) {
|
||||
|
||||
AffineTransform transform;
|
||||
if (rotationCorrection) {
|
||||
PageInformation page = pageInformation.get(pageNumber);
|
||||
transform = RotationCorrectionUtility.buildTransform(-angles.get(pageNumber), page.cropBox().getWidth(), page.cropBox().getHeight(), false);
|
||||
} else {
|
||||
transform = new AffineTransform();
|
||||
}
|
||||
transform.concatenate(resultToPageTransforms.get(pageNumber));
|
||||
return transform;
|
||||
}
|
||||
|
||||
|
||||
public void addAnalyzeResult(AnalyzeResult analyzeResult, PageBatch batch) {
|
||||
|
||||
DocumentSpanLookup words = new DocumentSpanLookup(analyzeResult);
|
||||
if (analyzeResult.getTables() != null) {
|
||||
analyzeResult.getTables()
|
||||
.forEach(documentTable -> addTable(documentTable, words, batch));
|
||||
}
|
||||
if (analyzeResult.getKeyValuePairs() != null) {
|
||||
analyzeResult.getKeyValuePairs()
|
||||
.forEach(documentKeyValuePair -> addKeyValuePair(documentKeyValuePair, batch));
|
||||
}
|
||||
if (analyzeResult.getFigures() != null) {
|
||||
analyzeResult.getFigures()
|
||||
.forEach(documentFigure -> addFigure(documentFigure, batch, words));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addFigure(DocumentFigure documentFigure, PageBatch batch, DocumentSpanLookup words) {
|
||||
|
||||
List<TextRegion> footNotes = new LinkedList<>();
|
||||
if (documentFigure.getFootnotes() != null) {
|
||||
documentFigure.getFootnotes()
|
||||
.stream()
|
||||
.map(footNote -> toTextRegion(footNote, batch))
|
||||
.filter(Objects::nonNull)
|
||||
.forEach(footNotes::add);
|
||||
}
|
||||
int batchPageNumber = documentFigure.getBoundingRegions()
|
||||
.get(0).getPageNumber();
|
||||
Region bbox = toRegionFromRegions(batch.getPageNumber(batchPageNumber), documentFigure.getBoundingRegions());
|
||||
TextRegion caption = toTextRegion(documentFigure.getCaption(), batch);
|
||||
idpResult.figures().add(new Figure(caption, bbox, footNotes));
|
||||
|
||||
}
|
||||
|
||||
|
||||
private void addKeyValuePair(DocumentKeyValuePair documentKeyValuePair, PageBatch batch) {
|
||||
|
||||
TextRegion key = null;
|
||||
if (documentKeyValuePair.getKey() != null && !documentKeyValuePair.getKey().getContent().isEmpty()) {
|
||||
Region region = toRegionFromRegions(batch, documentKeyValuePair.getKey().getBoundingRegions());
|
||||
key = new TextRegion(region, cleanString(documentKeyValuePair.getKey().getContent()));
|
||||
}
|
||||
TextRegion value = null;
|
||||
if (documentKeyValuePair.getValue() != null && !documentKeyValuePair.getValue().getContent().isEmpty()) {
|
||||
Region region = toRegionFromRegions(batch, documentKeyValuePair.getValue().getBoundingRegions());
|
||||
value = new TextRegion(region, cleanString(documentKeyValuePair.getValue().getContent()));
|
||||
}
|
||||
|
||||
idpResult.keyValuePairs().add(new KeyValuePair(key, value));
|
||||
}
|
||||
|
||||
|
||||
private void addTable(DocumentTable documentTable, DocumentSpanLookup words, PageBatch batch) {
|
||||
|
||||
TextRegion caption = toTextRegion(documentTable.getCaption(), batch);
|
||||
List<TableCell> tableCells = documentTable.getCells()
|
||||
.stream()
|
||||
.map(documentTableCell -> toTableCell(documentTableCell, words, batch))
|
||||
.toList();
|
||||
List<TextRegion> footNotes = new LinkedList<>();
|
||||
|
||||
if (documentTable.getFootnotes() != null) {
|
||||
documentTable.getFootnotes()
|
||||
.stream()
|
||||
.map(footNote -> toTextRegion(footNote, batch))
|
||||
.filter(Objects::nonNull)
|
||||
.forEach(footNotes::add);
|
||||
}
|
||||
List<Region> bbox = documentTable.getBoundingRegions()
|
||||
.stream()
|
||||
.map(b -> toRegion(b, batch))
|
||||
.toList();
|
||||
Table table = new Table(caption, documentTable.getColumnCount(), documentTable.getRowCount(), tableCells, footNotes, bbox);
|
||||
idpResult.tables().add(table);
|
||||
}
|
||||
|
||||
|
||||
private TextRegion toTextRegion(DocumentFootnote footNote, PageBatch batch) {
|
||||
|
||||
if (footNote == null || footNote.getBoundingRegions().isEmpty()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
Region region = toRegionFromRegions(batch, footNote.getBoundingRegions());
|
||||
return new TextRegion(region, cleanString(footNote.getContent()));
|
||||
}
|
||||
|
||||
|
||||
private TextRegion toTextRegion(DocumentCaption caption, PageBatch batch) {
|
||||
|
||||
if (caption == null || caption.getBoundingRegions().isEmpty()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
Region region = toRegionFromRegions(batch, caption.getBoundingRegions());
|
||||
return new TextRegion(region, cleanString(caption.getContent()));
|
||||
}
|
||||
|
||||
|
||||
private TableCell toTableCell(DocumentTableCell documentTableCell, DocumentSpanLookup words, PageBatch batch) {
|
||||
|
||||
int batchPageNumber = documentTableCell.getBoundingRegions()
|
||||
.get(0).getPageNumber();
|
||||
Region region = toRegionFromRegions(batch.getPageNumber(batchPageNumber), documentTableCell.getBoundingRegions());
|
||||
TableCellType kind = mapTableCellType(documentTableCell);
|
||||
return new TableCell(new TextRegion(region, cleanString(documentTableCell.getContent())), documentTableCell.getRowIndex(), documentTableCell.getColumnIndex(), kind);
|
||||
}
|
||||
|
||||
|
||||
private static TableCellType mapTableCellType(DocumentTableCell documentTableCell) {
|
||||
|
||||
if (documentTableCell.getKind() == null) {
|
||||
return TableCellType.CONTENT;
|
||||
}
|
||||
return switch (documentTableCell.getKind().toString()) {
|
||||
case "columnHeader" -> TableCellType.COLUMN_HEADER;
|
||||
case "rowHeader" -> TableCellType.ROW_HEADER;
|
||||
case "description" -> TableCellType.DESCRIPTION;
|
||||
case "stubHead" -> TableCellType.STUB_HEAD;
|
||||
default -> TableCellType.CONTENT;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
private Region toRegion(BoundingRegion boundingRegion, PageBatch batch) {
|
||||
|
||||
int pageNumber = batch.getPageNumber(boundingRegion.getPageNumber());
|
||||
QuadPoint qp = QuadPoint.fromPolygons(boundingRegion.getPolygon()).getTransformed(getResultToPageTransform(pageNumber));
|
||||
return new Region(pageNumber, qp.toData());
|
||||
}
|
||||
|
||||
|
||||
private Region toRegionFromRegions(int pageNumber, List<BoundingRegion> regions) {
|
||||
|
||||
if (regions.size() == 1) {
|
||||
return new Region(pageNumber, QuadPoint.fromPolygons(regions.get(0).getPolygon()).getTransformed(getResultToPageTransform(pageNumber)).toData());
|
||||
}
|
||||
QuadPoint bbox = QuadPoint.fromRectangle2D(regions.stream()
|
||||
.map(BoundingRegion::getPolygon)
|
||||
.map(QuadPoint::fromPolygons)
|
||||
.map(qp -> qp.getTransformed(getResultToPageTransform(pageNumber)).getBounds2D())
|
||||
.collect(new Rectangle2DBBoxCollector()));
|
||||
|
||||
return new Region(pageNumber, bbox.toData());
|
||||
}
|
||||
|
||||
|
||||
private Region toRegionFromRegions(PageBatch batch, List<BoundingRegion> regions) {
|
||||
|
||||
assert !regions.isEmpty();
|
||||
int batchPageNumber = regions.get(0).getPageNumber();
|
||||
if (!regions.stream()
|
||||
.map(BoundingRegion::getPageNumber)
|
||||
.allMatch(number -> number == batchPageNumber)) {
|
||||
throw new AssertionError();
|
||||
}
|
||||
int pageNumber = batch.getPageNumber(batchPageNumber);
|
||||
return toRegionFromRegions(pageNumber, regions);
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,77 +1,57 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.visualizations.layers;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.azure.ai.documentintelligence.models.AnalyzeResult;
|
||||
import com.knecon.fforesight.service.ocr.processor.OcrServiceSettings;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.PageBatch;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.PageInformation;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.OcrExecutionSupervisor;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.OcrResultPostProcessingPipeline;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.ImageProcessingPipeline;
|
||||
import com.knecon.fforesight.service.ocr.processor.OcrServiceSettings;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.ImageProcessingSupervisor;
|
||||
import com.knecon.fforesight.service.ocr.processor.visualizations.WritableOcrResult;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.AzureOcrFeature;
|
||||
import com.knecon.fforesight.service.ocr.processor.visualizations.WritableOcrResultFactory;
|
||||
import com.knecon.fforesight.service.viewerdoc.layers.LayerGroup;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class LayerFactory {
|
||||
|
||||
OcrExecutionSupervisor supervisor;
|
||||
OcrResultPostProcessingPipeline ocrResultPostProcessingPipeline;
|
||||
WritableOcrResultFactory writableOcrResultFactory;
|
||||
IdpLayerFactory idpLayerFactory;
|
||||
OcrDebugLayerFactory ocrDebugLayerFactory;
|
||||
OcrTextLayerFactory ocrTextLayerFactory;
|
||||
IdpResultFactory idpResultFactory;
|
||||
OcrServiceSettings settings;
|
||||
Set<AzureOcrFeature> features;
|
||||
Map<Integer, Double> angles;
|
||||
|
||||
|
||||
public LayerFactory(OcrServiceSettings settings,
|
||||
Set<AzureOcrFeature> features,
|
||||
OcrExecutionSupervisor supervisor,
|
||||
Map<Integer, PageInformation> pageInformation,
|
||||
ImageProcessingPipeline imageProcessingPipeline) {
|
||||
public LayerFactory(OcrServiceSettings settings, OcrExecutionSupervisor supervisor, ImageProcessingSupervisor imageSupervisor, Map<Integer, PageInformation> pageInformation) {
|
||||
|
||||
this.ocrResultPostProcessingPipeline = new OcrResultPostProcessingPipeline(pageInformation, imageProcessingPipeline, settings, features);
|
||||
this.idpLayerFactory = new IdpLayerFactory(ocrResultPostProcessingPipeline.getResultToPageTransforms());
|
||||
this.writableOcrResultFactory = new WritableOcrResultFactory(pageInformation, settings, imageSupervisor);
|
||||
this.idpLayerFactory = new IdpLayerFactory(writableOcrResultFactory.getPageCtms());
|
||||
this.ocrDebugLayerFactory = new OcrDebugLayerFactory();
|
||||
this.ocrTextLayerFactory = new OcrTextLayerFactory();
|
||||
this.settings = settings;
|
||||
this.features = features;
|
||||
this.supervisor = supervisor;
|
||||
this.angles = Collections.synchronizedMap(new HashMap<>());
|
||||
this.idpResultFactory = new IdpResultFactory(ocrResultPostProcessingPipeline.getResultToPageTransforms(), pageInformation, angles, features);
|
||||
}
|
||||
|
||||
|
||||
public void processAnalyzeResult(PageBatch batch, AnalyzeResult analyzeResult) throws InterruptedException {
|
||||
|
||||
List<WritableOcrResult> results = ocrResultPostProcessingPipeline.processAnalyzeResult(analyzeResult, batch);
|
||||
results.forEach(result -> angles.put(result.getPageNumber(), result.getAngle()));
|
||||
public void addAnalyzeResult(PageBatch pageRange, AnalyzeResult analyzeResult) throws InterruptedException {
|
||||
|
||||
List<WritableOcrResult> results = writableOcrResultFactory.buildOcrResultToWrite(analyzeResult, pageRange);
|
||||
ocrTextLayerFactory.addWritableOcrResult(results);
|
||||
|
||||
if (settings.isDebug()) {
|
||||
ocrDebugLayerFactory.addAnalysisResult(results);
|
||||
}
|
||||
if (features.contains(AzureOcrFeature.IDP)) {
|
||||
idpLayerFactory.addAnalyzeResult(analyzeResult, batch);
|
||||
idpResultFactory.addAnalyzeResult(analyzeResult, batch);
|
||||
if (settings.isIdpEnabled()) {
|
||||
idpLayerFactory.addAnalyzeResult(analyzeResult, pageRange);
|
||||
}
|
||||
|
||||
this.supervisor.finishMappingResult(batch);
|
||||
this.supervisor.finishMappingResult(pageRange);
|
||||
}
|
||||
|
||||
|
||||
@ -84,11 +64,10 @@ public class LayerFactory {
|
||||
if (settings.isDebug()) {
|
||||
debugLayers.add(ocrDebugLayerFactory.getOcrDebugLayer());
|
||||
}
|
||||
if (features.contains(AzureOcrFeature.IDP)) {
|
||||
if (settings.isIdpEnabled()) {
|
||||
debugLayers.add(idpLayerFactory.getIdpLayer());
|
||||
}
|
||||
IdpResult idpResult = features.contains(AzureOcrFeature.IDP) ? idpResultFactory.getIdpResult() : null;
|
||||
return new OcrResult(List.of(ocrTextLayer), debugLayers, angles, idpResult);
|
||||
return new OcrResult(List.of(ocrTextLayer), debugLayers);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -31,7 +31,7 @@ public class OcrDebugLayer extends OcrDebugLayerConfig {
|
||||
word.getFontMetricsProvider(),
|
||||
Optional.of(word.getTextMatrix()),
|
||||
Optional.of(RenderingMode.FILL)));
|
||||
bboxOnPage.getColoredLines().addAll(LineUtils.quadPointAsLines(word.getTransformedTextBBox(), word.isSnugBBox()));
|
||||
bboxOnPage.getColoredLines().addAll(LineUtils.quadPointAsLines(word.getTransformedTextBBox()));
|
||||
}
|
||||
|
||||
|
||||
@ -57,11 +57,4 @@ public class OcrDebugLayer extends OcrDebugLayerConfig {
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean isVisibleByDefault() {
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,11 +1,9 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.visualizations.layers;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
|
||||
import com.knecon.fforesight.service.viewerdoc.layers.LayerGroup;
|
||||
|
||||
public record OcrResult(List<LayerGroup> regularLayers, List<LayerGroup> debugLayers, Map<Integer, Double> anglesPerPage, IdpResult idpResult) {
|
||||
public record OcrResult(List<LayerGroup> regularLayers, List<LayerGroup> debugLayers) {
|
||||
|
||||
}
|
||||
@ -14,19 +14,12 @@ import lombok.experimental.UtilityClass;
|
||||
@UtilityClass
|
||||
public class LineUtils {
|
||||
|
||||
public List<ColoredLine> quadPointAsLines(QuadPoint rect, boolean tight) {
|
||||
public List<ColoredLine> quadPointAsLines(QuadPoint rect) {
|
||||
|
||||
if (tight) {
|
||||
return List.of(new ColoredLine(new Line2D.Double(rect.a(), rect.b()), Color.GREEN, 1),
|
||||
new ColoredLine(new Line2D.Double(rect.b(), rect.c()), Color.GREEN, 1),
|
||||
new ColoredLine(new Line2D.Double(rect.c(), rect.d()), Color.GREEN, 1),
|
||||
new ColoredLine(new Line2D.Double(rect.d(), rect.a()), Color.GREEN, 1));
|
||||
}
|
||||
|
||||
return List.of(new ColoredLine(new Line2D.Double(rect.a(), rect.b()), Color.BLUE, 1),
|
||||
return List.of(new ColoredLine(new Line2D.Double(rect.a(), rect.b()), Color.ORANGE, 1),
|
||||
new ColoredLine(new Line2D.Double(rect.b(), rect.c()), Color.BLUE, 1),
|
||||
new ColoredLine(new Line2D.Double(rect.c(), rect.d()), Color.BLUE, 1),
|
||||
new ColoredLine(new Line2D.Double(rect.d(), rect.a()), Color.BLUE, 1));
|
||||
new ColoredLine(new Line2D.Double(rect.c(), rect.d()), Color.GREEN, 1),
|
||||
new ColoredLine(new Line2D.Double(rect.d(), rect.a()), Color.MAGENTA, 1));
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1,217 +0,0 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.visualizations.utils;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.knecon.fforesight.service.viewerdoc.LayerIdentifier;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.sdf.Obj;
|
||||
import com.pdftron.sdf.SDFDoc;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class RotationCorrectionUtility {
|
||||
|
||||
public static final LayerIdentifier KNECON_ROTATION_CORRECTION = new LayerIdentifier(null, "ROTATION_CORRECTION");
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void rotatePages(Path inputFile, Path outputFile, Map<Integer, Double> anglesPerPage) {
|
||||
|
||||
Path tmp = Files.createTempFile("tempDocument", ".pdf");
|
||||
Files.copy(inputFile, tmp, StandardCopyOption.REPLACE_EXISTING);
|
||||
try (var in = new FileInputStream(tmp.toFile()); var out = new FileOutputStream(outputFile.toFile())) {
|
||||
rotatePages(in, out, anglesPerPage);
|
||||
}
|
||||
Files.deleteIfExists(tmp);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void rotatePages(InputStream in, OutputStream out, Map<Integer, Double> anglesPerPage) {
|
||||
|
||||
try (PDFDoc doc = new PDFDoc(in)) {
|
||||
anglesPerPage.forEach((pageNumber, angle) -> rotatePage(pageNumber, doc, angle));
|
||||
doc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void rotatePage(int pageNumber, PDFDoc doc, double angle) {
|
||||
|
||||
int quadrants = getQuadrantRotation(angle);
|
||||
Page page = doc.getPage(pageNumber);
|
||||
page.setRotation((quadrants + page.getRotation()) % 4);
|
||||
double remainingAngle = getRemainingAngle(angle, quadrants);
|
||||
|
||||
Obj contents = page.getContents();
|
||||
String content = buildRotationContent(remainingAngle, page);
|
||||
Obj rotationStream = doc.createIndirectStream(content.getBytes());
|
||||
Obj newContentsArray = doc.createIndirectArray();
|
||||
newContentsArray.pushBack(rotationStream);
|
||||
addPreviousContents(contents, newContentsArray);
|
||||
String closingContent = buildClosingContent();
|
||||
Obj closingStream = doc.createIndirectStream(closingContent.getBytes());
|
||||
newContentsArray.pushBack(closingStream);
|
||||
page.getSDFObj().erase("Contents");
|
||||
page.getSDFObj().put("Contents", newContentsArray);
|
||||
}
|
||||
|
||||
|
||||
private String buildClosingContent() {
|
||||
|
||||
List<String> closingCommands = new LinkedList<>();
|
||||
closingCommands.add("Q");
|
||||
return String.join("\n", closingCommands);
|
||||
}
|
||||
|
||||
|
||||
private String buildRotationContent(double angle, Page page) throws PDFNetException {
|
||||
|
||||
List<String> commands = new LinkedList<>();
|
||||
|
||||
double scale = getScalingFactor(angle, page);
|
||||
double x = page.getCropBox().getWidth() / 2;
|
||||
double y = page.getCropBox().getHeight() / 2;
|
||||
commands.add("q");
|
||||
commands.add("/%s <<>> BDC".formatted(KNECON_ROTATION_CORRECTION.markedContentName()));
|
||||
commands.add(buildMatrixCommands(AffineTransform.getTranslateInstance(x, y)));
|
||||
commands.add(buildMatrixCommands(AffineTransform.getRotateInstance(Math.toRadians(angle))));
|
||||
commands.add(buildMatrixCommands(AffineTransform.getScaleInstance(scale, scale)));
|
||||
commands.add(buildMatrixCommands(AffineTransform.getTranslateInstance(-x, -y)));
|
||||
commands.add("EMC");
|
||||
return String.join("\n", commands);
|
||||
}
|
||||
|
||||
|
||||
private void addPreviousContents(Obj contents, Obj newContentsArray) throws PDFNetException {
|
||||
|
||||
switch (contents.getType()) {
|
||||
case Obj.e_array -> {
|
||||
for (int i = 0; i < contents.size(); i++) {
|
||||
newContentsArray.pushBack(contents.getAt(i));
|
||||
}
|
||||
}
|
||||
case Obj.e_stream -> newContentsArray.pushBack(contents);
|
||||
default -> throw new IllegalStateException("Unexpected value: " + contents.getType());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static double getScalingFactor(double angle, Page page) throws PDFNetException {
|
||||
|
||||
double width = page.getPageWidth();
|
||||
double height = page.getPageHeight();
|
||||
return getScalingFactor(angle, width, height);
|
||||
}
|
||||
|
||||
|
||||
public static double getScalingFactor(double angle, double w, double h) {
|
||||
|
||||
if (Math.abs(angle) < 20) {
|
||||
return 1;
|
||||
}
|
||||
double sin = Math.abs(Math.sin(Math.toRadians(angle)));
|
||||
double cos = Math.abs(Math.cos(Math.toRadians(angle)));
|
||||
double newWidth = w * cos + h * sin;
|
||||
double newHeight = h * cos + w * sin;
|
||||
return Math.min(w / newWidth, h / newHeight);
|
||||
}
|
||||
|
||||
|
||||
public static AffineTransform buildTransform(double angle, double originalWidth, double originalHeight) {
|
||||
|
||||
return buildTransform(angle, originalWidth, originalHeight, true);
|
||||
}
|
||||
|
||||
|
||||
public static AffineTransform buildTransform(double angle, double originalWidth, double originalHeight, boolean quadrantRotation) {
|
||||
|
||||
int quadrants = getQuadrantRotation(angle);
|
||||
|
||||
double h = originalHeight;
|
||||
double w = originalWidth;
|
||||
|
||||
AffineTransform quadrantRotationTransform = new AffineTransform();
|
||||
if (quadrantRotation) {
|
||||
|
||||
if (quadrants == 1 || quadrants == 3) {
|
||||
w = originalHeight;
|
||||
h = originalWidth;
|
||||
}
|
||||
|
||||
quadrantRotationTransform = switch (quadrants) {
|
||||
case 1 -> new AffineTransform(0, 1, -1, 0, h, 0);
|
||||
case 2 -> new AffineTransform(-1, 0, 0, -1, w, h);
|
||||
case 3 -> new AffineTransform(0, -1, 1, 0, w - h, h);
|
||||
default -> new AffineTransform();
|
||||
};
|
||||
}
|
||||
|
||||
double remainder = getRemainingAngle(angle, quadrants);
|
||||
double scale = getScalingFactor(remainder, w, h);
|
||||
|
||||
AffineTransform transform = new AffineTransform();
|
||||
transform.translate(w / 2, h / 2);
|
||||
transform.rotate(Math.toRadians(remainder));
|
||||
transform.scale(scale, scale);
|
||||
transform.translate(-w / 2, -h / 2);
|
||||
transform.concatenate(quadrantRotationTransform);
|
||||
|
||||
return transform;
|
||||
}
|
||||
|
||||
|
||||
public static int getQuadrantRotation(double angle) {
|
||||
|
||||
double remainder = angle % 360;
|
||||
|
||||
if (remainder < 0) {
|
||||
remainder += 360;
|
||||
}
|
||||
|
||||
if (remainder > 315 || remainder <= 45) {
|
||||
return 0;
|
||||
} else if (remainder > 45 && remainder <= 135) {
|
||||
return 1;
|
||||
} else if (remainder > 135 && remainder <= 225) {
|
||||
return 2;
|
||||
} else {
|
||||
return 3;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static double getRemainingAngle(double angle, int quadrants) {
|
||||
|
||||
double referenceAngle = 90 * quadrants;
|
||||
return (angle - referenceAngle) % 360;
|
||||
}
|
||||
|
||||
|
||||
public static double getRemainingAngle(double angle) {
|
||||
|
||||
return getRemainingAngle(angle, getQuadrantRotation(angle));
|
||||
}
|
||||
|
||||
|
||||
private String buildMatrixCommands(AffineTransform at) {
|
||||
|
||||
return "%f %f %f %f %f %f cm".formatted(at.getScaleX(), at.getShearX(), at.getShearY(), at.getScaleY(), at.getTranslateX(), at.getTranslateY());
|
||||
}
|
||||
|
||||
}
|
||||
@ -4,8 +4,8 @@ import java.io.File;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
@ -13,16 +13,11 @@ import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.OcrServiceSettings;
|
||||
import com.knecon.fforesight.service.ocr.processor.initializer.NativeLibrariesInitializer;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.PageBatch;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.GhostScriptService;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.ImageProcessingPipeline;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.ImageProcessingService;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.imageprocessing.ImageProcessingSupervisor;
|
||||
import com.knecon.fforesight.service.ocr.processor.utils.OsUtils;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.PDFNet;
|
||||
import com.sun.jna.NativeLibrary;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
@ -36,11 +31,13 @@ class ImageProcessingPipelineTest {
|
||||
@BeforeEach
|
||||
public void setup() {
|
||||
|
||||
new NativeLibrariesInitializer("demo:1650351709282:7bd235e003000000004ec28a6743e1163a085e2115de2536ab6e2cfe5a", "/home/kschuettler/software/leptonica/vcpkg/installed/x64-linux-dynamic/lib/").init();
|
||||
System.setProperty("jna.library.path", System.getenv("VCPKG_DYNAMIC_LIB"));
|
||||
try (NativeLibrary leptonicaLib = NativeLibrary.getInstance("leptonica")) {
|
||||
assert leptonicaLib != null;
|
||||
}
|
||||
|
||||
OcrServiceSettings settings = new OcrServiceSettings();
|
||||
ImageProcessingService imageProcessingService = new ImageProcessingService(settings);
|
||||
GhostScriptService ghostScriptService = new GhostScriptService(settings);
|
||||
ImageProcessingService imageProcessingService = new ImageProcessingService();
|
||||
GhostScriptService ghostScriptService = new GhostScriptService();
|
||||
imageProcessingPipeline = new ImageProcessingPipeline(ghostScriptService, imageProcessingService);
|
||||
}
|
||||
|
||||
@ -49,7 +46,7 @@ class ImageProcessingPipelineTest {
|
||||
@SneakyThrows
|
||||
public void testImageProcessingPipeline() {
|
||||
|
||||
String fileName = "/home/kschuettler/Dokumente/TestFiles/OCR/VV-331340/VV-331340_OCRED_first15.pdf";
|
||||
String fileName = "/home/kschuettler/Dokumente/TestFiles/OCR/VV-331340.pdf";
|
||||
|
||||
File file;
|
||||
if (fileName.startsWith("files")) {
|
||||
@ -66,26 +63,21 @@ class ImageProcessingPipelineTest {
|
||||
|
||||
Files.copy(file.toPath(), documentFile, StandardCopyOption.REPLACE_EXISTING);
|
||||
|
||||
try (var doc = new PDFDoc(fileName)) {
|
||||
List<Integer> pageNumbers = new LinkedList<>();
|
||||
for (int i = 1; i <= doc.getPageCount(); i++) {
|
||||
if (i % 2 == 0) {
|
||||
continue;
|
||||
}
|
||||
pageNumbers.add(i);
|
||||
}
|
||||
PageBatch batch = BatchFactory.create(0, doc, pageNumbers, tmpDir);
|
||||
|
||||
ImageProcessingSupervisor supervisor = imageProcessingPipeline.addToPipeline(batch);
|
||||
|
||||
batch.forEach(pageNumber -> {
|
||||
try {
|
||||
assert supervisor.awaitProcessedPage(pageNumber) != null;
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
});
|
||||
int numberOfpages;
|
||||
try (var doc = Loader.loadPDF(file)) {
|
||||
numberOfpages = doc.getNumberOfPages();
|
||||
}
|
||||
Set<Integer> pageNumbers = new HashSet<>();
|
||||
for (int i = 1; i <= numberOfpages; i++) {
|
||||
if (i % 2 == 0) {
|
||||
continue;
|
||||
}
|
||||
pageNumbers.add(i);
|
||||
}
|
||||
|
||||
ImageProcessingSupervisor supervisor = imageProcessingPipeline.run(pageNumbers, tmpDir.resolve("images"), documentFile.toFile());
|
||||
|
||||
supervisor.awaitAll();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,70 +0,0 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import static com.knecon.fforesight.service.ocr.processor.visualizations.utils.RotationCorrectionUtility.KNECON_ROTATION_CORRECTION;
|
||||
|
||||
import java.nio.file.Path;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.visualizations.utils.RotationCorrectionUtility;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.PageContentCleaner;
|
||||
import com.pdftron.pdf.ElementReader;
|
||||
import com.pdftron.pdf.ElementWriter;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.PDFNet;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.pdf.PageIterator;
|
||||
import com.pdftron.sdf.SDFDoc;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@Disabled // leptonica is not available in build server
|
||||
public class PageRotationTest {
|
||||
|
||||
@BeforeAll
|
||||
public static void setUp() {
|
||||
|
||||
PDFNet.initialize("demo:1650351709282:7bd235e003000000004ec28a6743e1163a085e2115de2536ab6e2cfe5a");
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void putRotation() {
|
||||
|
||||
Map<Integer, Double> angles = new HashMap<>();
|
||||
for (int i = 1; i <= 100; i++) {
|
||||
double a = -90 + (i * ((double) 180 / 100));
|
||||
angles.put(i, a);
|
||||
}
|
||||
Path inputFile = Path.of("/home/kschuettler/Dokumente/TestFiles/OCR/VV-331340-first100.pdf");
|
||||
RotationCorrectionUtility.rotatePages(inputFile, Path.of("/tmp").resolve(inputFile.getFileName() + "_rotated.pdf"), angles);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void removeRotation() {
|
||||
|
||||
Path inputFile = Path.of("/tmp/VV-331340-first100.pdf_rotated.pdf");
|
||||
try (var doc = new PDFDoc(inputFile.toFile()
|
||||
.toString()); var reader = new ElementReader(); var writer = new ElementWriter(); PageIterator pageIterator = doc.getPageIterator()) {
|
||||
PageContentCleaner cleaner = PageContentCleaner.builder()
|
||||
.reader(reader)
|
||||
.writer(writer)
|
||||
.markedContentToRemove(Set.of(KNECON_ROTATION_CORRECTION.markedContentName()))
|
||||
.build();
|
||||
|
||||
while (pageIterator.hasNext()) {
|
||||
Page page = pageIterator.next();
|
||||
cleaner.removeMarkedContent(page);
|
||||
}
|
||||
doc.save(inputFile.resolveSibling(inputFile.getFileName() + "_derotated.pdf").toFile().toString(), SDFDoc.SaveMode.LINEARIZED, null);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,232 +0,0 @@
|
||||
package com.knecon.fforesight.service.ocr.processor.service;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Line2D;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.azure.ai.documentintelligence.models.AnalyzeResult;
|
||||
import com.azure.json.JsonOptions;
|
||||
import com.azure.json.JsonReader;
|
||||
import com.azure.json.implementation.DefaultJsonReader;
|
||||
import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService;
|
||||
import com.knecon.fforesight.service.ocr.processor.OcrServiceSettings;
|
||||
import com.knecon.fforesight.service.ocr.processor.initializer.NativeLibrariesInitializer;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.ImageFile;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.PageInformation;
|
||||
import com.knecon.fforesight.service.ocr.processor.model.TextPositionInImage;
|
||||
import com.knecon.fforesight.service.ocr.processor.visualizations.WritableOcrResult;
|
||||
import com.knecon.fforesight.service.ocr.processor.visualizations.layers.OcrDebugLayerFactory;
|
||||
import com.knecon.fforesight.service.ocr.processor.visualizations.utils.RotationCorrectionUtility;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.QuadPoint;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService;
|
||||
import com.pdftron.pdf.ColorPt;
|
||||
import com.pdftron.pdf.ColorSpace;
|
||||
import com.pdftron.pdf.Element;
|
||||
import com.pdftron.pdf.ElementBuilder;
|
||||
import com.pdftron.pdf.ElementWriter;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.Page;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@Disabled // leptonica is not available in build server
|
||||
public class SnugBoxesTest {
|
||||
|
||||
public static final int PAGE_NUMBER = 41;
|
||||
public static final Path ORIGIN_FILE = Path.of("/home/kschuettler/Dokumente/TestFiles/OCR/VV-331340-first100.pdf");
|
||||
public static final Path TEST_FOLDER = Path.of("/tmp/OCR_TEST/").resolve(ORIGIN_FILE.getFileName());
|
||||
public static final Path BATCH_FOLDER = TEST_FOLDER.resolve("batch_0");
|
||||
public static final Path DESTINATION_FILE = BATCH_FOLDER.resolve("SnugBoxesTest.pdf");
|
||||
|
||||
PDFTronViewerDocumentService viewerDocumentService = new PDFTronViewerDocumentService(null);
|
||||
|
||||
|
||||
@BeforeAll
|
||||
public static void setUp() {
|
||||
|
||||
new NativeLibrariesInitializer("demo:1650351709282:7bd235e003000000004ec28a6743e1163a085e2115de2536ab6e2cfe5a", "/home/kschuettler/software/leptonica/vcpkg/installed/x64-linux-dynamic/lib/").init();
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void snugBoxes() {
|
||||
|
||||
String filePath = ORIGIN_FILE.toFile().toString();
|
||||
File file = new File(filePath);
|
||||
assert file.exists();
|
||||
ImageFile imageFile = new ImageFile(PAGE_NUMBER, file.toString());
|
||||
AnalyzeResult result = null;
|
||||
try (var in = new FileInputStream(BATCH_FOLDER.resolve("analyzeResult.json").toFile()); JsonReader reader = DefaultJsonReader.fromStream(in, new JsonOptions());) {
|
||||
result = AnalyzeResult.fromJson(reader);
|
||||
}
|
||||
|
||||
var resultPage = result.getPages()
|
||||
.get(PAGE_NUMBER - 1);
|
||||
OcrResultPostProcessingPipeline ocrResultPostProcessingPipeline = new OcrResultPostProcessingPipeline(null, null, new OcrServiceSettings(), Set.of());
|
||||
OcrDebugLayerFactory debugLayerFactory = new OcrDebugLayerFactory();
|
||||
InvisibleElementRemovalService invisibleElementRemovalService = new InvisibleElementRemovalService();
|
||||
try (var in = new FileInputStream(ORIGIN_FILE.toFile()); var out = new FileOutputStream(DESTINATION_FILE.toFile())) {
|
||||
invisibleElementRemovalService.removeInvisibleElements(in, out, false);
|
||||
}
|
||||
PageInformation pageInformation = getPageInformation(PAGE_NUMBER, DESTINATION_FILE.toFile().toString());
|
||||
OcrResultPostProcessingPipeline.Lookups empty = OcrResultPostProcessingPipeline.Lookups.empty();
|
||||
|
||||
AffineTransform pageCtm = getPageCtm(PAGE_NUMBER, filePath, resultPage.getWidth());
|
||||
// pageCtm.preConcatenate(rotationCorrection);
|
||||
// pageCtm.preConcatenate(quadrantTransform);
|
||||
// Pix pageImage = imageFile.readPix();
|
||||
// AffineTransform imageTransform = WritableOcrResultFactory.buildImageTransform(resultPage, pageImage);
|
||||
// List<Rectangle2D> rects = new LinkedList<>();
|
||||
// for (DocumentWord word : resultPage.getWords()) {
|
||||
// QuadPoint quadPoint = QuadPoint.fromPolygons(word.getPolygon());
|
||||
// Rectangle2D rect = quadPoint.getTransformed(imageTransform).getBounds2D();
|
||||
// if (rect.getX() > 0 && rect.getY() > 0 && rect.getMaxX() < pageImage.w && rect.getMaxY() < pageImage.h) {
|
||||
// rects.add(rect);
|
||||
// }
|
||||
// }
|
||||
// Boxa boxa = createBoxaFromRectangles(rects);
|
||||
// Pix drawedPix = Leptonica1.pixDrawBoxa(pageImage, boxa, 5, 1);
|
||||
// Leptonica1.pixWrite("/tmp/OCR_TEST/VV-331340-first100.pdf/image_pipeline/page_" + PAGE_NUMBER + ".tiff", drawedPix, 5);
|
||||
|
||||
//
|
||||
|
||||
List<TextPositionInImage> words = ocrResultPostProcessingPipeline.buildTextWithSnugBBoxes(resultPage, imageFile, pageCtm, empty, pageInformation);
|
||||
var results = new WritableOcrResult(PAGE_NUMBER, -resultPage.getAngle(), words, Collections.emptyList());
|
||||
debugLayerFactory.addAnalysisResult(List.of(results));
|
||||
|
||||
// try (var doc = new PDFDoc(tmpFile.toString()); var out = new FileOutputStream(DESTINATION_FILE.toFile())) {
|
||||
// PageRotationHelper.rotatePage(PAGE_NUMBER, doc, -resultPage.getAngle());
|
||||
// var rects = resultPage.getWords()
|
||||
// .stream()
|
||||
// .map(DocumentWord::getPolygon)
|
||||
// .map(QuadPoint::fromPolygons)
|
||||
// .map(qp -> qp.getTransformed(pageCtm))
|
||||
// .toList();
|
||||
// drawRects(doc, rects, PAGE_NUMBER);
|
||||
// doc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
|
||||
// }
|
||||
// Files.deleteIfExists(tmpFile);
|
||||
|
||||
viewerDocumentService.addLayerGroups(DESTINATION_FILE.toFile(), DESTINATION_FILE.toFile(), List.of(debugLayerFactory.getOcrDebugLayer()));
|
||||
RotationCorrectionUtility.rotatePages(DESTINATION_FILE, DESTINATION_FILE, Map.of(PAGE_NUMBER, -resultPage.getAngle()));
|
||||
}
|
||||
|
||||
//
|
||||
// private static List<Rectangle2D> readRectsFromBoxa(Boxa boxa) {
|
||||
//
|
||||
// Pointer[] pointers = boxa.box.getPointer().getPointerArray(0, boxa.n);
|
||||
// List<Rectangle2D> boxes = new ArrayList<>(boxa.n);
|
||||
// for (int i = 0; i < boxa.n; i++) {
|
||||
// Box box = new Box(pointers[i]);
|
||||
// boxes.add(new Rectangle2D.Double(box.x, box.y, box.w, box.h));
|
||||
// LeptUtils.dispose(box);
|
||||
// }
|
||||
// return boxes;
|
||||
// }
|
||||
//
|
||||
//
|
||||
// @SuppressWarnings("PMD") // Memory will be de-allocated with boxa
|
||||
// public static Boxa createBoxaFromRectangles(List<Rectangle2D> rectangles) {
|
||||
//
|
||||
// if (rectangles.isEmpty()) {
|
||||
// return new Boxa();
|
||||
// }
|
||||
//
|
||||
// int n = rectangles.size(); // Number of rectangles
|
||||
// int nalloc = n; // Allocating memory for exactly 'n' boxes
|
||||
// int refcount = 1; // Default refcount
|
||||
//
|
||||
// Pointer boxPointerArray = new Memory((long) Native.POINTER_SIZE * n); // Memory for n pointers
|
||||
//
|
||||
// for (int i = 0; i < n; i++) {
|
||||
//
|
||||
// Rectangle2D rect = rectangles.get(i);
|
||||
// var mem = new Memory(20L);
|
||||
// mem.setInt(0, (int) rect.getX());
|
||||
// mem.setInt(4, (int) rect.getY());
|
||||
// mem.setInt(8, (int) rect.getWidth());
|
||||
// mem.setInt(12, (int) rect.getHeight());
|
||||
// mem.setInt(16, refcount);
|
||||
//
|
||||
// // Write the pointer of each Box into the native memory
|
||||
// boxPointerArray.setPointer((long) Native.POINTER_SIZE * i, mem);
|
||||
// }
|
||||
//
|
||||
// // Create a PointerByReference pointing to the native memory of the array
|
||||
// PointerByReference boxPointerRef = new PointerByReference();
|
||||
// boxPointerRef.setPointer(boxPointerArray);
|
||||
//
|
||||
// // Create the Boxa instance
|
||||
//
|
||||
// return new Boxa(n, nalloc, refcount, boxPointerRef);
|
||||
// }
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void drawRects(PDFDoc doc, List<QuadPoint> quadPoints, int pageNumber) {
|
||||
|
||||
try (ElementWriter writer = new ElementWriter(); ElementBuilder builder = new ElementBuilder()) {
|
||||
Page page = doc.getPage(pageNumber);
|
||||
writer.begin(page, ElementWriter.e_overlay);
|
||||
for (QuadPoint quadPoint : quadPoints) {
|
||||
quadPoint.asLines()
|
||||
.forEach(line -> {
|
||||
drawLine(line, builder, writer);
|
||||
});
|
||||
}
|
||||
writer.end();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static void drawLine(Line2D l, ElementBuilder builder, ElementWriter writer) {
|
||||
|
||||
float[] rgbComponents = Color.BLUE.getRGBColorComponents(null);
|
||||
|
||||
builder.pathBegin();
|
||||
builder.moveTo(l.getX1(), l.getY1());
|
||||
builder.lineTo(l.getX2(), l.getY2());
|
||||
Element line = builder.pathEnd();
|
||||
|
||||
line.setPathStroke(true);
|
||||
line.setPathFill(false);
|
||||
line.getGState().setLineWidth(1);
|
||||
line.getGState().setStrokeColorSpace(ColorSpace.createDeviceRGB());
|
||||
|
||||
try (ColorPt color = new ColorPt(rgbComponents[0], rgbComponents[1], rgbComponents[2])) {
|
||||
line.getGState().setStrokeColor(color);
|
||||
}
|
||||
writer.writeElement(line);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static AffineTransform getPageCtm(int pageNumber, String file, double imageWidh) {
|
||||
|
||||
return OcrResultPostProcessingPipeline.buildResultToPageTransform(getPageInformation(pageNumber, file), imageWidh);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static PageInformation getPageInformation(int pageNumber, String file) {
|
||||
|
||||
try (var in = new FileInputStream(file); var doc = new PDFDoc(in)) {
|
||||
return PageInformation.fromPage(pageNumber, doc.getPage(pageNumber));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -20,7 +20,7 @@ class Type0FontMetricsProviderTest {
|
||||
|
||||
try (PDDocument document = Loader.loadPDF(new File(Type0FontMetricsProviderTest.class.getClassLoader().getResource("InvisibleText.pdf").getPath()))) {
|
||||
Type0FontMetricsProvider metricsFactory = Type0FontMetricsProvider.regular(document);
|
||||
FontMetrics fontMetrics = metricsFactory.calculateMetricsForAzureBBox("deine mutter", 100, 50);
|
||||
FontMetrics fontMetrics = metricsFactory.calculateMetrics("deine mutter", 100, 50);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -8,9 +8,6 @@ plugins {
|
||||
id("org.sonarqube") version "4.3.0.3225"
|
||||
id("io.freefair.lombok") version "8.4"
|
||||
}
|
||||
pmd {
|
||||
isConsoleOutput = true
|
||||
}
|
||||
|
||||
configurations {
|
||||
all {
|
||||
@ -27,21 +24,15 @@ dependencies {
|
||||
implementation(project(":azure-ocr-service-api"))
|
||||
|
||||
implementation("com.knecon.fforesight:tracing-commons:0.5.0")
|
||||
implementation("io.github.openfeign:feign-core:12.4")
|
||||
implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.1.1")
|
||||
implementation("org.springframework.boot:spring-boot-starter-amqp:${springBootStarterVersion}")
|
||||
|
||||
implementation("com.iqser.red.service:persistence-service-internal-api-v1:2.224.0")
|
||||
implementation("com.knecon.fforesight:tenant-commons:0.31.0")
|
||||
implementation("com.iqser.red.commons:storage-commons:2.50.0")
|
||||
|
||||
implementation("net.logstash.logback:logstash-logback-encoder:7.4")
|
||||
implementation("ch.qos.logback:logback-classic")
|
||||
|
||||
testImplementation("org.springframework.boot:spring-boot-starter-test:${springBootStarterVersion}")
|
||||
testImplementation("com.iqser.red.commons:test-commons:2.1.0")
|
||||
testImplementation("org.springframework.amqp:spring-rabbit-test:3.0.2")
|
||||
testImplementation("com.iqser.red.commons:pdftron-logic-commons:2.32.0")
|
||||
}
|
||||
|
||||
tasks.named<BootBuildImage>("bootBuildImage") {
|
||||
|
||||
@ -9,9 +9,11 @@ import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.Import;
|
||||
import org.springframework.scheduling.annotation.EnableAsync;
|
||||
|
||||
import com.iqser.red.storage.commons.StorageAutoConfiguration;
|
||||
import com.iqser.red.pdftronlogic.commons.InvisibleElementRemovalService;
|
||||
import com.iqser.red.pdftronlogic.commons.WatermarkRemovalService;
|
||||
import com.knecon.fforesight.service.ocr.processor.OcrServiceProcessorConfiguration;
|
||||
import com.knecon.fforesight.service.ocr.v1.server.configuration.MessagingConfiguration;
|
||||
import com.iqser.red.storage.commons.StorageAutoConfiguration;
|
||||
import com.knecon.fforesight.tenantcommons.MultiTenancyAutoConfiguration;
|
||||
|
||||
import io.micrometer.core.aop.TimedAspect;
|
||||
@ -41,4 +43,17 @@ public class Application {
|
||||
}
|
||||
|
||||
|
||||
@Bean
|
||||
public InvisibleElementRemovalService invisibleElementRemovalService() {
|
||||
|
||||
return new InvisibleElementRemovalService();
|
||||
}
|
||||
|
||||
|
||||
@Bean
|
||||
public WatermarkRemovalService watermarkRemovalService() {
|
||||
|
||||
return new WatermarkRemovalService();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,9 +1,5 @@
|
||||
package com.knecon.fforesight.service.ocr.v1.server.configuration;
|
||||
|
||||
import org.springframework.amqp.core.DirectExchange;
|
||||
import org.springframework.amqp.core.Queue;
|
||||
import org.springframework.amqp.core.QueueBuilder;
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
@ -12,26 +8,10 @@ import lombok.RequiredArgsConstructor;
|
||||
@RequiredArgsConstructor
|
||||
public class MessagingConfiguration {
|
||||
|
||||
public static final String OCR_REQUEST_QUEUE_PREFIX = "ocr_request";
|
||||
public static final String OCR_REQUEST_EXCHANGE = "ocr_request_exchange";
|
||||
public static final String OCR_DLQ = "ocr_error";
|
||||
public static final String OCR_RESPONSE_EXCHANGE = "ocr_response_exchange";
|
||||
public static final String OCR_STATUS_UPDATE_RESPONSE_EXCHANGE = "ocr_status_update_response_exchange";
|
||||
public static final String OCR_STATUS_UPDATE_DLQ = "ocr_status_update_error";
|
||||
|
||||
public static final String OCR_REQUEST_QUEUE = "ocr_request_queue";
|
||||
public static final String OCR_RESPONSE_QUEUE = "ocr_response_queue";
|
||||
public static final String OCR_STATUS_UPDATE_RESPONSE_QUEUE = "ocr_status_update_response_queue";
|
||||
public static final String X_ERROR_INFO_HEADER = "x-error-message";
|
||||
public static final String X_ERROR_INFO_TIMESTAMP_HEADER = "x-error-message-timestamp";
|
||||
|
||||
@Bean
|
||||
public DirectExchange ocrRequestExchange() {
|
||||
|
||||
return new DirectExchange(OCR_REQUEST_EXCHANGE);
|
||||
}
|
||||
|
||||
|
||||
@Bean
|
||||
public Queue ocrDLQ() {
|
||||
|
||||
return QueueBuilder.durable(OCR_DLQ).build();
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,32 +0,0 @@
|
||||
package com.knecon.fforesight.service.ocr.v1.server.configuration;
|
||||
|
||||
import static com.knecon.fforesight.service.ocr.v1.server.configuration.MessagingConfiguration.OCR_DLQ;
|
||||
import static com.knecon.fforesight.service.ocr.v1.server.configuration.MessagingConfiguration.OCR_REQUEST_EXCHANGE;
|
||||
import static com.knecon.fforesight.service.ocr.v1.server.configuration.MessagingConfiguration.OCR_REQUEST_QUEUE_PREFIX;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.v1.server.queue.OcrMessageReceiver;
|
||||
import com.knecon.fforesight.tenantcommons.model.TenantQueueConfiguration;
|
||||
import com.knecon.fforesight.tenantcommons.model.TenantQueueProvider;
|
||||
|
||||
@Configuration
|
||||
public class TenantQueueProviderConfig {
|
||||
|
||||
@Bean
|
||||
protected TenantQueueProvider getTenantQueueConfigs() {
|
||||
|
||||
return new TenantQueueProvider(Set.of(TenantQueueConfiguration.builder()
|
||||
.listenerId(OcrMessageReceiver.OCR_REQUEST_LISTENER_ID)
|
||||
.exchangeName(OCR_REQUEST_EXCHANGE)
|
||||
.queuePrefix(OCR_REQUEST_QUEUE_PREFIX)
|
||||
.dlqName(OCR_DLQ)
|
||||
.arguments(Map.of("x-max-priority", 2))
|
||||
.build()));
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,13 +1,10 @@
|
||||
package com.knecon.fforesight.service.ocr.v1.server.queue;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.service.IOcrMessageSender;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.AzureOcrFeature;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.DocumentRequest;
|
||||
import com.knecon.fforesight.service.ocr.v1.server.configuration.MessagingConfiguration;
|
||||
import com.knecon.fforesight.tenantcommons.TenantContext;
|
||||
@ -25,24 +22,24 @@ public class NoStatusUpdateOcrMessageSender implements IOcrMessageSender {
|
||||
RabbitTemplate rabbitTemplate;
|
||||
|
||||
|
||||
public void sendOcrFinished(String fileId, int totalImages, Set<AzureOcrFeature> features) {
|
||||
public void sendOcrFinished(String fileId, int totalImages) {
|
||||
|
||||
}
|
||||
|
||||
|
||||
public void sendOCRStarted(String fileId, Set<AzureOcrFeature> features) {
|
||||
public void sendOCRStarted(String fileId) {
|
||||
|
||||
}
|
||||
|
||||
|
||||
public void sendUpdate(String fileId, int finishedImages, int totalImages, Set<AzureOcrFeature> features) {
|
||||
public void sendUpdate(String fileId, int finishedImages, int totalImages) {
|
||||
|
||||
}
|
||||
|
||||
|
||||
public void sendOcrResponse(DocumentRequest request) {
|
||||
public void sendOcrResponse(String dossierId, String fileId) {
|
||||
|
||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_RESPONSE_EXCHANGE, TenantContext.getTenantId(), request);
|
||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_RESPONSE_QUEUE, TenantContext.getTenantId(), new DocumentRequest(dossierId, fileId));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -2,13 +2,11 @@ package com.knecon.fforesight.service.ocr.v1.server.queue;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.time.OffsetDateTime;
|
||||
import java.time.temporal.ChronoUnit;
|
||||
import java.util.Objects;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.slf4j.MDC;
|
||||
import org.springframework.amqp.AmqpRejectAndDontRequeueException;
|
||||
import org.springframework.amqp.core.Message;
|
||||
import org.springframework.amqp.rabbit.annotation.RabbitHandler;
|
||||
@ -17,8 +15,7 @@ import org.springframework.stereotype.Service;
|
||||
import org.springframework.util.FileSystemUtils;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.knecon.fforesight.service.ocr.processor.utils.OsUtils;
|
||||
import com.knecon.fforesight.service.ocr.v1.server.FileStorageService;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.FileStorageService;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.IOcrMessageSender;
|
||||
import com.knecon.fforesight.service.ocr.processor.service.OCRService;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.DocumentRequest;
|
||||
@ -35,11 +32,6 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class OcrMessageReceiver {
|
||||
|
||||
public static final String OCR_REQUEST_LISTENER_ID = "ocr-request-listener";
|
||||
public static final String IDP_RESULT_FILE_NAME = "idpResult.json";
|
||||
public static final String VIEWER_DOCUMENT_FILE_NAME = "viewerDocument.pdf";
|
||||
public static final String DOCUMENT_FILE_NAME = "document.pdf";
|
||||
|
||||
FileStorageService fileStorageService;
|
||||
ObjectMapper objectMapper;
|
||||
OCRService ocrService;
|
||||
@ -47,7 +39,7 @@ public class OcrMessageReceiver {
|
||||
|
||||
|
||||
@RabbitHandler
|
||||
@RabbitListener(id = OCR_REQUEST_LISTENER_ID, concurrency = "1")
|
||||
@RabbitListener(queues = MessagingConfiguration.OCR_REQUEST_QUEUE, concurrency = "1")
|
||||
public void receiveOcr(Message in) throws IOException {
|
||||
|
||||
if (in.getMessageProperties().isRedelivered()) {
|
||||
@ -57,35 +49,32 @@ public class OcrMessageReceiver {
|
||||
DocumentRequest request = objectMapper.readValue(in.getBody(), DocumentRequest.class);
|
||||
String dossierId = request.getDossierId();
|
||||
String fileId = request.getFileId();
|
||||
Path runDir = Path.of(OsUtils.getTemporaryDirectory()).resolve(request.getDossierId()).resolve(request.getFileId());
|
||||
Path tmpDir = Files.createTempDirectory(null);
|
||||
|
||||
try {
|
||||
MDC.put("fileId", fileId);
|
||||
log.info("--------------------------------- Starting OCR ---------------------------------");
|
||||
log.info("Features: {}", request.getFeatures().stream().map(Objects::toString).collect(Collectors.joining(", ")));
|
||||
ocrMessageSender.sendOCRStarted(fileId, request.getFeatures());
|
||||
log.info("--------------------------------------------------------------------------");
|
||||
log.info("Start ocr for file with dossierId {} and fileId {}", dossierId, fileId);
|
||||
|
||||
File documentFile = runDir.resolve(DOCUMENT_FILE_NAME).toFile();
|
||||
File viewerDocumentFile = runDir.resolve(VIEWER_DOCUMENT_FILE_NAME).toFile();
|
||||
File idpResultFile = runDir.resolve(IDP_RESULT_FILE_NAME).toFile();
|
||||
ocrMessageSender.sendOCRStarted(fileId);
|
||||
|
||||
File documentFile = tmpDir.resolve("document.pdf").toFile();
|
||||
File viewerDocumentFile = tmpDir.resolve("viewerDocument.pdf").toFile();
|
||||
File analyzeResultFile = tmpDir.resolve("azureAnalysisResult.json").toFile();
|
||||
|
||||
fileStorageService.downloadFiles(request, documentFile);
|
||||
|
||||
ocrService.runOcrOnDocument(dossierId, fileId, request.getFeatures(), runDir, documentFile, viewerDocumentFile, idpResultFile);
|
||||
ocrService.runOcrOnDocument(dossierId, fileId, request.isRemoveWatermarks(), tmpDir, documentFile, viewerDocumentFile, analyzeResultFile);
|
||||
|
||||
fileStorageService.storeFiles(request, documentFile, viewerDocumentFile, idpResultFile);
|
||||
fileStorageService.storeFiles(request, documentFile, viewerDocumentFile, analyzeResultFile);
|
||||
|
||||
ocrMessageSender.sendOcrResponse(request);
|
||||
ocrMessageSender.sendOcrResponse(dossierId, fileId);
|
||||
} catch (Exception e) {
|
||||
log.warn("An exception occurred in ocr file stage: {}", e.getMessage());
|
||||
in.getMessageProperties().getHeaders().put(MessagingConfiguration.X_ERROR_INFO_HEADER, e.getMessage());
|
||||
in.getMessageProperties().getHeaders().put(MessagingConfiguration.X_ERROR_INFO_TIMESTAMP_HEADER, OffsetDateTime.now().truncatedTo(ChronoUnit.MILLIS));
|
||||
throw new RuntimeException(e);
|
||||
} finally {
|
||||
log.info("Done");
|
||||
log.info("--------------------------------- Done ---------------------------------");
|
||||
MDC.remove("fileId");
|
||||
FileSystemUtils.deleteRecursively(runDir);
|
||||
FileSystemUtils.deleteRecursively(tmpDir);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -1,13 +1,10 @@
|
||||
package com.knecon.fforesight.service.ocr.v1.server.queue;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.service.IOcrMessageSender;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.AzureOcrFeature;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.DocumentRequest;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.OCRStatusUpdateResponse;
|
||||
import com.knecon.fforesight.service.ocr.v1.server.configuration.MessagingConfiguration;
|
||||
@ -28,46 +25,35 @@ public class OcrMessageSender implements IOcrMessageSender {
|
||||
RabbitTemplate rabbitTemplate;
|
||||
|
||||
|
||||
public void sendOcrFinished(String fileId, int totalImages, Set<AzureOcrFeature> features) {
|
||||
public void sendOcrFinished(String fileId, int totalImages) {
|
||||
|
||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_EXCHANGE,
|
||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
|
||||
TenantContext.getTenantId(),
|
||||
OCRStatusUpdateResponse.builder()
|
||||
.fileId(fileId)
|
||||
.numberOfPagesToOCR(totalImages)
|
||||
.numberOfOCRedPages(totalImages)
|
||||
.ocrFinished(true)
|
||||
.features(features)
|
||||
.build());
|
||||
OCRStatusUpdateResponse.builder().fileId(fileId).numberOfPagesToOCR(totalImages).numberOfOCRedPages(totalImages).ocrFinished(true).build());
|
||||
}
|
||||
|
||||
|
||||
public void sendOCRStarted(String fileId, Set<AzureOcrFeature> features) {
|
||||
public void sendOCRStarted(String fileId) {
|
||||
|
||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_EXCHANGE,
|
||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
|
||||
TenantContext.getTenantId(),
|
||||
OCRStatusUpdateResponse.builder().fileId(fileId).features(features).ocrStarted(true).build());
|
||||
OCRStatusUpdateResponse.builder().fileId(fileId).ocrStarted(true).build());
|
||||
|
||||
}
|
||||
|
||||
|
||||
public void sendUpdate(String fileId, int finishedImages, int totalImages, Set<AzureOcrFeature> features) {
|
||||
public void sendUpdate(String fileId, int finishedImages, int totalImages) {
|
||||
|
||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_EXCHANGE,
|
||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_STATUS_UPDATE_RESPONSE_QUEUE,
|
||||
TenantContext.getTenantId(),
|
||||
OCRStatusUpdateResponse.builder()
|
||||
.fileId(fileId)
|
||||
.features(features)
|
||||
.numberOfPagesToOCR(totalImages)
|
||||
.numberOfOCRedPages(finishedImages)
|
||||
.build());
|
||||
OCRStatusUpdateResponse.builder().fileId(fileId).numberOfPagesToOCR(totalImages).numberOfOCRedPages(finishedImages).build());
|
||||
|
||||
}
|
||||
|
||||
|
||||
public void sendOcrResponse(DocumentRequest request) {
|
||||
public void sendOcrResponse(String dossierId, String fileId) {
|
||||
|
||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_RESPONSE_EXCHANGE, TenantContext.getTenantId(), request);
|
||||
rabbitTemplate.convertAndSend(MessagingConfiguration.OCR_RESPONSE_QUEUE, TenantContext.getTenantId(), new DocumentRequest(dossierId, fileId));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
# you can list packages
|
||||
ghostscript=9.55.0~dfsg1-0ubuntu5.10
|
||||
ghostscript=9.55.0~dfsg1-0ubuntu5.9
|
||||
pkg-config
|
||||
zip
|
||||
unzip
|
||||
|
||||
@ -63,5 +63,3 @@ azure:
|
||||
|
||||
ocrService:
|
||||
sendStatusUpdates: true
|
||||
|
||||
native-libs.path: ${VCPKG_DYNAMIC_LIB}
|
||||
@ -7,21 +7,11 @@
|
||||
<include resource="org/springframework/boot/logging/logback/console-appender.xml"/>
|
||||
|
||||
<appender name="JSON" class="ch.qos.logback.core.ConsoleAppender">
|
||||
<encoder class="net.logstash.logback.encoder.LogstashEncoder">
|
||||
<pattern>%d{yyyy-MM-dd HH:mm:ss}%replace( [file:%X{fileId}]){' \[file:\]', ''} [%thread] %-5level%logger{36} - %msg%n</pattern>
|
||||
</encoder>
|
||||
</appender>
|
||||
|
||||
<appender name="CONSOLE" class="ch.qos.logback.core.ConsoleAppender">
|
||||
<encoder>
|
||||
<pattern>%d{yyyy-MM-dd HH:mm:ss}%replace( [file:%X{fileId}]){' \[file:\]', ''} [%thread] %-5level%logger{36} - %msg%n</pattern>
|
||||
</encoder>
|
||||
<encoder class="net.logstash.logback.encoder.LogstashEncoder"/>
|
||||
</appender>
|
||||
|
||||
<root level="INFO">
|
||||
<appender-ref ref="${logType}"/>
|
||||
</root>
|
||||
|
||||
<logger name="com.iqser.red.pdftronlogic.commons" level="ERROR"/>
|
||||
|
||||
</configuration>
|
||||
@ -1,50 +0,0 @@
|
||||
package com.knecon.fforesight.service.ocr.v1.api.model;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
class QuadPointTest {
|
||||
|
||||
@Test
|
||||
public void testContains() {
|
||||
|
||||
var a = new Point2D.Double(0, 0);
|
||||
var b = new Point2D.Double(0, 1);
|
||||
var c = new Point2D.Double(1, 1);
|
||||
var d = new Point2D.Double(1, 0);
|
||||
var q = new QuadPoint(a, b, c, d);
|
||||
assertTrue(q.isHorizontal());
|
||||
assertFalse(q.isVertical());
|
||||
|
||||
assertTrue(q.contains(a));
|
||||
assertTrue(q.contains(b));
|
||||
assertTrue(q.contains(c));
|
||||
assertTrue(q.contains(d));
|
||||
|
||||
var p = new Point2D.Double(0.5, 0.5);
|
||||
assertTrue(q.contains(p));
|
||||
|
||||
var r = new Rectangle2D.Double(0.5, 0.5, 0.1, 0.1);
|
||||
assertTrue(q.contains(r));
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testCenter() {
|
||||
|
||||
var a = new Point2D.Double(0, 0);
|
||||
var b = new Point2D.Double(1, 1);
|
||||
var c = new Point2D.Double(2, 1);
|
||||
var d = new Point2D.Double(1, 0);
|
||||
var q = new QuadPoint(a, b, c, d);
|
||||
assertTrue(q.isHorizontal());
|
||||
assertFalse(q.isVertical());
|
||||
assertEquals(QuadPoint.Direction.RIGHT, q.getDirection());
|
||||
assertEquals(new Point2D.Double(1, 0.5), q.getCenter());
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,9 +1,6 @@
|
||||
package com.knecon.fforesight.service.ocr.v1.server;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import static org.mockito.ArgumentMatchers.any;
|
||||
import static org.mockito.Mockito.mock;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
@ -11,10 +8,7 @@ import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
import org.mockito.MockitoAnnotations;
|
||||
import org.mockito.junit.jupiter.MockitoExtension;
|
||||
import org.springframework.amqp.rabbit.core.RabbitAdmin;
|
||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||
import org.springframework.amqp.rabbit.listener.MessageListenerContainer;
|
||||
import org.springframework.amqp.rabbit.listener.RabbitListenerEndpointRegistry;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.boot.autoconfigure.EnableAutoConfiguration;
|
||||
import org.springframework.boot.autoconfigure.amqp.RabbitAutoConfiguration;
|
||||
@ -58,9 +52,6 @@ public class AbstractTest {
|
||||
@MockBean
|
||||
protected RabbitTemplate rabbitTemplate;
|
||||
|
||||
@MockBean
|
||||
private RabbitAdmin rabbitAdmin;
|
||||
|
||||
private static String pdftronLicense;
|
||||
|
||||
|
||||
@ -109,16 +100,6 @@ public class AbstractTest {
|
||||
@ComponentScan(excludeFilters = {@ComponentScan.Filter(type = FilterType.ASSIGNABLE_TYPE, value = StorageAutoConfiguration.class)})
|
||||
public static class TestConfiguration {
|
||||
|
||||
@Bean
|
||||
public RabbitListenerEndpointRegistry rabbitListenerEndpointRegistry() {
|
||||
|
||||
var mock = mock(RabbitListenerEndpointRegistry.class);
|
||||
when(mock.getListenerContainer(any())).thenReturn(mock(MessageListenerContainer.class));
|
||||
|
||||
return mock;
|
||||
}
|
||||
|
||||
|
||||
@Bean
|
||||
@Primary
|
||||
public StorageService inMemoryStorage() {
|
||||
|
||||
@ -1,9 +1,6 @@
|
||||
package com.knecon.fforesight.service.ocr.v1.server;
|
||||
|
||||
import static com.iqser.red.pdftronlogic.commons.PdfTextExtraction.extractAllTextFromDocument;
|
||||
import static com.knecon.fforesight.service.ocr.v1.server.queue.OcrMessageReceiver.DOCUMENT_FILE_NAME;
|
||||
import static com.knecon.fforesight.service.ocr.v1.server.queue.OcrMessageReceiver.IDP_RESULT_FILE_NAME;
|
||||
import static com.knecon.fforesight.service.ocr.v1.server.queue.OcrMessageReceiver.VIEWER_DOCUMENT_FILE_NAME;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
@ -12,28 +9,23 @@ import java.nio.file.Path;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.slf4j.MDC;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.boot.test.context.SpringBootTest;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.processor.service.OCRService;
|
||||
import com.knecon.fforesight.service.ocr.processor.utils.OsUtils;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.AzureOcrFeature;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
// in order to run, the azure.key must be set first in the application.yml and you must set the env variable VCPKG_DYNAMIC_LIB to your tesseract and leptonica installation folder
|
||||
@Disabled
|
||||
@Disabled // in order to run, the azure.key must be set first in the application.yml
|
||||
@SpringBootTest()
|
||||
public class OcrServiceIntegrationTest extends AbstractTest {
|
||||
|
||||
public static final Set<AzureOcrFeature> FEATURES = Set.of(AzureOcrFeature.ROTATION_CORRECTION, AzureOcrFeature.FONT_STYLE_DETECTION, AzureOcrFeature.IDP);
|
||||
@Autowired
|
||||
private OCRService ocrService;
|
||||
|
||||
@ -42,7 +34,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
||||
@SneakyThrows
|
||||
public void testOcrWith2000PageFile() {
|
||||
|
||||
testOCR("/home/kschuettler/Dokumente/TestFiles/OCR/brokenText.pdf");
|
||||
testOCR("/home/kschuettler/Dokumente/TestFiles/OCR/VV-331340-first100.pdf");
|
||||
}
|
||||
|
||||
|
||||
@ -58,7 +50,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
||||
@SneakyThrows
|
||||
public void testOcrWithFile() {
|
||||
|
||||
testOCR("/home/kschuettler/Dokumente/TestFiles/OCR/TestSet/VV-331340-first100.pdf");
|
||||
testOCR("/home/kschuettler/Dokumente/TestFiles/syn-dm-testfiles/1.A16148F - Toxicidade oral aguda.pdf");
|
||||
}
|
||||
|
||||
|
||||
@ -66,7 +58,7 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
||||
@SneakyThrows
|
||||
public void testOcrWithFolder() {
|
||||
|
||||
String dir = "/home/kschuettler/Dokumente/TestFiles/OCR/TestSet";
|
||||
String dir = "/home/kschuettler/Dokumente/TestFiles/BASF/Documine_Test_docs/2013-1110704.pdf";
|
||||
List<File> foundFiles = Files.walk(Path.of(dir))
|
||||
.sorted(Comparator.comparingLong(this::getFileSize))
|
||||
.map(Path::toFile)
|
||||
@ -105,21 +97,18 @@ public class OcrServiceIntegrationTest extends AbstractTest {
|
||||
@SneakyThrows
|
||||
private String testOCR(File file) {
|
||||
|
||||
MDC.put("fileId", "test");
|
||||
|
||||
Path tmpDir = Path.of(OsUtils.getTemporaryDirectory()).resolve("OCR_TEST").resolve(file.toPath().getFileName());
|
||||
|
||||
assert tmpDir.toFile().exists() || tmpDir.toFile().mkdirs();
|
||||
|
||||
var documentFile = tmpDir.resolve(Path.of(DOCUMENT_FILE_NAME));
|
||||
var viewerDocumentFile = tmpDir.resolve(Path.of(VIEWER_DOCUMENT_FILE_NAME));
|
||||
var analyzeResultFile = tmpDir.resolve(Path.of(IDP_RESULT_FILE_NAME));
|
||||
var documentFile = tmpDir.resolve(Path.of("document.pdf"));
|
||||
var viewerDocumentFile = tmpDir.resolve(Path.of("viewerDocument.pdf"));
|
||||
var analyzeResultFile = tmpDir.resolve(Path.of("azureAnalysisResult.json"));
|
||||
|
||||
Files.copy(file.toPath(), documentFile, StandardCopyOption.REPLACE_EXISTING);
|
||||
Files.copy(file.toPath(), viewerDocumentFile, StandardCopyOption.REPLACE_EXISTING);
|
||||
|
||||
ocrService.runOcrOnDocument(TEST_DOSSIER_ID, "file", FEATURES, tmpDir, documentFile.toFile(), viewerDocumentFile.toFile(), analyzeResultFile.toFile());
|
||||
MDC.remove("fileId");
|
||||
ocrService.runOcrOnDocument(TEST_DOSSIER_ID, "file", false, tmpDir, documentFile.toFile(), viewerDocumentFile.toFile(), analyzeResultFile.toFile());
|
||||
System.out.println("File:" + documentFile);
|
||||
System.out.println("\n\n");
|
||||
try (var fileStream = new FileInputStream(documentFile.toFile())) {
|
||||
|
||||
@ -2,16 +2,12 @@ persistence-service.url: "http://persistence-service-v1:8080"
|
||||
|
||||
pdftron.license: demo:1650351709282:7bd235e003000000004ec28a6743e1163a085e2115de2536ab6e2cfe5a
|
||||
azure:
|
||||
endpoint: https://ff-ocr-dev.cognitiveservices.azure.com/
|
||||
key: 444fe2f83e9c48da8e588c7bd5295309 # find key in Bitwarden under: Azure IDP Test Key
|
||||
native-libs:
|
||||
|
||||
endpoint: https://ff-ocr-test.cognitiveservices.azure.com/
|
||||
key: # find key in Bitwarden under: Azure IDP Test Key
|
||||
|
||||
logging.type: ${LOGGING_TYPE:CONSOLE}
|
||||
|
||||
ocrService:
|
||||
sendStatusUpdates: false
|
||||
debug: true
|
||||
ocrService.sendStatusUpdates: false
|
||||
|
||||
management:
|
||||
endpoint:
|
||||
@ -21,5 +17,4 @@ management:
|
||||
endpoints.web.exposure.include: prometheus, health, metrics
|
||||
metrics.export.prometheus.enabled: true
|
||||
|
||||
POD_NAME: azure-ocr-service
|
||||
native-libs.path: /home/kschuettler/software/leptonica/vcpkg/installed/x64-linux-dynamic/lib/
|
||||
POD_NAME: azure-ocr-service
|
||||
@ -15,7 +15,6 @@
|
||||
<exclude name="NullAssignment"/>
|
||||
<exclude name="AssignmentInOperand"/>
|
||||
<exclude name="BeanMembersShouldSerialize"/>
|
||||
<exclude name="AvoidFieldNameMatchingMethodName"/>
|
||||
</rule>
|
||||
|
||||
</ruleset>
|
||||
|
||||
@ -17,7 +17,6 @@
|
||||
<exclude name="AssignmentInOperand"/>
|
||||
<exclude name="TestClassWithoutTestCases"/>
|
||||
<exclude name="BeanMembersShouldSerialize"/>
|
||||
<exclude name="AvoidFieldNameMatchingMethodName"/>
|
||||
</rule>
|
||||
|
||||
</ruleset>
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user