Compare commits

...

8 Commits

Author SHA1 Message Date
Dominique Eifländer
ca69d3f5dc Merge branch 'RED-9760-4.1' into 'release/0.142.x'
RED-9760: Fixed missing newLines

See merge request fforesight/layout-parser!185
2024-07-30 16:06:30 +02:00
Dominique Eifländer
366c12bab0 RED-9760: Fixed missing newLines 2024-07-30 15:46:40 +02:00
Andrei Isvoran
4f3f72bcbc Merge branch 'RED-9607' into 'release/0.142.x'
RED-9607 - Correctly determine text position sequence based on file rotation

See merge request fforesight/layout-parser!183
2024-07-25 14:11:03 +02:00
Andrei Isvoran
6133b142cf RED-9607 - Correctly determine text position sequence based on file rotation 2024-07-24 16:32:54 +03:00
Kilian Schüttler
c3d24393ea Merge branch 'RED-8800-bp' into 'release/0.142.x'
Red 8800 bp

See merge request fforesight/layout-parser!180
2024-07-15 17:58:59 +02:00
Kilian Schüttler
d4500b879b Red 8800 bp 2024-07-15 17:58:59 +02:00
Andrei Isvoran
44277572ab Merge branch 'RED-9496-shutdown' into 'release/0.142.x'
RED-9496 - Implement graceful shutdown

See merge request fforesight/layout-parser!175
2024-07-04 14:25:58 +02:00
Andrei Isvoran
cbf809316b RED-9496 - Implement graceful shutdown 2024-07-04 14:19:19 +03:00
9 changed files with 120 additions and 27 deletions

View File

@ -1,5 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text; package com.knecon.fforesight.service.layoutparser.processor.model.text;
import static com.knecon.fforesight.service.layoutparser.processor.utils.ParsingConstants.NEW_LINE_TEXT_HEIGHT_PERCENTAGE;
import java.awt.geom.Rectangle2D; import java.awt.geom.Rectangle2D;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
@ -208,7 +210,7 @@ public class TextPageBlock extends AbstractPageBlock {
TextPositionSequence previous = null; TextPositionSequence previous = null;
for (TextPositionSequence word : sequences) { for (TextPositionSequence word : sequences) {
if (previous != null) { if (previous != null) {
if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) { if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight() * NEW_LINE_TEXT_HEIGHT_PERCENTAGE) {
sb.append('\n'); sb.append('\n');
} else { } else {
sb.append(' '); sb.append(' ');
@ -228,7 +230,7 @@ public class TextPageBlock extends AbstractPageBlock {
TextPositionSequence previous = null; TextPositionSequence previous = null;
for (TextPositionSequence word : sequences) { for (TextPositionSequence word : sequences) {
if (previous != null) { if (previous != null) {
if (word.getMaxYDirAdj() - previous.getMaxYDirAdj() > word.getTextHeight()) { if (word.getMaxYDirAdj() - previous.getMaxYDirAdj() > word.getTextHeight() * NEW_LINE_TEXT_HEIGHT_PERCENTAGE) {
numberOfLines++; numberOfLines++;
} }
} }

View File

@ -1,5 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.services.factory; package com.knecon.fforesight.service.layoutparser.processor.services.factory;
import static com.knecon.fforesight.service.layoutparser.processor.utils.ParsingConstants.NEW_LINE_TEXT_HEIGHT_PERCENTAGE;
import java.awt.geom.AffineTransform; import java.awt.geom.AffineTransform;
import java.awt.geom.Rectangle2D; import java.awt.geom.Rectangle2D;
import java.util.Collection; import java.util.Collection;
@ -67,7 +69,6 @@ public class SearchTextWithTextPositionFactory {
++context.stringIdx; ++context.stringIdx;
} }
List<Rectangle2D> positions = sequences.stream() List<Rectangle2D> positions = sequences.stream()
.map(TextPositionSequence::getTextPositions) .map(TextPositionSequence::getTextPositions)
.flatMap(Collection::stream) .flatMap(Collection::stream)
@ -161,7 +162,7 @@ public class SearchTextWithTextPositionFactory {
} }
double deltaY = Math.abs(currentPosition.getYDirAdj() - previousPosition.getYDirAdj()); double deltaY = Math.abs(currentPosition.getYDirAdj() - previousPosition.getYDirAdj());
return deltaY >= currentPosition.getHeightDir(); return deltaY >= currentPosition.getHeightDir() * NEW_LINE_TEXT_HEIGHT_PERCENTAGE;
} }
@ -191,9 +192,9 @@ public class SearchTextWithTextPositionFactory {
float textHeight = sequence.getTextHeight() + HEIGHT_PADDING; float textHeight = sequence.getTextHeight() + HEIGHT_PADDING;
Rectangle2D rectangle2D = new Rectangle2D.Double(textPosition.getXDirAdj(), Rectangle2D rectangle2D = new Rectangle2D.Double(textPosition.getXDirAdj(),
textPosition.getYDirAdj() - textHeight, textPosition.getYDirAdj() - textHeight,
textPosition.getWidthDirAdj(), textPosition.getWidthDirAdj(),
textHeight + HEIGHT_PADDING); textHeight + HEIGHT_PADDING);
AffineTransform transform = new AffineTransform(); AffineTransform transform = new AffineTransform();

View File

@ -82,6 +82,7 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
private int pageRotation; private int pageRotation;
private PDRectangle pageSize; private PDRectangle pageSize;
private Matrix translateMatrix;
private final GlyphList glyphList; private final GlyphList glyphList;
private final Map<COSDictionary, Float> fontHeightMap = new WeakHashMap<COSDictionary, Float>(); private final Map<COSDictionary, Float> fontHeightMap = new WeakHashMap<COSDictionary, Float>();
@ -133,6 +134,12 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
this.pageRotation = page.getRotation(); this.pageRotation = page.getRotation();
this.pageSize = page.getCropBox(); this.pageSize = page.getCropBox();
if (pageSize.getLowerLeftX() == 0 && pageSize.getLowerLeftY() == 0) {
translateMatrix = null;
} else {
// translation matrix for cropbox
translateMatrix = Matrix.getTranslateInstance(-pageSize.getLowerLeftX(), -pageSize.getLowerLeftY());
}
super.processPage(page); super.processPage(page);
} }
@ -257,13 +264,22 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
return; return;
} }
} }
// adjust for cropbox if needed
Matrix translatedTextRenderingMatrix;
if (translateMatrix == null) {
translatedTextRenderingMatrix = textRenderingMatrix;
} else {
translatedTextRenderingMatrix = Matrix.concatenate(translateMatrix, textRenderingMatrix);
nextX -= pageSize.getLowerLeftX();
nextY -= pageSize.getLowerLeftY();
}
// This is a hack for unicode letter with 2 chars e.g. RA see unicodeProblem.pdf // This is a hack for unicode letter with 2 chars e.g. RA see unicodeProblem.pdf
if (unicodeMapping.length() == 2) { if (unicodeMapping.length() == 2) {
processTextPosition(new TextPosition(pageRotation, processTextPosition(new TextPosition(pageRotation,
pageSize.getWidth(), pageSize.getWidth(),
pageSize.getHeight(), pageSize.getHeight(),
textRenderingMatrix, translatedTextRenderingMatrix,
nextX, nextX,
nextY, nextY,
Math.abs(dyDisplay), Math.abs(dyDisplay),
@ -277,7 +293,7 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
processTextPosition(new TextPosition(pageRotation, processTextPosition(new TextPosition(pageRotation,
pageSize.getWidth(), pageSize.getWidth(),
pageSize.getHeight(), pageSize.getHeight(),
textRenderingMatrix, translatedTextRenderingMatrix,
nextX, nextX,
nextY, nextY,
Math.abs(dyDisplay), Math.abs(dyDisplay),
@ -293,7 +309,7 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
processTextPosition(new TextPosition(pageRotation, processTextPosition(new TextPosition(pageRotation,
pageSize.getWidth(), pageSize.getWidth(),
pageSize.getHeight(), pageSize.getHeight(),
textRenderingMatrix, translatedTextRenderingMatrix,
nextX, nextX,
nextY, nextY,
Math.abs(dyDisplay), Math.abs(dyDisplay),

View File

@ -0,0 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
public class ParsingConstants {
public final static float NEW_LINE_TEXT_HEIGHT_PERCENTAGE = 0.6f;
}

View File

@ -51,13 +51,47 @@ public class TextPositionSequenceComparator implements Comparator<TextPositionSe
double yDifference = Math.abs(pos1YBottom - pos2YBottom); double yDifference = Math.abs(pos1YBottom - pos2YBottom);
// we will do a simple tolerance comparison // Adjust for text rotation
if (yDifference < .1 || pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom || pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom) { switch (pos1.getRotation()) {
return Double.compare(x1, x2); case 0:
} else if (pos1YBottom < pos2YBottom) { // 0 degrees (horizontal, top to bottom and left to right): Sort primarily by y-coordinates from top to bottom (pos1YBottom < pos2YBottom).
return -1; if (yDifference < .1 || (pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom) || (pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom)) {
} else { return Double.compare(x1, x2);
return 1; } else if (pos1YBottom < pos2YBottom) {
return -1;
} else {
return 1;
}
case 90:
// 90 degrees (vertical, right to left): Sort by x-coordinates first (x1 > x2), then by y-coordinates from top to bottom (pos1YBottom < pos2YBottom).
if (x1 > x2) {
return -1;
} else if (x1 < x2) {
return 1;
} else {
return Double.compare(pos1YBottom, pos2YBottom);
}
case 180:
// 180 degrees (horizontal, bottom to top and right to left): Sort primarily by y-coordinates from bottom to top (pos1YBottom > pos2YBottom).
if (yDifference < .1 || (pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom) || (pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom)) {
return Double.compare(x2, x1);
} else if (pos1YBottom > pos2YBottom) {
return -1;
} else {
return 1;
}
case 270:
// 270 degrees (vertical, left to right): Sort by x-coordinates in reverse (x2 > x1), then by y-coordinates from bottom to top (pos2YBottom > pos1YBottom).
if (x2 > x1) {
return -1;
} else if (x2 < x1) {
return 1;
} else {
return Double.compare(pos2YBottom, pos1YBottom);
}
default:
throw new RuntimeException("Rotation not supported. Only 0/90/180/270 degree rotation is supported.");
} }
} }

View File

@ -32,6 +32,7 @@ dependencies {
implementation("com.iqser.red.commons:storage-commons:2.45.0") implementation("com.iqser.red.commons:storage-commons:2.45.0")
implementation("com.knecon.fforesight:tenant-commons:0.21.0") implementation("com.knecon.fforesight:tenant-commons:0.21.0")
implementation("com.knecon.fforesight:tracing-commons:0.5.0") implementation("com.knecon.fforesight:tracing-commons:0.5.0")
implementation("com.knecon.fforesight:lifecycle-commons:0.6.0")
implementation("org.springframework.boot:spring-boot-starter-actuator:${springBootStarterVersion}") implementation("org.springframework.boot:spring-boot-starter-actuator:${springBootStarterVersion}")
implementation("org.springframework.boot:spring-boot-starter-amqp:${springBootStarterVersion}") implementation("org.springframework.boot:spring-boot-starter-amqp:${springBootStarterVersion}")
implementation("com.amazonaws:aws-java-sdk-s3:1.12.536") implementation("com.amazonaws:aws-java-sdk-s3:1.12.536")

View File

@ -6,19 +6,22 @@ import org.springframework.boot.autoconfigure.ImportAutoConfiguration;
import org.springframework.boot.autoconfigure.SpringBootApplication; import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.boot.autoconfigure.security.servlet.SecurityAutoConfiguration; import org.springframework.boot.autoconfigure.security.servlet.SecurityAutoConfiguration;
import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.EnableAspectJAutoProxy;
import org.springframework.context.annotation.Import; import org.springframework.context.annotation.Import;
import com.amazonaws.services.s3.model.metrics.MetricsConfiguration; import com.amazonaws.services.s3.model.metrics.MetricsConfiguration;
import com.iqser.red.storage.commons.StorageAutoConfiguration; import com.iqser.red.storage.commons.StorageAutoConfiguration;
import com.knecon.fforesight.lifecyclecommons.LifecycleAutoconfiguration;
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingServiceProcessorConfiguration; import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingServiceProcessorConfiguration;
import com.knecon.fforesight.tenantcommons.MultiTenancyAutoConfiguration; import com.knecon.fforesight.tenantcommons.MultiTenancyAutoConfiguration;
import io.micrometer.observation.ObservationRegistry; import io.micrometer.observation.ObservationRegistry;
import io.micrometer.observation.aop.ObservedAspect; import io.micrometer.observation.aop.ObservedAspect;
@ImportAutoConfiguration({MultiTenancyAutoConfiguration.class}) @ImportAutoConfiguration({MultiTenancyAutoConfiguration.class, LifecycleAutoconfiguration.class})
@Import({MetricsConfiguration.class, StorageAutoConfiguration.class, LayoutParsingServiceProcessorConfiguration.class}) @Import({MetricsConfiguration.class, StorageAutoConfiguration.class, LayoutParsingServiceProcessorConfiguration.class})
@SpringBootApplication(exclude = {SecurityAutoConfiguration.class, ManagementWebSecurityAutoConfiguration.class}) @SpringBootApplication(exclude = {SecurityAutoConfiguration.class, ManagementWebSecurityAutoConfiguration.class})
@EnableAspectJAutoProxy
public class Application { public class Application {
public static void main(String[] args) { public static void main(String[] args) {

View File

@ -15,6 +15,9 @@ project.version: 1.0-SNAPSHOT
server: server:
port: 8080 port: 8080
lifecycle:
base-package: com.knecon.fforesight.service.layoutparser
spring: spring:
application: application:
name: layoutparser-service name: layoutparser-service

View File

@ -1,6 +1,7 @@
package com.knecon.fforesight.service.viewerdoc.service; package com.knecon.fforesight.service.viewerdoc.service;
import java.awt.geom.AffineTransform; import java.awt.geom.AffineTransform;
import java.awt.geom.Line2D;
import java.awt.geom.Rectangle2D; import java.awt.geom.Rectangle2D;
import java.io.File; import java.io.File;
import java.io.FileOutputStream; import java.io.FileOutputStream;
@ -84,6 +85,7 @@ public class ViewerDocumentService {
pdPage.setContents(ContentStreamUtility.removeLayerFromContentStreams(allLayers, classifiers)); pdPage.setContents(ContentStreamUtility.removeLayerFromContentStreams(allLayers, classifiers));
AffineTransform textDeRotationMatrix = getTextDeRotationTransform(pdPage); AffineTransform textDeRotationMatrix = getTextDeRotationTransform(pdPage);
AffineTransform pageTransformationMatrix = getPageTransformationMatrix(pdPage);
if (!ContentStreamClassifier.areAllContentStreamsEscaped(classifiers)) { if (!ContentStreamClassifier.areAllContentStreamsEscaped(classifiers)) {
// We need to save the graphics state before, such that our appended content cannot be affected by previous content streams with side effects, // We need to save the graphics state before, such that our appended content cannot be affected by previous content streams with side effects,
@ -106,7 +108,11 @@ public class ViewerDocumentService {
contentStream.saveGraphicsState(); contentStream.saveGraphicsState();
drawVisualizationsToContentStream(pdDocument, visualization.getVisualizationsOnPages().get(pageNumber), contentStream, textDeRotationMatrix); drawVisualizationsToContentStream(pdDocument,
visualization.getVisualizationsOnPages().get(pageNumber),
contentStream,
textDeRotationMatrix,
pageTransformationMatrix);
contentStream.restoreGraphicsState(); contentStream.restoreGraphicsState();
@ -133,6 +139,12 @@ public class ViewerDocumentService {
} }
private AffineTransform getPageTransformationMatrix(PDPage pdPage) {
return new AffineTransform(1, 0, 0, 1, pdPage.getCropBox().getLowerLeftX(), pdPage.getCropBox().getLowerLeftY());
}
private static Map<ContentStreams.Identifier, PDOptionalContentGroup> addLayersToDocument(List<Visualizations> visualizations, PDDocument pdDocument) { private static Map<ContentStreams.Identifier, PDOptionalContentGroup> addLayersToDocument(List<Visualizations> visualizations, PDDocument pdDocument) {
Map<ContentStreams.Identifier, PDOptionalContentGroup> optionalContentGroupMap = new HashMap<>(); Map<ContentStreams.Identifier, PDOptionalContentGroup> optionalContentGroupMap = new HashMap<>();
@ -147,7 +159,8 @@ public class ViewerDocumentService {
private static void drawVisualizationsToContentStream(PDDocument pdDocument, private static void drawVisualizationsToContentStream(PDDocument pdDocument,
VisualizationsOnPage visualizationsOnPage, VisualizationsOnPage visualizationsOnPage,
PDPageContentStream contentStream, PDPageContentStream contentStream,
AffineTransform textDeRotationMatrix) throws IOException { AffineTransform textDeRotationMatrix,
AffineTransform pageTransformationMatrix) throws IOException {
if (visualizationsOnPage.isMakePathsInvisible()) { if (visualizationsOnPage.isMakePathsInvisible()) {
contentStream.addRect(0, 0, 1, 1); contentStream.addRect(0, 0, 1, 1);
@ -155,17 +168,18 @@ public class ViewerDocumentService {
} }
for (ColoredLine coloredLine : visualizationsOnPage.getColoredLines()) { for (ColoredLine coloredLine : visualizationsOnPage.getColoredLines()) {
Line2D line = transformLine(coloredLine.line(), pageTransformationMatrix);
contentStream.setLineWidth(coloredLine.lineWidth()); contentStream.setLineWidth(coloredLine.lineWidth());
contentStream.setStrokingColor(coloredLine.color()); contentStream.setStrokingColor(coloredLine.color());
contentStream.moveTo((float) coloredLine.line().getX1(), (float) coloredLine.line().getY1()); contentStream.moveTo((float) line.getX1(), (float) line.getY1());
contentStream.lineTo((float) coloredLine.line().getX2(), (float) coloredLine.line().getY2()); contentStream.lineTo((float) line.getX2(), (float) line.getY2());
contentStream.stroke(); contentStream.stroke();
} }
for (ColoredRectangle coloredRectangle : visualizationsOnPage.getColoredRectangles()) { for (ColoredRectangle coloredRectangle : visualizationsOnPage.getColoredRectangles()) {
contentStream.setLineWidth(coloredRectangle.lineWidth()); contentStream.setLineWidth(coloredRectangle.lineWidth());
contentStream.setStrokingColor(coloredRectangle.color()); contentStream.setStrokingColor(coloredRectangle.color());
Rectangle2D r = coloredRectangle.rectangle2D(); Rectangle2D r = transformRect(coloredRectangle.rectangle2D(), pageTransformationMatrix);
contentStream.addRect((float) r.getX(), (float) r.getY(), (float) r.getWidth(), (float) r.getHeight()); contentStream.addRect((float) r.getX(), (float) r.getY(), (float) r.getWidth(), (float) r.getHeight());
contentStream.stroke(); contentStream.stroke();
} }
@ -175,7 +189,7 @@ public class ViewerDocumentService {
PDExtendedGraphicsState graphicsState = new PDExtendedGraphicsState(); PDExtendedGraphicsState graphicsState = new PDExtendedGraphicsState();
graphicsState.setNonStrokingAlphaConstant(filledRectangle.alpha()); graphicsState.setNonStrokingAlphaConstant(filledRectangle.alpha());
contentStream.setGraphicsStateParameters(graphicsState); contentStream.setGraphicsStateParameters(graphicsState);
Rectangle2D r = filledRectangle.rectangle2D(); Rectangle2D r = transformRect(filledRectangle.rectangle2D(), pageTransformationMatrix);
contentStream.addRect((float) r.getX(), (float) r.getY(), (float) r.getWidth(), (float) r.getHeight()); contentStream.addRect((float) r.getX(), (float) r.getY(), (float) r.getWidth(), (float) r.getHeight());
contentStream.fill(); contentStream.fill();
} }
@ -190,7 +204,7 @@ public class ViewerDocumentService {
} else { } else {
contentStream.setRenderingMode(RenderingMode.FILL); contentStream.setRenderingMode(RenderingMode.FILL);
} }
Matrix textMatrix = getTextMatrix(placedText, textDeRotationMatrix); Matrix textMatrix = getTextMatrix(placedText, textDeRotationMatrix, pageTransformationMatrix);
contentStream.setTextMatrix(textMatrix); contentStream.setTextMatrix(textMatrix);
contentStream.showText(placedText.text()); contentStream.showText(placedText.text());
contentStream.endText(); contentStream.endText();
@ -223,7 +237,7 @@ public class ViewerDocumentService {
} }
private static Matrix getTextMatrix(PlacedText placedText, AffineTransform textDeRotationMatrix) { private static Matrix getTextMatrix(PlacedText placedText, AffineTransform textDeRotationMatrix, AffineTransform pageTransformationMatrix) {
Matrix textMatrix; Matrix textMatrix;
if (placedText.textMatrix().isEmpty()) { if (placedText.textMatrix().isEmpty()) {
@ -236,7 +250,7 @@ public class ViewerDocumentService {
} else { } else {
textMatrix = placedText.textMatrix().get(); textMatrix = placedText.textMatrix().get();
} }
return textMatrix; return new Matrix(pageTransformationMatrix).multiply(textMatrix);
} }
@ -313,4 +327,16 @@ public class ViewerDocumentService {
}); });
} }
private static Line2D transformLine(Line2D line, AffineTransform pageTransformation) {
return new Line2D.Double(pageTransformation.transform(line.getP1(), null), pageTransformation.transform(line.getP2(), null));
}
private static Rectangle2D transformRect(Rectangle2D r, AffineTransform pageTransformation) {
return pageTransformation.createTransformedShape(r).getBounds2D();
}
} }