RED-8825: general layoutparsing improvements

* fixing a bunch of coordinates
This commit is contained in:
Kilian Schuettler 2024-05-03 00:04:17 +02:00
parent a3decd292d
commit 60acbac53f
14 changed files with 70 additions and 36 deletions

View File

@ -263,6 +263,7 @@ public class LayoutParsingPipeline {
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
PDRectangle cropbox = pdPage.getCropBox();
classificationDocument.getVisualizations().addRulingVisualization(stripper.getRulings(), pageNumber);
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals(), PageInformation.fromPDPage(pageNumber, pdPage));

View File

@ -27,8 +27,8 @@ public class Character {
public Character(RedTextPosition chunk) {
this.x = chunk.getDirectionAdjustedPosition().getCenterX();
this.y = chunk.getDirectionAdjustedPosition().getCenterY();
this.x = chunk.getBBoxDirAdj().getCenterX();
this.y = chunk.getBBoxDirAdj().getCenterY();
this.textPosition = chunk;
}

View File

@ -21,7 +21,8 @@ import lombok.SneakyThrows;
public class RedTextPosition extends BoundingBox {
public final static int HEIGHT_PADDING = 2;
private Rectangle2D.Float directionAdjustedPosition; // adjusted to text rotation
private Rectangle2D.Float bBoxDirAdj; // adjusted to text rotation
@JsonIgnore
private int rotation;
@ -72,7 +73,7 @@ public class RedTextPosition extends BoundingBox {
textPosition.getYDirAdj() - textHeight,
textPosition.getWidthDirAdj(),
textHeight + HEIGHT_PADDING);
pos.setDirectionAdjustedPosition(dirAdjPosition);
pos.setBBoxDirAdj(dirAdjPosition);
AffineTransform affineTransform = getRotationMatrix(TextDirection.fromDegrees(textPosition.getDir()), textPosition.getPageWidth(), textPosition.getPageHeight());
Rectangle2D initialUserSpacePositionRect = affineTransform.createTransformedShape(dirAdjPosition).getBounds2D();
@ -83,6 +84,7 @@ public class RedTextPosition extends BoundingBox {
}
private static AffineTransform getRotationMatrix(TextDirection textDirection, float pageWidth, float pageHeight) {
AffineTransform transform = new AffineTransform();
@ -105,28 +107,28 @@ public class RedTextPosition extends BoundingBox {
@JsonIgnore
public float getXDirAdj() {
return this.directionAdjustedPosition.x;
return this.bBoxDirAdj.x;
}
@JsonIgnore
public float getYDirAdj() {
return this.directionAdjustedPosition.y;
return this.bBoxDirAdj.y;
}
@JsonIgnore
public float getWidthDirAdj() {
return this.directionAdjustedPosition.width;
return this.bBoxDirAdj.width;
}
@JsonIgnore
public float getHeightDir() {
return this.directionAdjustedPosition.height;
return this.bBoxDirAdj.height;
}
}

View File

@ -1,5 +1,6 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
@ -8,6 +9,7 @@ import java.util.stream.Collectors;
import org.apache.pdfbox.text.TextPosition;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.AllArgsConstructor;
import lombok.Builder;
@ -32,6 +34,7 @@ public class TextPositionSequence extends BoundingBox implements CharSequence {
@EqualsAndHashCode.Include
private List<RedTextPosition> textPositions = new ArrayList<>();
private Rectangle2D bBoxDirAdj;
@EqualsAndHashCode.Include
private TextDirection dir;
private int rotation;
@ -53,6 +56,15 @@ public class TextPositionSequence extends BoundingBox implements CharSequence {
this.pageHeight = textPositions.get(0).getPageHeight();
this.pageWidth = textPositions.get(0).getPageWidth();
this.isParagraphStart = isParagraphStart;
calculateBBox();
}
private void calculateBBox() {
this.bBoxDirAdj = textPositions.stream()
.map(RedTextPosition::getBBoxDirAdj)
.collect(RectangleTransformations.collectBBox());
setToBBoxOfComponents(getTextPositions());
}
@ -65,7 +77,7 @@ public class TextPositionSequence extends BoundingBox implements CharSequence {
this.rotation = textPositions.get(0).getRotation();
this.pageHeight = textPositions.get(0).getPageHeight();
this.pageWidth = textPositions.get(0).getPageWidth();
setToBBoxOfComponents(getTextPositions());
calculateBBox();
}
@ -133,7 +145,7 @@ public class TextPositionSequence extends BoundingBox implements CharSequence {
this.rotation = textPositionSequence.getRotation();
this.pageHeight = textPositionSequence.getPageHeight();
this.pageWidth = textPositionSequence.getPageWidth();
setToBBoxOfComponents(getTextPositions());
calculateBBox();
;
}
@ -145,7 +157,7 @@ public class TextPositionSequence extends BoundingBox implements CharSequence {
this.rotation = textPositions.get(0).getRotation();
this.pageHeight = textPositions.get(0).getPageHeight();
this.pageWidth = textPositions.get(0).getPageWidth();
setToBBoxOfComponents(getTextPositions());
calculateBBox();
}

View File

@ -56,7 +56,7 @@ public class ImageServiceResponseAdapter {
classificationPage.getImages().forEach(image -> {
if (image.getImageType().equals(ImageType.OTHER)) {
for (AbstractPageBlock textblock : classificationPage.getTextBlocks()) {
if (image.getPosition().contains(textblock.getBBox())) {
if (image.getPosition().contains(textblock.getBBoxInitialUserSpace())) {
image.setImageType(ImageType.OCR);
return;
}

View File

@ -124,9 +124,9 @@ public class RulingCleaningService {
}
if (ruling.isHorizontal()) {
return new Rectangle2D.Double(x - THRESHOLD_Y_HORIZONTAL, y - THRESHOLD_X_HORIZONTAL, w + 2 * THRESHOLD_X_HORIZONTAL, h + 2 * THRESHOLD_Y_HORIZONTAL);
return new Rectangle2D.Double(x - THRESHOLD_X_HORIZONTAL, y - THRESHOLD_Y_HORIZONTAL, w + 2 * THRESHOLD_X_HORIZONTAL, h + 2 * THRESHOLD_Y_HORIZONTAL);
} else {
return new Rectangle2D.Double(x - THRESHOLD_Y_VERTICAL, y - THRESHOLD_X_VERTICAL, w + 2 * THRESHOLD_X_VERTICAL, h + 2 * THRESHOLD_Y_VERTICAL);
return new Rectangle2D.Double(x - THRESHOLD_X_VERTICAL, y - THRESHOLD_Y_VERTICAL, w + 2 * THRESHOLD_X_VERTICAL, h + 2 * THRESHOLD_Y_VERTICAL);
}
}

View File

@ -142,14 +142,14 @@ public class TableExtractionService {
public static List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines, PageInformation pageInformation) {
AffineTransform affineTransform = CoordinateTransforms.calculateInitialUserSpaceCoordsToImageCoords(pageInformation, 1);
/*
switch (pageInformation.rotationDegrees()) {
case 90 -> affineTransform.translate(RedTextPosition.HEIGHT_PADDING, 0); //although this is wrong, our text coordinates are wrong as well
case 180 -> affineTransform.translate(0, RedTextPosition.HEIGHT_PADDING);
case 270 -> affineTransform.translate(-RedTextPosition.HEIGHT_PADDING, 0);
default -> affineTransform.translate(0, -RedTextPosition.HEIGHT_PADDING);
}
*/
return RectangularIntersectionFinder.find(horizontalRulingLines, verticalRulingLines)
.stream()
.map(rect -> new Cell(rect, affineTransform))

View File

@ -31,14 +31,14 @@ public class TextRulingsClassifier {
private static void handleVerticalText(CleanRulings cleanRulings, TextPositionSequence word) {
float lowerY = (float) (word.getBBox().getMinY() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
float upperY = (float) (word.getBBox().getMaxY() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
float lowerY = (float) (word.getBBoxInitialUserSpace().getMinY() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
float upperY = (float) (word.getBBoxInitialUserSpace().getMaxY() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
float strikethroughCenterX = (float) word.getBBox().getCenterX();
float strikethroughBoxHeight = (float) ((word.getHeight() * STRIKETHROUGH_ZONE) / 2);
float strikethroughCenterX = (float) word.getBBoxInitialUserSpace().getCenterX();
float strikethroughBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * STRIKETHROUGH_ZONE) / 2);
float underlineCenterX = (float) (word.getDir().equals(TextDirection.QUARTER_CIRCLE) ? word.getBBox().getMaxX() : word.getBBox().getMinX());
float underlineBoxHeight = (float) ((word.getHeight() * UNDERLINE_ZONE) / 2);
float underlineCenterX = (float) (word.getDir().equals(TextDirection.QUARTER_CIRCLE) ? word.getBBoxInitialUserSpace().getMaxX() : word.getBBoxInitialUserSpace().getMinX());
float underlineBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * UNDERLINE_ZONE) / 2);
float leftX = Math.min(underlineCenterX - underlineBoxHeight, strikethroughCenterX - strikethroughBoxHeight);
float rightX = Math.max(underlineCenterX + underlineBoxHeight, strikethroughCenterX + strikethroughBoxHeight);
@ -65,14 +65,14 @@ public class TextRulingsClassifier {
private static void handleHorizontalText(CleanRulings cleanRulings, TextPositionSequence word) {
float leftX = (float) (word.getBBox().getMinX() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
float rightX = (float) (word.getBBox().getMaxX() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
float leftX = (float) (word.getBBoxInitialUserSpace().getMinX() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
float rightX = (float) (word.getBBoxInitialUserSpace().getMaxX() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
float strikethroughCenterY = (float) word.getBBox().getCenterY();
float strikethroughBoxHeight = (float) ((word.getHeight() * STRIKETHROUGH_ZONE) / 2);
float strikethroughCenterY = (float) word.getBBoxInitialUserSpace().getCenterY();
float strikethroughBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * STRIKETHROUGH_ZONE) / 2);
float underlineCenterY = (float) (word.getDir().equals(TextDirection.ZERO) ? word.getBBox().getMinY() : word.getBBox().getMaxY());
float underlineBoxHeight = (float) ((word.getHeight() * UNDERLINE_ZONE) / 2);
float underlineCenterY = (float) (word.getDir().equals(TextDirection.ZERO) ? word.getBBoxInitialUserSpace().getMinY() : word.getBBoxInitialUserSpace().getMaxY());
float underlineBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * UNDERLINE_ZONE) / 2);
float lowerY = Math.min(underlineCenterY - underlineBoxHeight, strikethroughCenterY - strikethroughBoxHeight);
float upperY = Math.max(underlineCenterY + underlineBoxHeight, strikethroughCenterY + strikethroughBoxHeight);

View File

@ -39,7 +39,7 @@ public class SearchTextWithTextPositionFactory {
RedTextPosition currentTextPosition = sequences.get(0).getTextPositions()
.get(0);
RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").directionAdjustedPosition(currentTextPosition.getDirectionAdjustedPosition()).build();
RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").bBoxDirAdj(currentTextPosition.getBBoxDirAdj()).build();
for (TextPositionSequence word : sequences) {
for (int i = 0; i < word.getTextPositions().size(); ++i) {
@ -61,7 +61,7 @@ public class SearchTextWithTextPositionFactory {
++context.positionIdx;
}
previousTextPosition = RedTextPosition.builder().unicode(" ").directionAdjustedPosition(previousTextPosition.getDirectionAdjustedPosition()).build();
previousTextPosition = RedTextPosition.builder().unicode(" ").bBoxDirAdj(previousTextPosition.getBBoxDirAdj()).build();
context.stringBuilder.append(" ");
context.stringIdxToPositionIdx.add(context.positionIdx);
++context.stringIdx;

View File

@ -11,6 +11,7 @@ import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.apache.pdfbox.text.TextPosition;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
@ -47,7 +48,7 @@ public class MarkedContentUtils {
return markedContentByYPosition.values()
.stream()
.map(textPositions -> new TextPositionSequence(textPositions, 0, true).getBBox())
.map(textPositions -> new TextPositionSequence(textPositions, 0, true).getBBoxInitialUserSpace())
.map(t -> new Rectangle2D.Double(t.getX(), t.getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight())))
.collect(Collectors.toList());
}
@ -89,7 +90,7 @@ public class MarkedContentUtils {
.map(content -> (TextPosition) content)
.filter(content -> !content.getUnicode().equals(" "))
.map(textPositions -> new TextPositionSequence(List.of(textPositions), 0, true))
.map(TextPositionSequence::getBBox)
.map(BoundingBox::getBBoxInitialUserSpace)
.collect(Collectors.toList());
}

View File

@ -76,6 +76,7 @@ public class LayoutparsingVisualizations {
final Visualizations lines = Visualizations.builder().layer(ContentStreams.LINES).build();
final Visualizations zones = Visualizations.builder().layer(ContentStreams.ZONES).build();
final Visualizations mainBody = Visualizations.builder().layer(ContentStreams.MAIN_BODY).build();
final Visualizations clean_rulings = Visualizations.builder().layer(ContentStreams.CLEAN_RULINGS).build();
final Visualizations rulings = Visualizations.builder().layer(ContentStreams.RULINGS).build();
final Visualizations cells = Visualizations.builder().layer(ContentStreams.CELLS).build();
final Visualizations markedContent = Visualizations.builder().layer(ContentStreams.MARKED_CONTENT).build();
@ -94,6 +95,7 @@ public class LayoutparsingVisualizations {
lines, //
zones, //
rulings, //
clean_rulings, //
cells, //
mainBody, //
markedContent //
@ -120,11 +122,24 @@ public class LayoutparsingVisualizations {
if (!active) {
return;
}
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.rulings);
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.clean_rulings);
visualizationsOnPage.getColoredLines()
.addAll(cleanRulings.buildAll()
.stream()
.map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 1))
.map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 0.5f))
.toList());
}
public void addRulingVisualization(List<Ruling> rulings, int pageNumber) {
if (!active) {
return;
}
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.rulings);
visualizationsOnPage.getColoredLines()
.addAll(rulings
.stream()
.map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 0.5f))
.toList());
}

View File

@ -34,7 +34,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
@Test
public void testLayoutParserEndToEnd() {
String filePath = "files/syngenta/CustomerFiles/54 Fludioxonil - EU AIR3 - Document E1 - Listing of Community and Member States MRLs.pdf";
String filePath = "files/Minimal Examples/RotateTextWithRulingsTestFile.pdf";
runForFile(filePath);
}

View File

@ -29,7 +29,7 @@ class PageContentExtractorTest {
textPositionPerPage.stream()
.map(t -> t.getSortedTextPositionSequences()
.stream()
.map(TextPositionSequence::getBBox)
.map(TextPositionSequence::getBBoxInitialUserSpace)
.map(List::of)
.toList())
.toList(), tmpFileName);

View File

@ -26,6 +26,8 @@ public class ContentStreams {
public static Identifier ESCAPE_END = new Identifier("escape start", COSName.getPDFName("ESCAPE_END"), false);
public static Identifier CLEAN_RULINGS = new Identifier("Cleaned Rulings", COSName.getPDFName("KNECON_CLEAN_RULINGS"), true);
public static Identifier RULINGS = new Identifier("Rulings", COSName.getPDFName("KNECON_RULINGS"), true);
public static Identifier WORDS = new Identifier("Words", COSName.getPDFName("KNECON_WORDS"), true);
@ -53,6 +55,7 @@ public class ContentStreams {
ESCAPE_START,
ESCAPE_END,
RULINGS,
CLEAN_RULINGS,
WORDS,
ZONES,
LINES,