RED-8825: general layoutparsing improvements
* fixing a bunch of coordinates
This commit is contained in:
parent
a3decd292d
commit
60acbac53f
@ -263,6 +263,7 @@ public class LayoutParsingPipeline {
|
||||
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
|
||||
|
||||
PDRectangle cropbox = pdPage.getCropBox();
|
||||
classificationDocument.getVisualizations().addRulingVisualization(stripper.getRulings(), pageNumber);
|
||||
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
|
||||
|
||||
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals(), PageInformation.fromPDPage(pageNumber, pdPage));
|
||||
|
||||
@ -27,8 +27,8 @@ public class Character {
|
||||
|
||||
public Character(RedTextPosition chunk) {
|
||||
|
||||
this.x = chunk.getDirectionAdjustedPosition().getCenterX();
|
||||
this.y = chunk.getDirectionAdjustedPosition().getCenterY();
|
||||
this.x = chunk.getBBoxDirAdj().getCenterX();
|
||||
this.y = chunk.getBBoxDirAdj().getCenterY();
|
||||
this.textPosition = chunk;
|
||||
}
|
||||
|
||||
|
||||
@ -21,7 +21,8 @@ import lombok.SneakyThrows;
|
||||
public class RedTextPosition extends BoundingBox {
|
||||
|
||||
public final static int HEIGHT_PADDING = 2;
|
||||
private Rectangle2D.Float directionAdjustedPosition; // adjusted to text rotation
|
||||
|
||||
private Rectangle2D.Float bBoxDirAdj; // adjusted to text rotation
|
||||
|
||||
@JsonIgnore
|
||||
private int rotation;
|
||||
@ -72,7 +73,7 @@ public class RedTextPosition extends BoundingBox {
|
||||
textPosition.getYDirAdj() - textHeight,
|
||||
textPosition.getWidthDirAdj(),
|
||||
textHeight + HEIGHT_PADDING);
|
||||
pos.setDirectionAdjustedPosition(dirAdjPosition);
|
||||
pos.setBBoxDirAdj(dirAdjPosition);
|
||||
|
||||
AffineTransform affineTransform = getRotationMatrix(TextDirection.fromDegrees(textPosition.getDir()), textPosition.getPageWidth(), textPosition.getPageHeight());
|
||||
Rectangle2D initialUserSpacePositionRect = affineTransform.createTransformedShape(dirAdjPosition).getBounds2D();
|
||||
@ -83,6 +84,7 @@ public class RedTextPosition extends BoundingBox {
|
||||
}
|
||||
|
||||
|
||||
|
||||
private static AffineTransform getRotationMatrix(TextDirection textDirection, float pageWidth, float pageHeight) {
|
||||
|
||||
AffineTransform transform = new AffineTransform();
|
||||
@ -105,28 +107,28 @@ public class RedTextPosition extends BoundingBox {
|
||||
@JsonIgnore
|
||||
public float getXDirAdj() {
|
||||
|
||||
return this.directionAdjustedPosition.x;
|
||||
return this.bBoxDirAdj.x;
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getYDirAdj() {
|
||||
|
||||
return this.directionAdjustedPosition.y;
|
||||
return this.bBoxDirAdj.y;
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getWidthDirAdj() {
|
||||
|
||||
return this.directionAdjustedPosition.width;
|
||||
return this.bBoxDirAdj.width;
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getHeightDir() {
|
||||
|
||||
return this.directionAdjustedPosition.height;
|
||||
return this.bBoxDirAdj.height;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
@ -8,6 +9,7 @@ import java.util.stream.Collectors;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
@ -32,6 +34,7 @@ public class TextPositionSequence extends BoundingBox implements CharSequence {
|
||||
@EqualsAndHashCode.Include
|
||||
private List<RedTextPosition> textPositions = new ArrayList<>();
|
||||
|
||||
private Rectangle2D bBoxDirAdj;
|
||||
@EqualsAndHashCode.Include
|
||||
private TextDirection dir;
|
||||
private int rotation;
|
||||
@ -53,6 +56,15 @@ public class TextPositionSequence extends BoundingBox implements CharSequence {
|
||||
this.pageHeight = textPositions.get(0).getPageHeight();
|
||||
this.pageWidth = textPositions.get(0).getPageWidth();
|
||||
this.isParagraphStart = isParagraphStart;
|
||||
calculateBBox();
|
||||
}
|
||||
|
||||
|
||||
private void calculateBBox() {
|
||||
|
||||
this.bBoxDirAdj = textPositions.stream()
|
||||
.map(RedTextPosition::getBBoxDirAdj)
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
setToBBoxOfComponents(getTextPositions());
|
||||
}
|
||||
|
||||
@ -65,7 +77,7 @@ public class TextPositionSequence extends BoundingBox implements CharSequence {
|
||||
this.rotation = textPositions.get(0).getRotation();
|
||||
this.pageHeight = textPositions.get(0).getPageHeight();
|
||||
this.pageWidth = textPositions.get(0).getPageWidth();
|
||||
setToBBoxOfComponents(getTextPositions());
|
||||
calculateBBox();
|
||||
}
|
||||
|
||||
|
||||
@ -133,7 +145,7 @@ public class TextPositionSequence extends BoundingBox implements CharSequence {
|
||||
this.rotation = textPositionSequence.getRotation();
|
||||
this.pageHeight = textPositionSequence.getPageHeight();
|
||||
this.pageWidth = textPositionSequence.getPageWidth();
|
||||
setToBBoxOfComponents(getTextPositions());
|
||||
calculateBBox();
|
||||
;
|
||||
}
|
||||
|
||||
@ -145,7 +157,7 @@ public class TextPositionSequence extends BoundingBox implements CharSequence {
|
||||
this.rotation = textPositions.get(0).getRotation();
|
||||
this.pageHeight = textPositions.get(0).getPageHeight();
|
||||
this.pageWidth = textPositions.get(0).getPageWidth();
|
||||
setToBBoxOfComponents(getTextPositions());
|
||||
calculateBBox();
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -56,7 +56,7 @@ public class ImageServiceResponseAdapter {
|
||||
classificationPage.getImages().forEach(image -> {
|
||||
if (image.getImageType().equals(ImageType.OTHER)) {
|
||||
for (AbstractPageBlock textblock : classificationPage.getTextBlocks()) {
|
||||
if (image.getPosition().contains(textblock.getBBox())) {
|
||||
if (image.getPosition().contains(textblock.getBBoxInitialUserSpace())) {
|
||||
image.setImageType(ImageType.OCR);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -124,9 +124,9 @@ public class RulingCleaningService {
|
||||
}
|
||||
|
||||
if (ruling.isHorizontal()) {
|
||||
return new Rectangle2D.Double(x - THRESHOLD_Y_HORIZONTAL, y - THRESHOLD_X_HORIZONTAL, w + 2 * THRESHOLD_X_HORIZONTAL, h + 2 * THRESHOLD_Y_HORIZONTAL);
|
||||
return new Rectangle2D.Double(x - THRESHOLD_X_HORIZONTAL, y - THRESHOLD_Y_HORIZONTAL, w + 2 * THRESHOLD_X_HORIZONTAL, h + 2 * THRESHOLD_Y_HORIZONTAL);
|
||||
} else {
|
||||
return new Rectangle2D.Double(x - THRESHOLD_Y_VERTICAL, y - THRESHOLD_X_VERTICAL, w + 2 * THRESHOLD_X_VERTICAL, h + 2 * THRESHOLD_Y_VERTICAL);
|
||||
return new Rectangle2D.Double(x - THRESHOLD_X_VERTICAL, y - THRESHOLD_Y_VERTICAL, w + 2 * THRESHOLD_X_VERTICAL, h + 2 * THRESHOLD_Y_VERTICAL);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -142,14 +142,14 @@ public class TableExtractionService {
|
||||
public static List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines, PageInformation pageInformation) {
|
||||
|
||||
AffineTransform affineTransform = CoordinateTransforms.calculateInitialUserSpaceCoordsToImageCoords(pageInformation, 1);
|
||||
|
||||
/*
|
||||
switch (pageInformation.rotationDegrees()) {
|
||||
case 90 -> affineTransform.translate(RedTextPosition.HEIGHT_PADDING, 0); //although this is wrong, our text coordinates are wrong as well
|
||||
case 180 -> affineTransform.translate(0, RedTextPosition.HEIGHT_PADDING);
|
||||
case 270 -> affineTransform.translate(-RedTextPosition.HEIGHT_PADDING, 0);
|
||||
default -> affineTransform.translate(0, -RedTextPosition.HEIGHT_PADDING);
|
||||
}
|
||||
|
||||
*/
|
||||
return RectangularIntersectionFinder.find(horizontalRulingLines, verticalRulingLines)
|
||||
.stream()
|
||||
.map(rect -> new Cell(rect, affineTransform))
|
||||
|
||||
@ -31,14 +31,14 @@ public class TextRulingsClassifier {
|
||||
|
||||
private static void handleVerticalText(CleanRulings cleanRulings, TextPositionSequence word) {
|
||||
|
||||
float lowerY = (float) (word.getBBox().getMinY() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||
float upperY = (float) (word.getBBox().getMaxY() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||
float lowerY = (float) (word.getBBoxInitialUserSpace().getMinY() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||
float upperY = (float) (word.getBBoxInitialUserSpace().getMaxY() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||
|
||||
float strikethroughCenterX = (float) word.getBBox().getCenterX();
|
||||
float strikethroughBoxHeight = (float) ((word.getHeight() * STRIKETHROUGH_ZONE) / 2);
|
||||
float strikethroughCenterX = (float) word.getBBoxInitialUserSpace().getCenterX();
|
||||
float strikethroughBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * STRIKETHROUGH_ZONE) / 2);
|
||||
|
||||
float underlineCenterX = (float) (word.getDir().equals(TextDirection.QUARTER_CIRCLE) ? word.getBBox().getMaxX() : word.getBBox().getMinX());
|
||||
float underlineBoxHeight = (float) ((word.getHeight() * UNDERLINE_ZONE) / 2);
|
||||
float underlineCenterX = (float) (word.getDir().equals(TextDirection.QUARTER_CIRCLE) ? word.getBBoxInitialUserSpace().getMaxX() : word.getBBoxInitialUserSpace().getMinX());
|
||||
float underlineBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * UNDERLINE_ZONE) / 2);
|
||||
|
||||
float leftX = Math.min(underlineCenterX - underlineBoxHeight, strikethroughCenterX - strikethroughBoxHeight);
|
||||
float rightX = Math.max(underlineCenterX + underlineBoxHeight, strikethroughCenterX + strikethroughBoxHeight);
|
||||
@ -65,14 +65,14 @@ public class TextRulingsClassifier {
|
||||
|
||||
private static void handleHorizontalText(CleanRulings cleanRulings, TextPositionSequence word) {
|
||||
|
||||
float leftX = (float) (word.getBBox().getMinX() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||
float rightX = (float) (word.getBBox().getMaxX() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||
float leftX = (float) (word.getBBoxInitialUserSpace().getMinX() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||
float rightX = (float) (word.getBBoxInitialUserSpace().getMaxX() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||
|
||||
float strikethroughCenterY = (float) word.getBBox().getCenterY();
|
||||
float strikethroughBoxHeight = (float) ((word.getHeight() * STRIKETHROUGH_ZONE) / 2);
|
||||
float strikethroughCenterY = (float) word.getBBoxInitialUserSpace().getCenterY();
|
||||
float strikethroughBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * STRIKETHROUGH_ZONE) / 2);
|
||||
|
||||
float underlineCenterY = (float) (word.getDir().equals(TextDirection.ZERO) ? word.getBBox().getMinY() : word.getBBox().getMaxY());
|
||||
float underlineBoxHeight = (float) ((word.getHeight() * UNDERLINE_ZONE) / 2);
|
||||
float underlineCenterY = (float) (word.getDir().equals(TextDirection.ZERO) ? word.getBBoxInitialUserSpace().getMinY() : word.getBBoxInitialUserSpace().getMaxY());
|
||||
float underlineBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * UNDERLINE_ZONE) / 2);
|
||||
|
||||
float lowerY = Math.min(underlineCenterY - underlineBoxHeight, strikethroughCenterY - strikethroughBoxHeight);
|
||||
float upperY = Math.max(underlineCenterY + underlineBoxHeight, strikethroughCenterY + strikethroughBoxHeight);
|
||||
|
||||
@ -39,7 +39,7 @@ public class SearchTextWithTextPositionFactory {
|
||||
|
||||
RedTextPosition currentTextPosition = sequences.get(0).getTextPositions()
|
||||
.get(0);
|
||||
RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").directionAdjustedPosition(currentTextPosition.getDirectionAdjustedPosition()).build();
|
||||
RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").bBoxDirAdj(currentTextPosition.getBBoxDirAdj()).build();
|
||||
|
||||
for (TextPositionSequence word : sequences) {
|
||||
for (int i = 0; i < word.getTextPositions().size(); ++i) {
|
||||
@ -61,7 +61,7 @@ public class SearchTextWithTextPositionFactory {
|
||||
++context.positionIdx;
|
||||
}
|
||||
|
||||
previousTextPosition = RedTextPosition.builder().unicode(" ").directionAdjustedPosition(previousTextPosition.getDirectionAdjustedPosition()).build();
|
||||
previousTextPosition = RedTextPosition.builder().unicode(" ").bBoxDirAdj(previousTextPosition.getBBoxDirAdj()).build();
|
||||
context.stringBuilder.append(" ");
|
||||
context.stringIdxToPositionIdx.add(context.positionIdx);
|
||||
++context.stringIdx;
|
||||
|
||||
@ -11,6 +11,7 @@ import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
|
||||
@ -47,7 +48,7 @@ public class MarkedContentUtils {
|
||||
|
||||
return markedContentByYPosition.values()
|
||||
.stream()
|
||||
.map(textPositions -> new TextPositionSequence(textPositions, 0, true).getBBox())
|
||||
.map(textPositions -> new TextPositionSequence(textPositions, 0, true).getBBoxInitialUserSpace())
|
||||
.map(t -> new Rectangle2D.Double(t.getX(), t.getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight())))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
@ -89,7 +90,7 @@ public class MarkedContentUtils {
|
||||
.map(content -> (TextPosition) content)
|
||||
.filter(content -> !content.getUnicode().equals(" "))
|
||||
.map(textPositions -> new TextPositionSequence(List.of(textPositions), 0, true))
|
||||
.map(TextPositionSequence::getBBox)
|
||||
.map(BoundingBox::getBBoxInitialUserSpace)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
@ -76,6 +76,7 @@ public class LayoutparsingVisualizations {
|
||||
final Visualizations lines = Visualizations.builder().layer(ContentStreams.LINES).build();
|
||||
final Visualizations zones = Visualizations.builder().layer(ContentStreams.ZONES).build();
|
||||
final Visualizations mainBody = Visualizations.builder().layer(ContentStreams.MAIN_BODY).build();
|
||||
final Visualizations clean_rulings = Visualizations.builder().layer(ContentStreams.CLEAN_RULINGS).build();
|
||||
final Visualizations rulings = Visualizations.builder().layer(ContentStreams.RULINGS).build();
|
||||
final Visualizations cells = Visualizations.builder().layer(ContentStreams.CELLS).build();
|
||||
final Visualizations markedContent = Visualizations.builder().layer(ContentStreams.MARKED_CONTENT).build();
|
||||
@ -94,6 +95,7 @@ public class LayoutparsingVisualizations {
|
||||
lines, //
|
||||
zones, //
|
||||
rulings, //
|
||||
clean_rulings, //
|
||||
cells, //
|
||||
mainBody, //
|
||||
markedContent //
|
||||
@ -120,11 +122,24 @@ public class LayoutparsingVisualizations {
|
||||
if (!active) {
|
||||
return;
|
||||
}
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.rulings);
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.clean_rulings);
|
||||
visualizationsOnPage.getColoredLines()
|
||||
.addAll(cleanRulings.buildAll()
|
||||
.stream()
|
||||
.map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 1))
|
||||
.map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 0.5f))
|
||||
.toList());
|
||||
}
|
||||
|
||||
public void addRulingVisualization(List<Ruling> rulings, int pageNumber) {
|
||||
|
||||
if (!active) {
|
||||
return;
|
||||
}
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.rulings);
|
||||
visualizationsOnPage.getColoredLines()
|
||||
.addAll(rulings
|
||||
.stream()
|
||||
.map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 0.5f))
|
||||
.toList());
|
||||
}
|
||||
|
||||
|
||||
@ -34,7 +34,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
@Test
|
||||
public void testLayoutParserEndToEnd() {
|
||||
|
||||
String filePath = "files/syngenta/CustomerFiles/54 Fludioxonil - EU AIR3 - Document E1 - Listing of Community and Member States MRLs.pdf";
|
||||
String filePath = "files/Minimal Examples/RotateTextWithRulingsTestFile.pdf";
|
||||
|
||||
runForFile(filePath);
|
||||
}
|
||||
|
||||
@ -29,7 +29,7 @@ class PageContentExtractorTest {
|
||||
textPositionPerPage.stream()
|
||||
.map(t -> t.getSortedTextPositionSequences()
|
||||
.stream()
|
||||
.map(TextPositionSequence::getBBox)
|
||||
.map(TextPositionSequence::getBBoxInitialUserSpace)
|
||||
.map(List::of)
|
||||
.toList())
|
||||
.toList(), tmpFileName);
|
||||
|
||||
@ -26,6 +26,8 @@ public class ContentStreams {
|
||||
|
||||
public static Identifier ESCAPE_END = new Identifier("escape start", COSName.getPDFName("ESCAPE_END"), false);
|
||||
|
||||
public static Identifier CLEAN_RULINGS = new Identifier("Cleaned Rulings", COSName.getPDFName("KNECON_CLEAN_RULINGS"), true);
|
||||
|
||||
public static Identifier RULINGS = new Identifier("Rulings", COSName.getPDFName("KNECON_RULINGS"), true);
|
||||
|
||||
public static Identifier WORDS = new Identifier("Words", COSName.getPDFName("KNECON_WORDS"), true);
|
||||
@ -53,6 +55,7 @@ public class ContentStreams {
|
||||
ESCAPE_START,
|
||||
ESCAPE_END,
|
||||
RULINGS,
|
||||
CLEAN_RULINGS,
|
||||
WORDS,
|
||||
ZONES,
|
||||
LINES,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user