RED-8825: general layoutparsing improvements

* refactor all coordinates
This commit is contained in:
Kilian Schuettler 2024-05-02 21:01:25 +02:00
parent d61cac8b4f
commit b6f0a21886
42 changed files with 570 additions and 1196 deletions

View File

@ -60,6 +60,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.mapper.Taas
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import io.micrometer.observation.Observation;
import io.micrometer.observation.ObservationRegistry;
@ -264,7 +265,7 @@ public class LayoutParsingPipeline {
PDRectangle cropbox = pdPage.getCropBox();
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals());
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals(), PageInformation.fromPDPage(pageNumber, pdPage));
classificationDocument.getVisualizations().addCellVisualizations(emptyTableCells, pageNumber);
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings);
@ -293,7 +294,7 @@ public class LayoutParsingPipeline {
classificationPage.setPageWidth(cropbox.getWidth());
classificationPage.setPageHeight(cropbox.getHeight());
classificationDocument.getVisualizations().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber, pdPage);
classificationDocument.getVisualizations().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber);
// MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox.
classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents()));

View File

@ -1,13 +1,27 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
import java.awt.geom.Rectangle2D;
import java.util.Comparator;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.Data;
@Data
public abstract class BoundingBox {
private Rectangle2D bBox;
// Java coordinate system: (0, 0) is always upper left, x is increasing left to right and y is increasing from top to bottom.
// should be used when determining reading order or other tasks which require coordinates in a harmonized system.
protected Rectangle2D bBox; // I would not trust this coordinate when comparing rulings and text, due to the text positions being slightly off.
// PDF coordinate system: depends on page rotation, (0, 0) is lower left corner, x is increasing left to right and y from bottom to top.
// This rotates completely in 90 degree steps with page rotation.
// Needs to be used when writing to a PDF.
// Also, these are definitely correct and should be used whenever possible.
protected Rectangle2D bBoxInitialUserSpace;
protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
public double getX() {
@ -22,6 +36,42 @@ public abstract class BoundingBox {
}
public double getMinX() {
return bBox.getMinX();
}
public double getMinY() {
return bBox.getMinY();
}
public double getPdfMinX() {
return bBoxInitialUserSpace.getMinX();
}
public double getPdfMaxX() {
return bBoxInitialUserSpace.getMaxX();
}
public double getPdfMinY() {
return bBoxInitialUserSpace.getMinY();
}
public double getPdfMaxY() {
return bBoxInitialUserSpace.getMaxY();
}
public double getWidth() {
return bBox.getWidth();
@ -34,21 +84,102 @@ public abstract class BoundingBox {
}
public double getMaxX() {
return bBox.getMaxX();
}
public double getMaxY() {
return bBox.getMaxY();
}
public double getArea() {
return (bBox.getHeight() * bBox.getWidth());
}
public boolean contains(Rectangle2D contained, double tolerance) {
public boolean contains(BoundingBox contained) {
return bBox.getX() <= contained.getX() + tolerance && bBox.getY() <= contained.getY() + tolerance && bBox.getX() + bBox.getWidth() >= contained.getX() + contained.getWidth() - tolerance && bBox.getY() + bBox.getHeight() >= contained.getY() + contained.getHeight() - tolerance;
return contains(contained, 0);
}
public boolean contains(BoundingBox contained, double tolerance) {
return getPdfMinX() <= contained.getPdfMinX() + tolerance
&& getPdfMinY() <= contained.getPdfMinY() + tolerance
&& getPdfMaxX() >= contained.getPdfMaxX() - tolerance
&& getPdfMaxY() >= contained.getPdfMaxY() - tolerance;
}
public boolean intersects(BoundingBox other) {
return this.intersectsX(other) && this.intersectsY(other);
}
public boolean intersects(BoundingBox other, float yThreshold, float xThreshold) {
return this.intersectsX(other, xThreshold) && this.intersectsY(other, yThreshold);
}
public boolean intersectsY(BoundingBox other) {
return this.getBBox().getMinY() <= other.getBBox().getMaxY() && this.getBBox().getMaxY() >= other.getBBox().getMinY();
return this.getPdfMinY() <= other.getPdfMaxY() && this.getPdfMaxY() >= other.getPdfMinY();
}
public boolean intersectsY(BoundingBox other, float threshold) {
return this.getPdfMinY() - threshold <= other.getPdfMaxY() && this.getPdfMaxY() + threshold >= other.getPdfMinY();
}
public boolean intersectsX(BoundingBox other) {
return this.getPdfMinX() <= other.getMaxX() && this.getMaxX() >= other.getPdfMinX();
}
public boolean intersectsX(BoundingBox other, float threshold) {
return this.getPdfMinX() - threshold <= other.getMaxX() && this.getMaxX() + threshold >= other.getPdfMinX();
}
public void setToBBoxOfComponents(List<? extends BoundingBox> components) {
this.bBox = components.stream()
.map(BoundingBox::getBBox)
.collect(RectangleTransformations.collectBBox());
this.bBoxInitialUserSpace = components.stream()
.map(BoundingBox::getBBoxInitialUserSpace)
.collect(RectangleTransformations.collectBBox());
}
public double verticalOverlap(BoundingBox other) {
return Math.max(0, Math.min(this.getPdfMaxY(), other.getPdfMaxY()) - Math.max(this.getPdfMinY(), other.getPdfMinY()));
}
public static final Comparator<BoundingBox> ILL_DEFINED_ORDER = (o1, o2) -> {
if (o1.equals(o2)) {
return 0;
}
if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD * ((o1.getHeight() + o2.getHeight()) / 2)) {
return Double.compare(o1.getPdfMinX(), o2.getPdfMinX());
} else {
return Double.compare(o1.getPdfMaxY(), o2.getPdfMaxY());
}
};
}

View File

@ -5,9 +5,7 @@ import java.util.Arrays;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.Data;
import lombok.EqualsAndHashCode;
@ -145,10 +143,9 @@ public class Line extends BoundingBox {
private void buildBBox() {
this.setBBox(characters.stream()
.map(Character::getTextPosition)
.map(RedTextPosition::getInitialUserSpacePosition)
.collect(RectangleTransformations.collectBBox()));
this.setToBBoxOfComponents(characters.stream()
.map(Character::getTextPosition)
.toList());
}

View File

@ -18,15 +18,7 @@ public class Zone extends BoundingBox {
lines.sort(Comparator.comparingDouble(Line::getY0));
this.lines = lines;
buildBBox();
}
public void buildBBox() {
this.setBBox(getLines().stream()
.map(BoundingBox::getBBox)
.collect(RectangleTransformations.collectBBox()));
setToBBoxOfComponents(lines);
}

View File

@ -38,7 +38,7 @@ public class LineBuilderService {
if (character.getTextPosition().getDir() != neighbor.getCharacter().getTextPosition().getDir() //
|| !angleFilter.matches(neighbor) //
|| Math.pow(normalizedHorizontalDistance, 2) + Math.pow(normalizedVerticalDistance, 2) > 1 //
|| rulings.lineBetween(character, neighbor.getCharacter())) {
|| rulings.lineBetween(character.getTextPosition(), neighbor.getCharacter().getTextPosition())) {
return;
}

View File

@ -39,7 +39,10 @@ public class ReadingOrderService {
}
}
if (histogram.values().stream().mapToInt(Integer::intValue).average().orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) {
if (histogram.values()
.stream()
.mapToInt(Integer::intValue).average()
.orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) {
return resolveSingleColumnReadingOrder(zones);
} else {
@ -52,7 +55,7 @@ public class ReadingOrderService {
private static List<Zone> resolveSingleColumnReadingOrder(List<Zone> zones) {
zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
return zones;
}
@ -90,14 +93,14 @@ public class ReadingOrderService {
}
leftOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
rightOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
middle.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
/*
List<Zone> leftNotIntersecting = new ArrayList<>();
for (Zone leftZone : leftOf) {
boolean intersects = false;
@ -139,7 +142,7 @@ public class ReadingOrderService {
middle.addAll(leftNotIntersecting);
middle.addAll(rightNotIntersecting);
*/
List<Zone> sortedZones = new ArrayList<>();
sortedZones.addAll(leftOf);
sortedZones.addAll(rightOf);

View File

@ -65,7 +65,7 @@ public class ZoneBuilderService {
return;
}
if (rulings.lineBetween(outerLine.getBBox(), innerLine.getBBox())) {
if (rulings.lineBetween(outerLine, innerLine)) {
return;
}

View File

@ -4,6 +4,7 @@ import java.awt.geom.Rectangle2D;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import lombok.AllArgsConstructor;
@ -15,18 +16,8 @@ import lombok.NoArgsConstructor;
@AllArgsConstructor
@NoArgsConstructor
@EqualsAndHashCode(callSuper = true)
public abstract class AbstractPageBlock extends Rectangle {
public abstract class AbstractPageBlock extends BoundingBox {
protected Rectangle2D bBox; // in initial user space
@JsonIgnore
protected float minX;
@JsonIgnore
protected float maxX;
@JsonIgnore
protected float minY;
@JsonIgnore
protected float maxY;
@JsonIgnore
protected PageBlockType classification;
@JsonIgnore
@ -45,63 +36,6 @@ public abstract class AbstractPageBlock extends Rectangle {
}
public boolean containsBlock(TextPageBlock other) {
return this.minX <= other.getPdfMinX() && this.maxX >= other.getPdfMaxX() && this.minY >= other.getPdfMinY() && this.maxY <= other.getPdfMaxY();
}
public boolean contains(AbstractPageBlock other) {
return this.minX <= other.minX && this.maxX >= other.maxX && this.minY >= other.minY && this.maxY <= other.maxY;
}
public boolean contains(Rectangle other) {
return page == other.getPage() && this.minX <= other.getTopLeft().getX() && this.maxX >= other.getTopLeft().getX() + other.getWidth() && this.minY <= other.getTopLeft()
.getY() && this.maxY >= other.getTopLeft().getY() + other.getHeight();
}
@JsonIgnore
public float getHeight() {
return maxY - minY;
}
@JsonIgnore
public float getWidth() {
return maxX - minX;
}
public boolean intersectsY(AbstractPageBlock apb) {
return this.minY <= apb.getMaxY() && this.maxY >= apb.getMinY();
}
public boolean almostIntersects(AbstractPageBlock apb, float yThreshold, float xThreshold) {
return this.almostIntersectsX(apb, xThreshold) && this.almostIntersectsY(apb, yThreshold);
}
private boolean almostIntersectsY(AbstractPageBlock apb, float threshold) {
return this.minY - threshold <= apb.getMaxY() && this.maxY + threshold >= apb.getMinY();
}
private boolean almostIntersectsX(AbstractPageBlock apb, float threshold) {
return this.minX - threshold <= apb.getMaxX() && this.maxX + threshold >= apb.getMinX();
}
public abstract boolean isEmpty();
}

View File

@ -1,11 +1,13 @@
package com.knecon.fforesight.service.layoutparser.processor.model.table;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
@ -18,7 +20,7 @@ import lombok.NoArgsConstructor;
@Data
@EqualsAndHashCode(callSuper = true)
@NoArgsConstructor
public class Cell extends Rectangle {
public class Cell extends BoundingBox {
private List<TextPageBlock> textBlocks = new ArrayList<>();
@ -33,13 +35,24 @@ public class Cell extends Rectangle {
public Cell(Point2D topLeft, Point2D bottomRight) {
super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), (float) (bottomRight.getY() - topLeft.getY()));
this.bBoxInitialUserSpace = new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), (bottomRight.getX() - topLeft.getX()), (bottomRight.getY() - topLeft.getY()));
this.bBox = bBoxInitialUserSpace;
}
public Cell(Rectangle2D r) {
public Cell(Rectangle2D bBoxInitialUserSpace, AffineTransform initialUserSpaceToJava) {
super((float) r.getY(), (float) r.getX(), (float) r.getWidth(), (float) r.getHeight());
this.bBoxInitialUserSpace = bBoxInitialUserSpace;
this.bBox = initialUserSpaceToJava.createTransformedShape(bBoxInitialUserSpace).getBounds2D();
}
public static Cell copy(Cell cell) {
Cell copy = new Cell();
copy.bBoxInitialUserSpace = cell.bBoxInitialUserSpace;
copy.bBox = cell.bBox;
return copy;
}

View File

@ -7,10 +7,11 @@ import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import lombok.Getter;
@ -67,17 +68,14 @@ public class CleanRulings {
}
public boolean lineBetween(Character a, Character b) {
public boolean lineBetween(BoundingBox a, BoundingBox b) {
return lineBetween(a.getTextPosition().getInitialUserSpacePosition(), b.getTextPosition().getInitialUserSpacePosition());
return lineBetween(a.getBBoxInitialUserSpace(), b.getBBoxInitialUserSpace());
}
public boolean lineBetween(Rectangle2D a, Rectangle2D b) {
if (a.intersects(b)) {
return false;
}
return lineBetween(new Point2D.Double(a.getCenterX(), a.getCenterY()), new Point2D.Double(b.getCenterX(), b.getCenterY()));
}
@ -119,7 +117,7 @@ public class CleanRulings {
return Collections.emptyList();
}
List<Ruling> result = new ArrayList<>();
List<Ruling> result = new LinkedList<>();
for (int i = firstGreaterThanIdx; i < horizontals.size(); i++) {
Ruling horizontal = horizontals.get(i);
if (horizontal.y1 > endY) {
@ -170,7 +168,7 @@ public class CleanRulings {
return Collections.emptyList();
}
List<Ruling> result = new ArrayList<>();
List<Ruling> result = new LinkedList<>();
for (int i = firstGreaterThanIdx; i < verticals.size(); i++) {
Ruling horizontal = verticals.get(i);
if (horizontal.x1 > endX) {

View File

@ -1,218 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.model.table;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.Comparator;
import java.util.List;
@SuppressWarnings("all")
public class Rectangle extends Rectangle2D.Float {
protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
/**
* Ill-defined comparator, from when Rectangle was Comparable.
* <p>
* see https://github.com/tabulapdf/tabula-java/issues/116
*
* @deprecated with no replacement
*/
@Deprecated
public static final Comparator<Rectangle> ILL_DEFINED_ORDER = new Comparator<Rectangle>() {
@Override
public int compare(Rectangle o1, Rectangle o2) {
if (o1.equals(o2)) {
return 0;
}
if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD) {
return o1.isLtrDominant() == -1 && o2.isLtrDominant() == -1 ? -java.lang.Double.compare(o1.getX(), o2.getX()) : java.lang.Double.compare(o1.getX(), o2.getX());
} else {
return java.lang.Float.compare(o1.getBottom(), o2.getBottom());
}
}
};
public Rectangle() {
super();
}
public Rectangle(float top, float left, float width, float height) {
super();
this.setRect(left, top, width, height);
}
/**
* @param rectangles
* @return minimum bounding box that contains all the rectangles
*/
public static Rectangle boundingBoxOf(List<? extends Rectangle> rectangles) {
float minx = java.lang.Float.MAX_VALUE;
float miny = java.lang.Float.MAX_VALUE;
float maxx = java.lang.Float.MIN_VALUE;
float maxy = java.lang.Float.MIN_VALUE;
for (Rectangle r : rectangles) {
minx = (float) Math.min(r.getMinX(), minx);
miny = (float) Math.min(r.getMinY(), miny);
maxx = (float) Math.max(r.getMaxX(), maxx);
maxy = (float) Math.max(r.getMaxY(), maxy);
}
return new Rectangle(miny, minx, maxx - minx, maxy - miny);
}
public int compareTo(Rectangle other) {
return ILL_DEFINED_ORDER.compare(this, other);
}
// I'm bad at Java and need this for fancy sorting in
// technology.tabula.TextChunk.
public int isLtrDominant() {
return 0;
}
public float getArea() {
return this.width * this.height;
}
public float verticalOverlap(Rectangle other) {
return Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
}
public boolean verticallyOverlaps(Rectangle other) {
return verticalOverlap(other) > 0;
}
public float horizontalOverlap(Rectangle other) {
return Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
}
public boolean horizontallyOverlaps(Rectangle other) {
return horizontalOverlap(other) > 0;
}
public float verticalOverlapRatio(Rectangle other) {
float rv = 0, delta = Math.min(this.getBottom() - this.getTop(), other.getBottom() - other.getTop());
if (other.getTop() <= this.getTop() && this.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) {
rv = (other.getBottom() - this.getTop()) / delta;
} else if (this.getTop() <= other.getTop() && other.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) {
rv = (this.getBottom() - other.getTop()) / delta;
} else if (this.getTop() <= other.getTop() && other.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) {
rv = (other.getBottom() - other.getTop()) / delta;
} else if (other.getTop() <= this.getTop() && this.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) {
rv = (this.getBottom() - this.getTop()) / delta;
}
return rv;
}
public float overlapRatio(Rectangle other) {
double intersectionWidth = Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
double intersectionHeight = Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
double intersectionArea = Math.max(0, intersectionWidth * intersectionHeight);
double unionArea = this.getArea() + other.getArea() - intersectionArea;
return (float) (intersectionArea / unionArea);
}
public Rectangle merge(Rectangle other) {
this.setRect(this.createUnion(other));
return this;
}
public float getTop() {
return (float) this.getMinY();
}
public void setTop(float top) {
float deltaHeight = top - this.y;
this.setRect(this.x, top, this.width, this.height - deltaHeight);
}
public float getRight() {
return (float) this.getMaxX();
}
public void setRight(float right) {
this.setRect(this.x, this.y, right - this.x, this.height);
}
public float getLeft() {
return (float) this.getMinX();
}
public void setLeft(float left) {
float deltaWidth = left - this.x;
this.setRect(left, this.y, this.width - deltaWidth, this.height);
}
public float getBottom() {
return (float) this.getMaxY();
}
public void setBottom(float bottom) {
this.setRect(this.x, this.y, this.width, bottom - this.y);
}
public Point2D[] getPoints() {
return new Point2D[]{new Point2D.Float(this.getLeft(), this.getTop()), new Point2D.Float(this.getRight(), this.getTop()), new Point2D.Float(this.getRight(),
this.getBottom()), new Point2D.Float(this.getLeft(), this.getBottom())};
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
String s = super.toString();
sb.append(s.substring(0, s.length() - 1));
sb.append(String.format(",bottom=%f,right=%f]", this.getBottom(), this.getRight()));
return sb.toString();
}
}

View File

@ -1,7 +1,6 @@
package com.knecon.fforesight.service.layoutparser.processor.model.table;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
@ -37,14 +36,11 @@ public class TablePageBlock extends AbstractPageBlock {
private List<Cell> cells;
public TablePageBlock(List<Cell> cells, Rectangle area, int rotation) {
public TablePageBlock(List<Cell> cells, int rotation) {
setToBBoxOfComponents(cells);
this.cells = cells;
addCells(cells);
minX = area.getLeft();
minY = area.getBottom();
maxX = area.getRight();
maxY = area.getTop();
classification = PageBlockType.TABLE;
this.rotation = rotation;
}
@ -130,7 +126,8 @@ public class TablePageBlock extends AbstractPageBlock {
List<Cell> cellsToTheTop = new ArrayList<>();
for (int i = 0; i < rowIndex; i++) {
try {
cellsToTheTop.add(rows.get(i).get(colIndex));
cellsToTheTop.add(rows.get(i)
.get(colIndex));
} catch (IndexOutOfBoundsException e) {
log.debug("No cell {} in row {}, ignoring.", colIndex, rowIndex);
}
@ -145,7 +142,8 @@ public class TablePageBlock extends AbstractPageBlock {
if (lastHeaderCell != null) {
cell.getHeaderCells().add(lastHeaderCell);
}
if (!cell.getTextBlocks().isEmpty() && cell.getTextBlocks().get(0).getMostPopularWordStyle().equals("bold")) {
if (!cell.getTextBlocks().isEmpty() && cell.getTextBlocks()
.get(0).getMostPopularWordStyle().equals("bold")) {
cell.setHeaderCell(true);
}
}
@ -209,7 +207,8 @@ public class TablePageBlock extends AbstractPageBlock {
for (int i = 0; i < rowsOfCellsMatrix.size(); i++) {
for (int j = 0; j < rowsOfCellsMatrix.get(i).size(); j++) {
addCellToRowAndCol(rowsOfCellsMatrix.get(i).get(j), i, j);
addCellToRowAndCol(rowsOfCellsMatrix.get(i)
.get(j), i, j);
}
}
@ -228,15 +227,15 @@ public class TablePageBlock extends AbstractPageBlock {
return new ArrayList<>();
}
Set<Float> uniqueX = new HashSet<>();
Set<Float> uniqueY = new HashSet<>();
Set<Double> uniqueX = new HashSet<>();
Set<Double> uniqueY = new HashSet<>();
cells.stream()
.filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3)
.forEach(c -> {
uniqueX.add(c.getLeft());
uniqueX.add(c.getRight());
uniqueY.add(c.getBottom());
uniqueY.add(c.getTop());
uniqueX.add(c.getPdfMinX());
uniqueX.add(c.getPdfMaxX());
uniqueY.add(c.getPdfMinY());
uniqueY.add(c.getPdfMaxY());
});
var sortedUniqueX = uniqueX.stream()
@ -248,22 +247,24 @@ public class TablePageBlock extends AbstractPageBlock {
List<List<Cell>> rowsOfCells = new ArrayList<>();
Float prevY = null;
Double prevY = null;
for (Float y : sortedUniqueY) {
for (Double y : sortedUniqueY) {
List<Cell> row = new ArrayList<>();
Float prevX = null;
for (Float x : sortedUniqueX) {
Double prevX = null;
for (Double x : sortedUniqueX) {
if (prevY != null && prevX != null) {
var cellFromGridStructure = new Cell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y));
var cellFromGridStructure = new Cell(new Point2D.Double(prevX, prevY), new Point2D.Double(x, y));
if (cellFromGridStructure.hasMinimumSize()) {
cells.stream()
.map(originalCell -> new CellWithIntersection(originalCell, RectangleTransformations.calculateIntersectedArea(cellFromGridStructure, originalCell)))
.map(originalCell -> new CellWithIntersection(originalCell,
RectangleTransformations.calculateIntersectedArea(cellFromGridStructure.getBBoxInitialUserSpace(),
originalCell.getBBoxInitialUserSpace())))
.filter(cellWithIntersection -> cellWithIntersection.intersectedArea > 0)
.filter(cellWithIntersection -> cellWithIntersection.originalCell.getArea() > cellWithIntersection.intersectedArea * CELL_AREA_CONTAINED_THRESHOLD)
.max(Comparator.comparing(CellWithIntersection::intersectedArea))
@ -411,16 +412,6 @@ public class TablePageBlock extends AbstractPageBlock {
}
public Rectangle2D getBBox() {
if (this.bBox == null) {
this.bBox = cells.stream()
.collect(RectangleTransformations.collectBBox());
}
return this.bBox;
}
record CellWithIntersection(Cell originalCell, double intersectedArea) {
}

View File

@ -6,6 +6,7 @@ import java.awt.geom.Rectangle2D;
import org.apache.pdfbox.text.TextPosition;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import lombok.AllArgsConstructor;
import lombok.Builder;
@ -17,11 +18,10 @@ import lombok.SneakyThrows;
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class RedTextPosition {
public class RedTextPosition extends BoundingBox {
private final static int HEIGHT_PADDING = 2;
private Rectangle2D.Float directionAdjustedPosition;
private Rectangle2D initialUserSpacePosition;
public final static int HEIGHT_PADDING = 2;
private Rectangle2D.Float directionAdjustedPosition; // adjusted to text rotation
@JsonIgnore
private int rotation;
@ -63,17 +63,21 @@ public class RedTextPosition {
pos.setFontSizeInPt(textPosition.getFontSizeInPt());
pos.setFontName(textPosition.getFont().getName());
//TODO: There is a mismatch in the java coords of the text and the rulings,
// I guess if we start with the initial user space positions and transform them the same way we do the rulings it would work.
pos.setBBox(new Rectangle2D.Float(textPosition.getX(), textPosition.getY(), textPosition.getWidthDirAdj(), textPosition.getHeight()));
float textHeight = textPosition.getHeight() + HEIGHT_PADDING;
Rectangle2D.Float dirAdjPosition = new Rectangle2D.Float(textPosition.getXDirAdj(),
textPosition.getYDirAdj() - textHeight,
textPosition.getWidthDirAdj(),
textHeight + HEIGHT_PADDING);
textPosition.getYDirAdj() - textHeight,
textPosition.getWidthDirAdj(),
textHeight + HEIGHT_PADDING);
pos.setDirectionAdjustedPosition(dirAdjPosition);
AffineTransform affineTransform = getRotationMatrix(TextDirection.fromDegrees(textPosition.getDir()), textPosition.getPageWidth(), textPosition.getPageHeight());
Rectangle2D initialUserSpacePositionRect = affineTransform.createTransformedShape(dirAdjPosition).getBounds2D();
pos.setInitialUserSpacePosition(initialUserSpacePositionRect);
pos.setBBoxInitialUserSpace(initialUserSpacePositionRect); // These are definitely correct
return pos;
}
@ -97,6 +101,7 @@ public class RedTextPosition {
return transform;
}
@JsonIgnore
public float getXDirAdj() {
@ -107,7 +112,7 @@ public class RedTextPosition {
@JsonIgnore
public float getYDirAdj() {
return this.directionAdjustedPosition.y;
return this.directionAdjustedPosition.y;
}

View File

@ -1,18 +1,13 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import static java.util.stream.Collectors.toSet;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
import lombok.AllArgsConstructor;
@ -31,34 +26,31 @@ public class TextPageBlock extends AbstractPageBlock {
@Builder.Default
private List<TextPositionSequence> sequences = new ArrayList<>();
@JsonIgnore
private int rotation;
@JsonIgnore
private String mostPopularWordFont;
@JsonIgnore
private String mostPopularWordStyle;
@JsonIgnore
private float mostPopularWordFontSize;
@JsonIgnore
private float mostPopularWordHeight;
@JsonIgnore
private float mostPopularWordSpaceWidth;
@JsonIgnore
private float highestFontSize;
@JsonIgnore
private PageBlockType classification;
@JsonIgnore
private boolean toDuplicate;
public TextPageBlock(List<TextPositionSequence> sequences) {
this.sequences = sequences;
calculateFrequencyCounters();
calculateBBox();
}
@JsonIgnore
public TextDirection getDir() {
@ -66,34 +58,40 @@ public class TextPageBlock extends AbstractPageBlock {
}
@JsonIgnore
private float getPageHeight() {
private void calculateBBox() {
return sequences.get(0).getPageHeight();
}
@JsonIgnore
private float getPageWidth() {
return sequences.get(0).getPageWidth();
if (sequences == null) {
this.bBox = new Rectangle2D.Double();
this.bBoxInitialUserSpace = new Rectangle2D.Double();
return;
}
setToBBoxOfComponents(sequences);
}
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
if (textBlocksToMerge.isEmpty()) {
throw new IllegalArgumentException("Need to provide at least one TextPageBlock.");
}
if (textBlocksToMerge.stream()
.map(AbstractPageBlock::getPage)
.distinct()
.count() != 1) {
throw new IllegalArgumentException("Cannot merge textBlocks on different pages.");
}
List<TextPositionSequence> sequences = textBlocksToMerge.stream()
.map(TextPageBlock::getSequences)
.flatMap(java.util.Collection::stream)
.toList();
sequences = new ArrayList<>(sequences);
return fromTextPositionSequences(sequences);
return new TextPageBlock(sequences);
}
public static TextPageBlock fromTextPositionSequences(List<TextPositionSequence> wordBlockList) {
TextPageBlock textBlock = null;
private void calculateFrequencyCounters() {
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
@ -101,7 +99,7 @@ public class TextPageBlock extends AbstractPageBlock {
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
for (TextPositionSequence wordBlock : wordBlockList) {
for (TextPositionSequence wordBlock : sequences) {
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
@ -109,172 +107,23 @@ public class TextPageBlock extends AbstractPageBlock {
fontFrequencyCounter.add(wordBlock.getFont());
styleFrequencyCounter.add(wordBlock.getFontStyle());
if (textBlock == null) {
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
wordBlock.getMaxXDirAdj(),
wordBlock.getMinYDirAdj(),
wordBlock.getMaxYDirAdj(),
wordBlockList,
wordBlock.getRotation());
} else {
TextPageBlock spatialEntity = textBlock.union(wordBlock);
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
}
}
if (textBlock != null) {
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
}
if (textBlock != null
&& textBlock.getSequences() != null
&& textBlock.getSequences()
.stream()
.map(t -> DoubleComparisons.round(t.getMinYDirAdj(), 3))
.collect(toSet()).size() == 1) {
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
}
return textBlock;
}
public Rectangle2D getBBox() {
if (this.bBox == null) {
this.bBox = sequences.stream()
.map(TextPositionSequence::getBoundingBox)
.collect(RectangleTransformations.collectBBox());
}
return this.bBox;
}
/**
* Returns the minX value in pdf coordinate system.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
*
* @return the minX value in pdf coordinate system
*/
@JsonIgnore
public float getPdfMinX() {
if (getDir().getDegrees() == 90) {
return minY;
} else if (getDir().getDegrees() == 180) {
return getPageWidth() - maxX;
} else if (getDir().getDegrees() == 270) {
return getPageWidth() - maxY;
} else {
return minX;
}
}
/**
* Returns the maxX value in pdf coordinate system.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
*
* @return the maxX value in pdf coordinate system
*/
@JsonIgnore
public float getPdfMaxX() {
if (getDir().getDegrees() == 90) {
return maxY;
} else if (getDir().getDegrees() == 180) {
return getPageWidth() - minX;
} else if (getDir().getDegrees() == 270) {
return getPageWidth() - minY;
} else {
return maxX;
}
}
/**
* Returns the minY value in pdf coordinate system.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
*
* @return the minY value in pdf coordinate system
*/
@JsonIgnore
public float getPdfMinY() {
if (getDir().getDegrees() == 90) {
return minX;
} else if (getDir().getDegrees() == 180) {
return maxY;
} else if (getDir().getDegrees() == 270) {
return getPageHeight() - maxX;
} else {
return getPageHeight() - maxY;
}
}
/**
* Returns the maxY value in pdf coordinate system.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
*
* @return the maxY value in pdf coordinate system
*/
@JsonIgnore
public float getPdfMaxY() {
if (getDir().getDegrees() == 90) {
return maxX;
} else if (getDir().getDegrees() == 180) {
return minY;
} else if (getDir().getDegrees() == 270) {
return getPageHeight() - minX;
} else {
return getPageHeight() - minY;
}
}
public TextPageBlock(float minX, float maxX, float minY, float maxY, List<TextPositionSequence> sequences, int rotation) {
this.minX = minX;
this.maxX = maxX;
this.minY = minY;
this.maxY = maxY;
this.sequences = sequences;
this.rotation = rotation;
setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
setHighestFontSize(fontSizeFrequencyCounter.getHighest());
}
public TextPageBlock union(TextPositionSequence r) {
TextPageBlock union = this.copy();
union.add(r);
union.getSequences().add(r);
calculateFrequencyCounters();
calculateBBox();
return union;
}
@ -282,64 +131,32 @@ public class TextPageBlock extends AbstractPageBlock {
public TextPageBlock union(TextPageBlock r) {
TextPageBlock union = this.copy();
union.add(r);
union.getSequences().addAll(r.getSequences());
calculateFrequencyCounters();
calculateBBox();
return union;
}
public void add(TextPageBlock r) {
if (r.getMinX() < minX) {
minX = r.getMinX();
}
if (r.getMaxX() > maxX) {
maxX = r.getMaxX();
}
if (r.getMinY() < minY) {
minY = r.getMinY();
}
if (r.getMaxY() > maxY) {
maxY = r.getMaxY();
}
sequences.addAll(r.getSequences());
calculateFrequencyCounters();
calculateBBox();
}
public void add(TextPositionSequence r) {
if (r.getMinXDirAdj() < minX) {
minX = r.getMinXDirAdj();
}
if (r.getMaxXDirAdj() > maxX) {
maxX = r.getMaxXDirAdj();
}
if (r.getMinYDirAdj() < minY) {
minY = r.getMinYDirAdj();
}
if (r.getMaxYDirAdj() > maxY) {
maxY = r.getMaxYDirAdj();
}
sequences.add(r);
calculateFrequencyCounters();
calculateBBox();
}
public TextPageBlock copy() {
return new TextPageBlock(minX, maxX, minY, maxY, sequences, rotation);
}
public void resize(float x1, float y1, float width, float height) {
set(x1, y1, x1 + width, y1 + height);
}
public void set(float x1, float y1, float x2, float y2) {
this.minX = Math.min(x1, x2);
this.maxX = Math.max(x1, x2);
this.minY = Math.min(y1, y2);
this.maxY = Math.max(y1, y2);
return new TextPageBlock(new ArrayList<>(sequences));
}

View File

@ -1,8 +1,5 @@
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
@ -10,16 +7,13 @@ import java.util.stream.Collectors;
import org.apache.pdfbox.text.TextPosition;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@ -27,8 +21,8 @@ import lombok.extern.slf4j.Slf4j;
@Builder
@NoArgsConstructor
@AllArgsConstructor
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
public class TextPositionSequence implements CharSequence {
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = false)
public class TextPositionSequence extends BoundingBox implements CharSequence {
public static final int HEIGHT_PADDING = 2;
@ -59,8 +53,7 @@ public class TextPositionSequence implements CharSequence {
this.pageHeight = textPositions.get(0).getPageHeight();
this.pageWidth = textPositions.get(0).getPageWidth();
this.isParagraphStart = isParagraphStart;
setToBBoxOfComponents(getTextPositions());
}
@ -72,6 +65,7 @@ public class TextPositionSequence implements CharSequence {
this.rotation = textPositions.get(0).getRotation();
this.pageHeight = textPositions.get(0).getPageHeight();
this.pageWidth = textPositions.get(0).getPageWidth();
setToBBoxOfComponents(getTextPositions());
}
@ -109,7 +103,7 @@ public class TextPositionSequence implements CharSequence {
textPositionSequence.rotation = rotation;
textPositionSequence.pageHeight = pageHeight;
textPositionSequence.pageWidth = pageWidth;
textPositionSequence.setToBBoxOfComponents(getTextPositions());
return textPositionSequence;
}
@ -139,18 +133,20 @@ public class TextPositionSequence implements CharSequence {
this.rotation = textPositionSequence.getRotation();
this.pageHeight = textPositionSequence.getPageHeight();
this.pageWidth = textPositionSequence.getPageWidth();
setToBBoxOfComponents(getTextPositions());
;
}
public void add(TextPosition textPosition) {
this.textPositions.add(RedTextPosition.fromTextPosition(textPosition));
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
this.rotation = textPositions.get(0).getRotation();
this.pageHeight = textPositions.get(0).getPageHeight();
this.pageWidth = textPositions.get(0).getPageWidth();
setToBBoxOfComponents(getTextPositions());
;
}
@ -222,18 +218,6 @@ public class TextPositionSequence implements CharSequence {
}
public float getHeight() {
return getMaxYDirAdj() - getMinYDirAdj();
}
public float getWidth() {
return getMaxXDirAdj() - getMinXDirAdj();
}
public String getFont() {
if (textPositions.get(0).getFontName() == null) {
@ -273,62 +257,5 @@ public class TextPositionSequence implements CharSequence {
return textPositions.get(0).getWidthOfSpace();
}
/**
* This returns the bounding box of the word in Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
*
* @return bounding box of the word in Pdf Coordinate System
*/
@SneakyThrows
public Rectangle getRectangle() {
log.debug("Page: '{}', Word: '{}', Rotation: '{}', textRotation {}", page, this, rotation, dir);
float textHeight = getTextHeight();
RedTextPosition firstTextPos = textPositions.get(0);
RedTextPosition lastTextPos = textPositions.get(textPositions.size() - 1);
Point2D bottomLeft = new Point2D.Double(firstTextPos.getXDirAdj(), firstTextPos.getYDirAdj() - HEIGHT_PADDING);
Point2D topRight = new Point2D.Double(lastTextPos.getXDirAdj() + lastTextPos.getWidthDirAdj(), lastTextPos.getYDirAdj() + textHeight + HEIGHT_PADDING);
AffineTransform transform = new AffineTransform();
if (dir == TextDirection.ZERO || dir == TextDirection.HALF_CIRCLE) {
transform.rotate(dir.getRadians(), pageWidth / 2f, pageHeight / 2f);
transform.translate(0f, pageHeight + textHeight);
transform.scale(1., -1.);
} else if (dir == TextDirection.QUARTER_CIRCLE) {
transform.rotate(dir.getRadians(), pageWidth / 2f, pageWidth / 2f);
transform.translate(0f, pageWidth + textHeight);
transform.scale(1., -1.);
} else {
transform.rotate(dir.getRadians(), pageHeight / 2f, pageHeight / 2f);
transform.translate(0f, pageWidth + textHeight);
transform.scale(1., -1.);
}
bottomLeft = transform.transform(bottomLeft, null);
topRight = transform.transform(topRight, null);
return new Rectangle( //
new Point((float) bottomLeft.getX(), (float) bottomLeft.getY()),
(float) (topRight.getX() - bottomLeft.getX()),
(float) (topRight.getY() - bottomLeft.getY()),
page);
}
public Rectangle2D getBoundingBox() {
return getTextPositions().stream()
.map(RedTextPosition::getInitialUserSpacePosition)
.collect(RectangleTransformations.collectBBox());
}
}

View File

@ -160,11 +160,12 @@ public class BodyTextFrameService {
continue;
}
float approxLineCount = PositionUtils.getApproxLineCount(textBlock);
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)
&& approxLineCount < approximateHeaderLineCount
&& textBlock.getMaxY() >= page.getPageHeight() - (page.getPageHeight() / 10)
|| !layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD) && approxLineCount < approximateHeaderLineCount) {
double approxLineCount = PositionUtils.getApproxLineCount(textBlock);
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD) //
&& approxLineCount < approximateHeaderLineCount //
&& textBlock.getMaxY() >= page.getPageHeight() - (page.getPageHeight() / 10)//
|| !layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD) //
&& approxLineCount < approximateHeaderLineCount) {
continue;
}
@ -190,9 +191,9 @@ public class BodyTextFrameService {
}
}
}
return new Rectangle(new Point(expansionsRectangle.minX, expansionsRectangle.minY),
expansionsRectangle.maxX - expansionsRectangle.minX,
expansionsRectangle.maxY - expansionsRectangle.minY,
return new Rectangle(new Point((float) expansionsRectangle.minX, (float) expansionsRectangle.minY),
(float) (expansionsRectangle.maxX - expansionsRectangle.minX),
(float) (expansionsRectangle.maxY - expansionsRectangle.minY),
0);
}
@ -231,10 +232,10 @@ public class BodyTextFrameService {
private class BodyTextFrameExpansionsRectangle {
float minX = 10000;
float maxX = -100;
float minY = 10000;
float maxY = -100;
double minX = 10000;
double maxX = -100;
double minY = 10000;
double maxY = -100;
}

View File

@ -69,7 +69,7 @@ public class GapDetectionService {
private static Rectangle2D toRectangle2D(TextPositionSequence textPosition) {
return mirrorY(RectangleTransformations.toRectangle2D(textPosition.getRectangle()));
return mirrorY(textPosition.getBBox());
}

View File

@ -180,7 +180,7 @@ public class LineDetectionService {
private Rectangle2D textPositionBBox(List<TextPositionSequence> textPositionSequences) {
return RectangleTransformations.rectangleBBox(textPositionSequences.stream().map(TextPositionSequence::getRectangle).toList());
return RectangleTransformations.rectangle2DBBox(textPositionSequences.stream().map(TextPositionSequence::getBBox).toList());
}

View File

@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services;
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.X_FIRST_RULING_COMPARATOR;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
@ -12,9 +13,9 @@ import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.utils.UnionFind;
import lombok.RequiredArgsConstructor;
@ -51,37 +52,37 @@ public class RulingCleaningService {
private Rulings cleanRulings(Rulings rulings) {
List<List<Rectangle>> groupedOverlappingVerticalRectangles = groupOverlappingRectangles(rulings.verticalLines.stream()
.map(RulingCleaningService::getOverlapRectangle)
.distinct()
.toList());
List<Ruling> cleanedVerticalRulings = groupedOverlappingVerticalRectangles.stream()
.map(rectList -> getXCenteredRuling(Rectangle.boundingBoxOf(rectList)))
.toList();
List<List<Rectangle>> groupedOverlappingHorizontalRectangles = groupOverlappingRectangles(rulings.horizontalLines.stream()
List<List<Rectangle2D>> groupedOverlappingVerticalRectangles = groupOverlappingRectangles(rulings.verticalLines.stream()
.map(RulingCleaningService::getOverlapRectangle)
.distinct()
.toList());
List<Ruling> cleanedVerticalRulings = groupedOverlappingVerticalRectangles.stream()
.map(rectList -> getXCenteredRuling(RectangleTransformations.rectangle2DBBox(rectList)))
.toList();
List<List<Rectangle2D>> groupedOverlappingHorizontalRectangles = groupOverlappingRectangles(rulings.horizontalLines.stream()
.map(RulingCleaningService::getOverlapRectangle)
.distinct()
.toList());
List<Ruling> cleanedHorizontalRulings = groupedOverlappingHorizontalRectangles.stream()
.map(rectList -> getYCenteredRuling(Rectangle.boundingBoxOf(rectList)))
.map(rectList -> getYCenteredRuling(RectangleTransformations.rectangle2DBBox(rectList)))
.collect(Collectors.toList());
return new Rulings(cleanedVerticalRulings, cleanedHorizontalRulings);
}
private List<List<Rectangle>> groupOverlappingRectangles(List<Rectangle> rectangles) {
private List<List<Rectangle2D>> groupOverlappingRectangles(List<Rectangle2D> rectangles) {
UnionFind<Rectangle> unionFind = new UnionFind<>();
UnionFind<Rectangle2D> unionFind = new UnionFind<>();
for (int i = 0; i < rectangles.size(); i++) {
for (int j = i + 1; j < rectangles.size(); j++) {
Rectangle rectangle1 = rectangles.get(i);
Rectangle rectangle2 = rectangles.get(j);
Rectangle2D rectangle1 = rectangles.get(i);
Rectangle2D rectangle2 = rectangles.get(j);
// we can stop early when we are too far off because of x-y-sorting
if(rectangle1.getRight() < rectangle2.getLeft() && rectangle1.getBottom() < rectangle2.getTop()) {
if (rectangle1.getMaxX() < rectangle2.getMinX() && rectangle1.getMaxY() < rectangle2.getMinY()) {
break;
}
@ -91,16 +92,16 @@ public class RulingCleaningService {
}
}
Map<Rectangle, List<Rectangle>> groups = new HashMap<>();
for (Rectangle rectangle : rectangles) {
Rectangle root = unionFind.find(rectangle);
Map<Rectangle2D, List<Rectangle2D>> groups = new HashMap<>();
for (Rectangle2D rectangle : rectangles) {
Rectangle2D root = unionFind.find(rectangle);
groups.computeIfAbsent(root, k -> new ArrayList<>()).add(rectangle);
}
return new ArrayList<>(groups.values());
}
private static Rectangle getOverlapRectangle(Ruling ruling) {
private static Rectangle2D getOverlapRectangle(Ruling ruling) {
float top;
float left;
@ -123,34 +124,34 @@ public class RulingCleaningService {
}
if (ruling.isHorizontal()) {
return new Rectangle(top - THRESHOLD_Y_HORIZONTAL, left - THRESHOLD_X_HORIZONTAL, w + 2 * THRESHOLD_X_HORIZONTAL, h + 2 * THRESHOLD_Y_HORIZONTAL);
return new Rectangle2D.Double(top - THRESHOLD_Y_HORIZONTAL, left - THRESHOLD_X_HORIZONTAL, w + 2 * THRESHOLD_X_HORIZONTAL, h + 2 * THRESHOLD_Y_HORIZONTAL);
} else {
return new Rectangle(top - THRESHOLD_Y_VERTICAL, left - THRESHOLD_X_VERTICAL, w + 2 * THRESHOLD_X_VERTICAL, h + 2 * THRESHOLD_Y_VERTICAL);
return new Rectangle2D.Double(top - THRESHOLD_Y_VERTICAL, left - THRESHOLD_X_VERTICAL, w + 2 * THRESHOLD_X_VERTICAL, h + 2 * THRESHOLD_Y_VERTICAL);
}
}
public static Ruling getXCenteredRuling(Rectangle rectangle) {
public static Ruling getXCenteredRuling(Rectangle2D rectangle) {
float x = (float) rectangle.getCenterX();
float y1 = rectangle.getTop();
float y2 = rectangle.getBottom();
double x = rectangle.getCenterX();
double y1 = rectangle.getMinY();
double y2 = rectangle.getMaxY();
Point2D point1 = new Point2D.Float(x, y1 + THRESHOLD_Y_VERTICAL);
Point2D point2 = new Point2D.Float(x, y2 - THRESHOLD_Y_VERTICAL);
Point2D point1 = new Point2D.Double(x, y1 + THRESHOLD_Y_VERTICAL);
Point2D point2 = new Point2D.Double(x, y2 - THRESHOLD_Y_VERTICAL);
return new Ruling(point1, point2);
}
public static Ruling getYCenteredRuling(Rectangle rectangle) {
public static Ruling getYCenteredRuling(Rectangle2D rectangle) {
float x1 = rectangle.getLeft();
float x2 = rectangle.getRight();
float y = (float) rectangle.getCenterY();
double x1 = rectangle.getX();
double x2 = rectangle.getMaxX();
double y = rectangle.getCenterY();
Point2D point1 = new Point2D.Float(x1 + THRESHOLD_X_HORIZONTAL, y);
Point2D point2 = new Point2D.Float(x2 - THRESHOLD_X_HORIZONTAL, y);
Point2D point1 = new Point2D.Double(x1 + THRESHOLD_X_HORIZONTAL, y);
Point2D point2 = new Point2D.Double(x2 - THRESHOLD_X_HORIZONTAL, y);
return new Ruling(point1, point2);
}

View File

@ -71,7 +71,8 @@ public class SectionsBuilderService {
chunkBlockList.add(chunkBlock);
chunkWords = new ArrayList<>();
if (!chunkBlock.getTables().isEmpty()) {
previousTable = chunkBlock.getTables().get(chunkBlock.getTables().size() - 1);
previousTable = chunkBlock.getTables()
.get(chunkBlock.getTables().size() - 1);
}
}
if (current instanceof TablePageBlock table) {
@ -106,11 +107,12 @@ public class SectionsBuilderService {
List<ClassificationSection> sections = new ArrayList<>();
for (var page : document.getPages()) {
page.getTextBlocks().forEach(block -> {
block.setPage(page.getPageNumber());
var section = buildTextBlock(List.of(block), Strings.EMPTY);
sections.add(section);
});
page.getTextBlocks()
.forEach(block -> {
block.setPage(page.getPageNumber());
var section = buildTextBlock(List.of(block), Strings.EMPTY);
sections.add(section);
});
}
document.setSections(sections);
}
@ -155,10 +157,10 @@ public class SectionsBuilderService {
}
}
for (ClassificationSection section : sectionsOnPage) {
Float xMin = null;
Float yMin = null;
Float xMax = null;
Float yMax = null;
Double xMin = null;
Double yMin = null;
Double xMax = null;
Double yMax = null;
for (AbstractPageBlock abs : section.getPageBlocks()) {
if (abs.getPage() != page.getPageNumber()) {
@ -202,8 +204,14 @@ public class SectionsBuilderService {
log.debug("Image position x: {}, y: {}", image.getPosition().getX(), image.getPosition().getY());
log.debug("Paragraph position xMin: {}, xMax: {}, yMin: {}, yMax: {}", xMin, xMax, yMin, yMax);
if (xMin != null && xMax != null && yMin != null && yMax != null && image.getPosition().getX() >= xMin && image.getPosition()
.getX() <= xMax && image.getPosition().getY() >= yMin && image.getPosition().getY() <= yMax) {
if (xMin != null
&& xMax != null
&& yMin != null
&& yMax != null
&& image.getPosition().getX() >= xMin
&& image.getPosition().getX() <= xMax
&& image.getPosition().getY() >= yMin
&& image.getPosition().getY() <= yMax) {
section.getImages().add(image);
image.setAppendedToSection(true);
break;
@ -226,17 +234,26 @@ public class SectionsBuilderService {
List<Cell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
List<Cell> tableNonHeaderRow = getRowWithNonHeaderCells(currentTable);
// Allow merging of tables if header row is separated from first logical non-header row
if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows().get(0).size() == tableNonHeaderRow.size()) {
previousTableNonHeaderRow = previousTable.getRows().get(0).stream().map(cell -> {
Cell fakeCell = new Cell(cell.getPoints()[0], cell.getPoints()[2]);
fakeCell.setHeaderCells(Collections.singletonList(cell));
return fakeCell;
}).collect(Collectors.toList());
if (previousTableNonHeaderRow.isEmpty()
&& previousTable.getRowCount() == 1
&& previousTable.getRows()
.get(0).size() == tableNonHeaderRow.size()) {
previousTableNonHeaderRow = previousTable.getRows()
.get(0)
.stream()
.map(cell -> {
Cell fakeCell = Cell.copy(cell);
fakeCell.setHeaderCells(Collections.singletonList(cell));
return fakeCell;
})
.collect(Collectors.toList());
}
if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<Cell> row = currentTable.getRows().get(i);
if (row.size() == tableNonHeaderRow.size() && row.stream().allMatch(cell -> cell.getHeaderCells().isEmpty())) {
List<Cell> row = currentTable.getRows()
.get(i);
if (row.size() == tableNonHeaderRow.size() && row.stream()
.allMatch(cell -> cell.getHeaderCells().isEmpty())) {
for (int j = 0; j < row.size(); j++) {
row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells());
}
@ -279,7 +296,11 @@ public class SectionsBuilderService {
private boolean hasInvalidHeaderInformation(TablePageBlock table) {
return table.getRows().stream().flatMap(row -> row.stream().filter(cell -> !cell.getHeaderCells().isEmpty())).findAny().isEmpty();
return table.getRows()
.stream()
.flatMap(row -> row.stream()
.filter(cell -> !cell.getHeaderCells().isEmpty()))
.findAny().isEmpty();
}
@ -287,7 +308,8 @@ public class SectionsBuilderService {
private List<Cell> getRowWithNonHeaderCells(TablePageBlock table) {
for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<Cell> row = table.getRows().get(i);
List<Cell> row = table.getRows()
.get(i);
if (row.size() == 1) {
continue;
}

View File

@ -3,6 +3,8 @@ package com.knecon.fforesight.service.layoutparser.processor.services;
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.CELL_SIZE_COMPARATOR;
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.RECTANGLE_SIZE_COMPARATOR;
import java.awt.geom.AffineTransform;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
@ -11,22 +13,26 @@ import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangularIntersectionFinder;
import com.knecon.fforesight.service.layoutparser.processor.utils.SpreadsheetFinder;
import lombok.SneakyThrows;
@Service
public class TableExtractionService {
private static final int MAX_TABLE_CONTAINED_CELLS_WITH_TEXT = 1;
private static final double TEXT_BLOCK_CONTAINMENT_TOLERANCE = 0.02;
private static final double TABLE_UNIFORMITY_THRESHOLD = 0.7;
@ -59,20 +65,20 @@ public class TableExtractionService {
}
}
var cells = new ArrayList<>(new HashSet<>(emptyCells));
DoubleComparisons.sort(cells, Rectangle.ILL_DEFINED_ORDER);
List<Cell> cells = new ArrayList<>(new HashSet<>(emptyCells));
DoubleComparisons.sort(cells, BoundingBox.ILL_DEFINED_ORDER);
List<Rectangle> spreadsheetAreas = SpreadsheetFinder.findSpreadsheetsFromCells(cells);
List<Rectangle2D> spreadsheetAreas = SpreadsheetFinder.findSpreadsheetsFromCells(cells);
// sort spreadsheetAreas by size (height * width) ascending so that cells are placed in the smallest tables first
// this way no cell duplication occurs when tables are contained in other tables and only the most inner table contains the cells
spreadsheetAreas.sort(RECTANGLE_SIZE_COMPARATOR);
List<TablePageBlock> tables = new ArrayList<>();
for (Rectangle area : spreadsheetAreas) {
for (Rectangle2D area : spreadsheetAreas) {
List<Cell> containedCells = new ArrayList<>();
for (Cell c : cells) {
if (c.hasMinimumSize() && area.contains(c)) {
if (c.hasMinimumSize() && area.contains(c.getBBoxInitialUserSpace())) {
containedCells.add(c);
}
}
@ -83,7 +89,7 @@ public class TableExtractionService {
// verify if table would contain fewer cells with text than the threshold allows
if (containedCellsWithText.size() >= MAX_TABLE_CONTAINED_CELLS_WITH_TEXT && checkIfTableCellsAreUniform(containedCells)) {
tables.add(new TablePageBlock(containedCells, area, page.getRotation()));
tables.add(new TablePageBlock(containedCells, page.getRotation()));
cells.removeAll(containedCells);
}
}
@ -92,7 +98,7 @@ public class TableExtractionService {
int position = -1;
for (AbstractPageBlock pageBlock : page.getTextBlocks()) {
if (pageBlock instanceof TextPageBlock ? table.containsBlock((TextPageBlock) pageBlock) : table.contains(pageBlock) && position == -1) {
if (pageBlock instanceof TextPageBlock ? table.contains(pageBlock) : table.contains(pageBlock) && position == -1) {
position = page.getTextBlocks().indexOf(pageBlock);
}
}
@ -118,7 +124,7 @@ public class TableExtractionService {
}
Map<Long, List<Long>> cellsGroupedByRoundedWidth = containedCells.stream()
.map(Rectangle::getWidth)
.map(BoundingBox::getWidth)
.map(size -> Math.round(size / 10.0) * 10)
.collect(Collectors.groupingBy(Long::longValue));
@ -128,26 +134,25 @@ public class TableExtractionService {
private boolean doesCellContainTextBlock(Cell cell, TextPageBlock textBlock) {
double x = textBlock.getBBox().getX();
double y = textBlock.getBBox().getY();
double w = textBlock.getBBox().getWidth();
double h = textBlock.getBBox().getHeight();
if (cell.isEmpty() || w <= 0 || h <= 0) {
return false;
}
double x0 = cell.getX();
double y0 = cell.getY();
double xTol = TEXT_BLOCK_CONTAINMENT_TOLERANCE * w;
double yTol = TEXT_BLOCK_CONTAINMENT_TOLERANCE * h;
return (x >= x0 - xTol && y >= y0 - yTol && (x + w) <= x0 + cell.getWidth() + 2 * xTol && (y + h) <= y0 + cell.getHeight() + 2 * yTol);
return cell.contains(textBlock, RedTextPosition.HEIGHT_PADDING);
}
public static List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
@SneakyThrows
public static List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines, PageInformation pageInformation) {
AffineTransform affineTransform = CoordinateTransforms.calculateInitialUserSpaceCoordsToImageCoords(pageInformation, 1);
switch (pageInformation.rotationDegrees()) {
case 90 -> affineTransform.translate(RedTextPosition.HEIGHT_PADDING, 0); //although this is wrong, our text coordinates are wrong as well
case 180 -> affineTransform.translate(0, RedTextPosition.HEIGHT_PADDING);
case 270 -> affineTransform.translate(-RedTextPosition.HEIGHT_PADDING, 0);
default -> affineTransform.translate(0, -RedTextPosition.HEIGHT_PADDING);
}
return RectangularIntersectionFinder.find(horizontalRulingLines, verticalRulingLines)
.stream()
.map(Cell::new)
.map(rect -> new Cell(rect, affineTransform))
.collect(Collectors.toList());
}

View File

@ -31,13 +31,13 @@ public class TextRulingsClassifier {
private static void handleVerticalText(CleanRulings cleanRulings, TextPositionSequence word) {
float lowerY = (float) (word.getBoundingBox().getMinY() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
float upperY = (float) (word.getBoundingBox().getMaxY() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
float lowerY = (float) (word.getBBox().getMinY() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
float upperY = (float) (word.getBBox().getMaxY() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
float strikethroughCenterX = (float) word.getBoundingBox().getCenterX();
float strikethroughCenterX = (float) word.getBBox().getCenterX();
float strikethroughBoxHeight = (float) ((word.getHeight() * STRIKETHROUGH_ZONE) / 2);
float underlineCenterX = (float) (word.getDir().equals(TextDirection.QUARTER_CIRCLE) ? word.getBoundingBox().getMaxX() : word.getBoundingBox().getMinX());
float underlineCenterX = (float) (word.getDir().equals(TextDirection.QUARTER_CIRCLE) ? word.getBBox().getMaxX() : word.getBBox().getMinX());
float underlineBoxHeight = (float) ((word.getHeight() * UNDERLINE_ZONE) / 2);
float leftX = Math.min(underlineCenterX - underlineBoxHeight, strikethroughCenterX - strikethroughBoxHeight);
@ -65,13 +65,13 @@ public class TextRulingsClassifier {
private static void handleHorizontalText(CleanRulings cleanRulings, TextPositionSequence word) {
float leftX = (float) (word.getBoundingBox().getMinX() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
float rightX = (float) (word.getBoundingBox().getMaxX() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
float leftX = (float) (word.getBBox().getMinX() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
float rightX = (float) (word.getBBox().getMaxX() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
float strikethroughCenterY = (float) word.getBoundingBox().getCenterY();
float strikethroughCenterY = (float) word.getBBox().getCenterY();
float strikethroughBoxHeight = (float) ((word.getHeight() * STRIKETHROUGH_ZONE) / 2);
float underlineCenterY = (float) (word.getDir().equals(TextDirection.ZERO) ? word.getBoundingBox().getMinY() : word.getBoundingBox().getMaxY());
float underlineCenterY = (float) (word.getDir().equals(TextDirection.ZERO) ? word.getBBox().getMinY() : word.getBBox().getMaxY());
float underlineBoxHeight = (float) ((word.getHeight() * UNDERLINE_ZONE) / 2);
float lowerY = Math.min(underlineCenterY - underlineBoxHeight, strikethroughCenterY - strikethroughBoxHeight);

View File

@ -1,7 +1,5 @@
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
import static java.util.stream.Collectors.toSet;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
@ -15,14 +13,10 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
import lombok.RequiredArgsConstructor;
@ -62,12 +56,14 @@ public class DocstrumBlockificationService {
var classificationPage = new ClassificationPage(pageBlocks);
classificationPage.setCleanRulings(rulings);
mergeIntersectingBlocks(classificationPage, 0, 0);
mergeIntersectingBlocks(classificationPage, usedRulings, 0, 0);
combineBlocks(classificationPage);
if (layoutParsingType == LayoutParsingType.DOCUMINE || layoutParsingType == LayoutParsingType.REDACT_MANAGER) {
combineBlocks(classificationPage);
}
if (layoutParsingType == LayoutParsingType.CLARIFYND) {
mergeIntersectingBlocks(classificationPage, 0, 6.5f);
mergeIntersectingBlocks(classificationPage, usedRulings, 0, 6.5f);
}
return classificationPage;
@ -125,7 +121,7 @@ public class DocstrumBlockificationService {
if (previous != null && !previous.getSequences().isEmpty()) {
if (current.getDir() != previous.getDir() || usedRulings.lineBetween(current.getBBox(), previous.getBBox())) {
if (current.getDir() != previous.getDir() || usedRulings.lineBetween(current, previous)) {
previous = current;
continue;
}
@ -135,7 +131,7 @@ public class DocstrumBlockificationService {
continue;
}
if (previous.almostIntersects(current, 0, 0)) {
if (previous.intersects(current)) {
previous = combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
continue;
}
@ -154,7 +150,7 @@ public class DocstrumBlockificationService {
previous = current;
}
mergeIntersectingBlocks(page, 0, 6.5f);
mergeIntersectingBlocks(page, usedRulings, 0, 6.5f);
}
@ -235,7 +231,7 @@ public class DocstrumBlockificationService {
}
public void mergeIntersectingBlocks(ClassificationPage page, float xThreshold, float yThreshold) {
public void mergeIntersectingBlocks(ClassificationPage page, CleanRulings usedRulings, float xThreshold, float yThreshold) {
var blocks = page.getTextBlocks();
ListIterator<AbstractPageBlock> itty = blocks.listIterator();
@ -264,11 +260,11 @@ public class DocstrumBlockificationService {
TextPageBlock inner = (TextPageBlock) blocks.get(i);
if (page.getCleanRulings().lineBetween(inner.getBBox(), current.getBBox())) {
if (usedRulings.lineBetween(current, blocks.get(i))) {
continue;
}
if (current.getDir() == inner.getDir() && current.almostIntersects(inner, yThreshold, xThreshold)) {
if (current.getDir() == inner.getDir() && current.intersects(inner, yThreshold, xThreshold)) {
boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate();
current.getSequences().addAll(inner.getSequences());
@ -289,174 +285,9 @@ public class DocstrumBlockificationService {
}
public List<AbstractPageBlock> splitZonesAtRulings(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
int indexOnPage = 0;
List<TextPositionSequence> chunkWords = new ArrayList<>();
List<AbstractPageBlock> chunkBlockList = new ArrayList<>();
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
TextPositionSequence prev = null;
for (TextPositionSequence word : textPositions) {
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
if (prev != null && (splitByDir || isSplitByRuling)) {
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
indexOnPage++;
chunkBlockList.add(cb1);
chunkWords = new ArrayList<>();
minX = 1000;
maxX = 0;
minY = 1000;
maxY = 0;
prev = null;
}
chunkWords.add(word);
prev = word;
if (word.getMinXDirAdj() < minX) {
minX = word.getMinXDirAdj();
}
if (word.getMaxXDirAdj() > maxX) {
maxX = word.getMaxXDirAdj();
}
if (word.getMinYDirAdj() < minY) {
minY = word.getMinYDirAdj();
}
if (word.getMaxYDirAdj() > maxY) {
maxY = word.getMaxYDirAdj();
}
}
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
if (cb1 != null) {
chunkBlockList.add(cb1);
}
return chunkBlockList;
}
private boolean equalsWithThreshold(float f1, float f2) {
return Math.abs(f1 - f2) < THRESHOLD;
}
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
TextPageBlock textBlock = null;
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
for (TextPositionSequence wordBlock : wordBlockList) {
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
fontFrequencyCounter.add(wordBlock.getFont());
styleFrequencyCounter.add(wordBlock.getFontStyle());
if (textBlock == null) {
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
wordBlock.getMaxXDirAdj(),
wordBlock.getMinYDirAdj(),
wordBlock.getMaxYDirAdj(),
wordBlockList,
wordBlock.getRotation());
} else {
TextPageBlock spatialEntity = textBlock.union(wordBlock);
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
}
}
if (textBlock != null) {
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
}
if (textBlock != null
&& textBlock.getSequences() != null
&& textBlock.getSequences()
.stream()
.map(t -> round(t.getMinYDirAdj(), 3))
.collect(toSet()).size() == 1) {
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
}
return textBlock;
}
private boolean isSplitByRuling(float minX,
float minY,
float maxX,
float maxY,
TextPositionSequence word,
List<Ruling> horizontalRulingLines,
List<Ruling> verticalRulingLines) {
return isSplitByRuling(maxX, minY, word.getMinXDirAdj(), word.getMinYDirAdj(), verticalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight())
//
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight())
//
|| isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight())
//
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight());
}
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
for (Ruling ruling : rulingLines) {
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
return true;
}
}
return false;
}
private double round(float value, int decimalPoints) {
var d = Math.pow(10, decimalPoints);
return Math.round(value * d) / d;
return new TextPageBlock(wordBlockList);
}
}

View File

@ -40,7 +40,7 @@ public class DocuMineBlockificationService {
public ClassificationPage blockify(List<TextPositionSequence> textPositions, CleanRulings cleanRulings) {
List<TextPositionSequence> chunkWords = new ArrayList<>();
List<AbstractPageBlock> chunkBlockList1 = new ArrayList<>();
List<AbstractPageBlock> textPageBlocks = new ArrayList<>();
CleanRulings usedRulings = cleanRulings.withoutTextRulings();
@ -59,7 +59,7 @@ public class DocuMineBlockificationService {
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
boolean negativeXGap = prev != null && word.getMinXDirAdj() - minX < -5;
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev.getBoundingBox(), word.getBoundingBox());
boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev, word);
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
boolean splitByOtherFontAndOtherY = prev != null && Math.abs(prev.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight() * 0.2 && (word.getFontStyle()
.contains("bold")
@ -73,12 +73,12 @@ public class DocuMineBlockificationService {
if (prev != null && (lineSeparation || startFromTop || splitByDir || isSplitByRuling || splitByOtherFontAndOtherY || negativeXGap || startsOnSameX)) {
Orientation prevOrientation = null;
if (!chunkBlockList1.isEmpty()) {
prevOrientation = chunkBlockList1.get(chunkBlockList1.size() - 1).getOrientation();
if (!textPageBlocks.isEmpty()) {
prevOrientation = textPageBlocks.get(textPageBlocks.size() - 1).getOrientation();
}
TextPageBlock cb1 = buildTextBlock(chunkWords);
chunkBlockList1.add(cb1);
TextPageBlock cb1 = new TextPageBlock(chunkWords);
textPageBlocks.add(cb1);
chunkWords = new ArrayList<>();
if (splitByX && !isSplitByRuling) {
@ -121,77 +121,12 @@ public class DocuMineBlockificationService {
}
}
TextPageBlock cb1 = buildTextBlock(chunkWords);
if (cb1 != null) {
chunkBlockList1.add(cb1);
}
textPageBlocks.add(new TextPageBlock(chunkWords));
return new ClassificationPage(chunkBlockList1);
return new ClassificationPage(textPageBlocks);
}
private boolean equalsWithThreshold(float f1, float f2) {
return Math.abs(f1 - f2) < THRESHOLD;
}
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList) {
TextPageBlock textBlock = null;
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
for (TextPositionSequence wordBlock : wordBlockList) {
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
fontFrequencyCounter.add(wordBlock.getFont());
styleFrequencyCounter.add(wordBlock.getFontStyle());
if (textBlock == null) {
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
wordBlock.getMaxXDirAdj(),
wordBlock.getMinYDirAdj(),
wordBlock.getMaxYDirAdj(),
wordBlockList,
wordBlock.getRotation());
} else {
TextPageBlock spatialEntity = textBlock.union(wordBlock);
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
}
}
if (textBlock != null) {
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
}
if (textBlock != null
&& textBlock.getSequences() != null
&& textBlock.getSequences()
.stream()
.map(t -> round(t.getMinYDirAdj(), 3))
.collect(toSet()).size() == 1) {
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
}
return textBlock;
}
private double round(float value, int decimalPoints) {
var d = Math.pow(10, decimalPoints);
return Math.round(value * d) / d;
}
}

View File

@ -55,7 +55,7 @@ public class RedactManagerBlockificationService {
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX;
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev.getBoundingBox(), word.getBoundingBox());
boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev, word);
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
@ -65,7 +65,7 @@ public class RedactManagerBlockificationService {
prevOrientation = chunkBlockList.get(chunkBlockList.size() - 1).getOrientation();
}
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
TextPageBlock cb1 = new TextPageBlock(chunkWords);
indexOnPage++;
chunkBlockList.add(cb1);
@ -111,8 +111,8 @@ public class RedactManagerBlockificationService {
}
}
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
if (cb1 != null) {
if (!chunkWords.isEmpty()) {
TextPageBlock cb1 = new TextPageBlock(chunkWords);
chunkBlockList.add(cb1);
}
@ -174,68 +174,9 @@ public class RedactManagerBlockificationService {
}
private boolean equalsWithThreshold(float f1, float f2) {
private boolean equalsWithThreshold(double f1, double f2) {
return Math.abs(f1 - f2) < THRESHOLD;
}
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
TextPageBlock textBlock = null;
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
for (TextPositionSequence wordBlock : wordBlockList) {
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
fontFrequencyCounter.add(wordBlock.getFont());
styleFrequencyCounter.add(wordBlock.getFontStyle());
if (textBlock == null) {
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
wordBlock.getMaxXDirAdj(),
wordBlock.getMinYDirAdj(),
wordBlock.getMaxYDirAdj(),
wordBlockList,
wordBlock.getRotation());
} else {
TextPageBlock spatialEntity = textBlock.union(wordBlock);
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
}
}
if (textBlock != null) {
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
}
if (textBlock != null
&& textBlock.getSequences() != null
&& textBlock.getSequences()
.stream()
.map(t -> round(t.getMinYDirAdj(), 3))
.collect(toSet()).size() == 1) {
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
}
return textBlock;
}
private double round(float value, int decimalPoints) {
var d = Math.pow(10, decimalPoints);
return Math.round(value * d) / d;
}
}

View File

@ -71,11 +71,10 @@ public class SearchTextWithTextPositionFactory {
List<Rectangle2D> positions = sequences.stream()
.map(TextPositionSequence::getTextPositions)
.flatMap(Collection::stream)
.map(RedTextPosition::getInitialUserSpacePosition)
.map(RedTextPosition::getBBoxInitialUserSpace)
.toList();
assert context.stringBuilder.length() == context.stringIdxToPositionIdx.size();
assert positions.size() == context.stringIdxToPositionIdx.size();
return SearchTextWithTextPositionDto.builder()
.searchText(context.stringBuilder.toString())

View File

@ -45,7 +45,10 @@ public class TableNodeFactory {
.flatMap(Collection::stream)
.toList();
Table table = Table.builder().documentTree(context.getDocumentTree()).numberOfCols(mergedRows.isEmpty() ? 0 : mergedRows.get(0).size()).numberOfRows(mergedRows.size())
Table table = Table.builder()
.documentTree(context.getDocumentTree())
.numberOfCols(mergedRows.isEmpty() ? 0 : mergedRows.get(0).size())
.numberOfRows(mergedRows.size())
.build();
pages.forEach(page -> addTableToPage(page, parentNode, table));
@ -128,7 +131,12 @@ public class TableNodeFactory {
Page page = context.getPage(cell.getPageNumber());
TableCell tableCell = TableCell.builder().documentTree(context.getDocumentTree()).row(rowIndex).col(colIndex).header(cell.isHeaderCell()).bBox(cell.getBounds2D())
TableCell tableCell = TableCell.builder()
.documentTree(context.getDocumentTree())
.row(rowIndex)
.col(colIndex)
.header(cell.isHeaderCell())
.bBox(cell.getBBoxInitialUserSpace())
.build();
page.getMainBody().add(tableCell);

View File

@ -13,6 +13,9 @@ import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import lombok.SneakyThrows;
@Service
@ -30,7 +33,7 @@ public class FindGraphicsRaster {
var renderer = new PDFRenderer(doc);
var img = renderer.renderImageWithDPI(pageInformation.number() - 1, DPI, ImageType.GRAY);
var imageCtm = getImageCTM(pageInformation, img.getWidth());
var imageCtm = CoordinateTransforms.calculateImageCoordsToInitialUserSpaceCoords(pageInformation, CoordinateTransforms.calculateScalingFactor(pageInformation, img.getWidth()));
return findCCBoundingBoxes(img, remove, THRESHOLD, DPI / 72, imageCtm);
}
@ -131,42 +134,4 @@ public class FindGraphicsRaster {
}
public AffineTransform getImageCTM(PageInformation pageInformation, int imageWidth) {
double scalingFactor = calculateScalingFactor(pageInformation, imageWidth);
AffineTransform imageToCropBoxScaling = new AffineTransform(scalingFactor, 0, 0, scalingFactor, -pageInformation.minX(), -pageInformation.minY());
AffineTransform mirrorMatrix = new AffineTransform(1, 0, 0, -1, 0, pageInformation.height());
AffineTransform rotationMatrix = switch (pageInformation.rotationDegrees()) {
case 90 -> new AffineTransform(0, 1, -1, 0, pageInformation.height(), 0);
case 180 -> new AffineTransform(-1, 0, 0, -1, pageInformation.width(), pageInformation.height());
case 270 -> new AffineTransform(0, -1, 1, 0, pageInformation.width() - pageInformation.height(), pageInformation.height()); // results from 90 + 180 rotations
default -> new AffineTransform();
};
// matrix multiplication is performed from right to left, so the order is reversed.
// scaling -> mirror -> rotation
AffineTransform resultMatrix = new AffineTransform();
resultMatrix.concatenate(rotationMatrix);
resultMatrix.concatenate(mirrorMatrix);
resultMatrix.concatenate(imageToCropBoxScaling);
return resultMatrix;
}
private double calculateScalingFactor(PageInformation pageInformation, int imageWidth) {
// PDFBox always returns page height and width based on rotation
double pageWidth;
if (pageInformation.rotationDegrees() == 90 || pageInformation.rotationDegrees() == 270) {
pageWidth = pageInformation.height();
} else {
pageWidth = pageInformation.width();
}
return pageWidth / imageWidth;
}
}

View File

@ -12,6 +12,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRul
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.RequiredArgsConstructor;

View File

@ -0,0 +1,56 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.awt.geom.AffineTransform;
import lombok.SneakyThrows;
import lombok.experimental.UtilityClass;
@UtilityClass
public class CoordinateTransforms {
public AffineTransform calculateImageCoordsToInitialUserSpaceCoords(PageInformation pageInformation, double scalingFactor) {
AffineTransform imageToCropBoxScaling = new AffineTransform(scalingFactor, 0, 0, scalingFactor, -pageInformation.minX(), -pageInformation.minY());
AffineTransform mirrorMatrix = new AffineTransform(1, 0, 0, -1, 0, pageInformation.height());
AffineTransform rotationMatrix = switch (pageInformation.rotationDegrees()) {
case 90 -> new AffineTransform(0, 1, -1, 0, pageInformation.height(), 0);
case 180 -> new AffineTransform(-1, 0, 0, -1, pageInformation.width(), pageInformation.height());
case 270 -> new AffineTransform(0, -1, 1, 0, pageInformation.width() - pageInformation.height(), pageInformation.height()); // results from 90 + 180 rotations
default -> new AffineTransform();
};
// matrix multiplication is performed from right to left, so the order is reversed.
// scaling -> mirror -> rotation
AffineTransform resultMatrix = new AffineTransform();
resultMatrix.concatenate(rotationMatrix);
resultMatrix.concatenate(mirrorMatrix);
resultMatrix.concatenate(imageToCropBoxScaling);
return resultMatrix;
}
@SneakyThrows
public AffineTransform calculateInitialUserSpaceCoordsToImageCoords(PageInformation pageInformation, double scalingFactor) {
return calculateImageCoordsToInitialUserSpaceCoords(pageInformation, scalingFactor).createInverse();
}
public double calculateScalingFactor(PageInformation pageInformation, double imageWidth) {
// PDFBox always returns page height and width based on rotation
double pageWidth;
if (pageInformation.rotationDegrees() == 90 || pageInformation.rotationDegrees() == 270) {
pageWidth = pageInformation.height();
} else {
pageWidth = pageInformation.width();
}
return pageWidth / imageWidth;
}
}

View File

@ -1,10 +1,10 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.Comparator;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
public class GeometricComparators {
@ -58,7 +58,7 @@ public class GeometricComparators {
return cell1Size.compareTo(cell2Size);
};
public static final Comparator<Rectangle> RECTANGLE_SIZE_COMPARATOR = (rect1, rect2) -> {
public static final Comparator<Rectangle2D> RECTANGLE_SIZE_COMPARATOR = (rect1, rect2) -> {
Double rect1Size = rect1.getHeight() * rect1.getWidth();
Double rect2Size = rect2.getHeight() * rect2.getWidth();

View File

@ -47,8 +47,8 @@ public class MarkedContentUtils {
return markedContentByYPosition.values()
.stream()
.map(textPositions -> new TextPositionSequence(textPositions, 0, true).getRectangle())
.map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight())))
.map(textPositions -> new TextPositionSequence(textPositions, 0, true).getBBox())
.map(t -> new Rectangle2D.Double(t.getX(), t.getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight())))
.collect(Collectors.toList());
}
@ -89,7 +89,7 @@ public class MarkedContentUtils {
.map(content -> (TextPosition) content)
.filter(content -> !content.getUnicode().equals(" "))
.map(textPositions -> new TextPositionSequence(List.of(textPositions), 0, true))
.map(TextPositionSequence::getBoundingBox)
.map(TextPositionSequence::getBBox)
.collect(Collectors.toList());
}

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.services.graphics;
package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.awt.geom.Rectangle2D;

View File

@ -114,7 +114,7 @@ public final class PositionUtils {
}
public Float getApproxLineCount(TextPageBlock textBlock) {
public double getApproxLineCount(TextPageBlock textBlock) {
return textBlock.getHeight() / textBlock.getMostPopularWordHeight();
}

View File

@ -4,6 +4,7 @@ import static com.knecon.fforesight.service.layoutparser.processor.utils.Geometr
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.Y_FIRST_POINT_COMPARATOR;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
@ -11,7 +12,7 @@ import java.util.List;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
public class SpreadsheetFinder {
@ -19,15 +20,15 @@ public class SpreadsheetFinder {
private static final float AREA_TOLERANCE = 0.001f;
public static List<Rectangle> findSpreadsheetsFromCells(List<? extends Rectangle> cells) {
public static List<Rectangle2D> findSpreadsheetsFromCells(List<Cell> cells) {
// via: http://stackoverflow.com/questions/13746284/merging-multiple-adjacent-rectangles-into-one-polygon
List<Rectangle> rectangles = new ArrayList<>();
List<Rectangle2D> rectangles = new ArrayList<>();
Set<Point2D> pointSet = new HashSet<>();
Map<Point2D, Point2D> edgesH = new HashMap<>();
Map<Point2D, Point2D> edgesV = new HashMap<>();
for (Rectangle cell : cells) {
for (Point2D pt : cell.getPoints()) {
for (Cell cell : cells) {
for (Point2D pt : getPoints(cell.getBBoxInitialUserSpace())) {
if (pointSet.contains(pt)) { // shared vertex, remove it
pointSet.remove(pt);
} else {
@ -116,13 +117,22 @@ public class SpreadsheetFinder {
// do not add polygons with too many outer points as they are unlikely to be tables
if (poly.size() <= MAX_OUTER_POINT_TOLERANCE) {
rectangles.add(new Rectangle(top - AREA_TOLERANCE, left - AREA_TOLERANCE, right - left + 2 * AREA_TOLERANCE, bottom - top + 2 * AREA_TOLERANCE));
rectangles.add(new Rectangle2D.Double(left - AREA_TOLERANCE, top - AREA_TOLERANCE, right - left + (2 * AREA_TOLERANCE), bottom - top + (2 * AREA_TOLERANCE)));
}
}
return rectangles;
}
public static List<Point2D> getPoints(Rectangle2D rectangle2D) {
return List.of(new Point2D.Double(rectangle2D.getX(), rectangle2D.getY()),
new Point2D.Double(rectangle2D.getMaxX(), rectangle2D.getY()),
new Point2D.Double(rectangle2D.getMaxX(), rectangle2D.getMaxY()),
new Point2D.Double(rectangle2D.getX(), rectangle2D.getMaxY()));
}
private enum Direction {
HORIZONTAL,
VERTICAL

View File

@ -9,7 +9,6 @@ import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Stream;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
@ -18,7 +17,6 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
@ -111,10 +109,7 @@ public class LayoutparsingVisualizations {
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.words);
visualizationsOnPage.getColoredRectangles()
.addAll(textPositionSequences.stream()
.map(textPositionSequence -> textPositionSequence.getTextPositions()
.stream()
.map(RedTextPosition::getInitialUserSpacePosition)
.collect(RectangleTransformations.collectBBox()))
.map(BoundingBox::getBBoxInitialUserSpace)
.map(rect -> new ColoredRectangle(rect, WORDS_COLOR, 1))
.toList());
}
@ -147,7 +142,7 @@ public class LayoutparsingVisualizations {
}
public void addCellVisualizations(List<? extends Rectangle2D> cells, int pageNumber) {
public void addCellVisualizations(List<? extends BoundingBox> cells, int pageNumber) {
if (!active) {
return;
@ -155,7 +150,7 @@ public class LayoutparsingVisualizations {
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.cells);
visualizationsOnPage.getColoredRectangles()
.addAll(cells.stream()
.map(ruling -> new ColoredRectangle(ruling, CELLS_COLOR, 1))
.map(cell -> new ColoredRectangle(cell.getBBoxInitialUserSpace(), CELLS_COLOR, 1))
.toList());
}
@ -169,7 +164,7 @@ public class LayoutparsingVisualizations {
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, this.zones);
visualizationsOnPage.getColoredRectangles()
.addAll(zones.stream()
.map(BoundingBox::getBBox)
.map(BoundingBox::getBBoxInitialUserSpace)
.map(zone -> new ColoredRectangle(zone, ZONES_COLOR, 1))
.toList());
@ -194,8 +189,8 @@ public class LayoutparsingVisualizations {
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, this.lines);
visualizationsOnPage.getColoredRectangles()
.addAll(lines.stream()
.map(BoundingBox::getBBox)
.map(line -> new ColoredRectangle(line, LINES_COLOR, 0.3f))
.map(BoundingBox::getBBoxInitialUserSpace)
.map(line -> new ColoredRectangle(line, LINES_COLOR, 0.5f))
.toList());
}
@ -208,7 +203,7 @@ public class LayoutparsingVisualizations {
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, zones);
visualizationsOnPage.getColoredRectangles()
.addAll(textPageBlocks.stream()
.map(rect -> new ColoredRectangle(rect.getBBox(), ZONES_COLOR, 1))
.map(rect -> new ColoredRectangle(rect.getBBoxInitialUserSpace(), ZONES_COLOR, 1))
.toList());
}
@ -226,7 +221,7 @@ public class LayoutparsingVisualizations {
}
public void addMarkedContentVisualizations(List<PDMarkedContent> markedContents, int pageNumber, PDPage pdPage) {
public void addMarkedContentVisualizations(List<PDMarkedContent> markedContents, int pageNumber) {
if (!active) {
return;
@ -235,14 +230,16 @@ public class LayoutparsingVisualizations {
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, markedContent);
List<MarkedContentUtils.MarkedContentPosition> markedContentBBoxMapBySubType = MarkedContentUtils.getMarkedContentPositions(markedContents);
AtomicInteger count = new AtomicInteger();
markedContentBBoxMapBySubType.forEach(markedContentPosition -> {
var bbox = markedContentPosition.textPositions()
.stream()
.collect(RectangleTransformations.collectBBox());
String type = markedContentPosition.formattedType();
float translationAmount = ((FONT.getStringWidth(type) / 1000) * 10 + (2 * 1) + 4);
String type = markedContentPosition.formattedType() + " " + count.getAndIncrement();
float translationAmount = ((FONT.getStringWidth(type) / 100) + 6);
// Pushes the string to the left of the box: calculate string width, divide by font units (1000), multiply with font size (10), add small offset (6).
visualizationsOnPage.getPlacedTexts()
.add(PlacedText.textFacingUp(type, new Point2D.Double(bbox.getX() - translationAmount, bbox.getY() + bbox.getHeight()), 10, Color.BLACK, FONT));
@ -270,11 +267,11 @@ public class LayoutparsingVisualizations {
.flatMap(Collection::stream)
.forEach(character -> {
Color color = ROTATING_CHARACTER_COLOR.get(index.getAndIncrement() % ROTATING_CHARACTER_COLOR.size());
Rectangle2D charBBox = character.getTextPosition().getInitialUserSpacePosition();
Rectangle2D charBBox = character.getTextPosition().getBBoxInitialUserSpace();
characterVisualizations.getColoredRectangles().add(new ColoredRectangle(charBBox, color, 1));
character.getNeighbors()
.forEach(neighbor -> {
Rectangle2D neighborBBox = neighbor.getCharacter().getTextPosition().getInitialUserSpacePosition();
Rectangle2D neighborBBox = neighbor.getCharacter().getTextPosition().getBBoxInitialUserSpace();
Line2D line = new Line2D.Double(new Point2D.Double(charBBox.getCenterX(), charBBox.getCenterY()),
new Point2D.Double(neighborBBox.getCenterX(), neighborBBox.getCenterY()));
neighbourVisualizations.getColoredLines().add(new ColoredLine(line, color, 1));
@ -287,7 +284,8 @@ public class LayoutparsingVisualizations {
private VisualizationsOnPage getOrCreateVisualizationsOnPage(int page, Visualizations visualizations) {
if (visualizations.getVisualizationsOnPages().containsKey(page - 1)) {
return visualizations.getVisualizationsOnPages().get(page - 1);
return visualizations.getVisualizationsOnPages()
.get(page - 1);
}
VisualizationsOnPage visualizationsOnPage = VisualizationsOnPage.builder().build();
visualizations.getVisualizationsOnPages().put(page - 1, visualizationsOnPage);

View File

@ -70,7 +70,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
file = new File(filePath);
}
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.DOCUMINE, true);
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER, true);
prepareStorage(layoutParsingRequest, file);
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);

View File

@ -115,10 +115,10 @@ public class PdfSegmentationServiceTest extends AbstractTest {
ClassificationDocument classificationDocument = buildClassificationDocument(pdfFileResource.getFile());
assertThat(classificationDocument.getHeaders()
.get(0).getTextBlocks().size()).isEqualTo(1);
.get(0).getTextBlocks().size()).isEqualTo(3);
assertThat(classificationDocument.getHeaders()
.get(0).getTextBlocks()
.get(0).getSequences().size()).isEqualTo(12);
.get(0).getSequences().size()).isEqualTo(8);
assertThat(classificationDocument.getHeaders()
.get(0).getTextBlocks()
.get(0).toString()).contains(textToSearch);
@ -131,6 +131,17 @@ public class PdfSegmentationServiceTest extends AbstractTest {
}
@Test
@SneakyThrows
public void testTableAndCellRotations() {
String fileName = "files/Minimal Examples/simpleTablesRotated.pdf";
ClassPathResource pdfFileResource = new ClassPathResource(fileName);
ClassificationDocument classificationDocument = buildClassificationDocument(pdfFileResource.getFile());
}
@Disabled
@Test
public void testScanRotationBorderIsIgnored() throws IOException {

View File

@ -28,29 +28,30 @@ class InvisibleTableDetectionServiceTest {
String fileName = "files/basf/CustomerFiles/invisible_tables_test-two-pages_ocred.pdf";
var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TABLE.pdf").toString();
List<PageInformation> pageContents = PageContentExtractor.getSortedPageContents(fileName).stream().map(PageInformationService::build).collect(Collectors.toList());
List<PageInformation> pageContents = PageContentExtractor.getSortedPageContents(fileName)
.stream()
.map(PageInformationService::build)
.collect(Collectors.toList());
int pageNumber = 1;
Rectangle2D tableBBox = pageContents.get(0)
.getPageContents()
.getSortedTextPositionSequences()
.subList(45, 152)
Rectangle2D tableBBox = pageContents.get(0).getPageContents().getSortedTextPositionSequences().subList(45, 152)
.stream()
.map(TextPositionSequence::getRectangle)
.map(RectangleTransformations::toRectangle2D)
.map(TextPositionSequence::getBBox)
.map(this::mirrorY)
.collect(RectangleTransformations.collectBBox());
List<TextPositionSequence> textPositionSequences = pageContents.get(0)
.getPageContents()
.getSortedTextPositionSequences()
List<TextPositionSequence> textPositionSequences = pageContents.get(0).getPageContents().getSortedTextPositionSequences()
.stream()
.filter(textPositionSequence -> tableBBox.contains(mirrorY(RectangleTransformations.toRectangle2D(textPositionSequence.getRectangle()))))
.filter(textPositionSequence -> tableBBox.contains(mirrorY(textPositionSequence.getBBox())))
.toList();
var table = InvisibleTableDetectionService.detectTable(textPositionSequences, tableBBox);
PdfDraw.drawRectanglesPerPage(fileName, List.of(table.stream().flatMap(Collection::stream).toList(), Collections.emptyList()), tmpFileName);
PdfDraw.drawRectanglesPerPage(fileName,
List.of(table.stream()
.flatMap(Collection::stream)
.toList(), Collections.emptyList()),
tmpFileName);
}

View File

@ -29,9 +29,7 @@ class PageContentExtractorTest {
textPositionPerPage.stream()
.map(t -> t.getSortedTextPositionSequences()
.stream()
.map(TextPositionSequence::getRectangle)
.map(RectangleTransformations::toRectangle2D)
//.map(textPositionSequence -> (Rectangle2D) new Rectangle2D.Double(textPositionSequence.getMaxXDirAdj(), textPositionSequence.getMaxYDirAdj(), textPositionSequence.getWidth(), textPositionSequence.getHeight()))
.map(TextPositionSequence::getBBox)
.map(List::of)
.toList())
.toList(), tmpFileName);

View File

@ -12,4 +12,4 @@ commit_hash=$(git rev-parse --short=5 HEAD)
buildName="${USER}-${branch}-${commit_hash}"
gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=$buildName --no-build-cache
echo "nexus.knecon.com:5001/ff/${dir}-service-server:$buildName"
echo "nexus.knecon.com:5001/ff/layoutparser-service-server:$buildName"