RED-8825: general layoutparsing improvements
* refactor all coordinates
This commit is contained in:
parent
d61cac8b4f
commit
b6f0a21886
@ -60,6 +60,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.mapper.Taas
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||
|
||||
import io.micrometer.observation.Observation;
|
||||
import io.micrometer.observation.ObservationRegistry;
|
||||
@ -264,7 +265,7 @@ public class LayoutParsingPipeline {
|
||||
PDRectangle cropbox = pdPage.getCropBox();
|
||||
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
|
||||
|
||||
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals());
|
||||
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals(), PageInformation.fromPDPage(pageNumber, pdPage));
|
||||
classificationDocument.getVisualizations().addCellVisualizations(emptyTableCells, pageNumber);
|
||||
|
||||
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings);
|
||||
@ -293,7 +294,7 @@ public class LayoutParsingPipeline {
|
||||
classificationPage.setPageWidth(cropbox.getWidth());
|
||||
classificationPage.setPageHeight(cropbox.getHeight());
|
||||
|
||||
classificationDocument.getVisualizations().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber, pdPage);
|
||||
classificationDocument.getVisualizations().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber);
|
||||
// MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox.
|
||||
classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents()));
|
||||
|
||||
|
||||
@ -1,13 +1,27 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
public abstract class BoundingBox {
|
||||
|
||||
private Rectangle2D bBox;
|
||||
// Java coordinate system: (0, 0) is always upper left, x is increasing left to right and y is increasing from top to bottom.
|
||||
// should be used when determining reading order or other tasks which require coordinates in a harmonized system.
|
||||
protected Rectangle2D bBox; // I would not trust this coordinate when comparing rulings and text, due to the text positions being slightly off.
|
||||
|
||||
// PDF coordinate system: depends on page rotation, (0, 0) is lower left corner, x is increasing left to right and y from bottom to top.
|
||||
// This rotates completely in 90 degree steps with page rotation.
|
||||
// Needs to be used when writing to a PDF.
|
||||
// Also, these are definitely correct and should be used whenever possible.
|
||||
protected Rectangle2D bBoxInitialUserSpace;
|
||||
|
||||
protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
|
||||
|
||||
|
||||
public double getX() {
|
||||
@ -22,6 +36,42 @@ public abstract class BoundingBox {
|
||||
}
|
||||
|
||||
|
||||
public double getMinX() {
|
||||
|
||||
return bBox.getMinX();
|
||||
}
|
||||
|
||||
|
||||
public double getMinY() {
|
||||
|
||||
return bBox.getMinY();
|
||||
}
|
||||
|
||||
|
||||
public double getPdfMinX() {
|
||||
|
||||
return bBoxInitialUserSpace.getMinX();
|
||||
}
|
||||
|
||||
|
||||
public double getPdfMaxX() {
|
||||
|
||||
return bBoxInitialUserSpace.getMaxX();
|
||||
}
|
||||
|
||||
|
||||
public double getPdfMinY() {
|
||||
|
||||
return bBoxInitialUserSpace.getMinY();
|
||||
}
|
||||
|
||||
|
||||
public double getPdfMaxY() {
|
||||
|
||||
return bBoxInitialUserSpace.getMaxY();
|
||||
}
|
||||
|
||||
|
||||
public double getWidth() {
|
||||
|
||||
return bBox.getWidth();
|
||||
@ -34,21 +84,102 @@ public abstract class BoundingBox {
|
||||
}
|
||||
|
||||
|
||||
public double getMaxX() {
|
||||
|
||||
return bBox.getMaxX();
|
||||
}
|
||||
|
||||
|
||||
public double getMaxY() {
|
||||
|
||||
return bBox.getMaxY();
|
||||
}
|
||||
|
||||
|
||||
public double getArea() {
|
||||
|
||||
return (bBox.getHeight() * bBox.getWidth());
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(Rectangle2D contained, double tolerance) {
|
||||
public boolean contains(BoundingBox contained) {
|
||||
|
||||
return bBox.getX() <= contained.getX() + tolerance && bBox.getY() <= contained.getY() + tolerance && bBox.getX() + bBox.getWidth() >= contained.getX() + contained.getWidth() - tolerance && bBox.getY() + bBox.getHeight() >= contained.getY() + contained.getHeight() - tolerance;
|
||||
return contains(contained, 0);
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(BoundingBox contained, double tolerance) {
|
||||
|
||||
return getPdfMinX() <= contained.getPdfMinX() + tolerance
|
||||
&& getPdfMinY() <= contained.getPdfMinY() + tolerance
|
||||
&& getPdfMaxX() >= contained.getPdfMaxX() - tolerance
|
||||
&& getPdfMaxY() >= contained.getPdfMaxY() - tolerance;
|
||||
}
|
||||
|
||||
|
||||
public boolean intersects(BoundingBox other) {
|
||||
|
||||
return this.intersectsX(other) && this.intersectsY(other);
|
||||
}
|
||||
|
||||
|
||||
public boolean intersects(BoundingBox other, float yThreshold, float xThreshold) {
|
||||
|
||||
return this.intersectsX(other, xThreshold) && this.intersectsY(other, yThreshold);
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsY(BoundingBox other) {
|
||||
|
||||
return this.getBBox().getMinY() <= other.getBBox().getMaxY() && this.getBBox().getMaxY() >= other.getBBox().getMinY();
|
||||
return this.getPdfMinY() <= other.getPdfMaxY() && this.getPdfMaxY() >= other.getPdfMinY();
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsY(BoundingBox other, float threshold) {
|
||||
|
||||
return this.getPdfMinY() - threshold <= other.getPdfMaxY() && this.getPdfMaxY() + threshold >= other.getPdfMinY();
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsX(BoundingBox other) {
|
||||
|
||||
return this.getPdfMinX() <= other.getMaxX() && this.getMaxX() >= other.getPdfMinX();
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsX(BoundingBox other, float threshold) {
|
||||
|
||||
return this.getPdfMinX() - threshold <= other.getMaxX() && this.getMaxX() + threshold >= other.getPdfMinX();
|
||||
}
|
||||
|
||||
|
||||
public void setToBBoxOfComponents(List<? extends BoundingBox> components) {
|
||||
|
||||
this.bBox = components.stream()
|
||||
.map(BoundingBox::getBBox)
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
this.bBoxInitialUserSpace = components.stream()
|
||||
.map(BoundingBox::getBBoxInitialUserSpace)
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
}
|
||||
|
||||
|
||||
public double verticalOverlap(BoundingBox other) {
|
||||
|
||||
return Math.max(0, Math.min(this.getPdfMaxY(), other.getPdfMaxY()) - Math.max(this.getPdfMinY(), other.getPdfMinY()));
|
||||
}
|
||||
|
||||
|
||||
public static final Comparator<BoundingBox> ILL_DEFINED_ORDER = (o1, o2) -> {
|
||||
|
||||
if (o1.equals(o2)) {
|
||||
return 0;
|
||||
}
|
||||
if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD * ((o1.getHeight() + o2.getHeight()) / 2)) {
|
||||
return Double.compare(o1.getPdfMinX(), o2.getPdfMinX());
|
||||
} else {
|
||||
return Double.compare(o1.getPdfMaxY(), o2.getPdfMaxY());
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@ -5,9 +5,7 @@ import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
@ -145,10 +143,9 @@ public class Line extends BoundingBox {
|
||||
|
||||
private void buildBBox() {
|
||||
|
||||
this.setBBox(characters.stream()
|
||||
.map(Character::getTextPosition)
|
||||
.map(RedTextPosition::getInitialUserSpacePosition)
|
||||
.collect(RectangleTransformations.collectBBox()));
|
||||
this.setToBBoxOfComponents(characters.stream()
|
||||
.map(Character::getTextPosition)
|
||||
.toList());
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -18,15 +18,7 @@ public class Zone extends BoundingBox {
|
||||
|
||||
lines.sort(Comparator.comparingDouble(Line::getY0));
|
||||
this.lines = lines;
|
||||
buildBBox();
|
||||
}
|
||||
|
||||
|
||||
public void buildBBox() {
|
||||
|
||||
this.setBBox(getLines().stream()
|
||||
.map(BoundingBox::getBBox)
|
||||
.collect(RectangleTransformations.collectBBox()));
|
||||
setToBBoxOfComponents(lines);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -38,7 +38,7 @@ public class LineBuilderService {
|
||||
if (character.getTextPosition().getDir() != neighbor.getCharacter().getTextPosition().getDir() //
|
||||
|| !angleFilter.matches(neighbor) //
|
||||
|| Math.pow(normalizedHorizontalDistance, 2) + Math.pow(normalizedVerticalDistance, 2) > 1 //
|
||||
|| rulings.lineBetween(character, neighbor.getCharacter())) {
|
||||
|| rulings.lineBetween(character.getTextPosition(), neighbor.getCharacter().getTextPosition())) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
@ -39,7 +39,10 @@ public class ReadingOrderService {
|
||||
}
|
||||
}
|
||||
|
||||
if (histogram.values().stream().mapToInt(Integer::intValue).average().orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) {
|
||||
if (histogram.values()
|
||||
.stream()
|
||||
.mapToInt(Integer::intValue).average()
|
||||
.orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) {
|
||||
return resolveSingleColumnReadingOrder(zones);
|
||||
} else {
|
||||
|
||||
@ -52,7 +55,7 @@ public class ReadingOrderService {
|
||||
private static List<Zone> resolveSingleColumnReadingOrder(List<Zone> zones) {
|
||||
|
||||
zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
return zones;
|
||||
}
|
||||
|
||||
@ -90,14 +93,14 @@ public class ReadingOrderService {
|
||||
}
|
||||
|
||||
leftOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
|
||||
rightOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
|
||||
middle.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
/*
|
||||
List<Zone> leftNotIntersecting = new ArrayList<>();
|
||||
for (Zone leftZone : leftOf) {
|
||||
boolean intersects = false;
|
||||
@ -139,7 +142,7 @@ public class ReadingOrderService {
|
||||
|
||||
middle.addAll(leftNotIntersecting);
|
||||
middle.addAll(rightNotIntersecting);
|
||||
|
||||
*/
|
||||
List<Zone> sortedZones = new ArrayList<>();
|
||||
sortedZones.addAll(leftOf);
|
||||
sortedZones.addAll(rightOf);
|
||||
|
||||
@ -65,7 +65,7 @@ public class ZoneBuilderService {
|
||||
return;
|
||||
}
|
||||
|
||||
if (rulings.lineBetween(outerLine.getBBox(), innerLine.getBBox())) {
|
||||
if (rulings.lineBetween(outerLine, innerLine)) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
@ -4,6 +4,7 @@ import java.awt.geom.Rectangle2D;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
@ -15,18 +16,8 @@ import lombok.NoArgsConstructor;
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
public abstract class AbstractPageBlock extends Rectangle {
|
||||
public abstract class AbstractPageBlock extends BoundingBox {
|
||||
|
||||
protected Rectangle2D bBox; // in initial user space
|
||||
|
||||
@JsonIgnore
|
||||
protected float minX;
|
||||
@JsonIgnore
|
||||
protected float maxX;
|
||||
@JsonIgnore
|
||||
protected float minY;
|
||||
@JsonIgnore
|
||||
protected float maxY;
|
||||
@JsonIgnore
|
||||
protected PageBlockType classification;
|
||||
@JsonIgnore
|
||||
@ -45,63 +36,6 @@ public abstract class AbstractPageBlock extends Rectangle {
|
||||
}
|
||||
|
||||
|
||||
public boolean containsBlock(TextPageBlock other) {
|
||||
|
||||
return this.minX <= other.getPdfMinX() && this.maxX >= other.getPdfMaxX() && this.minY >= other.getPdfMinY() && this.maxY <= other.getPdfMaxY();
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(AbstractPageBlock other) {
|
||||
|
||||
return this.minX <= other.minX && this.maxX >= other.maxX && this.minY >= other.minY && this.maxY <= other.maxY;
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(Rectangle other) {
|
||||
|
||||
return page == other.getPage() && this.minX <= other.getTopLeft().getX() && this.maxX >= other.getTopLeft().getX() + other.getWidth() && this.minY <= other.getTopLeft()
|
||||
.getY() && this.maxY >= other.getTopLeft().getY() + other.getHeight();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getHeight() {
|
||||
|
||||
return maxY - minY;
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getWidth() {
|
||||
|
||||
return maxX - minX;
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsY(AbstractPageBlock apb) {
|
||||
|
||||
return this.minY <= apb.getMaxY() && this.maxY >= apb.getMinY();
|
||||
}
|
||||
|
||||
|
||||
public boolean almostIntersects(AbstractPageBlock apb, float yThreshold, float xThreshold) {
|
||||
|
||||
return this.almostIntersectsX(apb, xThreshold) && this.almostIntersectsY(apb, yThreshold);
|
||||
}
|
||||
|
||||
|
||||
private boolean almostIntersectsY(AbstractPageBlock apb, float threshold) {
|
||||
|
||||
return this.minY - threshold <= apb.getMaxY() && this.maxY + threshold >= apb.getMinY();
|
||||
}
|
||||
|
||||
|
||||
private boolean almostIntersectsX(AbstractPageBlock apb, float threshold) {
|
||||
|
||||
return this.minX - threshold <= apb.getMaxX() && this.maxX + threshold >= apb.getMinX();
|
||||
}
|
||||
|
||||
|
||||
public abstract boolean isEmpty();
|
||||
|
||||
}
|
||||
|
||||
@ -1,11 +1,13 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
||||
@ -18,7 +20,7 @@ import lombok.NoArgsConstructor;
|
||||
@Data
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@NoArgsConstructor
|
||||
public class Cell extends Rectangle {
|
||||
public class Cell extends BoundingBox {
|
||||
|
||||
private List<TextPageBlock> textBlocks = new ArrayList<>();
|
||||
|
||||
@ -33,13 +35,24 @@ public class Cell extends Rectangle {
|
||||
|
||||
public Cell(Point2D topLeft, Point2D bottomRight) {
|
||||
|
||||
super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), (float) (bottomRight.getY() - topLeft.getY()));
|
||||
this.bBoxInitialUserSpace = new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), (bottomRight.getX() - topLeft.getX()), (bottomRight.getY() - topLeft.getY()));
|
||||
this.bBox = bBoxInitialUserSpace;
|
||||
}
|
||||
|
||||
|
||||
public Cell(Rectangle2D r) {
|
||||
public Cell(Rectangle2D bBoxInitialUserSpace, AffineTransform initialUserSpaceToJava) {
|
||||
|
||||
super((float) r.getY(), (float) r.getX(), (float) r.getWidth(), (float) r.getHeight());
|
||||
this.bBoxInitialUserSpace = bBoxInitialUserSpace;
|
||||
this.bBox = initialUserSpaceToJava.createTransformedShape(bBoxInitialUserSpace).getBounds2D();
|
||||
}
|
||||
|
||||
|
||||
public static Cell copy(Cell cell) {
|
||||
|
||||
Cell copy = new Cell();
|
||||
copy.bBoxInitialUserSpace = cell.bBoxInitialUserSpace;
|
||||
copy.bBox = cell.bBox;
|
||||
return copy;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -7,10 +7,11 @@ import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
|
||||
import lombok.Getter;
|
||||
|
||||
@ -67,17 +68,14 @@ public class CleanRulings {
|
||||
}
|
||||
|
||||
|
||||
public boolean lineBetween(Character a, Character b) {
|
||||
public boolean lineBetween(BoundingBox a, BoundingBox b) {
|
||||
|
||||
return lineBetween(a.getTextPosition().getInitialUserSpacePosition(), b.getTextPosition().getInitialUserSpacePosition());
|
||||
return lineBetween(a.getBBoxInitialUserSpace(), b.getBBoxInitialUserSpace());
|
||||
}
|
||||
|
||||
|
||||
public boolean lineBetween(Rectangle2D a, Rectangle2D b) {
|
||||
|
||||
if (a.intersects(b)) {
|
||||
return false;
|
||||
}
|
||||
return lineBetween(new Point2D.Double(a.getCenterX(), a.getCenterY()), new Point2D.Double(b.getCenterX(), b.getCenterY()));
|
||||
}
|
||||
|
||||
@ -119,7 +117,7 @@ public class CleanRulings {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
List<Ruling> result = new ArrayList<>();
|
||||
List<Ruling> result = new LinkedList<>();
|
||||
for (int i = firstGreaterThanIdx; i < horizontals.size(); i++) {
|
||||
Ruling horizontal = horizontals.get(i);
|
||||
if (horizontal.y1 > endY) {
|
||||
@ -170,7 +168,7 @@ public class CleanRulings {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
List<Ruling> result = new ArrayList<>();
|
||||
List<Ruling> result = new LinkedList<>();
|
||||
for (int i = firstGreaterThanIdx; i < verticals.size(); i++) {
|
||||
Ruling horizontal = verticals.get(i);
|
||||
if (horizontal.x1 > endX) {
|
||||
|
||||
@ -1,218 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
@SuppressWarnings("all")
|
||||
public class Rectangle extends Rectangle2D.Float {
|
||||
|
||||
protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
|
||||
/**
|
||||
* Ill-defined comparator, from when Rectangle was Comparable.
|
||||
* <p>
|
||||
* see https://github.com/tabulapdf/tabula-java/issues/116
|
||||
*
|
||||
* @deprecated with no replacement
|
||||
*/
|
||||
@Deprecated
|
||||
public static final Comparator<Rectangle> ILL_DEFINED_ORDER = new Comparator<Rectangle>() {
|
||||
@Override
|
||||
public int compare(Rectangle o1, Rectangle o2) {
|
||||
|
||||
if (o1.equals(o2)) {
|
||||
return 0;
|
||||
}
|
||||
if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD) {
|
||||
return o1.isLtrDominant() == -1 && o2.isLtrDominant() == -1 ? -java.lang.Double.compare(o1.getX(), o2.getX()) : java.lang.Double.compare(o1.getX(), o2.getX());
|
||||
} else {
|
||||
return java.lang.Float.compare(o1.getBottom(), o2.getBottom());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
public Rectangle() {
|
||||
|
||||
super();
|
||||
}
|
||||
|
||||
|
||||
public Rectangle(float top, float left, float width, float height) {
|
||||
|
||||
super();
|
||||
this.setRect(left, top, width, height);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @param rectangles
|
||||
* @return minimum bounding box that contains all the rectangles
|
||||
*/
|
||||
public static Rectangle boundingBoxOf(List<? extends Rectangle> rectangles) {
|
||||
|
||||
float minx = java.lang.Float.MAX_VALUE;
|
||||
float miny = java.lang.Float.MAX_VALUE;
|
||||
float maxx = java.lang.Float.MIN_VALUE;
|
||||
float maxy = java.lang.Float.MIN_VALUE;
|
||||
|
||||
for (Rectangle r : rectangles) {
|
||||
minx = (float) Math.min(r.getMinX(), minx);
|
||||
miny = (float) Math.min(r.getMinY(), miny);
|
||||
maxx = (float) Math.max(r.getMaxX(), maxx);
|
||||
maxy = (float) Math.max(r.getMaxY(), maxy);
|
||||
}
|
||||
return new Rectangle(miny, minx, maxx - minx, maxy - miny);
|
||||
}
|
||||
|
||||
|
||||
public int compareTo(Rectangle other) {
|
||||
|
||||
return ILL_DEFINED_ORDER.compare(this, other);
|
||||
}
|
||||
|
||||
|
||||
// I'm bad at Java and need this for fancy sorting in
|
||||
// technology.tabula.TextChunk.
|
||||
public int isLtrDominant() {
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
public float getArea() {
|
||||
|
||||
return this.width * this.height;
|
||||
}
|
||||
|
||||
|
||||
public float verticalOverlap(Rectangle other) {
|
||||
|
||||
return Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
|
||||
}
|
||||
|
||||
|
||||
public boolean verticallyOverlaps(Rectangle other) {
|
||||
|
||||
return verticalOverlap(other) > 0;
|
||||
}
|
||||
|
||||
|
||||
public float horizontalOverlap(Rectangle other) {
|
||||
|
||||
return Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
|
||||
}
|
||||
|
||||
|
||||
public boolean horizontallyOverlaps(Rectangle other) {
|
||||
|
||||
return horizontalOverlap(other) > 0;
|
||||
}
|
||||
|
||||
|
||||
public float verticalOverlapRatio(Rectangle other) {
|
||||
|
||||
float rv = 0, delta = Math.min(this.getBottom() - this.getTop(), other.getBottom() - other.getTop());
|
||||
|
||||
if (other.getTop() <= this.getTop() && this.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) {
|
||||
rv = (other.getBottom() - this.getTop()) / delta;
|
||||
} else if (this.getTop() <= other.getTop() && other.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) {
|
||||
rv = (this.getBottom() - other.getTop()) / delta;
|
||||
} else if (this.getTop() <= other.getTop() && other.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) {
|
||||
rv = (other.getBottom() - other.getTop()) / delta;
|
||||
} else if (other.getTop() <= this.getTop() && this.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) {
|
||||
rv = (this.getBottom() - this.getTop()) / delta;
|
||||
}
|
||||
|
||||
return rv;
|
||||
|
||||
}
|
||||
|
||||
|
||||
public float overlapRatio(Rectangle other) {
|
||||
|
||||
double intersectionWidth = Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
|
||||
double intersectionHeight = Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
|
||||
double intersectionArea = Math.max(0, intersectionWidth * intersectionHeight);
|
||||
double unionArea = this.getArea() + other.getArea() - intersectionArea;
|
||||
|
||||
return (float) (intersectionArea / unionArea);
|
||||
}
|
||||
|
||||
|
||||
public Rectangle merge(Rectangle other) {
|
||||
|
||||
this.setRect(this.createUnion(other));
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
public float getTop() {
|
||||
|
||||
return (float) this.getMinY();
|
||||
}
|
||||
|
||||
|
||||
public void setTop(float top) {
|
||||
|
||||
float deltaHeight = top - this.y;
|
||||
this.setRect(this.x, top, this.width, this.height - deltaHeight);
|
||||
}
|
||||
|
||||
|
||||
public float getRight() {
|
||||
|
||||
return (float) this.getMaxX();
|
||||
}
|
||||
|
||||
|
||||
public void setRight(float right) {
|
||||
|
||||
this.setRect(this.x, this.y, right - this.x, this.height);
|
||||
}
|
||||
|
||||
|
||||
public float getLeft() {
|
||||
|
||||
return (float) this.getMinX();
|
||||
}
|
||||
|
||||
|
||||
public void setLeft(float left) {
|
||||
|
||||
float deltaWidth = left - this.x;
|
||||
this.setRect(left, this.y, this.width - deltaWidth, this.height);
|
||||
}
|
||||
|
||||
|
||||
public float getBottom() {
|
||||
|
||||
return (float) this.getMaxY();
|
||||
}
|
||||
|
||||
|
||||
public void setBottom(float bottom) {
|
||||
|
||||
this.setRect(this.x, this.y, this.width, bottom - this.y);
|
||||
}
|
||||
|
||||
|
||||
public Point2D[] getPoints() {
|
||||
|
||||
return new Point2D[]{new Point2D.Float(this.getLeft(), this.getTop()), new Point2D.Float(this.getRight(), this.getTop()), new Point2D.Float(this.getRight(),
|
||||
this.getBottom()), new Point2D.Float(this.getLeft(), this.getBottom())};
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
String s = super.toString();
|
||||
sb.append(s.substring(0, s.length() - 1));
|
||||
sb.append(String.format(",bottom=%f,right=%f]", this.getBottom(), this.getRight()));
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,7 +1,6 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
@ -37,14 +36,11 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
private List<Cell> cells;
|
||||
|
||||
|
||||
public TablePageBlock(List<Cell> cells, Rectangle area, int rotation) {
|
||||
public TablePageBlock(List<Cell> cells, int rotation) {
|
||||
|
||||
setToBBoxOfComponents(cells);
|
||||
this.cells = cells;
|
||||
addCells(cells);
|
||||
minX = area.getLeft();
|
||||
minY = area.getBottom();
|
||||
maxX = area.getRight();
|
||||
maxY = area.getTop();
|
||||
classification = PageBlockType.TABLE;
|
||||
this.rotation = rotation;
|
||||
}
|
||||
@ -130,7 +126,8 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
List<Cell> cellsToTheTop = new ArrayList<>();
|
||||
for (int i = 0; i < rowIndex; i++) {
|
||||
try {
|
||||
cellsToTheTop.add(rows.get(i).get(colIndex));
|
||||
cellsToTheTop.add(rows.get(i)
|
||||
.get(colIndex));
|
||||
} catch (IndexOutOfBoundsException e) {
|
||||
log.debug("No cell {} in row {}, ignoring.", colIndex, rowIndex);
|
||||
}
|
||||
@ -145,7 +142,8 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
if (lastHeaderCell != null) {
|
||||
cell.getHeaderCells().add(lastHeaderCell);
|
||||
}
|
||||
if (!cell.getTextBlocks().isEmpty() && cell.getTextBlocks().get(0).getMostPopularWordStyle().equals("bold")) {
|
||||
if (!cell.getTextBlocks().isEmpty() && cell.getTextBlocks()
|
||||
.get(0).getMostPopularWordStyle().equals("bold")) {
|
||||
cell.setHeaderCell(true);
|
||||
}
|
||||
}
|
||||
@ -209,7 +207,8 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
|
||||
for (int i = 0; i < rowsOfCellsMatrix.size(); i++) {
|
||||
for (int j = 0; j < rowsOfCellsMatrix.get(i).size(); j++) {
|
||||
addCellToRowAndCol(rowsOfCellsMatrix.get(i).get(j), i, j);
|
||||
addCellToRowAndCol(rowsOfCellsMatrix.get(i)
|
||||
.get(j), i, j);
|
||||
}
|
||||
}
|
||||
|
||||
@ -228,15 +227,15 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
Set<Float> uniqueX = new HashSet<>();
|
||||
Set<Float> uniqueY = new HashSet<>();
|
||||
Set<Double> uniqueX = new HashSet<>();
|
||||
Set<Double> uniqueY = new HashSet<>();
|
||||
cells.stream()
|
||||
.filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3)
|
||||
.forEach(c -> {
|
||||
uniqueX.add(c.getLeft());
|
||||
uniqueX.add(c.getRight());
|
||||
uniqueY.add(c.getBottom());
|
||||
uniqueY.add(c.getTop());
|
||||
uniqueX.add(c.getPdfMinX());
|
||||
uniqueX.add(c.getPdfMaxX());
|
||||
uniqueY.add(c.getPdfMinY());
|
||||
uniqueY.add(c.getPdfMaxY());
|
||||
});
|
||||
|
||||
var sortedUniqueX = uniqueX.stream()
|
||||
@ -248,22 +247,24 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
|
||||
List<List<Cell>> rowsOfCells = new ArrayList<>();
|
||||
|
||||
Float prevY = null;
|
||||
Double prevY = null;
|
||||
|
||||
for (Float y : sortedUniqueY) {
|
||||
for (Double y : sortedUniqueY) {
|
||||
|
||||
List<Cell> row = new ArrayList<>();
|
||||
|
||||
Float prevX = null;
|
||||
for (Float x : sortedUniqueX) {
|
||||
Double prevX = null;
|
||||
for (Double x : sortedUniqueX) {
|
||||
|
||||
if (prevY != null && prevX != null) {
|
||||
var cellFromGridStructure = new Cell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y));
|
||||
var cellFromGridStructure = new Cell(new Point2D.Double(prevX, prevY), new Point2D.Double(x, y));
|
||||
|
||||
if (cellFromGridStructure.hasMinimumSize()) {
|
||||
|
||||
cells.stream()
|
||||
.map(originalCell -> new CellWithIntersection(originalCell, RectangleTransformations.calculateIntersectedArea(cellFromGridStructure, originalCell)))
|
||||
.map(originalCell -> new CellWithIntersection(originalCell,
|
||||
RectangleTransformations.calculateIntersectedArea(cellFromGridStructure.getBBoxInitialUserSpace(),
|
||||
originalCell.getBBoxInitialUserSpace())))
|
||||
.filter(cellWithIntersection -> cellWithIntersection.intersectedArea > 0)
|
||||
.filter(cellWithIntersection -> cellWithIntersection.originalCell.getArea() > cellWithIntersection.intersectedArea * CELL_AREA_CONTAINED_THRESHOLD)
|
||||
.max(Comparator.comparing(CellWithIntersection::intersectedArea))
|
||||
@ -411,16 +412,6 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
}
|
||||
|
||||
|
||||
public Rectangle2D getBBox() {
|
||||
|
||||
if (this.bBox == null) {
|
||||
this.bBox = cells.stream()
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
}
|
||||
return this.bBox;
|
||||
}
|
||||
|
||||
|
||||
record CellWithIntersection(Cell originalCell, double intersectedArea) {
|
||||
|
||||
}
|
||||
|
||||
@ -6,6 +6,7 @@ import java.awt.geom.Rectangle2D;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
@ -17,11 +18,10 @@ import lombok.SneakyThrows;
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class RedTextPosition {
|
||||
public class RedTextPosition extends BoundingBox {
|
||||
|
||||
private final static int HEIGHT_PADDING = 2;
|
||||
private Rectangle2D.Float directionAdjustedPosition;
|
||||
private Rectangle2D initialUserSpacePosition;
|
||||
public final static int HEIGHT_PADDING = 2;
|
||||
private Rectangle2D.Float directionAdjustedPosition; // adjusted to text rotation
|
||||
|
||||
@JsonIgnore
|
||||
private int rotation;
|
||||
@ -63,17 +63,21 @@ public class RedTextPosition {
|
||||
pos.setFontSizeInPt(textPosition.getFontSizeInPt());
|
||||
pos.setFontName(textPosition.getFont().getName());
|
||||
|
||||
//TODO: There is a mismatch in the java coords of the text and the rulings,
|
||||
// I guess if we start with the initial user space positions and transform them the same way we do the rulings it would work.
|
||||
pos.setBBox(new Rectangle2D.Float(textPosition.getX(), textPosition.getY(), textPosition.getWidthDirAdj(), textPosition.getHeight()));
|
||||
|
||||
float textHeight = textPosition.getHeight() + HEIGHT_PADDING;
|
||||
Rectangle2D.Float dirAdjPosition = new Rectangle2D.Float(textPosition.getXDirAdj(),
|
||||
textPosition.getYDirAdj() - textHeight,
|
||||
textPosition.getWidthDirAdj(),
|
||||
textHeight + HEIGHT_PADDING);
|
||||
textPosition.getYDirAdj() - textHeight,
|
||||
textPosition.getWidthDirAdj(),
|
||||
textHeight + HEIGHT_PADDING);
|
||||
pos.setDirectionAdjustedPosition(dirAdjPosition);
|
||||
|
||||
AffineTransform affineTransform = getRotationMatrix(TextDirection.fromDegrees(textPosition.getDir()), textPosition.getPageWidth(), textPosition.getPageHeight());
|
||||
Rectangle2D initialUserSpacePositionRect = affineTransform.createTransformedShape(dirAdjPosition).getBounds2D();
|
||||
|
||||
pos.setInitialUserSpacePosition(initialUserSpacePositionRect);
|
||||
pos.setBBoxInitialUserSpace(initialUserSpacePositionRect); // These are definitely correct
|
||||
|
||||
return pos;
|
||||
}
|
||||
@ -97,6 +101,7 @@ public class RedTextPosition {
|
||||
return transform;
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public float getXDirAdj() {
|
||||
|
||||
@ -107,7 +112,7 @@ public class RedTextPosition {
|
||||
@JsonIgnore
|
||||
public float getYDirAdj() {
|
||||
|
||||
return this.directionAdjustedPosition.y;
|
||||
return this.directionAdjustedPosition.y;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1,18 +1,13 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import static java.util.stream.Collectors.toSet;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
@ -31,34 +26,31 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
@Builder.Default
|
||||
private List<TextPositionSequence> sequences = new ArrayList<>();
|
||||
|
||||
@JsonIgnore
|
||||
private int rotation;
|
||||
|
||||
@JsonIgnore
|
||||
private String mostPopularWordFont;
|
||||
|
||||
@JsonIgnore
|
||||
private String mostPopularWordStyle;
|
||||
|
||||
@JsonIgnore
|
||||
private float mostPopularWordFontSize;
|
||||
|
||||
@JsonIgnore
|
||||
private float mostPopularWordHeight;
|
||||
|
||||
@JsonIgnore
|
||||
private float mostPopularWordSpaceWidth;
|
||||
|
||||
@JsonIgnore
|
||||
private float highestFontSize;
|
||||
|
||||
@JsonIgnore
|
||||
private PageBlockType classification;
|
||||
|
||||
@JsonIgnore
|
||||
private boolean toDuplicate;
|
||||
|
||||
|
||||
public TextPageBlock(List<TextPositionSequence> sequences) {
|
||||
|
||||
this.sequences = sequences;
|
||||
calculateFrequencyCounters();
|
||||
calculateBBox();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public TextDirection getDir() {
|
||||
|
||||
@ -66,34 +58,40 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
private float getPageHeight() {
|
||||
private void calculateBBox() {
|
||||
|
||||
return sequences.get(0).getPageHeight();
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
private float getPageWidth() {
|
||||
|
||||
return sequences.get(0).getPageWidth();
|
||||
if (sequences == null) {
|
||||
this.bBox = new Rectangle2D.Double();
|
||||
this.bBoxInitialUserSpace = new Rectangle2D.Double();
|
||||
return;
|
||||
}
|
||||
setToBBoxOfComponents(sequences);
|
||||
}
|
||||
|
||||
|
||||
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
|
||||
|
||||
if (textBlocksToMerge.isEmpty()) {
|
||||
throw new IllegalArgumentException("Need to provide at least one TextPageBlock.");
|
||||
}
|
||||
if (textBlocksToMerge.stream()
|
||||
.map(AbstractPageBlock::getPage)
|
||||
.distinct()
|
||||
.count() != 1) {
|
||||
throw new IllegalArgumentException("Cannot merge textBlocks on different pages.");
|
||||
}
|
||||
|
||||
List<TextPositionSequence> sequences = textBlocksToMerge.stream()
|
||||
.map(TextPageBlock::getSequences)
|
||||
.flatMap(java.util.Collection::stream)
|
||||
.toList();
|
||||
sequences = new ArrayList<>(sequences);
|
||||
return fromTextPositionSequences(sequences);
|
||||
|
||||
return new TextPageBlock(sequences);
|
||||
}
|
||||
|
||||
|
||||
public static TextPageBlock fromTextPositionSequences(List<TextPositionSequence> wordBlockList) {
|
||||
|
||||
TextPageBlock textBlock = null;
|
||||
private void calculateFrequencyCounters() {
|
||||
|
||||
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
|
||||
@ -101,7 +99,7 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
|
||||
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
|
||||
|
||||
for (TextPositionSequence wordBlock : wordBlockList) {
|
||||
for (TextPositionSequence wordBlock : sequences) {
|
||||
|
||||
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
|
||||
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
|
||||
@ -109,172 +107,23 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
fontFrequencyCounter.add(wordBlock.getFont());
|
||||
styleFrequencyCounter.add(wordBlock.getFontStyle());
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
|
||||
wordBlock.getMaxXDirAdj(),
|
||||
wordBlock.getMinYDirAdj(),
|
||||
wordBlock.getMaxYDirAdj(),
|
||||
wordBlockList,
|
||||
wordBlock.getRotation());
|
||||
} else {
|
||||
TextPageBlock spatialEntity = textBlock.union(wordBlock);
|
||||
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
||||
}
|
||||
}
|
||||
|
||||
if (textBlock != null) {
|
||||
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
|
||||
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
||||
}
|
||||
|
||||
if (textBlock != null
|
||||
&& textBlock.getSequences() != null
|
||||
&& textBlock.getSequences()
|
||||
.stream()
|
||||
.map(t -> DoubleComparisons.round(t.getMinYDirAdj(), 3))
|
||||
.collect(toSet()).size() == 1) {
|
||||
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
public Rectangle2D getBBox() {
|
||||
|
||||
if (this.bBox == null) {
|
||||
this.bBox = sequences.stream()
|
||||
.map(TextPositionSequence::getBoundingBox)
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
}
|
||||
return this.bBox;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the minX value in pdf coordinate system.
|
||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||
* 0 -> LowerLeft
|
||||
* 90 -> UpperLeft
|
||||
* 180 -> UpperRight
|
||||
* 270 -> LowerRight
|
||||
*
|
||||
* @return the minX value in pdf coordinate system
|
||||
*/
|
||||
@JsonIgnore
|
||||
public float getPdfMinX() {
|
||||
|
||||
if (getDir().getDegrees() == 90) {
|
||||
return minY;
|
||||
} else if (getDir().getDegrees() == 180) {
|
||||
return getPageWidth() - maxX;
|
||||
|
||||
} else if (getDir().getDegrees() == 270) {
|
||||
|
||||
return getPageWidth() - maxY;
|
||||
} else {
|
||||
return minX;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the maxX value in pdf coordinate system.
|
||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||
* 0 -> LowerLeft
|
||||
* 90 -> UpperLeft
|
||||
* 180 -> UpperRight
|
||||
* 270 -> LowerRight
|
||||
*
|
||||
* @return the maxX value in pdf coordinate system
|
||||
*/
|
||||
@JsonIgnore
|
||||
public float getPdfMaxX() {
|
||||
|
||||
if (getDir().getDegrees() == 90) {
|
||||
return maxY;
|
||||
} else if (getDir().getDegrees() == 180) {
|
||||
return getPageWidth() - minX;
|
||||
} else if (getDir().getDegrees() == 270) {
|
||||
return getPageWidth() - minY;
|
||||
|
||||
} else {
|
||||
return maxX;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the minY value in pdf coordinate system.
|
||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||
* 0 -> LowerLeft
|
||||
* 90 -> UpperLeft
|
||||
* 180 -> UpperRight
|
||||
* 270 -> LowerRight
|
||||
*
|
||||
* @return the minY value in pdf coordinate system
|
||||
*/
|
||||
@JsonIgnore
|
||||
public float getPdfMinY() {
|
||||
|
||||
if (getDir().getDegrees() == 90) {
|
||||
return minX;
|
||||
} else if (getDir().getDegrees() == 180) {
|
||||
return maxY;
|
||||
|
||||
} else if (getDir().getDegrees() == 270) {
|
||||
return getPageHeight() - maxX;
|
||||
|
||||
} else {
|
||||
return getPageHeight() - maxY;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the maxY value in pdf coordinate system.
|
||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||
* 0 -> LowerLeft
|
||||
* 90 -> UpperLeft
|
||||
* 180 -> UpperRight
|
||||
* 270 -> LowerRight
|
||||
*
|
||||
* @return the maxY value in pdf coordinate system
|
||||
*/
|
||||
@JsonIgnore
|
||||
public float getPdfMaxY() {
|
||||
|
||||
if (getDir().getDegrees() == 90) {
|
||||
return maxX;
|
||||
} else if (getDir().getDegrees() == 180) {
|
||||
|
||||
return minY;
|
||||
} else if (getDir().getDegrees() == 270) {
|
||||
return getPageHeight() - minX;
|
||||
} else {
|
||||
return getPageHeight() - minY;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public TextPageBlock(float minX, float maxX, float minY, float maxY, List<TextPositionSequence> sequences, int rotation) {
|
||||
|
||||
this.minX = minX;
|
||||
this.maxX = maxX;
|
||||
this.minY = minY;
|
||||
this.maxY = maxY;
|
||||
this.sequences = sequences;
|
||||
this.rotation = rotation;
|
||||
setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
|
||||
setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
|
||||
setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
|
||||
setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
|
||||
setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
|
||||
setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
||||
}
|
||||
|
||||
|
||||
public TextPageBlock union(TextPositionSequence r) {
|
||||
|
||||
TextPageBlock union = this.copy();
|
||||
union.add(r);
|
||||
union.getSequences().add(r);
|
||||
calculateFrequencyCounters();
|
||||
calculateBBox();
|
||||
return union;
|
||||
}
|
||||
|
||||
@ -282,64 +131,32 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
public TextPageBlock union(TextPageBlock r) {
|
||||
|
||||
TextPageBlock union = this.copy();
|
||||
union.add(r);
|
||||
union.getSequences().addAll(r.getSequences());
|
||||
calculateFrequencyCounters();
|
||||
calculateBBox();
|
||||
return union;
|
||||
}
|
||||
|
||||
|
||||
public void add(TextPageBlock r) {
|
||||
|
||||
if (r.getMinX() < minX) {
|
||||
minX = r.getMinX();
|
||||
}
|
||||
if (r.getMaxX() > maxX) {
|
||||
maxX = r.getMaxX();
|
||||
}
|
||||
if (r.getMinY() < minY) {
|
||||
minY = r.getMinY();
|
||||
}
|
||||
if (r.getMaxY() > maxY) {
|
||||
maxY = r.getMaxY();
|
||||
}
|
||||
sequences.addAll(r.getSequences());
|
||||
calculateFrequencyCounters();
|
||||
calculateBBox();
|
||||
}
|
||||
|
||||
|
||||
public void add(TextPositionSequence r) {
|
||||
|
||||
if (r.getMinXDirAdj() < minX) {
|
||||
minX = r.getMinXDirAdj();
|
||||
}
|
||||
if (r.getMaxXDirAdj() > maxX) {
|
||||
maxX = r.getMaxXDirAdj();
|
||||
}
|
||||
if (r.getMinYDirAdj() < minY) {
|
||||
minY = r.getMinYDirAdj();
|
||||
}
|
||||
if (r.getMaxYDirAdj() > maxY) {
|
||||
maxY = r.getMaxYDirAdj();
|
||||
}
|
||||
sequences.add(r);
|
||||
calculateFrequencyCounters();
|
||||
calculateBBox();
|
||||
}
|
||||
|
||||
|
||||
public TextPageBlock copy() {
|
||||
|
||||
return new TextPageBlock(minX, maxX, minY, maxY, sequences, rotation);
|
||||
}
|
||||
|
||||
|
||||
public void resize(float x1, float y1, float width, float height) {
|
||||
|
||||
set(x1, y1, x1 + width, y1 + height);
|
||||
}
|
||||
|
||||
|
||||
public void set(float x1, float y1, float x2, float y2) {
|
||||
|
||||
this.minX = Math.min(x1, x2);
|
||||
this.maxX = Math.max(x1, x2);
|
||||
this.minY = Math.min(y1, y2);
|
||||
this.maxY = Math.max(y1, y2);
|
||||
return new TextPageBlock(new ArrayList<>(sequences));
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1,8 +1,5 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
@ -10,16 +7,13 @@ import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@ -27,8 +21,8 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
|
||||
public class TextPositionSequence implements CharSequence {
|
||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = false)
|
||||
public class TextPositionSequence extends BoundingBox implements CharSequence {
|
||||
|
||||
public static final int HEIGHT_PADDING = 2;
|
||||
|
||||
@ -59,8 +53,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
this.pageHeight = textPositions.get(0).getPageHeight();
|
||||
this.pageWidth = textPositions.get(0).getPageWidth();
|
||||
this.isParagraphStart = isParagraphStart;
|
||||
|
||||
|
||||
setToBBoxOfComponents(getTextPositions());
|
||||
}
|
||||
|
||||
|
||||
@ -72,6 +65,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
this.rotation = textPositions.get(0).getRotation();
|
||||
this.pageHeight = textPositions.get(0).getPageHeight();
|
||||
this.pageWidth = textPositions.get(0).getPageWidth();
|
||||
setToBBoxOfComponents(getTextPositions());
|
||||
}
|
||||
|
||||
|
||||
@ -109,7 +103,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
textPositionSequence.rotation = rotation;
|
||||
textPositionSequence.pageHeight = pageHeight;
|
||||
textPositionSequence.pageWidth = pageWidth;
|
||||
|
||||
textPositionSequence.setToBBoxOfComponents(getTextPositions());
|
||||
return textPositionSequence;
|
||||
}
|
||||
|
||||
@ -139,18 +133,20 @@ public class TextPositionSequence implements CharSequence {
|
||||
this.rotation = textPositionSequence.getRotation();
|
||||
this.pageHeight = textPositionSequence.getPageHeight();
|
||||
this.pageWidth = textPositionSequence.getPageWidth();
|
||||
setToBBoxOfComponents(getTextPositions());
|
||||
;
|
||||
}
|
||||
|
||||
|
||||
public void add(TextPosition textPosition) {
|
||||
|
||||
this.textPositions.add(RedTextPosition.fromTextPosition(textPosition));
|
||||
|
||||
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
|
||||
this.rotation = textPositions.get(0).getRotation();
|
||||
this.pageHeight = textPositions.get(0).getPageHeight();
|
||||
this.pageWidth = textPositions.get(0).getPageWidth();
|
||||
|
||||
setToBBoxOfComponents(getTextPositions());
|
||||
;
|
||||
}
|
||||
|
||||
|
||||
@ -222,18 +218,6 @@ public class TextPositionSequence implements CharSequence {
|
||||
}
|
||||
|
||||
|
||||
public float getHeight() {
|
||||
|
||||
return getMaxYDirAdj() - getMinYDirAdj();
|
||||
}
|
||||
|
||||
|
||||
public float getWidth() {
|
||||
|
||||
return getMaxXDirAdj() - getMinXDirAdj();
|
||||
}
|
||||
|
||||
|
||||
public String getFont() {
|
||||
|
||||
if (textPositions.get(0).getFontName() == null) {
|
||||
@ -273,62 +257,5 @@ public class TextPositionSequence implements CharSequence {
|
||||
return textPositions.get(0).getWidthOfSpace();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This returns the bounding box of the word in Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||
* 0 -> LowerLeft
|
||||
* 90 -> UpperLeft
|
||||
* 180 -> UpperRight
|
||||
* 270 -> LowerRight
|
||||
*
|
||||
* @return bounding box of the word in Pdf Coordinate System
|
||||
*/
|
||||
|
||||
@SneakyThrows
|
||||
public Rectangle getRectangle() {
|
||||
|
||||
log.debug("Page: '{}', Word: '{}', Rotation: '{}', textRotation {}", page, this, rotation, dir);
|
||||
|
||||
float textHeight = getTextHeight();
|
||||
|
||||
RedTextPosition firstTextPos = textPositions.get(0);
|
||||
RedTextPosition lastTextPos = textPositions.get(textPositions.size() - 1);
|
||||
|
||||
Point2D bottomLeft = new Point2D.Double(firstTextPos.getXDirAdj(), firstTextPos.getYDirAdj() - HEIGHT_PADDING);
|
||||
Point2D topRight = new Point2D.Double(lastTextPos.getXDirAdj() + lastTextPos.getWidthDirAdj(), lastTextPos.getYDirAdj() + textHeight + HEIGHT_PADDING);
|
||||
|
||||
AffineTransform transform = new AffineTransform();
|
||||
if (dir == TextDirection.ZERO || dir == TextDirection.HALF_CIRCLE) {
|
||||
transform.rotate(dir.getRadians(), pageWidth / 2f, pageHeight / 2f);
|
||||
transform.translate(0f, pageHeight + textHeight);
|
||||
transform.scale(1., -1.);
|
||||
} else if (dir == TextDirection.QUARTER_CIRCLE) {
|
||||
transform.rotate(dir.getRadians(), pageWidth / 2f, pageWidth / 2f);
|
||||
transform.translate(0f, pageWidth + textHeight);
|
||||
transform.scale(1., -1.);
|
||||
} else {
|
||||
transform.rotate(dir.getRadians(), pageHeight / 2f, pageHeight / 2f);
|
||||
transform.translate(0f, pageWidth + textHeight);
|
||||
transform.scale(1., -1.);
|
||||
}
|
||||
|
||||
bottomLeft = transform.transform(bottomLeft, null);
|
||||
topRight = transform.transform(topRight, null);
|
||||
|
||||
return new Rectangle( //
|
||||
new Point((float) bottomLeft.getX(), (float) bottomLeft.getY()),
|
||||
(float) (topRight.getX() - bottomLeft.getX()),
|
||||
(float) (topRight.getY() - bottomLeft.getY()),
|
||||
page);
|
||||
}
|
||||
|
||||
|
||||
public Rectangle2D getBoundingBox() {
|
||||
|
||||
return getTextPositions().stream()
|
||||
.map(RedTextPosition::getInitialUserSpacePosition)
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -160,11 +160,12 @@ public class BodyTextFrameService {
|
||||
continue;
|
||||
}
|
||||
|
||||
float approxLineCount = PositionUtils.getApproxLineCount(textBlock);
|
||||
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)
|
||||
&& approxLineCount < approximateHeaderLineCount
|
||||
&& textBlock.getMaxY() >= page.getPageHeight() - (page.getPageHeight() / 10)
|
||||
|| !layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD) && approxLineCount < approximateHeaderLineCount) {
|
||||
double approxLineCount = PositionUtils.getApproxLineCount(textBlock);
|
||||
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD) //
|
||||
&& approxLineCount < approximateHeaderLineCount //
|
||||
&& textBlock.getMaxY() >= page.getPageHeight() - (page.getPageHeight() / 10)//
|
||||
|| !layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD) //
|
||||
&& approxLineCount < approximateHeaderLineCount) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -190,9 +191,9 @@ public class BodyTextFrameService {
|
||||
}
|
||||
}
|
||||
}
|
||||
return new Rectangle(new Point(expansionsRectangle.minX, expansionsRectangle.minY),
|
||||
expansionsRectangle.maxX - expansionsRectangle.minX,
|
||||
expansionsRectangle.maxY - expansionsRectangle.minY,
|
||||
return new Rectangle(new Point((float) expansionsRectangle.minX, (float) expansionsRectangle.minY),
|
||||
(float) (expansionsRectangle.maxX - expansionsRectangle.minX),
|
||||
(float) (expansionsRectangle.maxY - expansionsRectangle.minY),
|
||||
0);
|
||||
}
|
||||
|
||||
@ -231,10 +232,10 @@ public class BodyTextFrameService {
|
||||
|
||||
private class BodyTextFrameExpansionsRectangle {
|
||||
|
||||
float minX = 10000;
|
||||
float maxX = -100;
|
||||
float minY = 10000;
|
||||
float maxY = -100;
|
||||
double minX = 10000;
|
||||
double maxX = -100;
|
||||
double minY = 10000;
|
||||
double maxY = -100;
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -69,7 +69,7 @@ public class GapDetectionService {
|
||||
|
||||
private static Rectangle2D toRectangle2D(TextPositionSequence textPosition) {
|
||||
|
||||
return mirrorY(RectangleTransformations.toRectangle2D(textPosition.getRectangle()));
|
||||
return mirrorY(textPosition.getBBox());
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -180,7 +180,7 @@ public class LineDetectionService {
|
||||
|
||||
private Rectangle2D textPositionBBox(List<TextPositionSequence> textPositionSequences) {
|
||||
|
||||
return RectangleTransformations.rectangleBBox(textPositionSequences.stream().map(TextPositionSequence::getRectangle).toList());
|
||||
return RectangleTransformations.rectangle2DBBox(textPositionSequences.stream().map(TextPositionSequence::getBBox).toList());
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.X_FIRST_RULING_COMPARATOR;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
@ -12,9 +13,9 @@ import java.util.stream.Collectors;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.UnionFind;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
@ -51,37 +52,37 @@ public class RulingCleaningService {
|
||||
|
||||
private Rulings cleanRulings(Rulings rulings) {
|
||||
|
||||
List<List<Rectangle>> groupedOverlappingVerticalRectangles = groupOverlappingRectangles(rulings.verticalLines.stream()
|
||||
.map(RulingCleaningService::getOverlapRectangle)
|
||||
.distinct()
|
||||
.toList());
|
||||
List<Ruling> cleanedVerticalRulings = groupedOverlappingVerticalRectangles.stream()
|
||||
.map(rectList -> getXCenteredRuling(Rectangle.boundingBoxOf(rectList)))
|
||||
.toList();
|
||||
|
||||
List<List<Rectangle>> groupedOverlappingHorizontalRectangles = groupOverlappingRectangles(rulings.horizontalLines.stream()
|
||||
List<List<Rectangle2D>> groupedOverlappingVerticalRectangles = groupOverlappingRectangles(rulings.verticalLines.stream()
|
||||
.map(RulingCleaningService::getOverlapRectangle)
|
||||
.distinct()
|
||||
.toList());
|
||||
List<Ruling> cleanedVerticalRulings = groupedOverlappingVerticalRectangles.stream()
|
||||
.map(rectList -> getXCenteredRuling(RectangleTransformations.rectangle2DBBox(rectList)))
|
||||
.toList();
|
||||
|
||||
List<List<Rectangle2D>> groupedOverlappingHorizontalRectangles = groupOverlappingRectangles(rulings.horizontalLines.stream()
|
||||
.map(RulingCleaningService::getOverlapRectangle)
|
||||
.distinct()
|
||||
.toList());
|
||||
|
||||
List<Ruling> cleanedHorizontalRulings = groupedOverlappingHorizontalRectangles.stream()
|
||||
.map(rectList -> getYCenteredRuling(Rectangle.boundingBoxOf(rectList)))
|
||||
.map(rectList -> getYCenteredRuling(RectangleTransformations.rectangle2DBBox(rectList)))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
return new Rulings(cleanedVerticalRulings, cleanedHorizontalRulings);
|
||||
}
|
||||
|
||||
|
||||
private List<List<Rectangle>> groupOverlappingRectangles(List<Rectangle> rectangles) {
|
||||
private List<List<Rectangle2D>> groupOverlappingRectangles(List<Rectangle2D> rectangles) {
|
||||
|
||||
UnionFind<Rectangle> unionFind = new UnionFind<>();
|
||||
UnionFind<Rectangle2D> unionFind = new UnionFind<>();
|
||||
for (int i = 0; i < rectangles.size(); i++) {
|
||||
for (int j = i + 1; j < rectangles.size(); j++) {
|
||||
Rectangle rectangle1 = rectangles.get(i);
|
||||
Rectangle rectangle2 = rectangles.get(j);
|
||||
Rectangle2D rectangle1 = rectangles.get(i);
|
||||
Rectangle2D rectangle2 = rectangles.get(j);
|
||||
|
||||
// we can stop early when we are too far off because of x-y-sorting
|
||||
if(rectangle1.getRight() < rectangle2.getLeft() && rectangle1.getBottom() < rectangle2.getTop()) {
|
||||
if (rectangle1.getMaxX() < rectangle2.getMinX() && rectangle1.getMaxY() < rectangle2.getMinY()) {
|
||||
break;
|
||||
}
|
||||
|
||||
@ -91,16 +92,16 @@ public class RulingCleaningService {
|
||||
}
|
||||
}
|
||||
|
||||
Map<Rectangle, List<Rectangle>> groups = new HashMap<>();
|
||||
for (Rectangle rectangle : rectangles) {
|
||||
Rectangle root = unionFind.find(rectangle);
|
||||
Map<Rectangle2D, List<Rectangle2D>> groups = new HashMap<>();
|
||||
for (Rectangle2D rectangle : rectangles) {
|
||||
Rectangle2D root = unionFind.find(rectangle);
|
||||
groups.computeIfAbsent(root, k -> new ArrayList<>()).add(rectangle);
|
||||
}
|
||||
return new ArrayList<>(groups.values());
|
||||
}
|
||||
|
||||
|
||||
private static Rectangle getOverlapRectangle(Ruling ruling) {
|
||||
private static Rectangle2D getOverlapRectangle(Ruling ruling) {
|
||||
|
||||
float top;
|
||||
float left;
|
||||
@ -123,34 +124,34 @@ public class RulingCleaningService {
|
||||
}
|
||||
|
||||
if (ruling.isHorizontal()) {
|
||||
return new Rectangle(top - THRESHOLD_Y_HORIZONTAL, left - THRESHOLD_X_HORIZONTAL, w + 2 * THRESHOLD_X_HORIZONTAL, h + 2 * THRESHOLD_Y_HORIZONTAL);
|
||||
return new Rectangle2D.Double(top - THRESHOLD_Y_HORIZONTAL, left - THRESHOLD_X_HORIZONTAL, w + 2 * THRESHOLD_X_HORIZONTAL, h + 2 * THRESHOLD_Y_HORIZONTAL);
|
||||
} else {
|
||||
return new Rectangle(top - THRESHOLD_Y_VERTICAL, left - THRESHOLD_X_VERTICAL, w + 2 * THRESHOLD_X_VERTICAL, h + 2 * THRESHOLD_Y_VERTICAL);
|
||||
return new Rectangle2D.Double(top - THRESHOLD_Y_VERTICAL, left - THRESHOLD_X_VERTICAL, w + 2 * THRESHOLD_X_VERTICAL, h + 2 * THRESHOLD_Y_VERTICAL);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static Ruling getXCenteredRuling(Rectangle rectangle) {
|
||||
public static Ruling getXCenteredRuling(Rectangle2D rectangle) {
|
||||
|
||||
float x = (float) rectangle.getCenterX();
|
||||
float y1 = rectangle.getTop();
|
||||
float y2 = rectangle.getBottom();
|
||||
double x = rectangle.getCenterX();
|
||||
double y1 = rectangle.getMinY();
|
||||
double y2 = rectangle.getMaxY();
|
||||
|
||||
Point2D point1 = new Point2D.Float(x, y1 + THRESHOLD_Y_VERTICAL);
|
||||
Point2D point2 = new Point2D.Float(x, y2 - THRESHOLD_Y_VERTICAL);
|
||||
Point2D point1 = new Point2D.Double(x, y1 + THRESHOLD_Y_VERTICAL);
|
||||
Point2D point2 = new Point2D.Double(x, y2 - THRESHOLD_Y_VERTICAL);
|
||||
|
||||
return new Ruling(point1, point2);
|
||||
}
|
||||
|
||||
|
||||
public static Ruling getYCenteredRuling(Rectangle rectangle) {
|
||||
public static Ruling getYCenteredRuling(Rectangle2D rectangle) {
|
||||
|
||||
float x1 = rectangle.getLeft();
|
||||
float x2 = rectangle.getRight();
|
||||
float y = (float) rectangle.getCenterY();
|
||||
double x1 = rectangle.getX();
|
||||
double x2 = rectangle.getMaxX();
|
||||
double y = rectangle.getCenterY();
|
||||
|
||||
Point2D point1 = new Point2D.Float(x1 + THRESHOLD_X_HORIZONTAL, y);
|
||||
Point2D point2 = new Point2D.Float(x2 - THRESHOLD_X_HORIZONTAL, y);
|
||||
Point2D point1 = new Point2D.Double(x1 + THRESHOLD_X_HORIZONTAL, y);
|
||||
Point2D point2 = new Point2D.Double(x2 - THRESHOLD_X_HORIZONTAL, y);
|
||||
|
||||
return new Ruling(point1, point2);
|
||||
}
|
||||
|
||||
@ -71,7 +71,8 @@ public class SectionsBuilderService {
|
||||
chunkBlockList.add(chunkBlock);
|
||||
chunkWords = new ArrayList<>();
|
||||
if (!chunkBlock.getTables().isEmpty()) {
|
||||
previousTable = chunkBlock.getTables().get(chunkBlock.getTables().size() - 1);
|
||||
previousTable = chunkBlock.getTables()
|
||||
.get(chunkBlock.getTables().size() - 1);
|
||||
}
|
||||
}
|
||||
if (current instanceof TablePageBlock table) {
|
||||
@ -106,11 +107,12 @@ public class SectionsBuilderService {
|
||||
|
||||
List<ClassificationSection> sections = new ArrayList<>();
|
||||
for (var page : document.getPages()) {
|
||||
page.getTextBlocks().forEach(block -> {
|
||||
block.setPage(page.getPageNumber());
|
||||
var section = buildTextBlock(List.of(block), Strings.EMPTY);
|
||||
sections.add(section);
|
||||
});
|
||||
page.getTextBlocks()
|
||||
.forEach(block -> {
|
||||
block.setPage(page.getPageNumber());
|
||||
var section = buildTextBlock(List.of(block), Strings.EMPTY);
|
||||
sections.add(section);
|
||||
});
|
||||
}
|
||||
document.setSections(sections);
|
||||
}
|
||||
@ -155,10 +157,10 @@ public class SectionsBuilderService {
|
||||
}
|
||||
}
|
||||
for (ClassificationSection section : sectionsOnPage) {
|
||||
Float xMin = null;
|
||||
Float yMin = null;
|
||||
Float xMax = null;
|
||||
Float yMax = null;
|
||||
Double xMin = null;
|
||||
Double yMin = null;
|
||||
Double xMax = null;
|
||||
Double yMax = null;
|
||||
|
||||
for (AbstractPageBlock abs : section.getPageBlocks()) {
|
||||
if (abs.getPage() != page.getPageNumber()) {
|
||||
@ -202,8 +204,14 @@ public class SectionsBuilderService {
|
||||
log.debug("Image position x: {}, y: {}", image.getPosition().getX(), image.getPosition().getY());
|
||||
log.debug("Paragraph position xMin: {}, xMax: {}, yMin: {}, yMax: {}", xMin, xMax, yMin, yMax);
|
||||
|
||||
if (xMin != null && xMax != null && yMin != null && yMax != null && image.getPosition().getX() >= xMin && image.getPosition()
|
||||
.getX() <= xMax && image.getPosition().getY() >= yMin && image.getPosition().getY() <= yMax) {
|
||||
if (xMin != null
|
||||
&& xMax != null
|
||||
&& yMin != null
|
||||
&& yMax != null
|
||||
&& image.getPosition().getX() >= xMin
|
||||
&& image.getPosition().getX() <= xMax
|
||||
&& image.getPosition().getY() >= yMin
|
||||
&& image.getPosition().getY() <= yMax) {
|
||||
section.getImages().add(image);
|
||||
image.setAppendedToSection(true);
|
||||
break;
|
||||
@ -226,17 +234,26 @@ public class SectionsBuilderService {
|
||||
List<Cell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
|
||||
List<Cell> tableNonHeaderRow = getRowWithNonHeaderCells(currentTable);
|
||||
// Allow merging of tables if header row is separated from first logical non-header row
|
||||
if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows().get(0).size() == tableNonHeaderRow.size()) {
|
||||
previousTableNonHeaderRow = previousTable.getRows().get(0).stream().map(cell -> {
|
||||
Cell fakeCell = new Cell(cell.getPoints()[0], cell.getPoints()[2]);
|
||||
fakeCell.setHeaderCells(Collections.singletonList(cell));
|
||||
return fakeCell;
|
||||
}).collect(Collectors.toList());
|
||||
if (previousTableNonHeaderRow.isEmpty()
|
||||
&& previousTable.getRowCount() == 1
|
||||
&& previousTable.getRows()
|
||||
.get(0).size() == tableNonHeaderRow.size()) {
|
||||
previousTableNonHeaderRow = previousTable.getRows()
|
||||
.get(0)
|
||||
.stream()
|
||||
.map(cell -> {
|
||||
Cell fakeCell = Cell.copy(cell);
|
||||
fakeCell.setHeaderCells(Collections.singletonList(cell));
|
||||
return fakeCell;
|
||||
})
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
|
||||
for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||
List<Cell> row = currentTable.getRows().get(i);
|
||||
if (row.size() == tableNonHeaderRow.size() && row.stream().allMatch(cell -> cell.getHeaderCells().isEmpty())) {
|
||||
List<Cell> row = currentTable.getRows()
|
||||
.get(i);
|
||||
if (row.size() == tableNonHeaderRow.size() && row.stream()
|
||||
.allMatch(cell -> cell.getHeaderCells().isEmpty())) {
|
||||
for (int j = 0; j < row.size(); j++) {
|
||||
row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells());
|
||||
}
|
||||
@ -279,7 +296,11 @@ public class SectionsBuilderService {
|
||||
|
||||
private boolean hasInvalidHeaderInformation(TablePageBlock table) {
|
||||
|
||||
return table.getRows().stream().flatMap(row -> row.stream().filter(cell -> !cell.getHeaderCells().isEmpty())).findAny().isEmpty();
|
||||
return table.getRows()
|
||||
.stream()
|
||||
.flatMap(row -> row.stream()
|
||||
.filter(cell -> !cell.getHeaderCells().isEmpty()))
|
||||
.findAny().isEmpty();
|
||||
|
||||
}
|
||||
|
||||
@ -287,7 +308,8 @@ public class SectionsBuilderService {
|
||||
private List<Cell> getRowWithNonHeaderCells(TablePageBlock table) {
|
||||
|
||||
for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||
List<Cell> row = table.getRows().get(i);
|
||||
List<Cell> row = table.getRows()
|
||||
.get(i);
|
||||
if (row.size() == 1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -3,6 +3,8 @@ package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.CELL_SIZE_COMPARATOR;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.RECTANGLE_SIZE_COMPARATOR;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
@ -11,22 +13,26 @@ import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangularIntersectionFinder;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.SpreadsheetFinder;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@Service
|
||||
public class TableExtractionService {
|
||||
|
||||
private static final int MAX_TABLE_CONTAINED_CELLS_WITH_TEXT = 1;
|
||||
private static final double TEXT_BLOCK_CONTAINMENT_TOLERANCE = 0.02;
|
||||
private static final double TABLE_UNIFORMITY_THRESHOLD = 0.7;
|
||||
|
||||
|
||||
@ -59,20 +65,20 @@ public class TableExtractionService {
|
||||
}
|
||||
}
|
||||
|
||||
var cells = new ArrayList<>(new HashSet<>(emptyCells));
|
||||
DoubleComparisons.sort(cells, Rectangle.ILL_DEFINED_ORDER);
|
||||
List<Cell> cells = new ArrayList<>(new HashSet<>(emptyCells));
|
||||
DoubleComparisons.sort(cells, BoundingBox.ILL_DEFINED_ORDER);
|
||||
|
||||
List<Rectangle> spreadsheetAreas = SpreadsheetFinder.findSpreadsheetsFromCells(cells);
|
||||
List<Rectangle2D> spreadsheetAreas = SpreadsheetFinder.findSpreadsheetsFromCells(cells);
|
||||
// sort spreadsheetAreas by size (height * width) ascending so that cells are placed in the smallest tables first
|
||||
// this way no cell duplication occurs when tables are contained in other tables and only the most inner table contains the cells
|
||||
spreadsheetAreas.sort(RECTANGLE_SIZE_COMPARATOR);
|
||||
|
||||
List<TablePageBlock> tables = new ArrayList<>();
|
||||
for (Rectangle area : spreadsheetAreas) {
|
||||
for (Rectangle2D area : spreadsheetAreas) {
|
||||
|
||||
List<Cell> containedCells = new ArrayList<>();
|
||||
for (Cell c : cells) {
|
||||
if (c.hasMinimumSize() && area.contains(c)) {
|
||||
if (c.hasMinimumSize() && area.contains(c.getBBoxInitialUserSpace())) {
|
||||
containedCells.add(c);
|
||||
}
|
||||
}
|
||||
@ -83,7 +89,7 @@ public class TableExtractionService {
|
||||
|
||||
// verify if table would contain fewer cells with text than the threshold allows
|
||||
if (containedCellsWithText.size() >= MAX_TABLE_CONTAINED_CELLS_WITH_TEXT && checkIfTableCellsAreUniform(containedCells)) {
|
||||
tables.add(new TablePageBlock(containedCells, area, page.getRotation()));
|
||||
tables.add(new TablePageBlock(containedCells, page.getRotation()));
|
||||
cells.removeAll(containedCells);
|
||||
}
|
||||
}
|
||||
@ -92,7 +98,7 @@ public class TableExtractionService {
|
||||
int position = -1;
|
||||
|
||||
for (AbstractPageBlock pageBlock : page.getTextBlocks()) {
|
||||
if (pageBlock instanceof TextPageBlock ? table.containsBlock((TextPageBlock) pageBlock) : table.contains(pageBlock) && position == -1) {
|
||||
if (pageBlock instanceof TextPageBlock ? table.contains(pageBlock) : table.contains(pageBlock) && position == -1) {
|
||||
position = page.getTextBlocks().indexOf(pageBlock);
|
||||
}
|
||||
}
|
||||
@ -118,7 +124,7 @@ public class TableExtractionService {
|
||||
}
|
||||
|
||||
Map<Long, List<Long>> cellsGroupedByRoundedWidth = containedCells.stream()
|
||||
.map(Rectangle::getWidth)
|
||||
.map(BoundingBox::getWidth)
|
||||
.map(size -> Math.round(size / 10.0) * 10)
|
||||
.collect(Collectors.groupingBy(Long::longValue));
|
||||
|
||||
@ -128,26 +134,25 @@ public class TableExtractionService {
|
||||
|
||||
private boolean doesCellContainTextBlock(Cell cell, TextPageBlock textBlock) {
|
||||
|
||||
double x = textBlock.getBBox().getX();
|
||||
double y = textBlock.getBBox().getY();
|
||||
double w = textBlock.getBBox().getWidth();
|
||||
double h = textBlock.getBBox().getHeight();
|
||||
if (cell.isEmpty() || w <= 0 || h <= 0) {
|
||||
return false;
|
||||
}
|
||||
double x0 = cell.getX();
|
||||
double y0 = cell.getY();
|
||||
double xTol = TEXT_BLOCK_CONTAINMENT_TOLERANCE * w;
|
||||
double yTol = TEXT_BLOCK_CONTAINMENT_TOLERANCE * h;
|
||||
return (x >= x0 - xTol && y >= y0 - yTol && (x + w) <= x0 + cell.getWidth() + 2 * xTol && (y + h) <= y0 + cell.getHeight() + 2 * yTol);
|
||||
return cell.contains(textBlock, RedTextPosition.HEIGHT_PADDING);
|
||||
}
|
||||
|
||||
|
||||
public static List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
@SneakyThrows
|
||||
public static List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines, PageInformation pageInformation) {
|
||||
|
||||
AffineTransform affineTransform = CoordinateTransforms.calculateInitialUserSpaceCoordsToImageCoords(pageInformation, 1);
|
||||
|
||||
switch (pageInformation.rotationDegrees()) {
|
||||
case 90 -> affineTransform.translate(RedTextPosition.HEIGHT_PADDING, 0); //although this is wrong, our text coordinates are wrong as well
|
||||
case 180 -> affineTransform.translate(0, RedTextPosition.HEIGHT_PADDING);
|
||||
case 270 -> affineTransform.translate(-RedTextPosition.HEIGHT_PADDING, 0);
|
||||
default -> affineTransform.translate(0, -RedTextPosition.HEIGHT_PADDING);
|
||||
}
|
||||
|
||||
return RectangularIntersectionFinder.find(horizontalRulingLines, verticalRulingLines)
|
||||
.stream()
|
||||
.map(Cell::new)
|
||||
.map(rect -> new Cell(rect, affineTransform))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
@ -31,13 +31,13 @@ public class TextRulingsClassifier {
|
||||
|
||||
private static void handleVerticalText(CleanRulings cleanRulings, TextPositionSequence word) {
|
||||
|
||||
float lowerY = (float) (word.getBoundingBox().getMinY() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||
float upperY = (float) (word.getBoundingBox().getMaxY() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||
float lowerY = (float) (word.getBBox().getMinY() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||
float upperY = (float) (word.getBBox().getMaxY() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||
|
||||
float strikethroughCenterX = (float) word.getBoundingBox().getCenterX();
|
||||
float strikethroughCenterX = (float) word.getBBox().getCenterX();
|
||||
float strikethroughBoxHeight = (float) ((word.getHeight() * STRIKETHROUGH_ZONE) / 2);
|
||||
|
||||
float underlineCenterX = (float) (word.getDir().equals(TextDirection.QUARTER_CIRCLE) ? word.getBoundingBox().getMaxX() : word.getBoundingBox().getMinX());
|
||||
float underlineCenterX = (float) (word.getDir().equals(TextDirection.QUARTER_CIRCLE) ? word.getBBox().getMaxX() : word.getBBox().getMinX());
|
||||
float underlineBoxHeight = (float) ((word.getHeight() * UNDERLINE_ZONE) / 2);
|
||||
|
||||
float leftX = Math.min(underlineCenterX - underlineBoxHeight, strikethroughCenterX - strikethroughBoxHeight);
|
||||
@ -65,13 +65,13 @@ public class TextRulingsClassifier {
|
||||
|
||||
private static void handleHorizontalText(CleanRulings cleanRulings, TextPositionSequence word) {
|
||||
|
||||
float leftX = (float) (word.getBoundingBox().getMinX() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||
float rightX = (float) (word.getBoundingBox().getMaxX() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||
float leftX = (float) (word.getBBox().getMinX() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||
float rightX = (float) (word.getBBox().getMaxX() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||
|
||||
float strikethroughCenterY = (float) word.getBoundingBox().getCenterY();
|
||||
float strikethroughCenterY = (float) word.getBBox().getCenterY();
|
||||
float strikethroughBoxHeight = (float) ((word.getHeight() * STRIKETHROUGH_ZONE) / 2);
|
||||
|
||||
float underlineCenterY = (float) (word.getDir().equals(TextDirection.ZERO) ? word.getBoundingBox().getMinY() : word.getBoundingBox().getMaxY());
|
||||
float underlineCenterY = (float) (word.getDir().equals(TextDirection.ZERO) ? word.getBBox().getMinY() : word.getBBox().getMaxY());
|
||||
float underlineBoxHeight = (float) ((word.getHeight() * UNDERLINE_ZONE) / 2);
|
||||
|
||||
float lowerY = Math.min(underlineCenterY - underlineBoxHeight, strikethroughCenterY - strikethroughBoxHeight);
|
||||
|
||||
@ -1,7 +1,5 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
|
||||
|
||||
import static java.util.stream.Collectors.toSet;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
@ -15,14 +13,10 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
@ -62,12 +56,14 @@ public class DocstrumBlockificationService {
|
||||
var classificationPage = new ClassificationPage(pageBlocks);
|
||||
classificationPage.setCleanRulings(rulings);
|
||||
|
||||
mergeIntersectingBlocks(classificationPage, 0, 0);
|
||||
mergeIntersectingBlocks(classificationPage, usedRulings, 0, 0);
|
||||
|
||||
combineBlocks(classificationPage);
|
||||
if (layoutParsingType == LayoutParsingType.DOCUMINE || layoutParsingType == LayoutParsingType.REDACT_MANAGER) {
|
||||
combineBlocks(classificationPage);
|
||||
}
|
||||
|
||||
if (layoutParsingType == LayoutParsingType.CLARIFYND) {
|
||||
mergeIntersectingBlocks(classificationPage, 0, 6.5f);
|
||||
mergeIntersectingBlocks(classificationPage, usedRulings, 0, 6.5f);
|
||||
}
|
||||
|
||||
return classificationPage;
|
||||
@ -125,7 +121,7 @@ public class DocstrumBlockificationService {
|
||||
|
||||
if (previous != null && !previous.getSequences().isEmpty()) {
|
||||
|
||||
if (current.getDir() != previous.getDir() || usedRulings.lineBetween(current.getBBox(), previous.getBBox())) {
|
||||
if (current.getDir() != previous.getDir() || usedRulings.lineBetween(current, previous)) {
|
||||
previous = current;
|
||||
continue;
|
||||
}
|
||||
@ -135,7 +131,7 @@ public class DocstrumBlockificationService {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (previous.almostIntersects(current, 0, 0)) {
|
||||
if (previous.intersects(current)) {
|
||||
previous = combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
|
||||
continue;
|
||||
}
|
||||
@ -154,7 +150,7 @@ public class DocstrumBlockificationService {
|
||||
previous = current;
|
||||
}
|
||||
|
||||
mergeIntersectingBlocks(page, 0, 6.5f);
|
||||
mergeIntersectingBlocks(page, usedRulings, 0, 6.5f);
|
||||
}
|
||||
|
||||
|
||||
@ -235,7 +231,7 @@ public class DocstrumBlockificationService {
|
||||
}
|
||||
|
||||
|
||||
public void mergeIntersectingBlocks(ClassificationPage page, float xThreshold, float yThreshold) {
|
||||
public void mergeIntersectingBlocks(ClassificationPage page, CleanRulings usedRulings, float xThreshold, float yThreshold) {
|
||||
|
||||
var blocks = page.getTextBlocks();
|
||||
ListIterator<AbstractPageBlock> itty = blocks.listIterator();
|
||||
@ -264,11 +260,11 @@ public class DocstrumBlockificationService {
|
||||
|
||||
TextPageBlock inner = (TextPageBlock) blocks.get(i);
|
||||
|
||||
if (page.getCleanRulings().lineBetween(inner.getBBox(), current.getBBox())) {
|
||||
if (usedRulings.lineBetween(current, blocks.get(i))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current.getDir() == inner.getDir() && current.almostIntersects(inner, yThreshold, xThreshold)) {
|
||||
if (current.getDir() == inner.getDir() && current.intersects(inner, yThreshold, xThreshold)) {
|
||||
|
||||
boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate();
|
||||
current.getSequences().addAll(inner.getSequences());
|
||||
@ -289,174 +285,9 @@ public class DocstrumBlockificationService {
|
||||
}
|
||||
|
||||
|
||||
public List<AbstractPageBlock> splitZonesAtRulings(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
|
||||
int indexOnPage = 0;
|
||||
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
||||
List<AbstractPageBlock> chunkBlockList = new ArrayList<>();
|
||||
|
||||
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
|
||||
TextPositionSequence prev = null;
|
||||
|
||||
for (TextPositionSequence word : textPositions) {
|
||||
|
||||
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
|
||||
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
||||
|
||||
if (prev != null && (splitByDir || isSplitByRuling)) {
|
||||
|
||||
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
|
||||
indexOnPage++;
|
||||
|
||||
chunkBlockList.add(cb1);
|
||||
chunkWords = new ArrayList<>();
|
||||
|
||||
minX = 1000;
|
||||
maxX = 0;
|
||||
minY = 1000;
|
||||
maxY = 0;
|
||||
prev = null;
|
||||
}
|
||||
|
||||
chunkWords.add(word);
|
||||
|
||||
prev = word;
|
||||
if (word.getMinXDirAdj() < minX) {
|
||||
minX = word.getMinXDirAdj();
|
||||
}
|
||||
if (word.getMaxXDirAdj() > maxX) {
|
||||
maxX = word.getMaxXDirAdj();
|
||||
}
|
||||
if (word.getMinYDirAdj() < minY) {
|
||||
minY = word.getMinYDirAdj();
|
||||
}
|
||||
if (word.getMaxYDirAdj() > maxY) {
|
||||
maxY = word.getMaxYDirAdj();
|
||||
}
|
||||
}
|
||||
|
||||
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
|
||||
if (cb1 != null) {
|
||||
chunkBlockList.add(cb1);
|
||||
}
|
||||
|
||||
return chunkBlockList;
|
||||
}
|
||||
|
||||
|
||||
private boolean equalsWithThreshold(float f1, float f2) {
|
||||
|
||||
return Math.abs(f1 - f2) < THRESHOLD;
|
||||
}
|
||||
|
||||
|
||||
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
|
||||
|
||||
TextPageBlock textBlock = null;
|
||||
|
||||
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
|
||||
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
|
||||
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
|
||||
|
||||
for (TextPositionSequence wordBlock : wordBlockList) {
|
||||
|
||||
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
|
||||
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
|
||||
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
|
||||
fontFrequencyCounter.add(wordBlock.getFont());
|
||||
styleFrequencyCounter.add(wordBlock.getFontStyle());
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
|
||||
wordBlock.getMaxXDirAdj(),
|
||||
wordBlock.getMinYDirAdj(),
|
||||
wordBlock.getMaxYDirAdj(),
|
||||
wordBlockList,
|
||||
wordBlock.getRotation());
|
||||
} else {
|
||||
TextPageBlock spatialEntity = textBlock.union(wordBlock);
|
||||
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
||||
}
|
||||
}
|
||||
|
||||
if (textBlock != null) {
|
||||
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
|
||||
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
||||
}
|
||||
|
||||
if (textBlock != null
|
||||
&& textBlock.getSequences() != null
|
||||
&& textBlock.getSequences()
|
||||
.stream()
|
||||
.map(t -> round(t.getMinYDirAdj(), 3))
|
||||
.collect(toSet()).size() == 1) {
|
||||
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
private boolean isSplitByRuling(float minX,
|
||||
float minY,
|
||||
float maxX,
|
||||
float maxY,
|
||||
TextPositionSequence word,
|
||||
List<Ruling> horizontalRulingLines,
|
||||
List<Ruling> verticalRulingLines) {
|
||||
|
||||
return isSplitByRuling(maxX, minY, word.getMinXDirAdj(), word.getMinYDirAdj(), verticalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight())
|
||||
//
|
||||
|| isSplitByRuling(minX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMaxYDirAdj(),
|
||||
horizontalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight())
|
||||
//
|
||||
|| isSplitByRuling(maxX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMinYDirAdj(),
|
||||
horizontalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight())
|
||||
//
|
||||
|| isSplitByRuling(minX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMaxYDirAdj(),
|
||||
verticalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight());
|
||||
}
|
||||
|
||||
|
||||
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
|
||||
|
||||
for (Ruling ruling : rulingLines) {
|
||||
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
|
||||
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private double round(float value, int decimalPoints) {
|
||||
|
||||
var d = Math.pow(10, decimalPoints);
|
||||
return Math.round(value * d) / d;
|
||||
return new TextPageBlock(wordBlockList);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -40,7 +40,7 @@ public class DocuMineBlockificationService {
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, CleanRulings cleanRulings) {
|
||||
|
||||
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
||||
List<AbstractPageBlock> chunkBlockList1 = new ArrayList<>();
|
||||
List<AbstractPageBlock> textPageBlocks = new ArrayList<>();
|
||||
|
||||
CleanRulings usedRulings = cleanRulings.withoutTextRulings();
|
||||
|
||||
@ -59,7 +59,7 @@ public class DocuMineBlockificationService {
|
||||
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
|
||||
boolean negativeXGap = prev != null && word.getMinXDirAdj() - minX < -5;
|
||||
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
||||
boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev.getBoundingBox(), word.getBoundingBox());
|
||||
boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev, word);
|
||||
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
||||
boolean splitByOtherFontAndOtherY = prev != null && Math.abs(prev.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight() * 0.2 && (word.getFontStyle()
|
||||
.contains("bold")
|
||||
@ -73,12 +73,12 @@ public class DocuMineBlockificationService {
|
||||
if (prev != null && (lineSeparation || startFromTop || splitByDir || isSplitByRuling || splitByOtherFontAndOtherY || negativeXGap || startsOnSameX)) {
|
||||
|
||||
Orientation prevOrientation = null;
|
||||
if (!chunkBlockList1.isEmpty()) {
|
||||
prevOrientation = chunkBlockList1.get(chunkBlockList1.size() - 1).getOrientation();
|
||||
if (!textPageBlocks.isEmpty()) {
|
||||
prevOrientation = textPageBlocks.get(textPageBlocks.size() - 1).getOrientation();
|
||||
}
|
||||
|
||||
TextPageBlock cb1 = buildTextBlock(chunkWords);
|
||||
chunkBlockList1.add(cb1);
|
||||
TextPageBlock cb1 = new TextPageBlock(chunkWords);
|
||||
textPageBlocks.add(cb1);
|
||||
chunkWords = new ArrayList<>();
|
||||
|
||||
if (splitByX && !isSplitByRuling) {
|
||||
@ -121,77 +121,12 @@ public class DocuMineBlockificationService {
|
||||
}
|
||||
}
|
||||
|
||||
TextPageBlock cb1 = buildTextBlock(chunkWords);
|
||||
if (cb1 != null) {
|
||||
chunkBlockList1.add(cb1);
|
||||
}
|
||||
textPageBlocks.add(new TextPageBlock(chunkWords));
|
||||
|
||||
return new ClassificationPage(chunkBlockList1);
|
||||
return new ClassificationPage(textPageBlocks);
|
||||
}
|
||||
|
||||
|
||||
private boolean equalsWithThreshold(float f1, float f2) {
|
||||
|
||||
return Math.abs(f1 - f2) < THRESHOLD;
|
||||
}
|
||||
|
||||
|
||||
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList) {
|
||||
|
||||
TextPageBlock textBlock = null;
|
||||
|
||||
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
|
||||
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
|
||||
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
|
||||
|
||||
for (TextPositionSequence wordBlock : wordBlockList) {
|
||||
|
||||
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
|
||||
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
|
||||
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
|
||||
fontFrequencyCounter.add(wordBlock.getFont());
|
||||
styleFrequencyCounter.add(wordBlock.getFontStyle());
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
|
||||
wordBlock.getMaxXDirAdj(),
|
||||
wordBlock.getMinYDirAdj(),
|
||||
wordBlock.getMaxYDirAdj(),
|
||||
wordBlockList,
|
||||
wordBlock.getRotation());
|
||||
} else {
|
||||
TextPageBlock spatialEntity = textBlock.union(wordBlock);
|
||||
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
||||
}
|
||||
}
|
||||
|
||||
if (textBlock != null) {
|
||||
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
|
||||
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
||||
}
|
||||
|
||||
if (textBlock != null
|
||||
&& textBlock.getSequences() != null
|
||||
&& textBlock.getSequences()
|
||||
.stream()
|
||||
.map(t -> round(t.getMinYDirAdj(), 3))
|
||||
.collect(toSet()).size() == 1) {
|
||||
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
private double round(float value, int decimalPoints) {
|
||||
|
||||
var d = Math.pow(10, decimalPoints);
|
||||
return Math.round(value * d) / d;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -55,7 +55,7 @@ public class RedactManagerBlockificationService {
|
||||
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
|
||||
boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX;
|
||||
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
||||
boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev.getBoundingBox(), word.getBoundingBox());
|
||||
boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev, word);
|
||||
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
||||
|
||||
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
|
||||
@ -65,7 +65,7 @@ public class RedactManagerBlockificationService {
|
||||
prevOrientation = chunkBlockList.get(chunkBlockList.size() - 1).getOrientation();
|
||||
}
|
||||
|
||||
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
|
||||
TextPageBlock cb1 = new TextPageBlock(chunkWords);
|
||||
indexOnPage++;
|
||||
|
||||
chunkBlockList.add(cb1);
|
||||
@ -111,8 +111,8 @@ public class RedactManagerBlockificationService {
|
||||
}
|
||||
}
|
||||
|
||||
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
|
||||
if (cb1 != null) {
|
||||
if (!chunkWords.isEmpty()) {
|
||||
TextPageBlock cb1 = new TextPageBlock(chunkWords);
|
||||
chunkBlockList.add(cb1);
|
||||
}
|
||||
|
||||
@ -174,68 +174,9 @@ public class RedactManagerBlockificationService {
|
||||
}
|
||||
|
||||
|
||||
private boolean equalsWithThreshold(float f1, float f2) {
|
||||
private boolean equalsWithThreshold(double f1, double f2) {
|
||||
|
||||
return Math.abs(f1 - f2) < THRESHOLD;
|
||||
}
|
||||
|
||||
|
||||
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
|
||||
|
||||
TextPageBlock textBlock = null;
|
||||
|
||||
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
|
||||
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
|
||||
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
|
||||
|
||||
for (TextPositionSequence wordBlock : wordBlockList) {
|
||||
|
||||
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
|
||||
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
|
||||
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
|
||||
fontFrequencyCounter.add(wordBlock.getFont());
|
||||
styleFrequencyCounter.add(wordBlock.getFontStyle());
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
|
||||
wordBlock.getMaxXDirAdj(),
|
||||
wordBlock.getMinYDirAdj(),
|
||||
wordBlock.getMaxYDirAdj(),
|
||||
wordBlockList,
|
||||
wordBlock.getRotation());
|
||||
} else {
|
||||
TextPageBlock spatialEntity = textBlock.union(wordBlock);
|
||||
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
||||
}
|
||||
}
|
||||
|
||||
if (textBlock != null) {
|
||||
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
|
||||
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
||||
}
|
||||
|
||||
if (textBlock != null
|
||||
&& textBlock.getSequences() != null
|
||||
&& textBlock.getSequences()
|
||||
.stream()
|
||||
.map(t -> round(t.getMinYDirAdj(), 3))
|
||||
.collect(toSet()).size() == 1) {
|
||||
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
private double round(float value, int decimalPoints) {
|
||||
|
||||
var d = Math.pow(10, decimalPoints);
|
||||
return Math.round(value * d) / d;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -71,11 +71,10 @@ public class SearchTextWithTextPositionFactory {
|
||||
List<Rectangle2D> positions = sequences.stream()
|
||||
.map(TextPositionSequence::getTextPositions)
|
||||
.flatMap(Collection::stream)
|
||||
.map(RedTextPosition::getInitialUserSpacePosition)
|
||||
.map(RedTextPosition::getBBoxInitialUserSpace)
|
||||
.toList();
|
||||
|
||||
assert context.stringBuilder.length() == context.stringIdxToPositionIdx.size();
|
||||
assert positions.size() == context.stringIdxToPositionIdx.size();
|
||||
|
||||
return SearchTextWithTextPositionDto.builder()
|
||||
.searchText(context.stringBuilder.toString())
|
||||
|
||||
@ -45,7 +45,10 @@ public class TableNodeFactory {
|
||||
.flatMap(Collection::stream)
|
||||
.toList();
|
||||
|
||||
Table table = Table.builder().documentTree(context.getDocumentTree()).numberOfCols(mergedRows.isEmpty() ? 0 : mergedRows.get(0).size()).numberOfRows(mergedRows.size())
|
||||
Table table = Table.builder()
|
||||
.documentTree(context.getDocumentTree())
|
||||
.numberOfCols(mergedRows.isEmpty() ? 0 : mergedRows.get(0).size())
|
||||
.numberOfRows(mergedRows.size())
|
||||
.build();
|
||||
|
||||
pages.forEach(page -> addTableToPage(page, parentNode, table));
|
||||
@ -128,7 +131,12 @@ public class TableNodeFactory {
|
||||
|
||||
Page page = context.getPage(cell.getPageNumber());
|
||||
|
||||
TableCell tableCell = TableCell.builder().documentTree(context.getDocumentTree()).row(rowIndex).col(colIndex).header(cell.isHeaderCell()).bBox(cell.getBounds2D())
|
||||
TableCell tableCell = TableCell.builder()
|
||||
.documentTree(context.getDocumentTree())
|
||||
.row(rowIndex)
|
||||
.col(colIndex)
|
||||
.header(cell.isHeaderCell())
|
||||
.bBox(cell.getBBoxInitialUserSpace())
|
||||
.build();
|
||||
page.getMainBody().add(tableCell);
|
||||
|
||||
|
||||
@ -13,6 +13,9 @@ import org.apache.pdfbox.rendering.ImageType;
|
||||
import org.apache.pdfbox.rendering.PDFRenderer;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@Service
|
||||
@ -30,7 +33,7 @@ public class FindGraphicsRaster {
|
||||
|
||||
var renderer = new PDFRenderer(doc);
|
||||
var img = renderer.renderImageWithDPI(pageInformation.number() - 1, DPI, ImageType.GRAY);
|
||||
var imageCtm = getImageCTM(pageInformation, img.getWidth());
|
||||
var imageCtm = CoordinateTransforms.calculateImageCoordsToInitialUserSpaceCoords(pageInformation, CoordinateTransforms.calculateScalingFactor(pageInformation, img.getWidth()));
|
||||
return findCCBoundingBoxes(img, remove, THRESHOLD, DPI / 72, imageCtm);
|
||||
}
|
||||
|
||||
@ -131,42 +134,4 @@ public class FindGraphicsRaster {
|
||||
}
|
||||
|
||||
|
||||
public AffineTransform getImageCTM(PageInformation pageInformation, int imageWidth) {
|
||||
|
||||
double scalingFactor = calculateScalingFactor(pageInformation, imageWidth);
|
||||
AffineTransform imageToCropBoxScaling = new AffineTransform(scalingFactor, 0, 0, scalingFactor, -pageInformation.minX(), -pageInformation.minY());
|
||||
|
||||
AffineTransform mirrorMatrix = new AffineTransform(1, 0, 0, -1, 0, pageInformation.height());
|
||||
|
||||
AffineTransform rotationMatrix = switch (pageInformation.rotationDegrees()) {
|
||||
case 90 -> new AffineTransform(0, 1, -1, 0, pageInformation.height(), 0);
|
||||
case 180 -> new AffineTransform(-1, 0, 0, -1, pageInformation.width(), pageInformation.height());
|
||||
case 270 -> new AffineTransform(0, -1, 1, 0, pageInformation.width() - pageInformation.height(), pageInformation.height()); // results from 90 + 180 rotations
|
||||
default -> new AffineTransform();
|
||||
};
|
||||
|
||||
// matrix multiplication is performed from right to left, so the order is reversed.
|
||||
// scaling -> mirror -> rotation
|
||||
AffineTransform resultMatrix = new AffineTransform();
|
||||
|
||||
resultMatrix.concatenate(rotationMatrix);
|
||||
resultMatrix.concatenate(mirrorMatrix);
|
||||
resultMatrix.concatenate(imageToCropBoxScaling);
|
||||
return resultMatrix;
|
||||
}
|
||||
|
||||
|
||||
private double calculateScalingFactor(PageInformation pageInformation, int imageWidth) {
|
||||
|
||||
// PDFBox always returns page height and width based on rotation
|
||||
double pageWidth;
|
||||
if (pageInformation.rotationDegrees() == 90 || pageInformation.rotationDegrees() == 270) {
|
||||
pageWidth = pageInformation.height();
|
||||
} else {
|
||||
pageWidth = pageInformation.width();
|
||||
}
|
||||
|
||||
return pageWidth / imageWidth;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -12,6 +12,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRul
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@ -0,0 +1,56 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class CoordinateTransforms {
|
||||
|
||||
public AffineTransform calculateImageCoordsToInitialUserSpaceCoords(PageInformation pageInformation, double scalingFactor) {
|
||||
|
||||
AffineTransform imageToCropBoxScaling = new AffineTransform(scalingFactor, 0, 0, scalingFactor, -pageInformation.minX(), -pageInformation.minY());
|
||||
|
||||
AffineTransform mirrorMatrix = new AffineTransform(1, 0, 0, -1, 0, pageInformation.height());
|
||||
|
||||
AffineTransform rotationMatrix = switch (pageInformation.rotationDegrees()) {
|
||||
case 90 -> new AffineTransform(0, 1, -1, 0, pageInformation.height(), 0);
|
||||
case 180 -> new AffineTransform(-1, 0, 0, -1, pageInformation.width(), pageInformation.height());
|
||||
case 270 -> new AffineTransform(0, -1, 1, 0, pageInformation.width() - pageInformation.height(), pageInformation.height()); // results from 90 + 180 rotations
|
||||
default -> new AffineTransform();
|
||||
};
|
||||
|
||||
// matrix multiplication is performed from right to left, so the order is reversed.
|
||||
// scaling -> mirror -> rotation
|
||||
AffineTransform resultMatrix = new AffineTransform();
|
||||
|
||||
resultMatrix.concatenate(rotationMatrix);
|
||||
resultMatrix.concatenate(mirrorMatrix);
|
||||
resultMatrix.concatenate(imageToCropBoxScaling);
|
||||
return resultMatrix;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public AffineTransform calculateInitialUserSpaceCoordsToImageCoords(PageInformation pageInformation, double scalingFactor) {
|
||||
|
||||
return calculateImageCoordsToInitialUserSpaceCoords(pageInformation, scalingFactor).createInverse();
|
||||
}
|
||||
|
||||
|
||||
public double calculateScalingFactor(PageInformation pageInformation, double imageWidth) {
|
||||
|
||||
// PDFBox always returns page height and width based on rotation
|
||||
double pageWidth;
|
||||
if (pageInformation.rotationDegrees() == 90 || pageInformation.rotationDegrees() == 270) {
|
||||
pageWidth = pageInformation.height();
|
||||
} else {
|
||||
pageWidth = pageInformation.width();
|
||||
}
|
||||
|
||||
return pageWidth / imageWidth;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,10 +1,10 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Comparator;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
|
||||
public class GeometricComparators {
|
||||
@ -58,7 +58,7 @@ public class GeometricComparators {
|
||||
return cell1Size.compareTo(cell2Size);
|
||||
};
|
||||
|
||||
public static final Comparator<Rectangle> RECTANGLE_SIZE_COMPARATOR = (rect1, rect2) -> {
|
||||
public static final Comparator<Rectangle2D> RECTANGLE_SIZE_COMPARATOR = (rect1, rect2) -> {
|
||||
|
||||
Double rect1Size = rect1.getHeight() * rect1.getWidth();
|
||||
Double rect2Size = rect2.getHeight() * rect2.getWidth();
|
||||
|
||||
@ -47,8 +47,8 @@ public class MarkedContentUtils {
|
||||
|
||||
return markedContentByYPosition.values()
|
||||
.stream()
|
||||
.map(textPositions -> new TextPositionSequence(textPositions, 0, true).getRectangle())
|
||||
.map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight())))
|
||||
.map(textPositions -> new TextPositionSequence(textPositions, 0, true).getBBox())
|
||||
.map(t -> new Rectangle2D.Double(t.getX(), t.getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight())))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
@ -89,7 +89,7 @@ public class MarkedContentUtils {
|
||||
.map(content -> (TextPosition) content)
|
||||
.filter(content -> !content.getUnicode().equals(" "))
|
||||
.map(textPositions -> new TextPositionSequence(List.of(textPositions), 0, true))
|
||||
.map(TextPositionSequence::getBoundingBox)
|
||||
.map(TextPositionSequence::getBBox)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.graphics;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
@ -114,7 +114,7 @@ public final class PositionUtils {
|
||||
}
|
||||
|
||||
|
||||
public Float getApproxLineCount(TextPageBlock textBlock) {
|
||||
public double getApproxLineCount(TextPageBlock textBlock) {
|
||||
|
||||
return textBlock.getHeight() / textBlock.getMostPopularWordHeight();
|
||||
}
|
||||
|
||||
@ -4,6 +4,7 @@ import static com.knecon.fforesight.service.layoutparser.processor.utils.Geometr
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.Y_FIRST_POINT_COMPARATOR;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
@ -11,7 +12,7 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
|
||||
public class SpreadsheetFinder {
|
||||
|
||||
@ -19,15 +20,15 @@ public class SpreadsheetFinder {
|
||||
private static final float AREA_TOLERANCE = 0.001f;
|
||||
|
||||
|
||||
public static List<Rectangle> findSpreadsheetsFromCells(List<? extends Rectangle> cells) {
|
||||
public static List<Rectangle2D> findSpreadsheetsFromCells(List<Cell> cells) {
|
||||
// via: http://stackoverflow.com/questions/13746284/merging-multiple-adjacent-rectangles-into-one-polygon
|
||||
List<Rectangle> rectangles = new ArrayList<>();
|
||||
List<Rectangle2D> rectangles = new ArrayList<>();
|
||||
Set<Point2D> pointSet = new HashSet<>();
|
||||
Map<Point2D, Point2D> edgesH = new HashMap<>();
|
||||
Map<Point2D, Point2D> edgesV = new HashMap<>();
|
||||
|
||||
for (Rectangle cell : cells) {
|
||||
for (Point2D pt : cell.getPoints()) {
|
||||
for (Cell cell : cells) {
|
||||
for (Point2D pt : getPoints(cell.getBBoxInitialUserSpace())) {
|
||||
if (pointSet.contains(pt)) { // shared vertex, remove it
|
||||
pointSet.remove(pt);
|
||||
} else {
|
||||
@ -116,13 +117,22 @@ public class SpreadsheetFinder {
|
||||
|
||||
// do not add polygons with too many outer points as they are unlikely to be tables
|
||||
if (poly.size() <= MAX_OUTER_POINT_TOLERANCE) {
|
||||
rectangles.add(new Rectangle(top - AREA_TOLERANCE, left - AREA_TOLERANCE, right - left + 2 * AREA_TOLERANCE, bottom - top + 2 * AREA_TOLERANCE));
|
||||
rectangles.add(new Rectangle2D.Double(left - AREA_TOLERANCE, top - AREA_TOLERANCE, right - left + (2 * AREA_TOLERANCE), bottom - top + (2 * AREA_TOLERANCE)));
|
||||
}
|
||||
}
|
||||
return rectangles;
|
||||
}
|
||||
|
||||
|
||||
public static List<Point2D> getPoints(Rectangle2D rectangle2D) {
|
||||
|
||||
return List.of(new Point2D.Double(rectangle2D.getX(), rectangle2D.getY()),
|
||||
new Point2D.Double(rectangle2D.getMaxX(), rectangle2D.getY()),
|
||||
new Point2D.Double(rectangle2D.getMaxX(), rectangle2D.getMaxY()),
|
||||
new Point2D.Double(rectangle2D.getX(), rectangle2D.getMaxY()));
|
||||
}
|
||||
|
||||
|
||||
private enum Direction {
|
||||
HORIZONTAL,
|
||||
VERTICAL
|
||||
|
||||
@ -9,7 +9,6 @@ import java.util.List;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
@ -18,7 +17,6 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
@ -111,10 +109,7 @@ public class LayoutparsingVisualizations {
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.words);
|
||||
visualizationsOnPage.getColoredRectangles()
|
||||
.addAll(textPositionSequences.stream()
|
||||
.map(textPositionSequence -> textPositionSequence.getTextPositions()
|
||||
.stream()
|
||||
.map(RedTextPosition::getInitialUserSpacePosition)
|
||||
.collect(RectangleTransformations.collectBBox()))
|
||||
.map(BoundingBox::getBBoxInitialUserSpace)
|
||||
.map(rect -> new ColoredRectangle(rect, WORDS_COLOR, 1))
|
||||
.toList());
|
||||
}
|
||||
@ -147,7 +142,7 @@ public class LayoutparsingVisualizations {
|
||||
}
|
||||
|
||||
|
||||
public void addCellVisualizations(List<? extends Rectangle2D> cells, int pageNumber) {
|
||||
public void addCellVisualizations(List<? extends BoundingBox> cells, int pageNumber) {
|
||||
|
||||
if (!active) {
|
||||
return;
|
||||
@ -155,7 +150,7 @@ public class LayoutparsingVisualizations {
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.cells);
|
||||
visualizationsOnPage.getColoredRectangles()
|
||||
.addAll(cells.stream()
|
||||
.map(ruling -> new ColoredRectangle(ruling, CELLS_COLOR, 1))
|
||||
.map(cell -> new ColoredRectangle(cell.getBBoxInitialUserSpace(), CELLS_COLOR, 1))
|
||||
.toList());
|
||||
}
|
||||
|
||||
@ -169,7 +164,7 @@ public class LayoutparsingVisualizations {
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, this.zones);
|
||||
visualizationsOnPage.getColoredRectangles()
|
||||
.addAll(zones.stream()
|
||||
.map(BoundingBox::getBBox)
|
||||
.map(BoundingBox::getBBoxInitialUserSpace)
|
||||
.map(zone -> new ColoredRectangle(zone, ZONES_COLOR, 1))
|
||||
.toList());
|
||||
|
||||
@ -194,8 +189,8 @@ public class LayoutparsingVisualizations {
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, this.lines);
|
||||
visualizationsOnPage.getColoredRectangles()
|
||||
.addAll(lines.stream()
|
||||
.map(BoundingBox::getBBox)
|
||||
.map(line -> new ColoredRectangle(line, LINES_COLOR, 0.3f))
|
||||
.map(BoundingBox::getBBoxInitialUserSpace)
|
||||
.map(line -> new ColoredRectangle(line, LINES_COLOR, 0.5f))
|
||||
.toList());
|
||||
}
|
||||
|
||||
@ -208,7 +203,7 @@ public class LayoutparsingVisualizations {
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, zones);
|
||||
visualizationsOnPage.getColoredRectangles()
|
||||
.addAll(textPageBlocks.stream()
|
||||
.map(rect -> new ColoredRectangle(rect.getBBox(), ZONES_COLOR, 1))
|
||||
.map(rect -> new ColoredRectangle(rect.getBBoxInitialUserSpace(), ZONES_COLOR, 1))
|
||||
.toList());
|
||||
}
|
||||
|
||||
@ -226,7 +221,7 @@ public class LayoutparsingVisualizations {
|
||||
}
|
||||
|
||||
|
||||
public void addMarkedContentVisualizations(List<PDMarkedContent> markedContents, int pageNumber, PDPage pdPage) {
|
||||
public void addMarkedContentVisualizations(List<PDMarkedContent> markedContents, int pageNumber) {
|
||||
|
||||
if (!active) {
|
||||
return;
|
||||
@ -235,14 +230,16 @@ public class LayoutparsingVisualizations {
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, markedContent);
|
||||
|
||||
List<MarkedContentUtils.MarkedContentPosition> markedContentBBoxMapBySubType = MarkedContentUtils.getMarkedContentPositions(markedContents);
|
||||
|
||||
AtomicInteger count = new AtomicInteger();
|
||||
markedContentBBoxMapBySubType.forEach(markedContentPosition -> {
|
||||
|
||||
var bbox = markedContentPosition.textPositions()
|
||||
.stream()
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
String type = markedContentPosition.formattedType();
|
||||
float translationAmount = ((FONT.getStringWidth(type) / 1000) * 10 + (2 * 1) + 4);
|
||||
String type = markedContentPosition.formattedType() + " " + count.getAndIncrement();
|
||||
|
||||
float translationAmount = ((FONT.getStringWidth(type) / 100) + 6);
|
||||
// Pushes the string to the left of the box: calculate string width, divide by font units (1000), multiply with font size (10), add small offset (6).
|
||||
|
||||
visualizationsOnPage.getPlacedTexts()
|
||||
.add(PlacedText.textFacingUp(type, new Point2D.Double(bbox.getX() - translationAmount, bbox.getY() + bbox.getHeight()), 10, Color.BLACK, FONT));
|
||||
|
||||
@ -270,11 +267,11 @@ public class LayoutparsingVisualizations {
|
||||
.flatMap(Collection::stream)
|
||||
.forEach(character -> {
|
||||
Color color = ROTATING_CHARACTER_COLOR.get(index.getAndIncrement() % ROTATING_CHARACTER_COLOR.size());
|
||||
Rectangle2D charBBox = character.getTextPosition().getInitialUserSpacePosition();
|
||||
Rectangle2D charBBox = character.getTextPosition().getBBoxInitialUserSpace();
|
||||
characterVisualizations.getColoredRectangles().add(new ColoredRectangle(charBBox, color, 1));
|
||||
character.getNeighbors()
|
||||
.forEach(neighbor -> {
|
||||
Rectangle2D neighborBBox = neighbor.getCharacter().getTextPosition().getInitialUserSpacePosition();
|
||||
Rectangle2D neighborBBox = neighbor.getCharacter().getTextPosition().getBBoxInitialUserSpace();
|
||||
Line2D line = new Line2D.Double(new Point2D.Double(charBBox.getCenterX(), charBBox.getCenterY()),
|
||||
new Point2D.Double(neighborBBox.getCenterX(), neighborBBox.getCenterY()));
|
||||
neighbourVisualizations.getColoredLines().add(new ColoredLine(line, color, 1));
|
||||
@ -287,7 +284,8 @@ public class LayoutparsingVisualizations {
|
||||
private VisualizationsOnPage getOrCreateVisualizationsOnPage(int page, Visualizations visualizations) {
|
||||
|
||||
if (visualizations.getVisualizationsOnPages().containsKey(page - 1)) {
|
||||
return visualizations.getVisualizationsOnPages().get(page - 1);
|
||||
return visualizations.getVisualizationsOnPages()
|
||||
.get(page - 1);
|
||||
}
|
||||
VisualizationsOnPage visualizationsOnPage = VisualizationsOnPage.builder().build();
|
||||
visualizations.getVisualizationsOnPages().put(page - 1, visualizationsOnPage);
|
||||
|
||||
@ -70,7 +70,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
file = new File(filePath);
|
||||
}
|
||||
|
||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.DOCUMINE, true);
|
||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER, true);
|
||||
prepareStorage(layoutParsingRequest, file);
|
||||
|
||||
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
||||
|
||||
@ -115,10 +115,10 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
ClassificationDocument classificationDocument = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
assertThat(classificationDocument.getHeaders()
|
||||
.get(0).getTextBlocks().size()).isEqualTo(1);
|
||||
.get(0).getTextBlocks().size()).isEqualTo(3);
|
||||
assertThat(classificationDocument.getHeaders()
|
||||
.get(0).getTextBlocks()
|
||||
.get(0).getSequences().size()).isEqualTo(12);
|
||||
.get(0).getSequences().size()).isEqualTo(8);
|
||||
assertThat(classificationDocument.getHeaders()
|
||||
.get(0).getTextBlocks()
|
||||
.get(0).toString()).contains(textToSearch);
|
||||
@ -131,6 +131,17 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testTableAndCellRotations() {
|
||||
String fileName = "files/Minimal Examples/simpleTablesRotated.pdf";
|
||||
ClassPathResource pdfFileResource = new ClassPathResource(fileName);
|
||||
|
||||
ClassificationDocument classificationDocument = buildClassificationDocument(pdfFileResource.getFile());
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Disabled
|
||||
@Test
|
||||
public void testScanRotationBorderIsIgnored() throws IOException {
|
||||
|
||||
@ -28,29 +28,30 @@ class InvisibleTableDetectionServiceTest {
|
||||
|
||||
String fileName = "files/basf/CustomerFiles/invisible_tables_test-two-pages_ocred.pdf";
|
||||
var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TABLE.pdf").toString();
|
||||
List<PageInformation> pageContents = PageContentExtractor.getSortedPageContents(fileName).stream().map(PageInformationService::build).collect(Collectors.toList());
|
||||
List<PageInformation> pageContents = PageContentExtractor.getSortedPageContents(fileName)
|
||||
.stream()
|
||||
.map(PageInformationService::build)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
int pageNumber = 1;
|
||||
Rectangle2D tableBBox = pageContents.get(0)
|
||||
.getPageContents()
|
||||
.getSortedTextPositionSequences()
|
||||
.subList(45, 152)
|
||||
Rectangle2D tableBBox = pageContents.get(0).getPageContents().getSortedTextPositionSequences().subList(45, 152)
|
||||
.stream()
|
||||
.map(TextPositionSequence::getRectangle)
|
||||
.map(RectangleTransformations::toRectangle2D)
|
||||
.map(TextPositionSequence::getBBox)
|
||||
.map(this::mirrorY)
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
|
||||
List<TextPositionSequence> textPositionSequences = pageContents.get(0)
|
||||
.getPageContents()
|
||||
.getSortedTextPositionSequences()
|
||||
List<TextPositionSequence> textPositionSequences = pageContents.get(0).getPageContents().getSortedTextPositionSequences()
|
||||
.stream()
|
||||
.filter(textPositionSequence -> tableBBox.contains(mirrorY(RectangleTransformations.toRectangle2D(textPositionSequence.getRectangle()))))
|
||||
.filter(textPositionSequence -> tableBBox.contains(mirrorY(textPositionSequence.getBBox())))
|
||||
.toList();
|
||||
|
||||
var table = InvisibleTableDetectionService.detectTable(textPositionSequences, tableBBox);
|
||||
|
||||
PdfDraw.drawRectanglesPerPage(fileName, List.of(table.stream().flatMap(Collection::stream).toList(), Collections.emptyList()), tmpFileName);
|
||||
PdfDraw.drawRectanglesPerPage(fileName,
|
||||
List.of(table.stream()
|
||||
.flatMap(Collection::stream)
|
||||
.toList(), Collections.emptyList()),
|
||||
tmpFileName);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -29,9 +29,7 @@ class PageContentExtractorTest {
|
||||
textPositionPerPage.stream()
|
||||
.map(t -> t.getSortedTextPositionSequences()
|
||||
.stream()
|
||||
.map(TextPositionSequence::getRectangle)
|
||||
.map(RectangleTransformations::toRectangle2D)
|
||||
//.map(textPositionSequence -> (Rectangle2D) new Rectangle2D.Double(textPositionSequence.getMaxXDirAdj(), textPositionSequence.getMaxYDirAdj(), textPositionSequence.getWidth(), textPositionSequence.getHeight()))
|
||||
.map(TextPositionSequence::getBBox)
|
||||
.map(List::of)
|
||||
.toList())
|
||||
.toList(), tmpFileName);
|
||||
|
||||
Binary file not shown.
@ -12,4 +12,4 @@ commit_hash=$(git rev-parse --short=5 HEAD)
|
||||
buildName="${USER}-${branch}-${commit_hash}"
|
||||
|
||||
gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=$buildName --no-build-cache
|
||||
echo "nexus.knecon.com:5001/ff/${dir}-service-server:$buildName"
|
||||
echo "nexus.knecon.com:5001/ff/layoutparser-service-server:$buildName"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user