diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmentationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmentationService.java index 4a21ff0..d243b06 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmentationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmentationService.java @@ -1,12 +1,17 @@ package com.knecon.fforesight.service.layoutparser.processor.services.docstrum; +import java.util.ArrayList; +import java.util.Comparator; import java.util.List; +import java.util.Set; import java.util.stream.Collectors; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.DisjointSets; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Line; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.LineBuilderService; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service.NearestNeighbourService; @@ -20,6 +25,8 @@ import lombok.RequiredArgsConstructor; @RequiredArgsConstructor public class DocstrumSegmentationService { + private static final double MAX_VERTICAL_MERGE_DIST = 0.5; + private final NearestNeighbourService nearestNeighbourService; private final SpacingService spacingService; private final LineBuilderService lineBuilderService; @@ -42,7 +49,105 @@ public class DocstrumSegmentationService { var zones = zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing); - return readingOrderService.resolve(zones); + zones = mergeLines(zones, characterSpacing, Double.NEGATIVE_INFINITY, 0.0, 0.0, lineSpacing * MAX_VERTICAL_MERGE_DIST); + + return readingOrderService.resolve(zones, false); + } + +// private List mergeZones(List zones, double tolerance) { +// +// List bounds = new ArrayList(zones.size()); +// for (List zone : zones) { +// BxBoundsBuilder builder = new BxBoundsBuilder(); +// for (ComponentLine line : zone) { +// for (Component component : line.getComponents()) { +// builder.expand(component.getChunk().getBounds()); +// } +// } +// bounds.add(builder.getBounds()); +// } +// +// List> outputZones = new ArrayList>(); +// mainFor: +// for (int i = 0; i < zones.size(); i++) { +// for (int j = 0; j < zones.size(); j++) { +// if (i == j || bounds.get(j) == null || bounds.get(i) == null) { +// continue; +// } +// if (BxModelUtils.contains(bounds.get(j), bounds.get(i), tolerance)) { +// zones.get(j).addAll(zones.get(i)); +// bounds.set(i, null); +// continue mainFor; +// } +// } +// outputZones.add(zones.get(i)); +// } +// return outputZones; +// } + + + private List mergeLines(List zones, + double wordSpacing, + double minHorizontalDistance, + double maxHorizontalDistance, + double minVerticalDistance, + double maxVerticalDistance) { + + List outputZones = new ArrayList<>(zones.size()); + for (Zone zone : zones) { + outputZones.add(mergeLinesInZone(zone, wordSpacing, minHorizontalDistance, maxHorizontalDistance, minVerticalDistance, maxVerticalDistance)); + } + return outputZones; + } + + + private Zone mergeLinesInZone(Zone zone, + double wordSpacing, + double minHorizontalDistance, + double maxHorizontalDistance, + double minVerticalDistance, + double maxVerticalDistance) { + + DisjointSets sets = new DisjointSets<>(zone.getLines()); + for (int i = 0; i < zone.getLines().size(); i++) { + Line li = zone.getLines().get(i); + for (int j = i + 1; j < zone.getLines().size(); j++) { + Line lj = zone.getLines().get(j); + double hDist = li.horizontalDistance(lj); + double vDist = li.verticalDistance(lj); + if (minHorizontalDistance <= hDist && hDist <= maxHorizontalDistance && minVerticalDistance <= vDist && vDist <= maxVerticalDistance) { + sets.union(li, lj); + } else if (minVerticalDistance <= vDist && vDist <= maxVerticalDistance && Math.abs(hDist - Math.min(li.getLength(), lj.getLength())) < 0.1) { + boolean componentOverlap = false; + int overlappingCount = 0; + for (Character ci : li.getCharacters()) { + for (Character cj : lj.getCharacters()) { + double dist = ci.overlappingDistance(cj); + if (dist > 2) { + componentOverlap = true; + } + if (dist > 0) { + overlappingCount++; + } + } + } + if (!componentOverlap && overlappingCount <= 2) { + sets.union(li, lj); + } + } + } + } + List outputZone = new ArrayList<>(); + for (Set group : sets) { + List components = new ArrayList<>(); + for (Line line : group) { + components.addAll(line.getCharacters()); + } + components.sort(Comparator.comparingDouble(Character::getX)); + + outputZone.add(new Line(components, wordSpacing)); + } + return new Zone(outputZone); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/AngleFilter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/AngleFilter.java index 5ecc891..fb0289e 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/AngleFilter.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/AngleFilter.java @@ -1,29 +1,14 @@ package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model; -/** - * Filter class for neighbor objects that checks if the angle of the - * neighbor is within specified range. - */ public abstract class AngleFilter { - private final double lowerAngle; - private final double upperAngle; + protected double lowerAngle; + protected double upperAngle; - private AngleFilter(double lowerAngle, double upperAngle) { - - this.lowerAngle = lowerAngle; - this.upperAngle = upperAngle; - } + public abstract boolean matches(Neighbor neighbor); - /** - * Constructs new angle filter. - * - * @param lowerAngle minimum angle in range [-3*pi/2, pi/2) - * @param upperAngle maximum angle in range [-pi/2, 3*pi/2) - * @return newly constructed angle filter - */ public static AngleFilter newInstance(double lowerAngle, double upperAngle) { if (lowerAngle < -Math.PI / 2) { @@ -40,33 +25,19 @@ public abstract class AngleFilter { } - public double getLowerAngle() { - - return lowerAngle; - } - - - public double getUpperAngle() { - - return upperAngle; - } - - - public abstract boolean matches(Neighbor neighbor); - - public static final class AndFilter extends AngleFilter { private AndFilter(double lowerAngle, double upperAngle) { - super(lowerAngle, upperAngle); + this.lowerAngle = lowerAngle; + this.upperAngle = upperAngle; } @Override public boolean matches(Neighbor neighbor) { - return getLowerAngle() <= neighbor.getAngle() && neighbor.getAngle() < getUpperAngle(); + return lowerAngle <= neighbor.getAngle() && neighbor.getAngle() < upperAngle; } } @@ -75,14 +46,15 @@ public abstract class AngleFilter { private OrFilter(double lowerAngle, double upperAngle) { - super(lowerAngle, upperAngle); + this.lowerAngle = lowerAngle; + this.upperAngle = upperAngle; } @Override public boolean matches(Neighbor neighbor) { - return getLowerAngle() <= neighbor.getAngle() || neighbor.getAngle() < getUpperAngle(); + return lowerAngle <= neighbor.getAngle() || neighbor.getAngle() < upperAngle; } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Character.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Character.java index 150a926..987665a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Character.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Character.java @@ -1,6 +1,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; @@ -51,6 +52,20 @@ public class Character { } + public double overlappingDistance(Character other) { + + double[] xs = new double[4]; + double s = Math.sin(-0), c = Math.cos(-0); + xs[0] = c * x - s * y; + xs[1] = c * (x + textPosition.getWidthDirAdj()) - s * (y + textPosition.getHeightDir()); + xs[2] = c * other.x - s * other.y; + xs[3] = c * (other.x + other.textPosition.getWidthDirAdj()) - s * (other.y + other.textPosition.getHeightDir()); + boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0]; + Arrays.sort(xs); + return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1); + } + + public void setNeighbors(List neighbors) { this.neighbors = neighbors; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/DisjointSets.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/DisjointSets.java index e4cc563..447500b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/DisjointSets.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/DisjointSets.java @@ -10,29 +10,17 @@ import java.util.Set; public class DisjointSets implements Iterable> { - private final Map> map = new HashMap>(); + private final Map> map = new HashMap<>(); - /** - * Constructs a new set of singletons. - * - * @param c elements of singleton sets - */ - public DisjointSets(Collection c) { + public DisjointSets(Collection collection) { - for (E element : c) { + for (E element : collection) { map.put(element, new Entry(element)); } } - /** - * Checks if elements are in the same subsets. - * - * @param e1 element from a subset - * @param e2 element from a subset - * @return true if elements are in the same subset; false otherwise - */ public boolean areTogether(E e1, E e2) { return map.get(e1).findRepresentative() == map.get(e2).findRepresentative(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Line.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Line.java index 892771d..5683c4a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Line.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Line.java @@ -33,11 +33,11 @@ public class Line extends BoundingBox { if (characters.size() >= 2) { // Simple linear regression double sx = 0.0, sxx = 0.0, sxy = 0.0, sy = 0.0; - for (Character component : characters) { - sx += component.getX(); - sxx += component.getX() * component.getX(); - sxy += component.getX() * component.getY(); - sy += component.getY(); + for (Character character : characters) { + sx += character.getX(); + sxx += character.getX() * character.getX(); + sxy += character.getX() * character.getY(); + sy += character.getY(); } double b = (characters.size() * sxy - sx * sy) / (characters.size() * sxx - sx * sx); double a = (sy - b * sx) / characters.size(); @@ -47,13 +47,13 @@ public class Line extends BoundingBox { this.x1 = characters.get(characters.size() - 1).getX(); this.y1 = a + b * this.x1; } else if (!characters.isEmpty()) { - Character component = characters.get(0); - double dx = component.getTextPosition().getWidthDirAdj() / 3; + Character character = characters.get(0); + double dx = character.getTextPosition().getWidthDirAdj() / 3; double dy = dx * Math.tan(0); - this.x0 = component.getX() - dx; - this.x1 = component.getX() + dx; - this.y0 = component.getY() - dy; - this.y1 = component.getY() + dy; + this.x0 = character.getX() - dx; + this.x1 = character.getX() + dx; + this.y0 = character.getY() - dy; + this.y1 = character.getY() + dy; } else { throw new IllegalArgumentException("Component list must not be empty"); } @@ -155,5 +155,13 @@ public class Line extends BoundingBox { this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY)); } + + public String toString() { + + StringBuilder sb = new StringBuilder(); + words.forEach(word -> sb.append(word.toString()).append(" ")); + return sb.toString().trim(); + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Zone.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Zone.java index 7960f63..1d9a0dd 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Zone.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/Zone.java @@ -39,4 +39,12 @@ public class Zone extends BoundingBox { this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY)); } + + public String toString() { + + StringBuilder sb = new StringBuilder(); + lines.forEach(line -> sb.append(line.toString()).append("\n")); + return sb.toString().trim(); + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/readingorder/BoundingBoxDistanceTuple.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/readingorder/BoundingBoxDistanceTuple.java new file mode 100644 index 0000000..3f1b73b --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/readingorder/BoundingBoxDistanceTuple.java @@ -0,0 +1,30 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder; + +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.BoundingBox; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils.DoubleUtils; + +import lombok.AllArgsConstructor; +import lombok.Data; + +@Data +@AllArgsConstructor +public class BoundingBoxDistanceTuple implements Comparable { + + private boolean c; + private double distance; + private BoundingBox zone1; + private BoundingBox zone2; + + + @Override + public int compareTo(BoundingBoxDistanceTuple compareObject) { + + double eps = 1E-3; + if (c == compareObject.c) { + return DoubleUtils.compareDouble(distance, compareObject.distance, eps); + } else { + return c ? -1 : 1; + } + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/readingorder/BoundingBoxZoneGroup.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/readingorder/BoundingBoxZoneGroup.java index b03ce06..50d6717 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/readingorder/BoundingBoxZoneGroup.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/readingorder/BoundingBoxZoneGroup.java @@ -4,6 +4,9 @@ import java.awt.geom.Rectangle2D; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.BoundingBox; +import lombok.Data; + +@Data public class BoundingBoxZoneGroup extends BoundingBox { private BoundingBox leftChild; @@ -14,51 +17,13 @@ public class BoundingBoxZoneGroup extends BoundingBox { this.leftChild = child1; this.rightChild = child2; - setBounds(Math.min(child1.getX(), child2.getX()), - Math.min(child1.getY(), child2.getY()), - Math.max(child1.getX() + child1.getWidth(), child2.getX() + child2.getWidth()), - Math.max(child1.getY() + child1.getHeight(), child2.getY() + child2.getHeight())); + + double minX = Math.min(leftChild.getX(), rightChild.getX()); + double minY = Math.min(leftChild.getY(), rightChild.getY()); + double maxX = Math.max(leftChild.getX() + leftChild.getWidth(), rightChild.getX() + rightChild.getWidth()); + double maxY = Math.max(leftChild.getY() + leftChild.getHeight(), rightChild.getY() + rightChild.getHeight()); + + this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY)); } - - - public void setbBox(Rectangle2D bBox) { - - super.setBBox(bBox); - } - - - public BoundingBox getLeftChild() { - - return leftChild; - } - - - public BoundingBox getRightChild() { - - return rightChild; - } - - - public BoundingBoxZoneGroup setLeftChild(BoundingBox obj) { - - this.leftChild = obj; - return this; - } - - - public BoundingBoxZoneGroup setRightChild(BoundingBox obj) { - - this.rightChild = obj; - return this; - } - - - public BoundingBoxZoneGroup setBounds(double x0, double y0, double x1, double y1) { - - assert x1 >= x0; - assert y1 >= y0; - this.setBBox(new Rectangle2D.Double(x0, y0, x1 - x0, y1 - y0)); - return this; - } - + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/readingorder/DistElem.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/readingorder/DistElem.java deleted file mode 100644 index 29ec401..0000000 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/readingorder/DistElem.java +++ /dev/null @@ -1,115 +0,0 @@ -package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder; - -import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils.DoubleUtils; - -public class DistElem implements Comparable> { - - @Override - public int hashCode() { - - final int prime = 31; - int result = 1; - result = prime * result + (c ? 1231 : 1237); - long temp; - temp = Double.doubleToLongBits(dist); - result = prime * result + (int) (temp ^ (temp >>> 32)); - result = prime * result + ((obj1 == null) ? 0 : obj1.hashCode()); - result = prime * result + ((obj2 == null) ? 0 : obj2.hashCode()); - return result; - } - - - @Override - public boolean equals(Object obj) { - - if (this == obj) { - return true; - } - if (obj == null) { - return false; - } - if (getClass() != obj.getClass()) { - return false; - } - DistElem other = (DistElem) obj; - if (c != other.c) { - return false; - } - if (Double.doubleToLongBits(dist) != Double.doubleToLongBits(other.dist)) { - return false; - } - if (obj1 == null) { - if (other.obj1 != null) { - return false; - } - } else if (!obj1.equals(other.obj1)) { - return false; - } - if (obj2 == null) { - if (other.obj2 != null) { - return false; - } - } else if (!obj2.equals(other.obj2)) { - return false; - } - return true; - } - - - boolean c; - double dist; - E obj1; - E obj2; - - - public boolean isC() { - - return c; - } - - - public void setC(boolean c) { - - this.c = c; - } - - - public double getDist() { - - return dist; - } - - - public E getObj1() { - - return obj1; - } - - - public E getObj2() { - - return obj2; - } - - - public DistElem(boolean c, double dist, E obj1, E obj2) { - - this.c = c; - this.dist = dist; - this.obj1 = obj1; - this.obj2 = obj2; - } - - - @Override - public int compareTo(DistElem compareObject) { - - double eps = 1E-3; - if (c == compareObject.c) { - return DoubleUtils.compareDouble(dist, compareObject.dist, eps); - } else { - return c ? -1 : 1; - } - } - -} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ReadingOrderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ReadingOrderService.java index 098b0ca..001d22e 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ReadingOrderService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ReadingOrderService.java @@ -5,13 +5,14 @@ import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.List; +import java.util.ListIterator; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.BoundingBox; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder.BoundingBoxDistanceTuple; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder.BoundingBoxZoneGroup; -import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder.DistElem; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder.DocumentPlane; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder.TreeToListConverter; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils.DoubleUtils; @@ -20,66 +21,96 @@ import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.ut public class ReadingOrderService { static final int GRIDSIZE = 50; - static final double EPS = 0.01; - static final int MAX_ZONES = 1000; - static final Comparator Y_ASCENDING_ORDER = new Comparator() { + static final double THRESHOLD = 1; - @Override - public int compare(BoundingBox o1, BoundingBox o2) { - return DoubleUtils.compareDouble(o1.getY(), o2.getY(), EPS); + public List resolve(List zones, boolean yxOrder) { + + if (zones.isEmpty() || zones.size() == 1) { + return zones; } - }; - static final Comparator X_ASCENDING_ORDER = new Comparator() { - - @Override - public int compare(BoundingBox o1, BoundingBox o2) { - - return DoubleUtils.compareDouble(o1.getX(), o2.getX(), EPS); + if (yxOrder) { + zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) + .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); + return zones; } - }; - static final Comparator YX_ASCENDING_ORDER = new Comparator() { + return simpleOrder(zones); + } - @Override - public int compare(BoundingBox o1, BoundingBox o2) { - int yCompare = Y_ASCENDING_ORDER.compare(o1, o2); - return yCompare == 0 ? X_ASCENDING_ORDER.compare(o1, o2) : yCompare; + private List simpleOrder(List zones) { + + double minX = Double.POSITIVE_INFINITY; + double maxX = Double.NEGATIVE_INFINITY; + + for (Zone zone : zones) { + if (zone.getX() < minX) { + minX = zone.getX(); + } + if (zone.getX() + zone.getWidth() > maxX) { + maxX = zone.getX() + zone.getWidth(); + } } - }; + double midLineXCoordinate = (minX + maxX) / 2; - public List resolve(List zones) { - - List orderedZones; - if (zones.size() > MAX_ZONES) { - orderedZones = new ArrayList<>(zones); - Collections.sort(orderedZones, YX_ASCENDING_ORDER); - } else { - orderedZones = reorderZones(zones); + List leftOf = new ArrayList<>(); + List rightOf = new ArrayList<>(); + List middle = new ArrayList<>(); + for (Zone zone : zones) { + if (zone.getX() < midLineXCoordinate && zone.getX() + zone.getWidth() < midLineXCoordinate) { + leftOf.add(zone); + } else if (zone.getX() > midLineXCoordinate && zone.getX() + zone.getWidth() > midLineXCoordinate) { + rightOf.add(zone); + } else { + middle.add(zone); + } } - return orderedZones; + + leftOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) + .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); + + rightOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) + .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); + + middle.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); + + List sortedZones = new ArrayList<>(); + sortedZones.addAll(leftOf); + sortedZones.addAll(rightOf); + + ListIterator itty = middle.listIterator(); + + while (itty.hasNext()) { + Zone current = itty.next(); + + for (int i = 0; i < sortedZones.size(); i++) { + if (current.getY() < sortedZones.get(i).getY()) { + sortedZones.add(i, current); + itty.remove(); + break; + } + } + + } + + sortedZones.addAll(middle); + + return sortedZones; } private List reorderZones(List unorderedZones) { - if (unorderedZones.isEmpty()) { - return new ArrayList<>(); - } else if (unorderedZones.size() == 1) { - List ret = new ArrayList<>(1); - ret.add(unorderedZones.get(0)); - return ret; - } else { - BoundingBoxZoneGroup bxZonesTree = groupZonesHierarchically(unorderedZones); - sortGroupedZones(bxZonesTree); - TreeToListConverter treeConverter = new TreeToListConverter(); - List orderedZones = treeConverter.convertToList(bxZonesTree); - assert unorderedZones.size() == orderedZones.size(); - return orderedZones; - } + BoundingBoxZoneGroup bxZonesTree = groupZonesHierarchically(unorderedZones); + sortGroupedZones(bxZonesTree); + TreeToListConverter treeConverter = new TreeToListConverter(); + List orderedZones = treeConverter.convertToList(bxZonesTree); + assert unorderedZones.size() == orderedZones.size(); + return orderedZones; + } @@ -94,29 +125,29 @@ public class ReadingOrderService { /* * Distance tuples are stored sorted by ascending distance value */ - List> dists = new ArrayList>(zones.size() * zones.size() / 2); + List dists = new ArrayList<>(); for (int idx1 = 0; idx1 < zones.size(); ++idx1) { for (int idx2 = idx1 + 1; idx2 < zones.size(); ++idx2) { Zone zone1 = zones.get(idx1); Zone zone2 = zones.get(idx2); - dists.add(new DistElem(false, distance(zone1, zone2), zone1, zone2)); + dists.add(new BoundingBoxDistanceTuple(false, distance(zone1, zone2), zone1, zone2)); } } Collections.sort(dists); DocumentPlane plane = new DocumentPlane(zones, GRIDSIZE); while (!dists.isEmpty()) { - DistElem distElem = dists.get(0); + BoundingBoxDistanceTuple distElem = dists.get(0); dists.remove(0); - if (!distElem.isC() && plane.anyObjectsBetween(distElem.getObj1(), distElem.getObj2())) { - dists.add(new DistElem(true, distElem.getDist(), distElem.getObj1(), distElem.getObj2())); + if (!distElem.isC() && plane.anyObjectsBetween(distElem.getZone1(), distElem.getZone2())) { + dists.add(new BoundingBoxDistanceTuple(true, distElem.getDistance(), distElem.getZone1(), distElem.getZone2())); continue; } - BoundingBoxZoneGroup newGroup = new BoundingBoxZoneGroup(distElem.getObj1(), distElem.getObj2()); - plane.remove(distElem.getObj1()).remove(distElem.getObj2()); - dists = removeDistElementsContainingObject(dists, distElem.getObj1()); - dists = removeDistElementsContainingObject(dists, distElem.getObj2()); + BoundingBoxZoneGroup newGroup = new BoundingBoxZoneGroup(distElem.getZone1(), distElem.getZone2()); + plane.remove(distElem.getZone1()).remove(distElem.getZone2()); + dists = removeDistElementsContainingObject(dists, distElem.getZone1()); + dists = removeDistElementsContainingObject(dists, distElem.getZone2()); for (BoundingBox other : plane.getObjects()) { - dists.add(new DistElem(false, distance(other, newGroup), newGroup, other)); + dists.add(new BoundingBoxDistanceTuple(false, distance(other, newGroup), newGroup, other)); } Collections.sort(dists); plane.add(newGroup); @@ -130,11 +161,11 @@ public class ReadingOrderService { /** * Removes all distance tuples containing obj */ - private List> removeDistElementsContainingObject(Collection> list, BoundingBox obj) { + private List removeDistElementsContainingObject(Collection list, BoundingBox obj) { - List> ret = new ArrayList>(); - for (DistElem distElem : list) { - if (distElem.getObj1() != obj && distElem.getObj2() != obj) { + List ret = new ArrayList<>(); + for (BoundingBoxDistanceTuple distElem : list) { + if (distElem.getZone1() != obj && distElem.getZone2() != obj) { ret.add(distElem); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ZoneBuilderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ZoneBuilderService.java index ef2344c..4ee8de6 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ZoneBuilderService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ZoneBuilderService.java @@ -41,6 +41,7 @@ public class ZoneBuilderService { lines.forEach(outerLine -> // lines.forEach(innerLine -> { + double scale = Math.min(outerLine.getHeight(), innerLine.getHeight()) / meanHeight; scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE)); @@ -49,13 +50,8 @@ public class ZoneBuilderService { double horizontalDistance = outerLine.horizontalDistance(innerLine) / scale; double verticalDistance = outerLine.verticalDistance(innerLine) / scale; - // Line over or above - if (minHorizontalDistance <= horizontalDistance && verticalDistance <= maxVerticalDistance) { - sets.union(outerLine, innerLine); - } - - // Split line that needs later merging - else if (minHorizontalMergeDistance <= horizontalDistance && verticalDistance <= maxVerticalMergeDistance) { + if (minHorizontalDistance <= horizontalDistance && verticalDistance <= maxVerticalDistance // + || minHorizontalMergeDistance <= horizontalDistance && verticalDistance <= maxVerticalMergeDistance) { sets.union(outerLine, innerLine); } } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index 56124e7..aaaecd6 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -25,9 +25,7 @@ public class ViewerDocumentTest extends BuildDocumentTest { @SneakyThrows public void testViewerDocument() { - System.out.println("<<<<<<<<<<" + Math.sin(-0) + "aaa " + Math.cos(0) + Math.tan(0)); - - String fileName = "files/Plenarprotokoll 1 (keine Druchsache!) (1).pdf"; + String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; var documentFile = new ClassPathResource(fileName).getFile(); diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/Plenarprotokoll 20_24_Seite1.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/Plenarprotokoll 20_24_Seite1.pdf new file mode 100644 index 0000000..dbcd246 Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/Plenarprotokoll 20_24_Seite1.pdf differ