More refactoring

This commit is contained in:
Dominique Eifländer 2024-02-20 14:04:28 +01:00
parent 0c8c727303
commit 08e994d904
14 changed files with 39 additions and 706 deletions

View File

@ -49,7 +49,7 @@ public class DocstrumSegmentationService {
var zones = zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing);
zones = mergeLines(zones, characterSpacing, Double.NEGATIVE_INFINITY, 0.0, 0.0, lineSpacing * MAX_VERTICAL_MERGE_DIST);
zones = mergeLinesInZones(zones, characterSpacing, Double.NEGATIVE_INFINITY, 0.0, 0.0, lineSpacing * MAX_VERTICAL_MERGE_DIST);
return readingOrderService.resolve(zones, false);
}
@ -86,12 +86,12 @@ public class DocstrumSegmentationService {
// }
private List<Zone> mergeLines(List<Zone> zones,
double wordSpacing,
double minHorizontalDistance,
double maxHorizontalDistance,
double minVerticalDistance,
double maxVerticalDistance) {
private List<Zone> mergeLinesInZones(List<Zone> zones,
double wordSpacing,
double minHorizontalDistance,
double maxHorizontalDistance,
double minVerticalDistance,
double maxVerticalDistance) {
List<Zone> outputZones = new ArrayList<>(zones.size());
for (Zone zone : zones) {

View File

@ -1,15 +1,12 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
public abstract class AngleFilter {
public class AngleFilter {
protected double lowerAngle;
protected double upperAngle;
public abstract boolean matches(Neighbor neighbor);
public static AngleFilter newInstance(double lowerAngle, double upperAngle) {
public AngleFilter(double lowerAngle, double upperAngle) {
if (lowerAngle < -Math.PI / 2) {
lowerAngle += Math.PI;
@ -17,46 +14,19 @@ public abstract class AngleFilter {
if (upperAngle >= Math.PI / 2) {
upperAngle -= Math.PI;
}
this.lowerAngle = lowerAngle;
this.upperAngle = upperAngle;
}
public boolean matches(Neighbor neighbor) {
if (lowerAngle <= upperAngle) {
return new AndFilter(lowerAngle, upperAngle);
} else {
return new OrFilter(lowerAngle, upperAngle);
}
}
public static final class AndFilter extends AngleFilter {
private AndFilter(double lowerAngle, double upperAngle) {
this.lowerAngle = lowerAngle;
this.upperAngle = upperAngle;
}
@Override
public boolean matches(Neighbor neighbor) {
return lowerAngle <= neighbor.getAngle() && neighbor.getAngle() < upperAngle;
}
}
public static final class OrFilter extends AngleFilter {
private OrFilter(double lowerAngle, double upperAngle) {
this.lowerAngle = lowerAngle;
this.upperAngle = upperAngle;
}
@Override
public boolean matches(Neighbor neighbor) {
} else {
return lowerAngle <= neighbor.getAngle() || neighbor.getAngle() < upperAngle;
}
}
}

View File

@ -27,12 +27,6 @@ public class DisjointSets<E> implements Iterable<Set<E>> {
}
/**
* Merges subsets which elements e1 and e2 belong to.
*
* @param e1 element from a subset
* @param e2 element from a subset
*/
public void union(E e1, E e2) {
Entry<E> r1 = map.get(e1).findRepresentative();
@ -50,7 +44,7 @@ public class DisjointSets<E> implements Iterable<Set<E>> {
@Override
public Iterator<Set<E>> iterator() {
return new Iterator<Set<E>>() {
return new Iterator<>() {
private final Iterator<Entry<E>> iterator = map.values().iterator();
private Entry<E> nextRepresentative;

View File

@ -1,33 +1,19 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model;
import java.util.Iterator;
import java.util.NoSuchElementException;
public class Histogram implements Iterable<Histogram.Bin> {
public class Histogram {
private static final double EPSILON = 1.0e-6;
private final double min;
private final double delta;
private final double resolution;
private double[] frequencies;
/**
* Constructs a new histogram for values in range [minValue, maxValue] with
* given resolution.
*
* @param minValue - minimum allowed value
* @param maxValue - maximum allowed value
* @param resolution - histogram's resolution
*/
public Histogram(double minValue, double maxValue, double resolution) {
this.min = minValue - EPSILON;
this.delta = maxValue - minValue + 2 * EPSILON;
double delta = maxValue - minValue + 2 * EPSILON;
int size = Math.max(1, (int) Math.round((maxValue - minValue) / resolution));
this.resolution = this.delta / size;
this.resolution = delta / size;
this.frequencies = new double[size];
}
@ -47,25 +33,6 @@ public class Histogram implements Iterable<Histogram.Bin> {
}
public void circularKernelSmooth(double[] kernel) {
double[] newFrequencies = new double[frequencies.length];
int shift = (kernel.length - 1) / 2;
for (int i = 0; i < frequencies.length; i++) {
for (int d = 0; d < kernel.length; d++) {
int j = i + d - shift;
if (j < 0) {
j += frequencies.length;
} else if (j >= frequencies.length) {
j -= frequencies.length;
}
newFrequencies[i] += kernel[d] * frequencies[j];
}
}
frequencies = newFrequencies;
}
public double[] createGaussianKernel(double length, double stdDeviation) {
int r = (int) Math.round(length / resolution) / 2;
@ -87,45 +54,24 @@ public class Histogram implements Iterable<Histogram.Bin> {
}
public void circularGaussianSmooth(double windowLength, double stdDeviation) {
circularKernelSmooth(createGaussianKernel(windowLength, stdDeviation));
}
public void gaussianSmooth(double windowLength, double stdDeviation) {
kernelSmooth(createGaussianKernel(windowLength, stdDeviation));
}
/**
* Adds single occurrence of given value to the histogram.
*
* @param value inserted values
*/
public void add(double value) {
frequencies[(int) ((value - min) / resolution)] += 1.0;
}
/**
* Returns histogram's number of bins.
*
* @return number of bins
*/
public int getSize() {
return frequencies.length;
}
/**
* Finds the histogram's peak value.
*
* @return peak value
*/
public double getPeakValue() {
int peakIndex = 0;
@ -142,58 +88,4 @@ public class Histogram implements Iterable<Histogram.Bin> {
return ((double) peakIndex + peakEndIndex) / 2 * resolution + min;
}
@Override
public Iterator<Bin> iterator() {
return new Iterator() {
private int index = 0;
@Override
public boolean hasNext() {
return index < frequencies.length;
}
@Override
public Object next() {
if (index >= frequencies.length) {
throw new NoSuchElementException();
}
return new Bin(index++);
}
@Override
public void remove() {
throw new UnsupportedOperationException("Not supported yet.");
}
};
}
public final class Bin {
private final int index;
private Bin(int index) {
this.index = index;
}
public double getValue() {
return (index + 0.5) * resolution + min;
}
}
}

View File

@ -31,7 +31,7 @@ public class Line extends BoundingBox {
this.characters = characters;
if (characters.size() >= 2) {
// Simple linear regression
// linear regression
double sx = 0.0, sxx = 0.0, sxy = 0.0, sy = 0.0;
for (Character character : characters) {
sx += character.getX();
@ -46,7 +46,7 @@ public class Line extends BoundingBox {
this.y0 = a + b * this.x0;
this.x1 = characters.get(characters.size() - 1).getX();
this.y1 = a + b * this.x1;
} else if (!characters.isEmpty()) {
} else {
Character character = characters.get(0);
double dx = character.getTextPosition().getWidthDirAdj() / 3;
double dy = dx * Math.tan(0);
@ -54,12 +54,10 @@ public class Line extends BoundingBox {
this.x1 = character.getX() + dx;
this.y0 = character.getY() - dy;
this.y1 = character.getY() + dy;
} else {
throw new IllegalArgumentException("Component list must not be empty");
}
height = computeHeight();
computeWords(wordSpacing * WORD_DISTANCE_MULTIPLIER);
buildBox();
buildBBox();
}
@ -136,7 +134,7 @@ public class Line extends BoundingBox {
}
private void buildBox() {
private void buildBBox() {
double minX = Double.POSITIVE_INFINITY;
double minY = Double.POSITIVE_INFINITY;

View File

@ -16,11 +16,11 @@ public class Zone extends BoundingBox {
lines.sort(Comparator.comparingDouble(Line::getY));
this.lines = lines;
buildBox();
buildBBox();
}
public void buildBox() {
public void buildBBox() {
double minX = Double.POSITIVE_INFINITY;
double minY = Double.POSITIVE_INFINITY;

View File

@ -1,30 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils.DoubleUtils;
import lombok.AllArgsConstructor;
import lombok.Data;
@Data
@AllArgsConstructor
public class BoundingBoxDistanceTuple implements Comparable<BoundingBoxDistanceTuple> {
private boolean c;
private double distance;
private BoundingBox zone1;
private BoundingBox zone2;
@Override
public int compareTo(BoundingBoxDistanceTuple compareObject) {
double eps = 1E-3;
if (c == compareObject.c) {
return DoubleUtils.compareDouble(distance, compareObject.distance, eps);
} else {
return c ? -1 : 1;
}
}
}

View File

@ -1,29 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder;
import java.awt.geom.Rectangle2D;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.BoundingBox;
import lombok.Data;
@Data
public class BoundingBoxZoneGroup extends BoundingBox {
private BoundingBox leftChild;
private BoundingBox rightChild;
public BoundingBoxZoneGroup(BoundingBox child1, BoundingBox child2) {
this.leftChild = child1;
this.rightChild = child2;
double minX = Math.min(leftChild.getX(), rightChild.getX());
double minY = Math.min(leftChild.getY(), rightChild.getY());
double maxX = Math.max(leftChild.getX() + leftChild.getWidth(), rightChild.getX() + rightChild.getWidth());
double maxY = Math.max(leftChild.getY() + leftChild.getHeight(), rightChild.getY() + rightChild.getHeight());
this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY));
}
}

View File

@ -1,258 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
/**
* A set-like data structure for objects placed on a plane. Can efficiently find objects in a certain rectangular area.
* It maintains two parallel lists of objects, each of which is sorted by its x or y coordinate.
*
* @author Pawel Szostek
*/
public class DocumentPlane {
/**
* List of objects on the plane. Stored in a random order
*/
private final List<BoundingBox> objs;
/**
* Size of a grid square. If gridSize=50, then the plane is divided into squares of size 50. Each square contains
* objects placed in a 50x50 area
*/
private final int gridSize;
/**
* Redundant dictionary of objects on the plane. Allows efficient 2D space search. Keys are X-Y coordinates of a
* grid square. Single object can be stored under several keys (depending on its physical size). Grid squares are
* lazy-initialized.
*/
private final Map<GridXY, List<BoundingBox>> grid;
/**
* Representation of XY coordinates
*/
private static class GridXY {
public int x;
public int y;
public GridXY(int x, int y) {
this.x = x;
this.y = y;
}
@Override
public int hashCode() {
return x * y;
}
@Override
public boolean equals(Object obj) {
if (obj == null || getClass() != obj.getClass()) {
return false;
}
GridXY comparedObj = (GridXY) obj;
return x == comparedObj.x && y == comparedObj.y;
}
@Override
public String toString() {
return "(" + x + "," + y + ")";
}
}
public List<BoundingBox> getObjects() {
return objs;
}
public DocumentPlane(List<Zone> objectList, int gridSize) {
this.grid = new HashMap<GridXY, List<BoundingBox>>();
this.objs = new ArrayList<BoundingBox>();
this.gridSize = gridSize;
for (Zone obj : objectList) {
add(obj);
}
}
/**
* Looks for objects placed between obj1 and obj2 excluding them
*
* @param obj1 object
* @param obj2 object
* @return object list
*/
public List<BoundingBox> findObjectsBetween(BoundingBox obj1, BoundingBox obj2) {
double x0 = Math.min(obj1.getX(), obj2.getX());
double y0 = Math.min(obj1.getY(), obj2.getY());
double x1 = Math.max(obj1.getX() + obj1.getWidth(), obj2.getX() + obj2.getWidth());
double y1 = Math.max(obj1.getY() + obj1.getHeight(), obj2.getY() + obj2.getHeight());
assert x1 >= x0 && y1 >= y0;
Rectangle2D searchBounds = new Rectangle2D.Double(x0, y0, x1 - x0, y1 - y0);
List<BoundingBox> objsBetween = find(searchBounds);
/*
* the rectangle area must contain at least obj1 and obj2
*/
objsBetween.remove(obj1);
objsBetween.remove(obj2);
return objsBetween;
}
/**
* Checks if there is any object placed between obj1 and obj2
*
* @param obj1 object
* @param obj2 object
* @return true if anything is placed between, false otherwise
*/
public boolean anyObjectsBetween(BoundingBox obj1, BoundingBox obj2) {
List<BoundingBox> lObjs = findObjectsBetween(obj1, obj2);
return !(lObjs.isEmpty());
}
/**
* Adds object to the plane
*
* @param obj object
* @return document plane
*/
public DocumentPlane add(BoundingBox obj) {
int objsBefore = this.objs.size();
/*
* iterate over grid squares
*/
for (int y = ((int) obj.getY()) / gridSize; y <= ((int) (obj.getY() + obj.getHeight() + gridSize - 1)) / gridSize; ++y) {
for (int x = ((int) obj.getX()) / gridSize; x <= ((int) (obj.getX() + obj.getWidth() + gridSize - 1)) / gridSize; ++x) {
GridXY xy = new GridXY(x, y);
if (!grid.keySet().contains(xy)) {
/*
* add the non-existing key
*/
grid.put(xy, new ArrayList<BoundingBox>());
grid.get(xy).add(obj);
assert grid.get(xy).size() == 1;
} else {
grid.get(xy).add(obj);
}
}
}
objs.add(obj);
/*
* size of the object list should be incremented
*/
assert objsBefore + 1 == objs.size();
/*
* object list must contain the same number of objects as object dictionary
*/
assert objs.size() == elementsInGrid();
return this;
}
public DocumentPlane remove(BoundingBox obj) {
/*
* iterate over grid squares
*/
for (int y = ((int) obj.getY()) / gridSize; y <= ((int) (obj.getY() + obj.getHeight() + gridSize - 1)) / gridSize; ++y) {
for (int x = ((int) obj.getX()) / gridSize; x <= ((int) (obj.getX() + obj.getWidth() + gridSize - 1)) / gridSize; ++x) {
GridXY xy = new GridXY(x, y);
if (grid.get(xy).contains(obj)) {
grid.get(xy).remove(obj);
}
}
}
objs.remove(obj);
assert objs.size() == elementsInGrid();
return this;
}
/**
* Find objects within search bounds
*
* @param searchBounds is a search rectangle
* @return list of objects in!side search rectangle
*/
public List<BoundingBox> find(Rectangle2D searchBounds) {
List<BoundingBox> done = new ArrayList<BoundingBox>(); //contains already considered objects (wrt. optimization)
List<BoundingBox> ret = new ArrayList<BoundingBox>();
double x0 = searchBounds.getX();
double y0 = searchBounds.getY();
double y1 = searchBounds.getY() + searchBounds.getHeight();
double x1 = searchBounds.getX() + searchBounds.getWidth();
/*
* iterate over grid squares
*/
for (int y = (int) y0 / gridSize; y < ((int) (y1 + gridSize - 1)) / gridSize; ++y) {
for (int x = (int) x0 / gridSize; x < ((int) (x1 + gridSize - 1)) / gridSize; ++x) {
GridXY xy = new GridXY(x, y);
if (!grid.containsKey(xy)) {
continue;
}
for (BoundingBox obj : grid.get(xy)) {
if (done.contains(obj)) /*
* omit if already checked
*/ {
continue;
}
/*
* add to the checked objects
*/
done.add(obj);
/*
* check if two objects overlap
*/
if (obj.getX() + obj.getWidth() <= x0 || x1 <= obj.getX() || obj.getY() + obj.getHeight() <= y0 || y1 <= obj.getY()) {
continue;
}
ret.add(obj);
}
}
}
return ret;
}
/**
* Count objects stored in objects dictionary
*
* @return number of elements
*/
protected int elementsInGrid() {
List<BoundingBox> objs_ = new ArrayList<BoundingBox>();
for (GridXY coord : grid.keySet()) {
for (BoundingBox obj : grid.get(coord)) {
if (!objs_.contains(obj)) {
objs_.add(obj);
}
}
}
return objs_.size();
}
}

View File

@ -1,29 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder;
import java.util.ArrayList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
public class TreeToListConverter {
public List<Zone> convertToList(BoundingBoxZoneGroup obj) {
List<Zone> ret = new ArrayList<>();
if (obj.getLeftChild() instanceof Zone) {
Zone zone = (Zone) obj.getLeftChild();
ret.add(zone);
} else { // obj.getLeftChild() instanceof BxZoneGroup
ret.addAll(convertToList((BoundingBoxZoneGroup) obj.getLeftChild()));
}
if (obj.getRightChild() instanceof Zone) {
Zone zone = (Zone) obj.getRightChild();
ret.add(zone);
} else { // obj.getRightChild() instanceof BxZoneGroup
ret.addAll(convertToList((BoundingBoxZoneGroup) obj.getRightChild()));
}
return ret;
}
}

View File

@ -25,7 +25,7 @@ public class LineBuilderService {
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_CHARACTER_DISTANCE;
DisjointSets<Character> sets = new DisjointSets<>(characters);
AngleFilter filter = AngleFilter.newInstance(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
AngleFilter filter = new AngleFilter(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
characters.forEach(character -> {
character.getNeighbors().forEach(neighbor -> {

View File

@ -1,8 +1,6 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.service;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.ListIterator;
@ -11,17 +9,12 @@ import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder.BoundingBoxDistanceTuple;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder.BoundingBoxZoneGroup;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder.DocumentPlane;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.readingorder.TreeToListConverter;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils.DoubleUtils;
@Service
public class ReadingOrderService {
static final int GRIDSIZE = 50;
static final double THRESHOLD = 1;
private static final double THRESHOLD = 1;
public List<Zone> resolve(List<Zone> zones, boolean yxOrder) {
@ -36,11 +29,14 @@ public class ReadingOrderService {
return zones;
}
return simpleOrder(zones);
return resolveMultiColumnReadingOder(zones);
}
private List<Zone> simpleOrder(List<Zone> zones) {
private List<Zone> resolveMultiColumnReadingOder(List<Zone> zones) {
// Simple reading order resolver for multi column page layout as described here : https://pub.towardsai.net/advanced-rag-02-unveiling-pdf-parsing-b84ae866344e
// TODO implement a more fancy reading order resolver see https://github.com/BobLd/DocumentLayoutAnalysis/blob/master/README.md#reading-order
double minX = Double.POSITIVE_INFINITY;
double maxX = Double.NEGATIVE_INFINITY;
@ -75,7 +71,8 @@ public class ReadingOrderService {
rightOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
middle.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
middle.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
List<Zone> sortedZones = new ArrayList<>();
sortedZones.addAll(leftOf);
@ -85,7 +82,6 @@ public class ReadingOrderService {
while (itty.hasNext()) {
Zone current = itty.next();
for (int i = 0; i < sortedZones.size(); i++) {
if (current.getY() < sortedZones.get(i).getY()) {
sortedZones.add(i, current);
@ -93,7 +89,6 @@ public class ReadingOrderService {
break;
}
}
}
sortedZones.addAll(middle);
@ -101,174 +96,4 @@ public class ReadingOrderService {
return sortedZones;
}
private List<Zone> reorderZones(List<Zone> unorderedZones) {
BoundingBoxZoneGroup bxZonesTree = groupZonesHierarchically(unorderedZones);
sortGroupedZones(bxZonesTree);
TreeToListConverter treeConverter = new TreeToListConverter();
List<Zone> orderedZones = treeConverter.convertToList(bxZonesTree);
assert unorderedZones.size() == orderedZones.size();
return orderedZones;
}
/**
* Builds a binary tree of zones and groups of zones from a list of unordered zones. This is done in hierarchical
* clustering by joining two least distant nodes. Distance is calculated in the distance() method.
*
* @param zones is a list of unordered zones
* @return root of the zones clustered in a tree
*/
private BoundingBoxZoneGroup groupZonesHierarchically(List<Zone> zones) {
/*
* Distance tuples are stored sorted by ascending distance value
*/
List<BoundingBoxDistanceTuple> dists = new ArrayList<>();
for (int idx1 = 0; idx1 < zones.size(); ++idx1) {
for (int idx2 = idx1 + 1; idx2 < zones.size(); ++idx2) {
Zone zone1 = zones.get(idx1);
Zone zone2 = zones.get(idx2);
dists.add(new BoundingBoxDistanceTuple(false, distance(zone1, zone2), zone1, zone2));
}
}
Collections.sort(dists);
DocumentPlane plane = new DocumentPlane(zones, GRIDSIZE);
while (!dists.isEmpty()) {
BoundingBoxDistanceTuple distElem = dists.get(0);
dists.remove(0);
if (!distElem.isC() && plane.anyObjectsBetween(distElem.getZone1(), distElem.getZone2())) {
dists.add(new BoundingBoxDistanceTuple(true, distElem.getDistance(), distElem.getZone1(), distElem.getZone2()));
continue;
}
BoundingBoxZoneGroup newGroup = new BoundingBoxZoneGroup(distElem.getZone1(), distElem.getZone2());
plane.remove(distElem.getZone1()).remove(distElem.getZone2());
dists = removeDistElementsContainingObject(dists, distElem.getZone1());
dists = removeDistElementsContainingObject(dists, distElem.getZone2());
for (BoundingBox other : plane.getObjects()) {
dists.add(new BoundingBoxDistanceTuple(false, distance(other, newGroup), newGroup, other));
}
Collections.sort(dists);
plane.add(newGroup);
}
assert plane.getObjects().size() == 1 : "There should be one object left at the plane after grouping";
return (BoundingBoxZoneGroup) plane.getObjects().get(0);
}
/**
* Removes all distance tuples containing obj
*/
private List<BoundingBoxDistanceTuple> removeDistElementsContainingObject(Collection<BoundingBoxDistanceTuple> list, BoundingBox obj) {
List<BoundingBoxDistanceTuple> ret = new ArrayList<>();
for (BoundingBoxDistanceTuple distElem : list) {
if (distElem.getZone1() != obj && distElem.getZone2() != obj) {
ret.add(distElem);
}
}
return ret;
}
/**
* Swaps children of BxZoneGroup if necessary. A group with smaller sort factor is placed to the left (leftChild).
* An object with greater sort factor is placed on the right (rightChild). This plays an important role when
* traversing the tree in conversion to a one dimensional list.
*
* @param group
*/
private void sortGroupedZones(BoundingBoxZoneGroup group) {
BoundingBox leftChild = group.getLeftChild();
BoundingBox rightChild = group.getRightChild();
if (shouldBeSwapped(leftChild, rightChild)) {
// swap
group.setLeftChild(rightChild);
group.setRightChild(leftChild);
}
if (leftChild instanceof BoundingBoxZoneGroup) // if the child is a tree node, then recurse
{
sortGroupedZones((BoundingBoxZoneGroup) leftChild);
}
if (rightChild instanceof BoundingBoxZoneGroup) // as above - recurse
{
sortGroupedZones((BoundingBoxZoneGroup) rightChild);
}
}
private boolean shouldBeSwapped(BoundingBox first, BoundingBox second) {
double cx, cy, cw, ch, ox, oy, ow, oh;
cx = first.getBBox().getX();
cy = first.getBBox().getY();
cw = first.getBBox().getWidth();
ch = first.getBBox().getHeight();
ox = second.getBBox().getX();
oy = second.getBBox().getY();
ow = second.getBBox().getWidth();
oh = second.getBBox().getHeight();
// Determine Octant
//
// 0 | 1 | 2
// __|___|__
// 7 | 9 | 3 First is placed in 9th square
// __|___|__
// 6 | 5 | 4
if (cx + cw <= ox) { //2,3,4
return false;
} else if (ox + ow <= cx) { //0,6,7
return true; //6
} else if (cy + ch <= oy) {
return false; //5
} else if (oy + oh <= cy) {
return true; //1
} else { //two zones
double xdiff = ox + ow / 2 - cx - cw / 2;
double ydiff = oy + oh / 2 - cy - ch / 2;
return xdiff + ydiff < 0;
}
}
/**
* A distance function between two TextBoxes.
* <p>
* Consider the bounding rectangle for obj1 and obj2. Return its area minus the areas of obj1 and obj2, shown as
* 'www' below. This value may be negative. (x0,y0) +------+..........+ | obj1 |wwwwwwwwww: +------+www+------+
* :wwwwwwwwww| obj2 | +..........+------+ (x1,y1)
*
* @return distance value based on objects' coordinates and physical size on a plane
*/
private double distance(BoundingBox obj1, BoundingBox obj2) {
double x0 = Math.min(obj1.getX(), obj2.getX());
double y0 = Math.min(obj1.getY(), obj2.getY());
double x1 = Math.max(obj1.getX() + obj1.getWidth(), obj2.getX() + obj2.getWidth());
double y1 = Math.max(obj1.getY() + obj1.getHeight(), obj2.getY() + obj2.getHeight());
double dist = ((x1 - x0) * (y1 - y0) - obj1.getArea() - obj2.getArea());
double obj1X = obj1.getX();
double obj1CenterX = obj1.getX() + obj1.getWidth() / 2;
double obj1CenterY = obj1.getY() + obj1.getHeight() / 2;
double obj2X = obj2.getX();
double obj2CenterX = obj2.getX() + obj2.getWidth() / 2;
double obj2CenterY = obj2.getY() + obj2.getHeight() / 2;
double obj1obj2VectorCosineAbsLeft = Math.abs((obj2X - obj1X) / Math.sqrt((obj2X - obj1X) * (obj2X - obj1X) + (obj2CenterY - obj1CenterY) * (obj2CenterY - obj1CenterY)));
double obj1obj2VectorCosineAbsCenter = Math.abs((obj2CenterX - obj1CenterX) / Math.sqrt((obj2CenterX - obj1CenterX) * (obj2CenterX - obj1CenterX) + (obj2CenterY - obj1CenterY) * (obj2CenterY - obj1CenterY)));
double cosine = Math.min(obj1obj2VectorCosineAbsLeft, obj1obj2VectorCosineAbsCenter);
final double MAGIC_COEFF = 0.5;
return dist * (MAGIC_COEFF + cosine);
}
}

View File

@ -40,7 +40,7 @@ public class SpacingService {
}
}
Histogram histogram = new Histogram(0, maxDistance, SPACING_HISTOGRAM_RESOLUTION);
AngleFilter filter = AngleFilter.newInstance(angle - ANGLE_TOLERANCE, angle + ANGLE_TOLERANCE);
AngleFilter filter = new AngleFilter(angle - ANGLE_TOLERANCE, angle + ANGLE_TOLERANCE);
for (Character component : components) {
for (Neighbor neighbor : component.getNeighbors()) {
if (filter.matches(neighbor)) {

View File

@ -25,7 +25,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
@SneakyThrows
public void testViewerDocument() {
String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
String fileName = "files/Plenarprotokoll 1 (keine Druchsache!) (1).pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
var documentFile = new ClassPathResource(fileName).getFile();