More refactoring
This commit is contained in:
parent
4871e55f2d
commit
b2fb6829cb
@ -1,24 +1,20 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.DisjointSets;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Histogram;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Line;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Zone;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.Character;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.Neighbor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterLine;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterZone;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.utils.BoundingBoxBuilder;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.utils.ZoneUtils;
|
||||
|
||||
@ -31,103 +27,9 @@ public class DocstrumSegmenter {
|
||||
private final NearestNeighbourService nearestNeighbourService;
|
||||
private final SpacingService spacingService;
|
||||
private final LineBuilderService lineBuilderService;
|
||||
private final ZoneBuilderService zoneBuilderService;
|
||||
|
||||
public static final int MAX_ZONES_PER_PAGE = 300;
|
||||
|
||||
private static final double DISTANCE_STEP = 16.0;
|
||||
|
||||
/**
|
||||
* Angle histogram resolution in radians per bin.
|
||||
*/
|
||||
private static final double ANGLE_HIST_RESOLUTION = Math.toRadians(0.5);
|
||||
|
||||
/**
|
||||
* Angle histogram smoothing window length in radians.
|
||||
* Length of angle histogram is equal to pi.
|
||||
*/
|
||||
private static final double ANGLE_HIST_SMOOTHING_LEN = 0.25 * Math.PI;
|
||||
|
||||
/**
|
||||
* Angle histogram gaussian smoothing window standard deviation in radians.
|
||||
*/
|
||||
private static final double ANGLE_HIST_SMOOTHING_STDDEV = 0.0625 * Math.PI;
|
||||
|
||||
/**
|
||||
* Spacing histogram resolution per bin.
|
||||
*/
|
||||
private static final double SPACING_HIST_RESOLUTION = 0.5;
|
||||
|
||||
/**
|
||||
* Spacing histogram smoothing window length.
|
||||
*/
|
||||
private static final double SPACING_HIST_SMOOTHING_LEN = 2.5;
|
||||
|
||||
/**
|
||||
* Spacing histogram gaussian smoothing window standard deviation.
|
||||
*/
|
||||
private static final double SPACING_HIST_SMOOTHING_STDDEV = 0.5;
|
||||
|
||||
/**
|
||||
* Maximum vertical component distance multiplier used during line
|
||||
* determination.
|
||||
* <p>
|
||||
* Maximum vertical distance between components (characters) that belong
|
||||
* to the same line is equal to the product of this value and estimated
|
||||
* between-line spacing.
|
||||
*/
|
||||
private static final double MAX_VERTICAL_COMP_DIST = 0.67;
|
||||
|
||||
/**
|
||||
* Minimum line size scale value.
|
||||
* <p>
|
||||
* During zone determination (merging lines into zones) line height is
|
||||
* taken into account. To achieve this, line size scale is estimated and
|
||||
* limited to range [minLineSizeScale, maxLineSizeScale].
|
||||
*/
|
||||
private static final double MIN_LINE_SIZE_SCALE = 0.9;
|
||||
|
||||
/**
|
||||
* Maximum line size scale value.
|
||||
* <p>
|
||||
* See minLineSizeScale for more information.
|
||||
*/
|
||||
private static final double MAX_LINE_SIZE_SCALE = 2.5;
|
||||
|
||||
/**
|
||||
* Minimum horizontal line distance multiplier.
|
||||
* <p>
|
||||
* Minimum horizontal distance between lines that belong to the same zone
|
||||
* is equal to the product of this value and estimated within-line spacing.
|
||||
*/
|
||||
private static final double MIN_HORIZONTAL_DIST = -0.5;
|
||||
|
||||
/**
|
||||
* Minimum vertical line distance multiplier.
|
||||
* <p>
|
||||
* Minimum vertical distance between lines that belong to the same zone
|
||||
* is equal to the product of this value and estimated between-line spacing.
|
||||
*/
|
||||
private static final double MIN_VERTICAL_DIST = 0.0;
|
||||
|
||||
/**
|
||||
* Maximum vertical line distance multiplier.
|
||||
* <p>
|
||||
* Maximum vertical distance between lines that belong to the same zone
|
||||
* is equal to the product of this value and estimated between-line spacing.
|
||||
*/
|
||||
private static final double MAX_VERTICAL_DIST = 1.2;
|
||||
|
||||
/**
|
||||
* Component distance character spacing multiplier.
|
||||
* <p>
|
||||
* Maximum distance between components that belong to the same line is
|
||||
* equal to (lineSpacing * componentDistanceLineMultiplier +
|
||||
* characterSpacing * componentDistanceCharacterMultiplier), where
|
||||
* lineSpacing and characterSpacing are estimated between-line and
|
||||
* within-line spacing, respectively.
|
||||
*/
|
||||
private static final double COMP_DIST_CHAR = 3.5;
|
||||
|
||||
/**
|
||||
* Word distance multiplier.
|
||||
* <p>
|
||||
@ -136,37 +38,6 @@ public class DocstrumSegmenter {
|
||||
*/
|
||||
private static final double WORD_DIST_MULT = 0.2;
|
||||
|
||||
/**
|
||||
* Minimum horizontal line merge distance multiplier.
|
||||
* <p>
|
||||
* Minimum horizontal distance between lines that should be merged is equal
|
||||
* to the product of this value and estimated within-line spacing.
|
||||
* <p>
|
||||
* Because split lines do not overlap this value should be negative.
|
||||
*/
|
||||
|
||||
private static final double MIN_HORIZONTAL_MERGE_DIST = -3.0;
|
||||
|
||||
/**
|
||||
* Maximum vertical line merge distance multiplier.
|
||||
* <p>
|
||||
* Maximum vertical distance between lines that should be merged is equal
|
||||
* to the product of this value and estimated between-line spacing.
|
||||
*/
|
||||
|
||||
private static final double MAX_VERTICAL_MERGE_DIST = 0.5;
|
||||
|
||||
/**
|
||||
* Angle tolerance for comparisons of angles between components and angles
|
||||
* between lines.
|
||||
*/
|
||||
private static final double ANGLE_TOLERANCE = Math.PI / 6;
|
||||
|
||||
/**
|
||||
* Number of nearest-neighbors found per component.
|
||||
*/
|
||||
private static final int NEIGHBOUR_COUNT = 8;
|
||||
|
||||
|
||||
public List<Zone> segmentPage(List<TextPositionSequence> textPositions) {
|
||||
|
||||
@ -176,271 +47,31 @@ public class DocstrumSegmenter {
|
||||
|
||||
nearestNeighbourService.findNearestNeighbors(components);
|
||||
|
||||
double orientation = 0;
|
||||
|
||||
double characterSpacing = spacingService.computeCharacterSpacing(components);
|
||||
double lineSpacing = spacingService.computeLineSpacing(components);
|
||||
|
||||
List<ComponentLine> lines = lineBuilderService.buildLines(components, characterSpacing, lineSpacing);
|
||||
List<CharacterLine> lines = lineBuilderService.buildLines(components, characterSpacing, lineSpacing);
|
||||
|
||||
List<List<ComponentLine>> zones = determineZones(lines,
|
||||
orientation,
|
||||
characterSpacing * MIN_HORIZONTAL_DIST,
|
||||
Double.POSITIVE_INFINITY,
|
||||
lineSpacing * MIN_VERTICAL_DIST,
|
||||
lineSpacing * MAX_VERTICAL_DIST,
|
||||
characterSpacing * MIN_HORIZONTAL_MERGE_DIST,
|
||||
0.0,
|
||||
0.0,
|
||||
lineSpacing * MAX_VERTICAL_MERGE_DIST);
|
||||
List<CharacterZone> zones = zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing);
|
||||
return convertToBxModel(zones, WORD_DIST_MULT * characterSpacing);
|
||||
}
|
||||
|
||||
|
||||
private void findNeighbors(Character[] components) {
|
||||
|
||||
if (components.length == 0) {
|
||||
return;
|
||||
}
|
||||
if (components.length == 1) {
|
||||
components[0].setNeighbors(new ArrayList<Neighbor>());
|
||||
return;
|
||||
}
|
||||
int pageNeighborCount = NEIGHBOUR_COUNT;
|
||||
if (components.length <= NEIGHBOUR_COUNT) {
|
||||
pageNeighborCount = components.length - 1;
|
||||
}
|
||||
|
||||
List<Neighbor> candidates = new ArrayList<Neighbor>();
|
||||
for (int i = 0; i < components.length; i++) {
|
||||
int start = i, end = i + 1;
|
||||
// Contains components from components array
|
||||
// from ranges [start, i) and [i+1, end)
|
||||
double dist = Double.POSITIVE_INFINITY;
|
||||
for (double searchDist = 0; searchDist < dist; ) {
|
||||
searchDist += DISTANCE_STEP;
|
||||
boolean newCandidatesFound = false;
|
||||
|
||||
while (start > 0 && components[i].getX() - components[start - 1].getX() < searchDist) {
|
||||
start--;
|
||||
candidates.add(new Neighbor(components[start], components[i]));
|
||||
if (candidates.size() > pageNeighborCount) {
|
||||
Collections.sort(candidates, NeighborDistanceComparator.getInstance());
|
||||
candidates.subList(pageNeighborCount, candidates.size()).clear();
|
||||
}
|
||||
newCandidatesFound = true;
|
||||
}
|
||||
while (end < components.length && components[end].getX() - components[i].getX() < searchDist) {
|
||||
candidates.add(new Neighbor(components[end], components[i]));
|
||||
if (candidates.size() > pageNeighborCount) {
|
||||
Collections.sort(candidates, NeighborDistanceComparator.getInstance());
|
||||
candidates.subList(pageNeighborCount, candidates.size()).clear();
|
||||
}
|
||||
end++;
|
||||
newCandidatesFound = true;
|
||||
}
|
||||
|
||||
if (newCandidatesFound && candidates.size() >= pageNeighborCount) {
|
||||
Collections.sort(candidates, NeighborDistanceComparator.getInstance());
|
||||
dist = candidates.get(pageNeighborCount - 1).getDistance();
|
||||
}
|
||||
}
|
||||
candidates.subList(pageNeighborCount, candidates.size()).clear();
|
||||
components[i].setNeighbors(new ArrayList<Neighbor>(candidates));
|
||||
candidates.clear();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Computes initial orientation estimation based on nearest-neighbors' angles.
|
||||
*
|
||||
* @param components
|
||||
* @return initial orientation estimation
|
||||
*/
|
||||
private double computeInitialOrientation(List<Character> components) {
|
||||
|
||||
Histogram histogram = new Histogram(-Math.PI / 2, Math.PI / 2, ANGLE_HIST_RESOLUTION);
|
||||
for (Character component : components) {
|
||||
for (Neighbor neighbor : component.getNeighbors()) {
|
||||
histogram.add(neighbor.getAngle());
|
||||
}
|
||||
}
|
||||
// Rectangular smoothing window has been replaced with gaussian smoothing window
|
||||
histogram.circularGaussianSmooth(ANGLE_HIST_SMOOTHING_LEN, ANGLE_HIST_SMOOTHING_STDDEV);
|
||||
return histogram.getPeakValue();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Computes within-line spacing based on nearest-neighbors distances.
|
||||
*
|
||||
* @param components
|
||||
* @param orientation estimated text orientation
|
||||
* @return estimated within-line spacing
|
||||
*/
|
||||
private double computeCharacterSpacing(List<Character> components, double orientation) {
|
||||
|
||||
return computeSpacing(components, orientation);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Computes between-line spacing based on nearest-neighbors distances.
|
||||
*
|
||||
* @param components
|
||||
* @param orientation estimated text orientation
|
||||
* @return estimated between-line spacing
|
||||
*/
|
||||
private double computeLineSpacing(List<Character> components, double orientation) {
|
||||
|
||||
if (orientation >= 0) {
|
||||
return computeSpacing(components, orientation - Math.PI / 2);
|
||||
} else {
|
||||
return computeSpacing(components, orientation + Math.PI / 2);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private double computeSpacing(List<Character> components, double angle) {
|
||||
|
||||
double maxDistance = Double.NEGATIVE_INFINITY;
|
||||
for (Character component : components) {
|
||||
for (Neighbor neighbor : component.getNeighbors()) {
|
||||
maxDistance = Math.max(maxDistance, neighbor.getDistance());
|
||||
}
|
||||
}
|
||||
Histogram histogram = new Histogram(0, maxDistance, SPACING_HIST_RESOLUTION);
|
||||
AngleFilter filter = AngleFilter.newInstance(angle - ANGLE_TOLERANCE, angle + ANGLE_TOLERANCE);
|
||||
for (Character component : components) {
|
||||
for (Neighbor neighbor : component.getNeighbors()) {
|
||||
if (filter.matches(neighbor)) {
|
||||
histogram.add(neighbor.getDistance());
|
||||
}
|
||||
}
|
||||
}
|
||||
// Rectangular smoothing window has been replaced with gaussian smoothing window
|
||||
histogram.gaussianSmooth(SPACING_HIST_SMOOTHING_LEN, SPACING_HIST_SMOOTHING_STDDEV);
|
||||
return histogram.getPeakValue();
|
||||
}
|
||||
|
||||
|
||||
private List<ComponentLine> determineLines(List<Character> characters, double characterSpacing, double lineSpacing) {
|
||||
|
||||
double maxHorizontalDistance = characterSpacing * COMP_DIST_CHAR;
|
||||
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_COMP_DIST;
|
||||
|
||||
// DisjointSets<Character> sets = new DisjointSets<Character>(characters);
|
||||
// AngleFilter filter = AngleFilter.newInstance(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
|
||||
// for (Character component : characters) {
|
||||
// for (Neighbor neighbor : component.getNeighbors()) {
|
||||
// double x = neighbor.getHorizontalDistance() / maxHorizontalDistance;
|
||||
// double y = neighbor.getVerticalDistance() / maxVerticalDistance;
|
||||
// if (filter.matches(neighbor) && x * x + y * y <= 1) {
|
||||
// sets.union(component, neighbor.getCharacter());
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// List<ComponentLine> lines = new ArrayList<ComponentLine>();
|
||||
// for (Set<Character> group : sets) {
|
||||
// List<Character> lineComponents = new ArrayList<Character>(group);
|
||||
// lineComponents.sort(Comparator.comparingDouble(Character::getX));
|
||||
// lines.add(new ComponentLine(lineComponents));
|
||||
// }
|
||||
// return lines;
|
||||
|
||||
DisjointSets<Character> sets = new DisjointSets<>(characters);
|
||||
AngleFilter filter = AngleFilter.newInstance(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
|
||||
|
||||
characters.forEach(character -> {
|
||||
character.getNeighbors().forEach(neighbor -> {
|
||||
double x = neighbor.getHorizontalDistance() / maxHorizontalDistance;
|
||||
double y = neighbor.getVerticalDistance() / maxVerticalDistance;
|
||||
if (filter.matches(neighbor) && Math.pow(x, 2) + Math.pow(y, 2) <= 1) {
|
||||
sets.union(character, neighbor.getCharacter());
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
List<ComponentLine> lines = new ArrayList<>();
|
||||
sets.forEach(group -> {
|
||||
List<Character> lineComponents = new ArrayList<>(group);
|
||||
lineComponents.sort(Comparator.comparingDouble(Character::getX));
|
||||
lines.add(new ComponentLine(lineComponents));
|
||||
});
|
||||
|
||||
return lines;
|
||||
|
||||
}
|
||||
|
||||
|
||||
private List<List<ComponentLine>> determineZones(List<ComponentLine> lines,
|
||||
double orientation,
|
||||
double minHorizontalDistance,
|
||||
double maxHorizontalDistance,
|
||||
double minVerticalDistance,
|
||||
double maxVerticalDistance,
|
||||
double minHorizontalMergeDistance,
|
||||
double maxHorizontalMergeDistance,
|
||||
double minVerticalMergeDistance,
|
||||
double maxVerticalMergeDistance) {
|
||||
|
||||
DisjointSets<ComponentLine> sets = new DisjointSets<ComponentLine>(lines);
|
||||
// Mean height is computed so that all distances can be scaled
|
||||
// relative to the line height
|
||||
double meanHeight = 0.0, weights = 0.0;
|
||||
for (ComponentLine line : lines) {
|
||||
double weight = line.getLength();
|
||||
meanHeight += line.getHeight() * weight;
|
||||
weights += weight;
|
||||
}
|
||||
meanHeight /= weights;
|
||||
|
||||
for (int i = 0; i < lines.size(); i++) {
|
||||
ComponentLine li = lines.get(i);
|
||||
for (int j = i + 1; j < lines.size(); j++) {
|
||||
ComponentLine lj = lines.get(j);
|
||||
double scale = Math.min(li.getHeight(), lj.getHeight()) / meanHeight;
|
||||
scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE));
|
||||
// "<=" is used instead of "<" for consistency and to allow setting minVertical(Merge)Distance
|
||||
// to 0.0 with meaning "no minimal distance required"
|
||||
if (!sets.areTogether(li, lj) && li.angularDifference(lj) <= ANGLE_TOLERANCE) {
|
||||
double hDist = li.horizontalDistance(lj, orientation) / scale;
|
||||
double vDist = li.verticalDistance(lj, orientation) / scale;
|
||||
// Line over or above
|
||||
if (minHorizontalDistance <= hDist && hDist <= maxHorizontalDistance && minVerticalDistance <= vDist && vDist <= maxVerticalDistance) {
|
||||
sets.union(li, lj);
|
||||
}
|
||||
// Split line that needs later merging
|
||||
else if (minHorizontalMergeDistance <= hDist && hDist <= maxHorizontalMergeDistance && minVerticalMergeDistance <= vDist && vDist <= maxVerticalMergeDistance) {
|
||||
sets.union(li, lj);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
List<List<ComponentLine>> zones = new ArrayList<List<ComponentLine>>();
|
||||
for (Set<ComponentLine> group : sets) {
|
||||
zones.add(new ArrayList<ComponentLine>(group));
|
||||
}
|
||||
return zones;
|
||||
}
|
||||
|
||||
|
||||
private List<Zone> convertToBxModel(List<List<ComponentLine>> zones, double wordSpacing) {
|
||||
private List<Zone> convertToBxModel(List<CharacterZone> zones, double wordSpacing) {
|
||||
|
||||
List<Zone> zoneList = new ArrayList<>();
|
||||
if (zones.size() > MAX_ZONES_PER_PAGE) {
|
||||
List<ComponentLine> oneZone = new ArrayList<ComponentLine>();
|
||||
for (List<ComponentLine> zone : zones) {
|
||||
oneZone.addAll(zone);
|
||||
CharacterZone oneZone = new CharacterZone();
|
||||
for (CharacterZone zone : zones) {
|
||||
oneZone.getLines().addAll(zone.getLines());
|
||||
}
|
||||
zones = new ArrayList<>();
|
||||
zones.add(oneZone);
|
||||
}
|
||||
|
||||
for (List<ComponentLine> lines : zones) {
|
||||
for (CharacterZone characterZone : zones) {
|
||||
Zone zone = new Zone();
|
||||
for (ComponentLine line : lines) {
|
||||
for (CharacterLine line : characterZone.getLines()) {
|
||||
zone.addLine(line.convertToBxLine(wordSpacing));
|
||||
}
|
||||
List<Line> zLines = Lists.newArrayList(zone.getLines());
|
||||
@ -461,173 +92,4 @@ public class DocstrumSegmenter {
|
||||
return zoneList;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Neighbor distance comparator based on the distance.
|
||||
* <p>
|
||||
* The ordering is not consistent with equals.
|
||||
*/
|
||||
protected static final class NeighborDistanceComparator implements Comparator<Neighbor> {
|
||||
|
||||
private NeighborDistanceComparator() {
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int compare(Neighbor o1, Neighbor o2) {
|
||||
|
||||
return Double.compare(o1.getDistance(), o2.getDistance());
|
||||
}
|
||||
|
||||
|
||||
private static final NeighborDistanceComparator instance = new NeighborDistanceComparator();
|
||||
|
||||
|
||||
public static NeighborDistanceComparator getInstance() {
|
||||
|
||||
return instance;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal representation of the text line.
|
||||
*/
|
||||
protected static class ComponentLine {
|
||||
|
||||
private final double x0;
|
||||
private final double y0;
|
||||
|
||||
private final double x1;
|
||||
private final double y1;
|
||||
|
||||
private final double height;
|
||||
|
||||
private final List<Character> components;
|
||||
|
||||
|
||||
public ComponentLine(List<Character> components) {
|
||||
|
||||
this.components = components;
|
||||
|
||||
if (components.size() >= 2) {
|
||||
// Simple linear regression
|
||||
double sx = 0.0, sxx = 0.0, sxy = 0.0, sy = 0.0;
|
||||
for (Character component : components) {
|
||||
sx += component.getX();
|
||||
sxx += component.getX() * component.getX();
|
||||
sxy += component.getX() * component.getY();
|
||||
sy += component.getY();
|
||||
}
|
||||
double b = (components.size() * sxy - sx * sy) / (components.size() * sxx - sx * sx);
|
||||
double a = (sy - b * sx) / components.size();
|
||||
|
||||
this.x0 = components.get(0).getX();
|
||||
this.y0 = a + b * this.x0;
|
||||
this.x1 = components.get(components.size() - 1).getX();
|
||||
this.y1 = a + b * this.x1;
|
||||
} else if (!components.isEmpty()) {
|
||||
Character component = components.get(0);
|
||||
double dx = component.getTextPosition().getWidthDirAdj() / 3;
|
||||
double dy = dx * Math.tan(0);
|
||||
this.x0 = component.getX() - dx;
|
||||
this.x1 = component.getX() + dx;
|
||||
this.y0 = component.getY() - dy;
|
||||
this.y1 = component.getY() + dy;
|
||||
} else {
|
||||
throw new IllegalArgumentException("Component list must not be empty");
|
||||
}
|
||||
height = computeHeight();
|
||||
}
|
||||
|
||||
|
||||
public double getAngle() {
|
||||
|
||||
return Math.atan2(y1 - y0, x1 - x0);
|
||||
}
|
||||
|
||||
|
||||
public double getLength() {
|
||||
|
||||
return Math.sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1));
|
||||
}
|
||||
|
||||
|
||||
private double computeHeight() {
|
||||
|
||||
double sum = 0.0;
|
||||
for (Character component : components) {
|
||||
sum += component.getHeight();
|
||||
}
|
||||
return sum / components.size();
|
||||
}
|
||||
|
||||
|
||||
public double getHeight() {
|
||||
|
||||
return height;
|
||||
}
|
||||
|
||||
|
||||
public double angularDifference(ComponentLine j) {
|
||||
|
||||
double diff = Math.abs(getAngle() - j.getAngle());
|
||||
if (diff <= Math.PI / 2) {
|
||||
return diff;
|
||||
} else {
|
||||
return Math.PI - diff;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public double horizontalDistance(ComponentLine other, double orientation) {
|
||||
|
||||
double[] xs = new double[4];
|
||||
double s = Math.sin(-orientation), c = Math.cos(-orientation);
|
||||
xs[0] = c * x0 - s * y0;
|
||||
xs[1] = c * x1 - s * y1;
|
||||
xs[2] = c * other.x0 - s * other.y0;
|
||||
xs[3] = c * other.x1 - s * other.y1;
|
||||
boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0];
|
||||
Arrays.sort(xs);
|
||||
return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1);
|
||||
}
|
||||
|
||||
|
||||
public double verticalDistance(ComponentLine other, double orientation) {
|
||||
|
||||
double xm = (x0 + x1) / 2, ym = (y0 + y1) / 2;
|
||||
double xn = (other.x0 + other.x1) / 2, yn = (other.y0 + other.y1) / 2;
|
||||
double a = Math.tan(orientation);
|
||||
return Math.abs(a * (xn - xm) + ym - yn) / Math.sqrt(a * a + 1);
|
||||
}
|
||||
|
||||
|
||||
public Line convertToBxLine(double wordSpacing) {
|
||||
|
||||
Line line = new Line();
|
||||
Word word = new Word();
|
||||
Character previousComponent = null;
|
||||
for (Character component : components) {
|
||||
if (previousComponent != null) {
|
||||
double dist = component.getTextPosition().getXDirAdj() - previousComponent.getTextPosition().getXDirAdj() - previousComponent.getTextPosition()
|
||||
.getWidthDirAdj();
|
||||
if (dist > wordSpacing) {
|
||||
BoundingBoxBuilder.setBounds(word);
|
||||
line.addWord(word);
|
||||
word = new Word();
|
||||
}
|
||||
}
|
||||
word.addChunk(component.getTextPosition());
|
||||
previousComponent = component;
|
||||
}
|
||||
BoundingBoxBuilder.setBounds(word);
|
||||
line.addWord(word);
|
||||
BoundingBoxBuilder.setBounds(line);
|
||||
return line;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -8,6 +8,7 @@ import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.DisjointSets;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.Character;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterLine;
|
||||
|
||||
@Service
|
||||
public class LineBuilderService {
|
||||
@ -17,7 +18,7 @@ public class LineBuilderService {
|
||||
private static final double ANGLE_TOLERANCE = Math.PI / 6;
|
||||
|
||||
|
||||
public List<DocstrumSegmenter.ComponentLine> buildLines(List<Character> characters, double characterSpacing, double lineSpacing) {
|
||||
public List<CharacterLine> buildLines(List<Character> characters, double characterSpacing, double lineSpacing) {
|
||||
|
||||
double maxHorizontalDistance = characterSpacing * CHARACTER_SPACING_DISTANCE_MULTIPLIER;
|
||||
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_CHARACTER_DISTANCE;
|
||||
@ -35,11 +36,11 @@ public class LineBuilderService {
|
||||
});
|
||||
});
|
||||
|
||||
List<DocstrumSegmenter.ComponentLine> lines = new ArrayList<>();
|
||||
List<CharacterLine> lines = new ArrayList<>();
|
||||
sets.forEach(group -> {
|
||||
List<Character> lineComponents = new ArrayList<>(group);
|
||||
lineComponents.sort(Comparator.comparingDouble(Character::getX));
|
||||
lines.add(new DocstrumSegmenter.ComponentLine(lineComponents));
|
||||
lines.add(new CharacterLine(lineComponents));
|
||||
});
|
||||
|
||||
return lines;
|
||||
|
||||
@ -3,6 +3,10 @@ package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.m
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Line;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.utils.BoundingBoxBuilder;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@ -90,11 +94,10 @@ public class CharacterLine {
|
||||
public double horizontalDistance(CharacterLine other) {
|
||||
|
||||
double[] xs = new double[4];
|
||||
double s = 0, c = 1;
|
||||
xs[0] = c * x0 - s * y0;
|
||||
xs[1] = c * x1 - s * y1;
|
||||
xs[2] = c * other.x0 - s * other.y0;
|
||||
xs[3] = c * other.x1 - s * other.y1;
|
||||
xs[0] = x0;
|
||||
xs[1] = x1;
|
||||
xs[2] = other.x0;
|
||||
xs[3] = other.x1;
|
||||
boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0];
|
||||
Arrays.sort(xs);
|
||||
return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1);
|
||||
@ -103,9 +106,34 @@ public class CharacterLine {
|
||||
|
||||
public double verticalDistance(CharacterLine other) {
|
||||
|
||||
double xm = (x0 + x1) / 2, ym = (y0 + y1) / 2;
|
||||
double xn = (other.x0 + other.x1) / 2, yn = (other.y0 + other.y1) / 2;
|
||||
return Math.abs((xn - xm) + ym - yn) / Math.sqrt(1);
|
||||
double ym = (y0 + y1) / 2;
|
||||
double yn = (other.y0 + other.y1) / 2;
|
||||
return Math.abs(ym - yn) / Math.sqrt(1);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public Line convertToBxLine(double wordSpacing) {
|
||||
|
||||
Line line = new Line();
|
||||
Word word = new Word();
|
||||
Character previousComponent = null;
|
||||
for (Character component : characters) {
|
||||
if (previousComponent != null) {
|
||||
double dist = component.getTextPosition().getXDirAdj() - previousComponent.getTextPosition().getXDirAdj() - previousComponent.getTextPosition().getWidthDirAdj();
|
||||
if (dist > wordSpacing) {
|
||||
BoundingBoxBuilder.setBounds(word);
|
||||
line.addWord(word);
|
||||
word = new Word();
|
||||
}
|
||||
}
|
||||
word.addChunk(component.getTextPosition());
|
||||
previousComponent = component;
|
||||
}
|
||||
BoundingBoxBuilder.setBounds(word);
|
||||
line.addWord(word);
|
||||
BoundingBoxBuilder.setBounds(line);
|
||||
return line;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -3,6 +3,8 @@ package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.m
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BBoxObject;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
@ -10,8 +12,13 @@ import lombok.NoArgsConstructor;
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class CharacterZone {
|
||||
public class CharacterZone extends BBoxObject {
|
||||
|
||||
private List<CharacterLine> lines = new ArrayList<>();
|
||||
|
||||
|
||||
public void buildBox() {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -25,7 +25,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
@SneakyThrows
|
||||
public void testViewerDocument() {
|
||||
|
||||
System.out.println("<<<<<<<<<<" + Math.sin(-0) + "aaa " + Math.cos(-0));
|
||||
System.out.println("<<<<<<<<<<" + Math.sin(-0) + "aaa " + Math.cos(0) + Math.tan(0));
|
||||
|
||||
String fileName = "files/Plenarprotokoll 1 (keine Druchsache!) (1).pdf";
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user