diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmenter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmenter.java
index 65848c0..896658f 100644
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmenter.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmenter.java
@@ -1,24 +1,20 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum;
import java.util.ArrayList;
-import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
-import java.util.Set;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.google.common.collect.Lists;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
-import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.DisjointSets;
-import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Histogram;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Line;
-import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Word;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Zone;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.Character;
-import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.Neighbor;
+import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterLine;
+import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterZone;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.utils.BoundingBoxBuilder;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.utils.ZoneUtils;
@@ -31,103 +27,9 @@ public class DocstrumSegmenter {
private final NearestNeighbourService nearestNeighbourService;
private final SpacingService spacingService;
private final LineBuilderService lineBuilderService;
+ private final ZoneBuilderService zoneBuilderService;
public static final int MAX_ZONES_PER_PAGE = 300;
-
- private static final double DISTANCE_STEP = 16.0;
-
- /**
- * Angle histogram resolution in radians per bin.
- */
- private static final double ANGLE_HIST_RESOLUTION = Math.toRadians(0.5);
-
- /**
- * Angle histogram smoothing window length in radians.
- * Length of angle histogram is equal to pi.
- */
- private static final double ANGLE_HIST_SMOOTHING_LEN = 0.25 * Math.PI;
-
- /**
- * Angle histogram gaussian smoothing window standard deviation in radians.
- */
- private static final double ANGLE_HIST_SMOOTHING_STDDEV = 0.0625 * Math.PI;
-
- /**
- * Spacing histogram resolution per bin.
- */
- private static final double SPACING_HIST_RESOLUTION = 0.5;
-
- /**
- * Spacing histogram smoothing window length.
- */
- private static final double SPACING_HIST_SMOOTHING_LEN = 2.5;
-
- /**
- * Spacing histogram gaussian smoothing window standard deviation.
- */
- private static final double SPACING_HIST_SMOOTHING_STDDEV = 0.5;
-
- /**
- * Maximum vertical component distance multiplier used during line
- * determination.
- *
- * Maximum vertical distance between components (characters) that belong
- * to the same line is equal to the product of this value and estimated
- * between-line spacing.
- */
- private static final double MAX_VERTICAL_COMP_DIST = 0.67;
-
- /**
- * Minimum line size scale value.
- *
- * During zone determination (merging lines into zones) line height is
- * taken into account. To achieve this, line size scale is estimated and
- * limited to range [minLineSizeScale, maxLineSizeScale].
- */
- private static final double MIN_LINE_SIZE_SCALE = 0.9;
-
- /**
- * Maximum line size scale value.
- *
- * See minLineSizeScale for more information.
- */
- private static final double MAX_LINE_SIZE_SCALE = 2.5;
-
- /**
- * Minimum horizontal line distance multiplier.
- *
- * Minimum horizontal distance between lines that belong to the same zone
- * is equal to the product of this value and estimated within-line spacing.
- */
- private static final double MIN_HORIZONTAL_DIST = -0.5;
-
- /**
- * Minimum vertical line distance multiplier.
- *
- * Minimum vertical distance between lines that belong to the same zone
- * is equal to the product of this value and estimated between-line spacing.
- */
- private static final double MIN_VERTICAL_DIST = 0.0;
-
- /**
- * Maximum vertical line distance multiplier.
- *
- * Maximum vertical distance between lines that belong to the same zone
- * is equal to the product of this value and estimated between-line spacing.
- */
- private static final double MAX_VERTICAL_DIST = 1.2;
-
- /**
- * Component distance character spacing multiplier.
- *
- * Maximum distance between components that belong to the same line is
- * equal to (lineSpacing * componentDistanceLineMultiplier +
- * characterSpacing * componentDistanceCharacterMultiplier), where
- * lineSpacing and characterSpacing are estimated between-line and
- * within-line spacing, respectively.
- */
- private static final double COMP_DIST_CHAR = 3.5;
-
/**
* Word distance multiplier.
*
@@ -136,37 +38,6 @@ public class DocstrumSegmenter {
*/
private static final double WORD_DIST_MULT = 0.2;
- /**
- * Minimum horizontal line merge distance multiplier.
- *
- * Minimum horizontal distance between lines that should be merged is equal
- * to the product of this value and estimated within-line spacing.
- *
- * Because split lines do not overlap this value should be negative.
- */
-
- private static final double MIN_HORIZONTAL_MERGE_DIST = -3.0;
-
- /**
- * Maximum vertical line merge distance multiplier.
- *
- * Maximum vertical distance between lines that should be merged is equal
- * to the product of this value and estimated between-line spacing.
- */
-
- private static final double MAX_VERTICAL_MERGE_DIST = 0.5;
-
- /**
- * Angle tolerance for comparisons of angles between components and angles
- * between lines.
- */
- private static final double ANGLE_TOLERANCE = Math.PI / 6;
-
- /**
- * Number of nearest-neighbors found per component.
- */
- private static final int NEIGHBOUR_COUNT = 8;
-
public List segmentPage(List textPositions) {
@@ -176,271 +47,31 @@ public class DocstrumSegmenter {
nearestNeighbourService.findNearestNeighbors(components);
- double orientation = 0;
-
double characterSpacing = spacingService.computeCharacterSpacing(components);
double lineSpacing = spacingService.computeLineSpacing(components);
- List lines = lineBuilderService.buildLines(components, characterSpacing, lineSpacing);
+ List lines = lineBuilderService.buildLines(components, characterSpacing, lineSpacing);
- List> zones = determineZones(lines,
- orientation,
- characterSpacing * MIN_HORIZONTAL_DIST,
- Double.POSITIVE_INFINITY,
- lineSpacing * MIN_VERTICAL_DIST,
- lineSpacing * MAX_VERTICAL_DIST,
- characterSpacing * MIN_HORIZONTAL_MERGE_DIST,
- 0.0,
- 0.0,
- lineSpacing * MAX_VERTICAL_MERGE_DIST);
+ List zones = zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing);
return convertToBxModel(zones, WORD_DIST_MULT * characterSpacing);
}
- private void findNeighbors(Character[] components) {
-
- if (components.length == 0) {
- return;
- }
- if (components.length == 1) {
- components[0].setNeighbors(new ArrayList());
- return;
- }
- int pageNeighborCount = NEIGHBOUR_COUNT;
- if (components.length <= NEIGHBOUR_COUNT) {
- pageNeighborCount = components.length - 1;
- }
-
- List candidates = new ArrayList();
- for (int i = 0; i < components.length; i++) {
- int start = i, end = i + 1;
- // Contains components from components array
- // from ranges [start, i) and [i+1, end)
- double dist = Double.POSITIVE_INFINITY;
- for (double searchDist = 0; searchDist < dist; ) {
- searchDist += DISTANCE_STEP;
- boolean newCandidatesFound = false;
-
- while (start > 0 && components[i].getX() - components[start - 1].getX() < searchDist) {
- start--;
- candidates.add(new Neighbor(components[start], components[i]));
- if (candidates.size() > pageNeighborCount) {
- Collections.sort(candidates, NeighborDistanceComparator.getInstance());
- candidates.subList(pageNeighborCount, candidates.size()).clear();
- }
- newCandidatesFound = true;
- }
- while (end < components.length && components[end].getX() - components[i].getX() < searchDist) {
- candidates.add(new Neighbor(components[end], components[i]));
- if (candidates.size() > pageNeighborCount) {
- Collections.sort(candidates, NeighborDistanceComparator.getInstance());
- candidates.subList(pageNeighborCount, candidates.size()).clear();
- }
- end++;
- newCandidatesFound = true;
- }
-
- if (newCandidatesFound && candidates.size() >= pageNeighborCount) {
- Collections.sort(candidates, NeighborDistanceComparator.getInstance());
- dist = candidates.get(pageNeighborCount - 1).getDistance();
- }
- }
- candidates.subList(pageNeighborCount, candidates.size()).clear();
- components[i].setNeighbors(new ArrayList(candidates));
- candidates.clear();
- }
- }
-
-
- /**
- * Computes initial orientation estimation based on nearest-neighbors' angles.
- *
- * @param components
- * @return initial orientation estimation
- */
- private double computeInitialOrientation(List components) {
-
- Histogram histogram = new Histogram(-Math.PI / 2, Math.PI / 2, ANGLE_HIST_RESOLUTION);
- for (Character component : components) {
- for (Neighbor neighbor : component.getNeighbors()) {
- histogram.add(neighbor.getAngle());
- }
- }
- // Rectangular smoothing window has been replaced with gaussian smoothing window
- histogram.circularGaussianSmooth(ANGLE_HIST_SMOOTHING_LEN, ANGLE_HIST_SMOOTHING_STDDEV);
- return histogram.getPeakValue();
- }
-
-
- /**
- * Computes within-line spacing based on nearest-neighbors distances.
- *
- * @param components
- * @param orientation estimated text orientation
- * @return estimated within-line spacing
- */
- private double computeCharacterSpacing(List components, double orientation) {
-
- return computeSpacing(components, orientation);
- }
-
-
- /**
- * Computes between-line spacing based on nearest-neighbors distances.
- *
- * @param components
- * @param orientation estimated text orientation
- * @return estimated between-line spacing
- */
- private double computeLineSpacing(List components, double orientation) {
-
- if (orientation >= 0) {
- return computeSpacing(components, orientation - Math.PI / 2);
- } else {
- return computeSpacing(components, orientation + Math.PI / 2);
- }
- }
-
-
- private double computeSpacing(List components, double angle) {
-
- double maxDistance = Double.NEGATIVE_INFINITY;
- for (Character component : components) {
- for (Neighbor neighbor : component.getNeighbors()) {
- maxDistance = Math.max(maxDistance, neighbor.getDistance());
- }
- }
- Histogram histogram = new Histogram(0, maxDistance, SPACING_HIST_RESOLUTION);
- AngleFilter filter = AngleFilter.newInstance(angle - ANGLE_TOLERANCE, angle + ANGLE_TOLERANCE);
- for (Character component : components) {
- for (Neighbor neighbor : component.getNeighbors()) {
- if (filter.matches(neighbor)) {
- histogram.add(neighbor.getDistance());
- }
- }
- }
- // Rectangular smoothing window has been replaced with gaussian smoothing window
- histogram.gaussianSmooth(SPACING_HIST_SMOOTHING_LEN, SPACING_HIST_SMOOTHING_STDDEV);
- return histogram.getPeakValue();
- }
-
-
- private List determineLines(List characters, double characterSpacing, double lineSpacing) {
-
- double maxHorizontalDistance = characterSpacing * COMP_DIST_CHAR;
- double maxVerticalDistance = lineSpacing * MAX_VERTICAL_COMP_DIST;
-
-// DisjointSets sets = new DisjointSets(characters);
-// AngleFilter filter = AngleFilter.newInstance(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
-// for (Character component : characters) {
-// for (Neighbor neighbor : component.getNeighbors()) {
-// double x = neighbor.getHorizontalDistance() / maxHorizontalDistance;
-// double y = neighbor.getVerticalDistance() / maxVerticalDistance;
-// if (filter.matches(neighbor) && x * x + y * y <= 1) {
-// sets.union(component, neighbor.getCharacter());
-// }
-// }
-// }
-// List lines = new ArrayList();
-// for (Set group : sets) {
-// List lineComponents = new ArrayList(group);
-// lineComponents.sort(Comparator.comparingDouble(Character::getX));
-// lines.add(new ComponentLine(lineComponents));
-// }
-// return lines;
-
- DisjointSets sets = new DisjointSets<>(characters);
- AngleFilter filter = AngleFilter.newInstance(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
-
- characters.forEach(character -> {
- character.getNeighbors().forEach(neighbor -> {
- double x = neighbor.getHorizontalDistance() / maxHorizontalDistance;
- double y = neighbor.getVerticalDistance() / maxVerticalDistance;
- if (filter.matches(neighbor) && Math.pow(x, 2) + Math.pow(y, 2) <= 1) {
- sets.union(character, neighbor.getCharacter());
- }
- });
- });
-
- List lines = new ArrayList<>();
- sets.forEach(group -> {
- List lineComponents = new ArrayList<>(group);
- lineComponents.sort(Comparator.comparingDouble(Character::getX));
- lines.add(new ComponentLine(lineComponents));
- });
-
- return lines;
-
- }
-
-
- private List> determineZones(List lines,
- double orientation,
- double minHorizontalDistance,
- double maxHorizontalDistance,
- double minVerticalDistance,
- double maxVerticalDistance,
- double minHorizontalMergeDistance,
- double maxHorizontalMergeDistance,
- double minVerticalMergeDistance,
- double maxVerticalMergeDistance) {
-
- DisjointSets sets = new DisjointSets(lines);
- // Mean height is computed so that all distances can be scaled
- // relative to the line height
- double meanHeight = 0.0, weights = 0.0;
- for (ComponentLine line : lines) {
- double weight = line.getLength();
- meanHeight += line.getHeight() * weight;
- weights += weight;
- }
- meanHeight /= weights;
-
- for (int i = 0; i < lines.size(); i++) {
- ComponentLine li = lines.get(i);
- for (int j = i + 1; j < lines.size(); j++) {
- ComponentLine lj = lines.get(j);
- double scale = Math.min(li.getHeight(), lj.getHeight()) / meanHeight;
- scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE));
- // "<=" is used instead of "<" for consistency and to allow setting minVertical(Merge)Distance
- // to 0.0 with meaning "no minimal distance required"
- if (!sets.areTogether(li, lj) && li.angularDifference(lj) <= ANGLE_TOLERANCE) {
- double hDist = li.horizontalDistance(lj, orientation) / scale;
- double vDist = li.verticalDistance(lj, orientation) / scale;
- // Line over or above
- if (minHorizontalDistance <= hDist && hDist <= maxHorizontalDistance && minVerticalDistance <= vDist && vDist <= maxVerticalDistance) {
- sets.union(li, lj);
- }
- // Split line that needs later merging
- else if (minHorizontalMergeDistance <= hDist && hDist <= maxHorizontalMergeDistance && minVerticalMergeDistance <= vDist && vDist <= maxVerticalMergeDistance) {
- sets.union(li, lj);
- }
- }
- }
- }
- List> zones = new ArrayList>();
- for (Set group : sets) {
- zones.add(new ArrayList(group));
- }
- return zones;
- }
-
-
- private List convertToBxModel(List> zones, double wordSpacing) {
+ private List convertToBxModel(List zones, double wordSpacing) {
List zoneList = new ArrayList<>();
if (zones.size() > MAX_ZONES_PER_PAGE) {
- List oneZone = new ArrayList();
- for (List zone : zones) {
- oneZone.addAll(zone);
+ CharacterZone oneZone = new CharacterZone();
+ for (CharacterZone zone : zones) {
+ oneZone.getLines().addAll(zone.getLines());
}
zones = new ArrayList<>();
zones.add(oneZone);
}
- for (List lines : zones) {
+ for (CharacterZone characterZone : zones) {
Zone zone = new Zone();
- for (ComponentLine line : lines) {
+ for (CharacterLine line : characterZone.getLines()) {
zone.addLine(line.convertToBxLine(wordSpacing));
}
List zLines = Lists.newArrayList(zone.getLines());
@@ -461,173 +92,4 @@ public class DocstrumSegmenter {
return zoneList;
}
-
- /**
- * Neighbor distance comparator based on the distance.
- *
- * The ordering is not consistent with equals.
- */
- protected static final class NeighborDistanceComparator implements Comparator {
-
- private NeighborDistanceComparator() {
-
- }
-
-
- @Override
- public int compare(Neighbor o1, Neighbor o2) {
-
- return Double.compare(o1.getDistance(), o2.getDistance());
- }
-
-
- private static final NeighborDistanceComparator instance = new NeighborDistanceComparator();
-
-
- public static NeighborDistanceComparator getInstance() {
-
- return instance;
- }
-
- }
-
- /**
- * Internal representation of the text line.
- */
- protected static class ComponentLine {
-
- private final double x0;
- private final double y0;
-
- private final double x1;
- private final double y1;
-
- private final double height;
-
- private final List components;
-
-
- public ComponentLine(List components) {
-
- this.components = components;
-
- if (components.size() >= 2) {
- // Simple linear regression
- double sx = 0.0, sxx = 0.0, sxy = 0.0, sy = 0.0;
- for (Character component : components) {
- sx += component.getX();
- sxx += component.getX() * component.getX();
- sxy += component.getX() * component.getY();
- sy += component.getY();
- }
- double b = (components.size() * sxy - sx * sy) / (components.size() * sxx - sx * sx);
- double a = (sy - b * sx) / components.size();
-
- this.x0 = components.get(0).getX();
- this.y0 = a + b * this.x0;
- this.x1 = components.get(components.size() - 1).getX();
- this.y1 = a + b * this.x1;
- } else if (!components.isEmpty()) {
- Character component = components.get(0);
- double dx = component.getTextPosition().getWidthDirAdj() / 3;
- double dy = dx * Math.tan(0);
- this.x0 = component.getX() - dx;
- this.x1 = component.getX() + dx;
- this.y0 = component.getY() - dy;
- this.y1 = component.getY() + dy;
- } else {
- throw new IllegalArgumentException("Component list must not be empty");
- }
- height = computeHeight();
- }
-
-
- public double getAngle() {
-
- return Math.atan2(y1 - y0, x1 - x0);
- }
-
-
- public double getLength() {
-
- return Math.sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1));
- }
-
-
- private double computeHeight() {
-
- double sum = 0.0;
- for (Character component : components) {
- sum += component.getHeight();
- }
- return sum / components.size();
- }
-
-
- public double getHeight() {
-
- return height;
- }
-
-
- public double angularDifference(ComponentLine j) {
-
- double diff = Math.abs(getAngle() - j.getAngle());
- if (diff <= Math.PI / 2) {
- return diff;
- } else {
- return Math.PI - diff;
- }
- }
-
-
- public double horizontalDistance(ComponentLine other, double orientation) {
-
- double[] xs = new double[4];
- double s = Math.sin(-orientation), c = Math.cos(-orientation);
- xs[0] = c * x0 - s * y0;
- xs[1] = c * x1 - s * y1;
- xs[2] = c * other.x0 - s * other.y0;
- xs[3] = c * other.x1 - s * other.y1;
- boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0];
- Arrays.sort(xs);
- return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1);
- }
-
-
- public double verticalDistance(ComponentLine other, double orientation) {
-
- double xm = (x0 + x1) / 2, ym = (y0 + y1) / 2;
- double xn = (other.x0 + other.x1) / 2, yn = (other.y0 + other.y1) / 2;
- double a = Math.tan(orientation);
- return Math.abs(a * (xn - xm) + ym - yn) / Math.sqrt(a * a + 1);
- }
-
-
- public Line convertToBxLine(double wordSpacing) {
-
- Line line = new Line();
- Word word = new Word();
- Character previousComponent = null;
- for (Character component : components) {
- if (previousComponent != null) {
- double dist = component.getTextPosition().getXDirAdj() - previousComponent.getTextPosition().getXDirAdj() - previousComponent.getTextPosition()
- .getWidthDirAdj();
- if (dist > wordSpacing) {
- BoundingBoxBuilder.setBounds(word);
- line.addWord(word);
- word = new Word();
- }
- }
- word.addChunk(component.getTextPosition());
- previousComponent = component;
- }
- BoundingBoxBuilder.setBounds(word);
- line.addWord(word);
- BoundingBoxBuilder.setBounds(line);
- return line;
- }
-
- }
-
}
diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/LineBuilderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/LineBuilderService.java
index 217329d..59a03fa 100644
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/LineBuilderService.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/LineBuilderService.java
@@ -8,6 +8,7 @@ import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.DisjointSets;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.Character;
+import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterLine;
@Service
public class LineBuilderService {
@@ -17,7 +18,7 @@ public class LineBuilderService {
private static final double ANGLE_TOLERANCE = Math.PI / 6;
- public List buildLines(List characters, double characterSpacing, double lineSpacing) {
+ public List buildLines(List characters, double characterSpacing, double lineSpacing) {
double maxHorizontalDistance = characterSpacing * CHARACTER_SPACING_DISTANCE_MULTIPLIER;
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_CHARACTER_DISTANCE;
@@ -35,11 +36,11 @@ public class LineBuilderService {
});
});
- List lines = new ArrayList<>();
+ List lines = new ArrayList<>();
sets.forEach(group -> {
List lineComponents = new ArrayList<>(group);
lineComponents.sort(Comparator.comparingDouble(Character::getX));
- lines.add(new DocstrumSegmenter.ComponentLine(lineComponents));
+ lines.add(new CharacterLine(lineComponents));
});
return lines;
diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/refactor/docstrum/CharacterLine.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/refactor/docstrum/CharacterLine.java
index 3b54287..f6bc990 100644
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/refactor/docstrum/CharacterLine.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/refactor/docstrum/CharacterLine.java
@@ -3,6 +3,10 @@ package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.m
import java.util.Arrays;
import java.util.List;
+import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Line;
+import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Word;
+import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.utils.BoundingBoxBuilder;
+
import lombok.Data;
@Data
@@ -90,11 +94,10 @@ public class CharacterLine {
public double horizontalDistance(CharacterLine other) {
double[] xs = new double[4];
- double s = 0, c = 1;
- xs[0] = c * x0 - s * y0;
- xs[1] = c * x1 - s * y1;
- xs[2] = c * other.x0 - s * other.y0;
- xs[3] = c * other.x1 - s * other.y1;
+ xs[0] = x0;
+ xs[1] = x1;
+ xs[2] = other.x0;
+ xs[3] = other.x1;
boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0];
Arrays.sort(xs);
return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1);
@@ -103,9 +106,34 @@ public class CharacterLine {
public double verticalDistance(CharacterLine other) {
- double xm = (x0 + x1) / 2, ym = (y0 + y1) / 2;
- double xn = (other.x0 + other.x1) / 2, yn = (other.y0 + other.y1) / 2;
- return Math.abs((xn - xm) + ym - yn) / Math.sqrt(1);
+ double ym = (y0 + y1) / 2;
+ double yn = (other.y0 + other.y1) / 2;
+ return Math.abs(ym - yn) / Math.sqrt(1);
}
-}
\ No newline at end of file
+
+ public Line convertToBxLine(double wordSpacing) {
+
+ Line line = new Line();
+ Word word = new Word();
+ Character previousComponent = null;
+ for (Character component : characters) {
+ if (previousComponent != null) {
+ double dist = component.getTextPosition().getXDirAdj() - previousComponent.getTextPosition().getXDirAdj() - previousComponent.getTextPosition().getWidthDirAdj();
+ if (dist > wordSpacing) {
+ BoundingBoxBuilder.setBounds(word);
+ line.addWord(word);
+ word = new Word();
+ }
+ }
+ word.addChunk(component.getTextPosition());
+ previousComponent = component;
+ }
+ BoundingBoxBuilder.setBounds(word);
+ line.addWord(word);
+ BoundingBoxBuilder.setBounds(line);
+ return line;
+ }
+
+}
+
diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/refactor/docstrum/CharacterZone.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/refactor/docstrum/CharacterZone.java
index 0b19599..97255d6 100644
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/refactor/docstrum/CharacterZone.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/refactor/docstrum/CharacterZone.java
@@ -3,6 +3,8 @@ package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.m
import java.util.ArrayList;
import java.util.List;
+import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BBoxObject;
+
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@@ -10,8 +12,13 @@ import lombok.NoArgsConstructor;
@Data
@NoArgsConstructor
@AllArgsConstructor
-public class CharacterZone {
+public class CharacterZone extends BBoxObject {
private List lines = new ArrayList<>();
+
+ public void buildBox() {
+
+ }
+
}
diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java
index e5981d5..56124e7 100644
--- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java
+++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java
@@ -25,7 +25,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
@SneakyThrows
public void testViewerDocument() {
- System.out.println("<<<<<<<<<<" + Math.sin(-0) + "aaa " + Math.cos(-0));
+ System.out.println("<<<<<<<<<<" + Math.sin(-0) + "aaa " + Math.cos(0) + Math.tan(0));
String fileName = "files/Plenarprotokoll 1 (keine Druchsache!) (1).pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";