diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmenter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmenter.java index 65848c0..896658f 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmenter.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmenter.java @@ -1,24 +1,20 @@ package com.knecon.fforesight.service.layoutparser.processor.services.docstrum; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.List; -import java.util.Set; import java.util.stream.Collectors; import org.springframework.stereotype.Service; import com.google.common.collect.Lists; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; -import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.DisjointSets; -import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Histogram; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Line; -import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Word; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Zone; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.Character; -import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.Neighbor; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterLine; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterZone; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.utils.BoundingBoxBuilder; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.utils.ZoneUtils; @@ -31,103 +27,9 @@ public class DocstrumSegmenter { private final NearestNeighbourService nearestNeighbourService; private final SpacingService spacingService; private final LineBuilderService lineBuilderService; + private final ZoneBuilderService zoneBuilderService; public static final int MAX_ZONES_PER_PAGE = 300; - - private static final double DISTANCE_STEP = 16.0; - - /** - * Angle histogram resolution in radians per bin. - */ - private static final double ANGLE_HIST_RESOLUTION = Math.toRadians(0.5); - - /** - * Angle histogram smoothing window length in radians. - * Length of angle histogram is equal to pi. - */ - private static final double ANGLE_HIST_SMOOTHING_LEN = 0.25 * Math.PI; - - /** - * Angle histogram gaussian smoothing window standard deviation in radians. - */ - private static final double ANGLE_HIST_SMOOTHING_STDDEV = 0.0625 * Math.PI; - - /** - * Spacing histogram resolution per bin. - */ - private static final double SPACING_HIST_RESOLUTION = 0.5; - - /** - * Spacing histogram smoothing window length. - */ - private static final double SPACING_HIST_SMOOTHING_LEN = 2.5; - - /** - * Spacing histogram gaussian smoothing window standard deviation. - */ - private static final double SPACING_HIST_SMOOTHING_STDDEV = 0.5; - - /** - * Maximum vertical component distance multiplier used during line - * determination. - *

- * Maximum vertical distance between components (characters) that belong - * to the same line is equal to the product of this value and estimated - * between-line spacing. - */ - private static final double MAX_VERTICAL_COMP_DIST = 0.67; - - /** - * Minimum line size scale value. - *

- * During zone determination (merging lines into zones) line height is - * taken into account. To achieve this, line size scale is estimated and - * limited to range [minLineSizeScale, maxLineSizeScale]. - */ - private static final double MIN_LINE_SIZE_SCALE = 0.9; - - /** - * Maximum line size scale value. - *

- * See minLineSizeScale for more information. - */ - private static final double MAX_LINE_SIZE_SCALE = 2.5; - - /** - * Minimum horizontal line distance multiplier. - *

- * Minimum horizontal distance between lines that belong to the same zone - * is equal to the product of this value and estimated within-line spacing. - */ - private static final double MIN_HORIZONTAL_DIST = -0.5; - - /** - * Minimum vertical line distance multiplier. - *

- * Minimum vertical distance between lines that belong to the same zone - * is equal to the product of this value and estimated between-line spacing. - */ - private static final double MIN_VERTICAL_DIST = 0.0; - - /** - * Maximum vertical line distance multiplier. - *

- * Maximum vertical distance between lines that belong to the same zone - * is equal to the product of this value and estimated between-line spacing. - */ - private static final double MAX_VERTICAL_DIST = 1.2; - - /** - * Component distance character spacing multiplier. - *

- * Maximum distance between components that belong to the same line is - * equal to (lineSpacing * componentDistanceLineMultiplier + - * characterSpacing * componentDistanceCharacterMultiplier), where - * lineSpacing and characterSpacing are estimated between-line and - * within-line spacing, respectively. - */ - private static final double COMP_DIST_CHAR = 3.5; - /** * Word distance multiplier. *

@@ -136,37 +38,6 @@ public class DocstrumSegmenter { */ private static final double WORD_DIST_MULT = 0.2; - /** - * Minimum horizontal line merge distance multiplier. - *

- * Minimum horizontal distance between lines that should be merged is equal - * to the product of this value and estimated within-line spacing. - *

- * Because split lines do not overlap this value should be negative. - */ - - private static final double MIN_HORIZONTAL_MERGE_DIST = -3.0; - - /** - * Maximum vertical line merge distance multiplier. - *

- * Maximum vertical distance between lines that should be merged is equal - * to the product of this value and estimated between-line spacing. - */ - - private static final double MAX_VERTICAL_MERGE_DIST = 0.5; - - /** - * Angle tolerance for comparisons of angles between components and angles - * between lines. - */ - private static final double ANGLE_TOLERANCE = Math.PI / 6; - - /** - * Number of nearest-neighbors found per component. - */ - private static final int NEIGHBOUR_COUNT = 8; - public List segmentPage(List textPositions) { @@ -176,271 +47,31 @@ public class DocstrumSegmenter { nearestNeighbourService.findNearestNeighbors(components); - double orientation = 0; - double characterSpacing = spacingService.computeCharacterSpacing(components); double lineSpacing = spacingService.computeLineSpacing(components); - List lines = lineBuilderService.buildLines(components, characterSpacing, lineSpacing); + List lines = lineBuilderService.buildLines(components, characterSpacing, lineSpacing); - List> zones = determineZones(lines, - orientation, - characterSpacing * MIN_HORIZONTAL_DIST, - Double.POSITIVE_INFINITY, - lineSpacing * MIN_VERTICAL_DIST, - lineSpacing * MAX_VERTICAL_DIST, - characterSpacing * MIN_HORIZONTAL_MERGE_DIST, - 0.0, - 0.0, - lineSpacing * MAX_VERTICAL_MERGE_DIST); + List zones = zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing); return convertToBxModel(zones, WORD_DIST_MULT * characterSpacing); } - private void findNeighbors(Character[] components) { - - if (components.length == 0) { - return; - } - if (components.length == 1) { - components[0].setNeighbors(new ArrayList()); - return; - } - int pageNeighborCount = NEIGHBOUR_COUNT; - if (components.length <= NEIGHBOUR_COUNT) { - pageNeighborCount = components.length - 1; - } - - List candidates = new ArrayList(); - for (int i = 0; i < components.length; i++) { - int start = i, end = i + 1; - // Contains components from components array - // from ranges [start, i) and [i+1, end) - double dist = Double.POSITIVE_INFINITY; - for (double searchDist = 0; searchDist < dist; ) { - searchDist += DISTANCE_STEP; - boolean newCandidatesFound = false; - - while (start > 0 && components[i].getX() - components[start - 1].getX() < searchDist) { - start--; - candidates.add(new Neighbor(components[start], components[i])); - if (candidates.size() > pageNeighborCount) { - Collections.sort(candidates, NeighborDistanceComparator.getInstance()); - candidates.subList(pageNeighborCount, candidates.size()).clear(); - } - newCandidatesFound = true; - } - while (end < components.length && components[end].getX() - components[i].getX() < searchDist) { - candidates.add(new Neighbor(components[end], components[i])); - if (candidates.size() > pageNeighborCount) { - Collections.sort(candidates, NeighborDistanceComparator.getInstance()); - candidates.subList(pageNeighborCount, candidates.size()).clear(); - } - end++; - newCandidatesFound = true; - } - - if (newCandidatesFound && candidates.size() >= pageNeighborCount) { - Collections.sort(candidates, NeighborDistanceComparator.getInstance()); - dist = candidates.get(pageNeighborCount - 1).getDistance(); - } - } - candidates.subList(pageNeighborCount, candidates.size()).clear(); - components[i].setNeighbors(new ArrayList(candidates)); - candidates.clear(); - } - } - - - /** - * Computes initial orientation estimation based on nearest-neighbors' angles. - * - * @param components - * @return initial orientation estimation - */ - private double computeInitialOrientation(List components) { - - Histogram histogram = new Histogram(-Math.PI / 2, Math.PI / 2, ANGLE_HIST_RESOLUTION); - for (Character component : components) { - for (Neighbor neighbor : component.getNeighbors()) { - histogram.add(neighbor.getAngle()); - } - } - // Rectangular smoothing window has been replaced with gaussian smoothing window - histogram.circularGaussianSmooth(ANGLE_HIST_SMOOTHING_LEN, ANGLE_HIST_SMOOTHING_STDDEV); - return histogram.getPeakValue(); - } - - - /** - * Computes within-line spacing based on nearest-neighbors distances. - * - * @param components - * @param orientation estimated text orientation - * @return estimated within-line spacing - */ - private double computeCharacterSpacing(List components, double orientation) { - - return computeSpacing(components, orientation); - } - - - /** - * Computes between-line spacing based on nearest-neighbors distances. - * - * @param components - * @param orientation estimated text orientation - * @return estimated between-line spacing - */ - private double computeLineSpacing(List components, double orientation) { - - if (orientation >= 0) { - return computeSpacing(components, orientation - Math.PI / 2); - } else { - return computeSpacing(components, orientation + Math.PI / 2); - } - } - - - private double computeSpacing(List components, double angle) { - - double maxDistance = Double.NEGATIVE_INFINITY; - for (Character component : components) { - for (Neighbor neighbor : component.getNeighbors()) { - maxDistance = Math.max(maxDistance, neighbor.getDistance()); - } - } - Histogram histogram = new Histogram(0, maxDistance, SPACING_HIST_RESOLUTION); - AngleFilter filter = AngleFilter.newInstance(angle - ANGLE_TOLERANCE, angle + ANGLE_TOLERANCE); - for (Character component : components) { - for (Neighbor neighbor : component.getNeighbors()) { - if (filter.matches(neighbor)) { - histogram.add(neighbor.getDistance()); - } - } - } - // Rectangular smoothing window has been replaced with gaussian smoothing window - histogram.gaussianSmooth(SPACING_HIST_SMOOTHING_LEN, SPACING_HIST_SMOOTHING_STDDEV); - return histogram.getPeakValue(); - } - - - private List determineLines(List characters, double characterSpacing, double lineSpacing) { - - double maxHorizontalDistance = characterSpacing * COMP_DIST_CHAR; - double maxVerticalDistance = lineSpacing * MAX_VERTICAL_COMP_DIST; - -// DisjointSets sets = new DisjointSets(characters); -// AngleFilter filter = AngleFilter.newInstance(-ANGLE_TOLERANCE, ANGLE_TOLERANCE); -// for (Character component : characters) { -// for (Neighbor neighbor : component.getNeighbors()) { -// double x = neighbor.getHorizontalDistance() / maxHorizontalDistance; -// double y = neighbor.getVerticalDistance() / maxVerticalDistance; -// if (filter.matches(neighbor) && x * x + y * y <= 1) { -// sets.union(component, neighbor.getCharacter()); -// } -// } -// } -// List lines = new ArrayList(); -// for (Set group : sets) { -// List lineComponents = new ArrayList(group); -// lineComponents.sort(Comparator.comparingDouble(Character::getX)); -// lines.add(new ComponentLine(lineComponents)); -// } -// return lines; - - DisjointSets sets = new DisjointSets<>(characters); - AngleFilter filter = AngleFilter.newInstance(-ANGLE_TOLERANCE, ANGLE_TOLERANCE); - - characters.forEach(character -> { - character.getNeighbors().forEach(neighbor -> { - double x = neighbor.getHorizontalDistance() / maxHorizontalDistance; - double y = neighbor.getVerticalDistance() / maxVerticalDistance; - if (filter.matches(neighbor) && Math.pow(x, 2) + Math.pow(y, 2) <= 1) { - sets.union(character, neighbor.getCharacter()); - } - }); - }); - - List lines = new ArrayList<>(); - sets.forEach(group -> { - List lineComponents = new ArrayList<>(group); - lineComponents.sort(Comparator.comparingDouble(Character::getX)); - lines.add(new ComponentLine(lineComponents)); - }); - - return lines; - - } - - - private List> determineZones(List lines, - double orientation, - double minHorizontalDistance, - double maxHorizontalDistance, - double minVerticalDistance, - double maxVerticalDistance, - double minHorizontalMergeDistance, - double maxHorizontalMergeDistance, - double minVerticalMergeDistance, - double maxVerticalMergeDistance) { - - DisjointSets sets = new DisjointSets(lines); - // Mean height is computed so that all distances can be scaled - // relative to the line height - double meanHeight = 0.0, weights = 0.0; - for (ComponentLine line : lines) { - double weight = line.getLength(); - meanHeight += line.getHeight() * weight; - weights += weight; - } - meanHeight /= weights; - - for (int i = 0; i < lines.size(); i++) { - ComponentLine li = lines.get(i); - for (int j = i + 1; j < lines.size(); j++) { - ComponentLine lj = lines.get(j); - double scale = Math.min(li.getHeight(), lj.getHeight()) / meanHeight; - scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE)); - // "<=" is used instead of "<" for consistency and to allow setting minVertical(Merge)Distance - // to 0.0 with meaning "no minimal distance required" - if (!sets.areTogether(li, lj) && li.angularDifference(lj) <= ANGLE_TOLERANCE) { - double hDist = li.horizontalDistance(lj, orientation) / scale; - double vDist = li.verticalDistance(lj, orientation) / scale; - // Line over or above - if (minHorizontalDistance <= hDist && hDist <= maxHorizontalDistance && minVerticalDistance <= vDist && vDist <= maxVerticalDistance) { - sets.union(li, lj); - } - // Split line that needs later merging - else if (minHorizontalMergeDistance <= hDist && hDist <= maxHorizontalMergeDistance && minVerticalMergeDistance <= vDist && vDist <= maxVerticalMergeDistance) { - sets.union(li, lj); - } - } - } - } - List> zones = new ArrayList>(); - for (Set group : sets) { - zones.add(new ArrayList(group)); - } - return zones; - } - - - private List convertToBxModel(List> zones, double wordSpacing) { + private List convertToBxModel(List zones, double wordSpacing) { List zoneList = new ArrayList<>(); if (zones.size() > MAX_ZONES_PER_PAGE) { - List oneZone = new ArrayList(); - for (List zone : zones) { - oneZone.addAll(zone); + CharacterZone oneZone = new CharacterZone(); + for (CharacterZone zone : zones) { + oneZone.getLines().addAll(zone.getLines()); } zones = new ArrayList<>(); zones.add(oneZone); } - for (List lines : zones) { + for (CharacterZone characterZone : zones) { Zone zone = new Zone(); - for (ComponentLine line : lines) { + for (CharacterLine line : characterZone.getLines()) { zone.addLine(line.convertToBxLine(wordSpacing)); } List zLines = Lists.newArrayList(zone.getLines()); @@ -461,173 +92,4 @@ public class DocstrumSegmenter { return zoneList; } - - /** - * Neighbor distance comparator based on the distance. - *

- * The ordering is not consistent with equals. - */ - protected static final class NeighborDistanceComparator implements Comparator { - - private NeighborDistanceComparator() { - - } - - - @Override - public int compare(Neighbor o1, Neighbor o2) { - - return Double.compare(o1.getDistance(), o2.getDistance()); - } - - - private static final NeighborDistanceComparator instance = new NeighborDistanceComparator(); - - - public static NeighborDistanceComparator getInstance() { - - return instance; - } - - } - - /** - * Internal representation of the text line. - */ - protected static class ComponentLine { - - private final double x0; - private final double y0; - - private final double x1; - private final double y1; - - private final double height; - - private final List components; - - - public ComponentLine(List components) { - - this.components = components; - - if (components.size() >= 2) { - // Simple linear regression - double sx = 0.0, sxx = 0.0, sxy = 0.0, sy = 0.0; - for (Character component : components) { - sx += component.getX(); - sxx += component.getX() * component.getX(); - sxy += component.getX() * component.getY(); - sy += component.getY(); - } - double b = (components.size() * sxy - sx * sy) / (components.size() * sxx - sx * sx); - double a = (sy - b * sx) / components.size(); - - this.x0 = components.get(0).getX(); - this.y0 = a + b * this.x0; - this.x1 = components.get(components.size() - 1).getX(); - this.y1 = a + b * this.x1; - } else if (!components.isEmpty()) { - Character component = components.get(0); - double dx = component.getTextPosition().getWidthDirAdj() / 3; - double dy = dx * Math.tan(0); - this.x0 = component.getX() - dx; - this.x1 = component.getX() + dx; - this.y0 = component.getY() - dy; - this.y1 = component.getY() + dy; - } else { - throw new IllegalArgumentException("Component list must not be empty"); - } - height = computeHeight(); - } - - - public double getAngle() { - - return Math.atan2(y1 - y0, x1 - x0); - } - - - public double getLength() { - - return Math.sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1)); - } - - - private double computeHeight() { - - double sum = 0.0; - for (Character component : components) { - sum += component.getHeight(); - } - return sum / components.size(); - } - - - public double getHeight() { - - return height; - } - - - public double angularDifference(ComponentLine j) { - - double diff = Math.abs(getAngle() - j.getAngle()); - if (diff <= Math.PI / 2) { - return diff; - } else { - return Math.PI - diff; - } - } - - - public double horizontalDistance(ComponentLine other, double orientation) { - - double[] xs = new double[4]; - double s = Math.sin(-orientation), c = Math.cos(-orientation); - xs[0] = c * x0 - s * y0; - xs[1] = c * x1 - s * y1; - xs[2] = c * other.x0 - s * other.y0; - xs[3] = c * other.x1 - s * other.y1; - boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0]; - Arrays.sort(xs); - return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1); - } - - - public double verticalDistance(ComponentLine other, double orientation) { - - double xm = (x0 + x1) / 2, ym = (y0 + y1) / 2; - double xn = (other.x0 + other.x1) / 2, yn = (other.y0 + other.y1) / 2; - double a = Math.tan(orientation); - return Math.abs(a * (xn - xm) + ym - yn) / Math.sqrt(a * a + 1); - } - - - public Line convertToBxLine(double wordSpacing) { - - Line line = new Line(); - Word word = new Word(); - Character previousComponent = null; - for (Character component : components) { - if (previousComponent != null) { - double dist = component.getTextPosition().getXDirAdj() - previousComponent.getTextPosition().getXDirAdj() - previousComponent.getTextPosition() - .getWidthDirAdj(); - if (dist > wordSpacing) { - BoundingBoxBuilder.setBounds(word); - line.addWord(word); - word = new Word(); - } - } - word.addChunk(component.getTextPosition()); - previousComponent = component; - } - BoundingBoxBuilder.setBounds(word); - line.addWord(word); - BoundingBoxBuilder.setBounds(line); - return line; - } - - } - } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/LineBuilderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/LineBuilderService.java index 217329d..59a03fa 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/LineBuilderService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/LineBuilderService.java @@ -8,6 +8,7 @@ import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.DisjointSets; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.Character; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.docstrum.CharacterLine; @Service public class LineBuilderService { @@ -17,7 +18,7 @@ public class LineBuilderService { private static final double ANGLE_TOLERANCE = Math.PI / 6; - public List buildLines(List characters, double characterSpacing, double lineSpacing) { + public List buildLines(List characters, double characterSpacing, double lineSpacing) { double maxHorizontalDistance = characterSpacing * CHARACTER_SPACING_DISTANCE_MULTIPLIER; double maxVerticalDistance = lineSpacing * MAX_VERTICAL_CHARACTER_DISTANCE; @@ -35,11 +36,11 @@ public class LineBuilderService { }); }); - List lines = new ArrayList<>(); + List lines = new ArrayList<>(); sets.forEach(group -> { List lineComponents = new ArrayList<>(group); lineComponents.sort(Comparator.comparingDouble(Character::getX)); - lines.add(new DocstrumSegmenter.ComponentLine(lineComponents)); + lines.add(new CharacterLine(lineComponents)); }); return lines; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/refactor/docstrum/CharacterLine.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/refactor/docstrum/CharacterLine.java index 3b54287..f6bc990 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/refactor/docstrum/CharacterLine.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/refactor/docstrum/CharacterLine.java @@ -3,6 +3,10 @@ package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.m import java.util.Arrays; import java.util.List; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Line; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.Word; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.utils.BoundingBoxBuilder; + import lombok.Data; @Data @@ -90,11 +94,10 @@ public class CharacterLine { public double horizontalDistance(CharacterLine other) { double[] xs = new double[4]; - double s = 0, c = 1; - xs[0] = c * x0 - s * y0; - xs[1] = c * x1 - s * y1; - xs[2] = c * other.x0 - s * other.y0; - xs[3] = c * other.x1 - s * other.y1; + xs[0] = x0; + xs[1] = x1; + xs[2] = other.x0; + xs[3] = other.x1; boolean overlapping = xs[1] >= xs[2] && xs[3] >= xs[0]; Arrays.sort(xs); return Math.abs(xs[2] - xs[1]) * (overlapping ? 1 : -1); @@ -103,9 +106,34 @@ public class CharacterLine { public double verticalDistance(CharacterLine other) { - double xm = (x0 + x1) / 2, ym = (y0 + y1) / 2; - double xn = (other.x0 + other.x1) / 2, yn = (other.y0 + other.y1) / 2; - return Math.abs((xn - xm) + ym - yn) / Math.sqrt(1); + double ym = (y0 + y1) / 2; + double yn = (other.y0 + other.y1) / 2; + return Math.abs(ym - yn) / Math.sqrt(1); } -} \ No newline at end of file + + public Line convertToBxLine(double wordSpacing) { + + Line line = new Line(); + Word word = new Word(); + Character previousComponent = null; + for (Character component : characters) { + if (previousComponent != null) { + double dist = component.getTextPosition().getXDirAdj() - previousComponent.getTextPosition().getXDirAdj() - previousComponent.getTextPosition().getWidthDirAdj(); + if (dist > wordSpacing) { + BoundingBoxBuilder.setBounds(word); + line.addWord(word); + word = new Word(); + } + } + word.addChunk(component.getTextPosition()); + previousComponent = component; + } + BoundingBoxBuilder.setBounds(word); + line.addWord(word); + BoundingBoxBuilder.setBounds(line); + return line; + } + +} + diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/refactor/docstrum/CharacterZone.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/refactor/docstrum/CharacterZone.java index 0b19599..97255d6 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/refactor/docstrum/CharacterZone.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/model/refactor/docstrum/CharacterZone.java @@ -3,6 +3,8 @@ package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.m import java.util.ArrayList; import java.util.List; +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.refactor.BBoxObject; + import lombok.AllArgsConstructor; import lombok.Data; import lombok.NoArgsConstructor; @@ -10,8 +12,13 @@ import lombok.NoArgsConstructor; @Data @NoArgsConstructor @AllArgsConstructor -public class CharacterZone { +public class CharacterZone extends BBoxObject { private List lines = new ArrayList<>(); + + public void buildBox() { + + } + } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index e5981d5..56124e7 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -25,7 +25,7 @@ public class ViewerDocumentTest extends BuildDocumentTest { @SneakyThrows public void testViewerDocument() { - System.out.println("<<<<<<<<<<" + Math.sin(-0) + "aaa " + Math.cos(-0)); + System.out.println("<<<<<<<<<<" + Math.sin(-0) + "aaa " + Math.cos(0) + Math.tan(0)); String fileName = "files/Plenarprotokoll 1 (keine Druchsache!) (1).pdf"; String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";