diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/BoundingBox.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/BoundingBox.java index 9f79eed..ae2fd62 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/BoundingBox.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/BoundingBox.java @@ -135,6 +135,12 @@ public abstract class BoundingBox { } + public boolean intersectsYJava(BoundingBox other) { + + return this.getY() <= other.getMaxY() && this.getMaxY() >= other.getY(); + } + + public boolean intersectsY(BoundingBox other, float threshold) { return this.getPdfMinY() - threshold <= other.getPdfMaxY() && this.getPdfMaxY() + threshold >= other.getPdfMinY(); @@ -143,7 +149,13 @@ public abstract class BoundingBox { public boolean intersectsX(BoundingBox other) { - return this.getPdfMinX() <= other.getMaxX() && this.getMaxX() >= other.getPdfMinX(); + return this.getPdfMinX() <= other.getPdfMaxX() && this.getPdfMaxX() >= other.getPdfMinX(); + } + + + public boolean intersectsXJava(BoundingBox other) { + + return this.getX() <= other.getMaxX() && this.getMaxX() >= other.getMinX(); } @@ -182,4 +194,60 @@ public abstract class BoundingBox { } }; + + public double horizontalDistance(BoundingBox other) { + + Rectangle2D left; + Rectangle2D right; + if (this.leftOf(other)) { + left = this.getBBox(); + right = other.getBBox(); + } else { + left = other.getBBox(); + right = this.getBBox(); + } + + return Math.max(0, right.getMinX() - left.getMaxX()); + } + + + public double verticalDistance(BoundingBox other) { + + Rectangle2D bottom; + Rectangle2D top; + if (this.isAbove(other)) { + top = this.getBBox(); + bottom = other.getBBox(); + } else { + bottom = this.getBBox(); + top = other.getBBox(); + } + + return Math.max(0, bottom.getMinY() - top.getMaxY()); + } + + + public boolean rightOf(BoundingBox other) { + + return this.intersectsYJava(other) && other.getMaxX() <= this.getMinX(); + } + + + public boolean leftOf(BoundingBox other) { + + return this.intersectsYJava(other) && other.getMinX() >= this.getMaxX(); + } + + + public boolean isAbove(BoundingBox other) { + + return this.intersectsXJava(other) && other.getMinY() >= this.getMaxY(); + } + + + public boolean isBelow(BoundingBox other) { + + return this.intersectsXJava(other) && this.getMinY() >= other.getMaxY(); + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/ColumnDetector.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/ColumnDetector.java new file mode 100644 index 0000000..2c5960a --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/ColumnDetector.java @@ -0,0 +1,323 @@ +package com.knecon.fforesight.service.layoutparser.processor.docstrum.model; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; + + +/* +WIP, mostly working, needs to be tested a bit more + */ +public class ColumnDetector { + + public static final double MAX_VALUE_THRESHOLD = 0.5; + final static int bins_num = 128; + final static int globalStartIdx = 0; // ignore outer parts completely, we don't expect columns there + final static int globalEndIdx = bins_num; // i chose 7, since thirds seems a likely split for columns, therefore divided by 6 would eliminate those. + public static final double DERIVATIVE_ZERO_THRESHOLD = 1e-10; + public static final double MINIMUM_THRESHOLD_FOR_COLUMNS = 0.05; + double minY; + double maxY; + double midY; + double[] histogram; + double min; + double max; + double resolution; + double sum; + int N; + + + public ColumnDetector(double min, double max, double minY, double maxY) { + + this.min = min; + this.max = max; + this.minY = minY; + this.maxY = maxY; + this.midY = maxY - minY; + this.resolution = (max - min) / bins_num; + this.histogram = new double[bins_num]; + } + + + public void add(BoundingBox zone) { + + N++; + double weight = computeWeight(zone); + int start = (int) ((zone.getMinX() - min) / resolution); + int end = (int) ((zone.getMaxX() - min) / resolution); + for (int i = start; i < end; i++) { + histogram[i] += weight; + sum += histogram[i]; + } + } + + + private double computeWeight(BoundingBox zone) { + + double areaWeight = zone.getBBox().getHeight(); + + double relativeDistance = relativeDistanceToMiddle(zone.getBBox().getCenterY()); + + double distanceWeight; + if (relativeDistance < 0.6) { + distanceWeight = 1; + } else if (relativeDistance < 0.8) { + distanceWeight = 0.8; + } else { + distanceWeight = 0.1; + } + + return areaWeight * distanceWeight; + } + + + private double relativeDistanceToMiddle(double y) { + + double range = (maxY - minY) / 2; + double mid = minY + range; + + return Math.abs(y - mid) / range; + } + + + public double[] computeDerivative() { + + int length = histogram.length; + double[] derivative = new double[length]; + + for (int i = 0; i < length; i++) { + if (i == 0) { + derivative[i] = (histogram[i + 1] - histogram[i]) / resolution; + } else if (i == length - 1) { + derivative[i] = (histogram[i] - histogram[i - 1]) / resolution; + } else { + derivative[i] = (histogram[i + 1] - histogram[i - 1]) / (2 * resolution); + } + } + + return derivative; + } + + + public double calcMean(double[] arr, int start, int end) { + + if (start == end) { + return 0; + } + double sum = 0; + for (int i = start; i < end; i++) { + sum += arr[i]; + } + return sum / (end - start); + } + + + /* + Find columns, by finding all local maxima/minima of the derivative. Filtering them for the ones with the biggest values. + For each found minima, we will step to the right until we hit a 0 in the derivative, this indicates a minimum in the main histogram. If this minimum is below a threshold, it is deemed a column divider. + Same goes for maxima, but stepping to the left now, since minima in the function will always be to the left of a maximum in its derivative. + */ + public List determineColumnsWithDerivative(double[] derivative) { + + assert derivative.length == histogram.length; + + Set columnIndices = new HashSet<>(); + double mean = calcMean(histogram, 0, histogram.length); + double maxDvValue = calcMax(derivative); + double minDvValue = calcMin(derivative); + + if (maxDvValue - minDvValue < mean * MAX_VALUE_THRESHOLD) { + Collections.emptyList(); + } + + Extrema derivativeExtrema = calculateNearGlobalExtrema(derivative, maxDvValue, minDvValue); + + List columnsRightOfMinima = findZerosToTheRightOfMinima(derivative, derivativeExtrema.minima(), mean); + columnIndices.addAll(columnsRightOfMinima); + + List columnsLeftOfMaxima = findZerosToTheLeftOfMaxima(derivative, derivativeExtrema.maxima(), mean); + columnIndices.addAll(columnsLeftOfMaxima); + + return columnIndices.stream() + .sorted(Comparator.naturalOrder()) + .map(this::calculateXCoordinateFromIdx) + .toList(); + } + + + private List findZerosToTheLeftOfMaxima(double[] derivative, List derivativeMaxima, double mean) { + + List columnsLeftOfMaxima = new ArrayList<>(); + + for (int i = 0; i < derivativeMaxima.size(); i++) { + List consecutiveZeroes = new LinkedList<>(); + boolean maximumFound = false; + int maximaIdx = derivativeMaxima.get(i) - 1; // the highest derivative will always be at least one step away from the lowest value. + int endIdx = (int) Math.max(globalStartIdx, + Math.min(maximaIdx - 1, + maximaIdx - 0.1 * bins_num)); // search through 10% of array to the right, but at least one step and at most to the left edge; + + for (int j = maximaIdx; j >= endIdx; j--) { + if (derivative[j] < DERIVATIVE_ZERO_THRESHOLD) { + maximumFound = true; + consecutiveZeroes.add(j); + } else if (maximumFound) { + break; + } + } + if (maximumFound) { + int midIdx = consecutiveZeroes.size() / 2; + int middleMinimumIdx = consecutiveZeroes.get(midIdx); + if (histogram[middleMinimumIdx] < mean * MINIMUM_THRESHOLD_FOR_COLUMNS) { + columnsLeftOfMaxima.add(middleMinimumIdx); + } + } + } + return columnsLeftOfMaxima; + } + + + private List findZerosToTheRightOfMinima(double[] derivative, List derivativeMinima, double mean) { + + List columnIndixes = new LinkedList<>(); + for (int i = 0; i < derivativeMinima.size(); i++) { + List consecutiveZeroes = new LinkedList<>(); + boolean minimumFound = false; + int minimaIdx = derivativeMinima.get(i) + 1; // the highest derivative will always be at least one step earlier than the lowest value. + int endIdx = (int) Math.min(globalEndIdx, + Math.max(minimaIdx + 1, + minimaIdx + 0.1 * bins_num)); // search through 10% of array to the right, but at least one step and at most to the right edge; + + for (int j = minimaIdx; j < endIdx; j++) { + if (derivative[j] < DERIVATIVE_ZERO_THRESHOLD) { + minimumFound = true; + consecutiveZeroes.add(j); + } else if (minimumFound) { + break; + } + } + if (minimumFound) { + int midIdx = consecutiveZeroes.size() / 2; + int middleMinimumIdx = consecutiveZeroes.get(midIdx); + if (histogram[middleMinimumIdx] < mean * MINIMUM_THRESHOLD_FOR_COLUMNS) { + columnIndixes.add(middleMinimumIdx); + } + } + } + return columnIndixes; + } + + + private double calcMax(double[] array) { + + double max = Double.NEGATIVE_INFINITY; + for (int i = 0; i < array.length; i++) { + if (array[i] > max) { + max = array[i]; + } + } + return max; + } + + + private double calcMin(double[] array) { + + double min = Double.POSITIVE_INFINITY; + for (int i = 0; i < array.length; i++) { + if (array[i] < min) { + min = array[i]; + } + } + return min; + } + + + private Extrema calculateNearGlobalExtrema(double[] derivative, double maxDvValue, double minDvValue) { + + List nearGlobalDvMaximaIdx = new LinkedList<>(); + List nearGlobalDvMinimaIdx = new LinkedList<>(); + for (int i = globalStartIdx; i < globalEndIdx; i++) { + if (derivative[i] <= minDvValue * 0.8) { + nearGlobalDvMinimaIdx.add(i); + } + if (derivative[i] >= maxDvValue * 0.8) { + nearGlobalDvMaximaIdx.add(i); + } + } + + nearGlobalDvMinimaIdx = removeConsecutive(nearGlobalDvMinimaIdx); + nearGlobalDvMaximaIdx = removeConsecutive(nearGlobalDvMaximaIdx); + + return new Extrema(nearGlobalDvMaximaIdx, nearGlobalDvMinimaIdx); + } + + + private record Extrema(List maxima, List minima) { + + } + + + private Double calculateXCoordinateFromIdx(int globalMinIdx) { + + return min + ((globalMinIdx + 1) * resolution); + } + + + public static List removeConsecutive(List numbers) { + + List result = new ArrayList<>(); + if (numbers == null || numbers.isEmpty()) { + return result; + } + + result.add(numbers.get(0)); // Add the first number + + for (int i = 1; i < numbers.size(); i++) { + if (numbers.get(i) != numbers.get(i - 1) + 1) { + result.add(numbers.get(i)); // Add non-consecutive numbers + } + } + + return result; + } + + + public void kernelSmooth(double[] kernel) { + + double[] newFrequencies = new double[histogram.length]; + int shift = (kernel.length - 1) / 2; + for (int i = 0; i < kernel.length; i++) { + int jStart = Math.max(0, i - shift); + int jEnd = Math.min(histogram.length, histogram.length + i - shift); + for (int j = jStart; j < jEnd; j++) { + newFrequencies[j - i + shift] += kernel[i] * histogram[j]; + } + } + histogram = newFrequencies; + } + + + public double[] createGaussianKernel(int length, double stdDeviation) { + + int r = length / 2; + + int size = 2 * r + 1; + double[] kernel = new double[size]; + double sum = 0; + double b = 2 * (stdDeviation) * (stdDeviation); + double a = 1 / Math.sqrt(Math.PI * b); + for (int i = 0; i < size; i++) { + kernel[i] = a * Math.exp(-(i - r) * (i - r) / b); + sum += kernel[i]; + } + for (int i = 0; i < size; i++) { + kernel[i] /= sum; + } + return kernel; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java index cca8558..b6e09f5 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java @@ -117,7 +117,7 @@ public class SectionNodeFactory { if (abstractPageBlock instanceof TextPageBlock) { switch (layoutParsingType) { - case REDACT_MANAGER, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> { + case REDACT_MANAGER, DOCUMINE, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> { alreadyMerged.add(abstractPageBlock); remainingBlocks.remove(abstractPageBlock); DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, new ArrayList<>(), layoutParsingType); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PageInformation.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PageInformation.java index 0b53d74..feb340b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PageInformation.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PageInformation.java @@ -22,6 +22,15 @@ public record PageInformation(Rectangle2D mediabox, int number, int rotationDegr } + public double heightRot() { + + if (rotationDegrees == 90 || rotationDegrees == 270) { + return width(); + } + return height(); + } + + public double width() { return mediabox.getWidth(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionSequenceComparator.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionSequenceComparator.java index b002dbc..52974d3 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionSequenceComparator.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionSequenceComparator.java @@ -39,21 +39,21 @@ public class TextPositionSequenceComparator implements Comparator= pos1YTop && pos2YBottom <= pos1YBottom || pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom) { - return Float.compare(x1, x2); + return Double.compare(x1, x2); } else if (pos1YBottom < pos2YBottom) { return -1; } else {