RED-8825: some fixes, and experimental column detector
This commit is contained in:
parent
07733d0855
commit
e935cc7b14
@ -135,6 +135,12 @@ public abstract class BoundingBox {
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsYJava(BoundingBox other) {
|
||||
|
||||
return this.getY() <= other.getMaxY() && this.getMaxY() >= other.getY();
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsY(BoundingBox other, float threshold) {
|
||||
|
||||
return this.getPdfMinY() - threshold <= other.getPdfMaxY() && this.getPdfMaxY() + threshold >= other.getPdfMinY();
|
||||
@ -143,7 +149,13 @@ public abstract class BoundingBox {
|
||||
|
||||
public boolean intersectsX(BoundingBox other) {
|
||||
|
||||
return this.getPdfMinX() <= other.getMaxX() && this.getMaxX() >= other.getPdfMinX();
|
||||
return this.getPdfMinX() <= other.getPdfMaxX() && this.getPdfMaxX() >= other.getPdfMinX();
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsXJava(BoundingBox other) {
|
||||
|
||||
return this.getX() <= other.getMaxX() && this.getMaxX() >= other.getMinX();
|
||||
}
|
||||
|
||||
|
||||
@ -182,4 +194,60 @@ public abstract class BoundingBox {
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
public double horizontalDistance(BoundingBox other) {
|
||||
|
||||
Rectangle2D left;
|
||||
Rectangle2D right;
|
||||
if (this.leftOf(other)) {
|
||||
left = this.getBBox();
|
||||
right = other.getBBox();
|
||||
} else {
|
||||
left = other.getBBox();
|
||||
right = this.getBBox();
|
||||
}
|
||||
|
||||
return Math.max(0, right.getMinX() - left.getMaxX());
|
||||
}
|
||||
|
||||
|
||||
public double verticalDistance(BoundingBox other) {
|
||||
|
||||
Rectangle2D bottom;
|
||||
Rectangle2D top;
|
||||
if (this.isAbove(other)) {
|
||||
top = this.getBBox();
|
||||
bottom = other.getBBox();
|
||||
} else {
|
||||
bottom = this.getBBox();
|
||||
top = other.getBBox();
|
||||
}
|
||||
|
||||
return Math.max(0, bottom.getMinY() - top.getMaxY());
|
||||
}
|
||||
|
||||
|
||||
public boolean rightOf(BoundingBox other) {
|
||||
|
||||
return this.intersectsYJava(other) && other.getMaxX() <= this.getMinX();
|
||||
}
|
||||
|
||||
|
||||
public boolean leftOf(BoundingBox other) {
|
||||
|
||||
return this.intersectsYJava(other) && other.getMinX() >= this.getMaxX();
|
||||
}
|
||||
|
||||
|
||||
public boolean isAbove(BoundingBox other) {
|
||||
|
||||
return this.intersectsXJava(other) && other.getMinY() >= this.getMaxY();
|
||||
}
|
||||
|
||||
|
||||
public boolean isBelow(BoundingBox other) {
|
||||
|
||||
return this.intersectsXJava(other) && this.getMinY() >= other.getMaxY();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,323 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
|
||||
/*
|
||||
WIP, mostly working, needs to be tested a bit more
|
||||
*/
|
||||
public class ColumnDetector {
|
||||
|
||||
public static final double MAX_VALUE_THRESHOLD = 0.5;
|
||||
final static int bins_num = 128;
|
||||
final static int globalStartIdx = 0; // ignore outer parts completely, we don't expect columns there
|
||||
final static int globalEndIdx = bins_num; // i chose 7, since thirds seems a likely split for columns, therefore divided by 6 would eliminate those.
|
||||
public static final double DERIVATIVE_ZERO_THRESHOLD = 1e-10;
|
||||
public static final double MINIMUM_THRESHOLD_FOR_COLUMNS = 0.05;
|
||||
double minY;
|
||||
double maxY;
|
||||
double midY;
|
||||
double[] histogram;
|
||||
double min;
|
||||
double max;
|
||||
double resolution;
|
||||
double sum;
|
||||
int N;
|
||||
|
||||
|
||||
public ColumnDetector(double min, double max, double minY, double maxY) {
|
||||
|
||||
this.min = min;
|
||||
this.max = max;
|
||||
this.minY = minY;
|
||||
this.maxY = maxY;
|
||||
this.midY = maxY - minY;
|
||||
this.resolution = (max - min) / bins_num;
|
||||
this.histogram = new double[bins_num];
|
||||
}
|
||||
|
||||
|
||||
public void add(BoundingBox zone) {
|
||||
|
||||
N++;
|
||||
double weight = computeWeight(zone);
|
||||
int start = (int) ((zone.getMinX() - min) / resolution);
|
||||
int end = (int) ((zone.getMaxX() - min) / resolution);
|
||||
for (int i = start; i < end; i++) {
|
||||
histogram[i] += weight;
|
||||
sum += histogram[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private double computeWeight(BoundingBox zone) {
|
||||
|
||||
double areaWeight = zone.getBBox().getHeight();
|
||||
|
||||
double relativeDistance = relativeDistanceToMiddle(zone.getBBox().getCenterY());
|
||||
|
||||
double distanceWeight;
|
||||
if (relativeDistance < 0.6) {
|
||||
distanceWeight = 1;
|
||||
} else if (relativeDistance < 0.8) {
|
||||
distanceWeight = 0.8;
|
||||
} else {
|
||||
distanceWeight = 0.1;
|
||||
}
|
||||
|
||||
return areaWeight * distanceWeight;
|
||||
}
|
||||
|
||||
|
||||
private double relativeDistanceToMiddle(double y) {
|
||||
|
||||
double range = (maxY - minY) / 2;
|
||||
double mid = minY + range;
|
||||
|
||||
return Math.abs(y - mid) / range;
|
||||
}
|
||||
|
||||
|
||||
public double[] computeDerivative() {
|
||||
|
||||
int length = histogram.length;
|
||||
double[] derivative = new double[length];
|
||||
|
||||
for (int i = 0; i < length; i++) {
|
||||
if (i == 0) {
|
||||
derivative[i] = (histogram[i + 1] - histogram[i]) / resolution;
|
||||
} else if (i == length - 1) {
|
||||
derivative[i] = (histogram[i] - histogram[i - 1]) / resolution;
|
||||
} else {
|
||||
derivative[i] = (histogram[i + 1] - histogram[i - 1]) / (2 * resolution);
|
||||
}
|
||||
}
|
||||
|
||||
return derivative;
|
||||
}
|
||||
|
||||
|
||||
public double calcMean(double[] arr, int start, int end) {
|
||||
|
||||
if (start == end) {
|
||||
return 0;
|
||||
}
|
||||
double sum = 0;
|
||||
for (int i = start; i < end; i++) {
|
||||
sum += arr[i];
|
||||
}
|
||||
return sum / (end - start);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Find columns, by finding all local maxima/minima of the derivative. Filtering them for the ones with the biggest values.
|
||||
For each found minima, we will step to the right until we hit a 0 in the derivative, this indicates a minimum in the main histogram. If this minimum is below a threshold, it is deemed a column divider.
|
||||
Same goes for maxima, but stepping to the left now, since minima in the function will always be to the left of a maximum in its derivative.
|
||||
*/
|
||||
public List<Double> determineColumnsWithDerivative(double[] derivative) {
|
||||
|
||||
assert derivative.length == histogram.length;
|
||||
|
||||
Set<Integer> columnIndices = new HashSet<>();
|
||||
double mean = calcMean(histogram, 0, histogram.length);
|
||||
double maxDvValue = calcMax(derivative);
|
||||
double minDvValue = calcMin(derivative);
|
||||
|
||||
if (maxDvValue - minDvValue < mean * MAX_VALUE_THRESHOLD) {
|
||||
Collections.emptyList();
|
||||
}
|
||||
|
||||
Extrema derivativeExtrema = calculateNearGlobalExtrema(derivative, maxDvValue, minDvValue);
|
||||
|
||||
List<Integer> columnsRightOfMinima = findZerosToTheRightOfMinima(derivative, derivativeExtrema.minima(), mean);
|
||||
columnIndices.addAll(columnsRightOfMinima);
|
||||
|
||||
List<Integer> columnsLeftOfMaxima = findZerosToTheLeftOfMaxima(derivative, derivativeExtrema.maxima(), mean);
|
||||
columnIndices.addAll(columnsLeftOfMaxima);
|
||||
|
||||
return columnIndices.stream()
|
||||
.sorted(Comparator.naturalOrder())
|
||||
.map(this::calculateXCoordinateFromIdx)
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
private List<Integer> findZerosToTheLeftOfMaxima(double[] derivative, List<Integer> derivativeMaxima, double mean) {
|
||||
|
||||
List<Integer> columnsLeftOfMaxima = new ArrayList<>();
|
||||
|
||||
for (int i = 0; i < derivativeMaxima.size(); i++) {
|
||||
List<Integer> consecutiveZeroes = new LinkedList<>();
|
||||
boolean maximumFound = false;
|
||||
int maximaIdx = derivativeMaxima.get(i) - 1; // the highest derivative will always be at least one step away from the lowest value.
|
||||
int endIdx = (int) Math.max(globalStartIdx,
|
||||
Math.min(maximaIdx - 1,
|
||||
maximaIdx - 0.1 * bins_num)); // search through 10% of array to the right, but at least one step and at most to the left edge;
|
||||
|
||||
for (int j = maximaIdx; j >= endIdx; j--) {
|
||||
if (derivative[j] < DERIVATIVE_ZERO_THRESHOLD) {
|
||||
maximumFound = true;
|
||||
consecutiveZeroes.add(j);
|
||||
} else if (maximumFound) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (maximumFound) {
|
||||
int midIdx = consecutiveZeroes.size() / 2;
|
||||
int middleMinimumIdx = consecutiveZeroes.get(midIdx);
|
||||
if (histogram[middleMinimumIdx] < mean * MINIMUM_THRESHOLD_FOR_COLUMNS) {
|
||||
columnsLeftOfMaxima.add(middleMinimumIdx);
|
||||
}
|
||||
}
|
||||
}
|
||||
return columnsLeftOfMaxima;
|
||||
}
|
||||
|
||||
|
||||
private List<Integer> findZerosToTheRightOfMinima(double[] derivative, List<Integer> derivativeMinima, double mean) {
|
||||
|
||||
List<Integer> columnIndixes = new LinkedList<>();
|
||||
for (int i = 0; i < derivativeMinima.size(); i++) {
|
||||
List<Integer> consecutiveZeroes = new LinkedList<>();
|
||||
boolean minimumFound = false;
|
||||
int minimaIdx = derivativeMinima.get(i) + 1; // the highest derivative will always be at least one step earlier than the lowest value.
|
||||
int endIdx = (int) Math.min(globalEndIdx,
|
||||
Math.max(minimaIdx + 1,
|
||||
minimaIdx + 0.1 * bins_num)); // search through 10% of array to the right, but at least one step and at most to the right edge;
|
||||
|
||||
for (int j = minimaIdx; j < endIdx; j++) {
|
||||
if (derivative[j] < DERIVATIVE_ZERO_THRESHOLD) {
|
||||
minimumFound = true;
|
||||
consecutiveZeroes.add(j);
|
||||
} else if (minimumFound) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (minimumFound) {
|
||||
int midIdx = consecutiveZeroes.size() / 2;
|
||||
int middleMinimumIdx = consecutiveZeroes.get(midIdx);
|
||||
if (histogram[middleMinimumIdx] < mean * MINIMUM_THRESHOLD_FOR_COLUMNS) {
|
||||
columnIndixes.add(middleMinimumIdx);
|
||||
}
|
||||
}
|
||||
}
|
||||
return columnIndixes;
|
||||
}
|
||||
|
||||
|
||||
private double calcMax(double[] array) {
|
||||
|
||||
double max = Double.NEGATIVE_INFINITY;
|
||||
for (int i = 0; i < array.length; i++) {
|
||||
if (array[i] > max) {
|
||||
max = array[i];
|
||||
}
|
||||
}
|
||||
return max;
|
||||
}
|
||||
|
||||
|
||||
private double calcMin(double[] array) {
|
||||
|
||||
double min = Double.POSITIVE_INFINITY;
|
||||
for (int i = 0; i < array.length; i++) {
|
||||
if (array[i] < min) {
|
||||
min = array[i];
|
||||
}
|
||||
}
|
||||
return min;
|
||||
}
|
||||
|
||||
|
||||
private Extrema calculateNearGlobalExtrema(double[] derivative, double maxDvValue, double minDvValue) {
|
||||
|
||||
List<Integer> nearGlobalDvMaximaIdx = new LinkedList<>();
|
||||
List<Integer> nearGlobalDvMinimaIdx = new LinkedList<>();
|
||||
for (int i = globalStartIdx; i < globalEndIdx; i++) {
|
||||
if (derivative[i] <= minDvValue * 0.8) {
|
||||
nearGlobalDvMinimaIdx.add(i);
|
||||
}
|
||||
if (derivative[i] >= maxDvValue * 0.8) {
|
||||
nearGlobalDvMaximaIdx.add(i);
|
||||
}
|
||||
}
|
||||
|
||||
nearGlobalDvMinimaIdx = removeConsecutive(nearGlobalDvMinimaIdx);
|
||||
nearGlobalDvMaximaIdx = removeConsecutive(nearGlobalDvMaximaIdx);
|
||||
|
||||
return new Extrema(nearGlobalDvMaximaIdx, nearGlobalDvMinimaIdx);
|
||||
}
|
||||
|
||||
|
||||
private record Extrema(List<Integer> maxima, List<Integer> minima) {
|
||||
|
||||
}
|
||||
|
||||
|
||||
private Double calculateXCoordinateFromIdx(int globalMinIdx) {
|
||||
|
||||
return min + ((globalMinIdx + 1) * resolution);
|
||||
}
|
||||
|
||||
|
||||
public static List<Integer> removeConsecutive(List<Integer> numbers) {
|
||||
|
||||
List<Integer> result = new ArrayList<>();
|
||||
if (numbers == null || numbers.isEmpty()) {
|
||||
return result;
|
||||
}
|
||||
|
||||
result.add(numbers.get(0)); // Add the first number
|
||||
|
||||
for (int i = 1; i < numbers.size(); i++) {
|
||||
if (numbers.get(i) != numbers.get(i - 1) + 1) {
|
||||
result.add(numbers.get(i)); // Add non-consecutive numbers
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
public void kernelSmooth(double[] kernel) {
|
||||
|
||||
double[] newFrequencies = new double[histogram.length];
|
||||
int shift = (kernel.length - 1) / 2;
|
||||
for (int i = 0; i < kernel.length; i++) {
|
||||
int jStart = Math.max(0, i - shift);
|
||||
int jEnd = Math.min(histogram.length, histogram.length + i - shift);
|
||||
for (int j = jStart; j < jEnd; j++) {
|
||||
newFrequencies[j - i + shift] += kernel[i] * histogram[j];
|
||||
}
|
||||
}
|
||||
histogram = newFrequencies;
|
||||
}
|
||||
|
||||
|
||||
public double[] createGaussianKernel(int length, double stdDeviation) {
|
||||
|
||||
int r = length / 2;
|
||||
|
||||
int size = 2 * r + 1;
|
||||
double[] kernel = new double[size];
|
||||
double sum = 0;
|
||||
double b = 2 * (stdDeviation) * (stdDeviation);
|
||||
double a = 1 / Math.sqrt(Math.PI * b);
|
||||
for (int i = 0; i < size; i++) {
|
||||
kernel[i] = a * Math.exp(-(i - r) * (i - r) / b);
|
||||
sum += kernel[i];
|
||||
}
|
||||
for (int i = 0; i < size; i++) {
|
||||
kernel[i] /= sum;
|
||||
}
|
||||
return kernel;
|
||||
}
|
||||
|
||||
}
|
||||
@ -117,7 +117,7 @@ public class SectionNodeFactory {
|
||||
if (abstractPageBlock instanceof TextPageBlock) {
|
||||
|
||||
switch (layoutParsingType) {
|
||||
case REDACT_MANAGER, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> {
|
||||
case REDACT_MANAGER, DOCUMINE, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> {
|
||||
alreadyMerged.add(abstractPageBlock);
|
||||
remainingBlocks.remove(abstractPageBlock);
|
||||
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, new ArrayList<>(), layoutParsingType);
|
||||
|
||||
@ -22,6 +22,15 @@ public record PageInformation(Rectangle2D mediabox, int number, int rotationDegr
|
||||
}
|
||||
|
||||
|
||||
public double heightRot() {
|
||||
|
||||
if (rotationDegrees == 90 || rotationDegrees == 270) {
|
||||
return width();
|
||||
}
|
||||
return height();
|
||||
}
|
||||
|
||||
|
||||
public double width() {
|
||||
|
||||
return mediabox.getWidth();
|
||||
|
||||
@ -39,21 +39,21 @@ public class TextPositionSequenceComparator implements Comparator<TextPositionSe
|
||||
}
|
||||
|
||||
// get the text direction adjusted coordinates
|
||||
float x1 = pos1.getMinXDirAdj();
|
||||
float x2 = pos2.getMinXDirAdj();
|
||||
double x1 = pos1.getBBox().getX();
|
||||
double x2 = pos2.getBBox().getX();
|
||||
|
||||
float pos1YBottom = pos1.getMaxYDirAdj();
|
||||
float pos2YBottom = pos2.getMaxYDirAdj();
|
||||
double pos1YBottom = pos1.getBBox().getMaxY();
|
||||
double pos2YBottom = pos2.getBBox().getMaxY();
|
||||
|
||||
// note that the coordinates have been adjusted so 0,0 is in upper left
|
||||
float pos1YTop = pos1YBottom - pos1.getTextHeightNoPadding();
|
||||
float pos2YTop = pos2YBottom - pos2.getTextHeightNoPadding();
|
||||
double pos1YTop = pos1YBottom - pos1.getBBox().getHeight();
|
||||
double pos2YTop = pos2YBottom - pos2.getBBox().getHeight();
|
||||
|
||||
float yDifference = Math.abs(pos1YBottom - pos2YBottom);
|
||||
double yDifference = Math.abs(pos1YBottom - pos2YBottom);
|
||||
|
||||
// we will do a simple tolerance comparison
|
||||
if (yDifference < .1 || pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom || pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom) {
|
||||
return Float.compare(x1, x2);
|
||||
return Double.compare(x1, x2);
|
||||
} else if (pos1YBottom < pos2YBottom) {
|
||||
return -1;
|
||||
} else {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user