akra-certificates: finetuning for certificates
This commit is contained in:
parent
eb2ea755a5
commit
e8513d05e2
@ -266,7 +266,8 @@ public class LayoutParsingPipeline {
|
|||||||
classificationDocument.getVisualizations().addRulingVisualization(stripper.getRulings(), pageNumber);
|
classificationDocument.getVisualizations().addRulingVisualization(stripper.getRulings(), pageNumber);
|
||||||
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
|
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
|
||||||
|
|
||||||
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals(), PageInformation.fromPDPage(pageNumber, pdPage));
|
PageInformation pageInformation = PageInformation.fromPDPage(pageNumber, pdPage);
|
||||||
|
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals(), pageInformation);
|
||||||
classificationDocument.getVisualizations().addCellVisualizations(emptyTableCells, pageNumber);
|
classificationDocument.getVisualizations().addCellVisualizations(emptyTableCells, pageNumber);
|
||||||
|
|
||||||
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings);
|
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings);
|
||||||
@ -283,9 +284,9 @@ public class LayoutParsingPipeline {
|
|||||||
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getVisualizations());
|
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getVisualizations());
|
||||||
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
|
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
|
||||||
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG ->
|
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG ->
|
||||||
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getVisualizations(), layoutParsingType);
|
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getVisualizations(), layoutParsingType, pageInformation);
|
||||||
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG ->
|
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG ->
|
||||||
docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getVisualizations(), layoutParsingType);
|
docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getVisualizations(), layoutParsingType, pageInformation);
|
||||||
};
|
};
|
||||||
|
|
||||||
classificationPage.setCleanRulings(cleanRulings);
|
classificationPage.setCleanRulings(cleanRulings);
|
||||||
|
|||||||
@ -18,6 +18,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRul
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
|
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
@ -33,7 +34,11 @@ public class DocstrumSegmentationService {
|
|||||||
private final ReadingOrderService readingOrderService;
|
private final ReadingOrderService readingOrderService;
|
||||||
|
|
||||||
|
|
||||||
public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean xyOrder, CleanRulings usedRulings, LayoutparsingVisualizations visualizations) {
|
public List<Zone> segmentPage(List<TextPositionSequence> textPositions,
|
||||||
|
boolean xyOrder,
|
||||||
|
CleanRulings usedRulings,
|
||||||
|
LayoutparsingVisualizations visualizations,
|
||||||
|
PageInformation pageInformation) {
|
||||||
|
|
||||||
List<Zone> zones = new ArrayList<>();
|
List<Zone> zones = new ArrayList<>();
|
||||||
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.ZERO));
|
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.ZERO));
|
||||||
@ -41,7 +46,7 @@ public class DocstrumSegmentationService {
|
|||||||
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.HALF_CIRCLE));
|
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.HALF_CIRCLE));
|
||||||
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.THREE_QUARTER_CIRCLE));
|
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.THREE_QUARTER_CIRCLE));
|
||||||
|
|
||||||
return readingOrderService.resolve(zones, xyOrder);
|
return readingOrderService.resolve(zones, xyOrder, visualizations, textPositions.get(0).getPage(), pageInformation);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -63,10 +68,9 @@ public class DocstrumSegmentationService {
|
|||||||
double lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20);
|
double lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20);
|
||||||
|
|
||||||
List<Line> lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing, rulings);
|
List<Line> lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing, rulings);
|
||||||
List<Zone> zones = zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing, rulings);
|
List<Zone> zones = zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing, rulings);
|
||||||
// return zones;
|
// return zones;
|
||||||
return zoneBuilderService.mergeZonesUntilConvergence(zones, characterSpacing, lineSpacing, rulings);
|
return zoneBuilderService.mergeZonesAgain(zones, characterSpacing, lineSpacing, rulings);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -0,0 +1,319 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
public class ColumnDetector {
|
||||||
|
|
||||||
|
public static final double MAX_VALUE_THRESHOLD = 0.5;
|
||||||
|
final static int bins_num = 128;
|
||||||
|
final static int globalStartIdx = 0; // ignore outer parts completely, we don't expect columns there
|
||||||
|
final static int globalEndIdx = bins_num; // i chose 7, since thirds seems a likely split for columns, therefore divided by 6 would eliminate those.
|
||||||
|
public static final double DERIVATIVE_ZERO_THRESHOLD = 1e-10;
|
||||||
|
public static final double MINIMUM_THRESHOLD_FOR_COLUMNS = 0.05;
|
||||||
|
double minY;
|
||||||
|
double maxY;
|
||||||
|
double midY;
|
||||||
|
double[] histogram;
|
||||||
|
double min;
|
||||||
|
double max;
|
||||||
|
double resolution;
|
||||||
|
double sum;
|
||||||
|
int N;
|
||||||
|
|
||||||
|
|
||||||
|
public ColumnDetector(double min, double max, double minY, double maxY) {
|
||||||
|
|
||||||
|
this.min = min;
|
||||||
|
this.max = max;
|
||||||
|
this.minY = minY;
|
||||||
|
this.maxY = maxY;
|
||||||
|
this.midY = maxY - minY;
|
||||||
|
this.resolution = (max - min) / bins_num;
|
||||||
|
this.histogram = new double[bins_num];
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void add(BoundingBox zone) {
|
||||||
|
|
||||||
|
N++;
|
||||||
|
double weight = computeWeight(zone);
|
||||||
|
int start = (int) ((zone.getMinX() - min) / resolution);
|
||||||
|
int end = (int) ((zone.getMaxX() - min) / resolution);
|
||||||
|
for (int i = start; i < end; i++) {
|
||||||
|
histogram[i] += weight;
|
||||||
|
sum += histogram[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private double computeWeight(BoundingBox zone) {
|
||||||
|
|
||||||
|
double areaWeight = zone.getBBox().getHeight();
|
||||||
|
|
||||||
|
double relativeDistance = relativeDistanceToMiddle(zone.getBBox().getCenterY());
|
||||||
|
|
||||||
|
double distanceWeight;
|
||||||
|
if (relativeDistance < 0.6) {
|
||||||
|
distanceWeight = 1;
|
||||||
|
} else if (relativeDistance < 0.8) {
|
||||||
|
distanceWeight = 0.8;
|
||||||
|
} else {
|
||||||
|
distanceWeight = 0.1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return areaWeight * distanceWeight;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private double relativeDistanceToMiddle(double y) {
|
||||||
|
|
||||||
|
double range = (maxY - minY) / 2;
|
||||||
|
double mid = minY + range;
|
||||||
|
|
||||||
|
return Math.abs(y - mid) / range;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double[] computeDerivative() {
|
||||||
|
|
||||||
|
int length = histogram.length;
|
||||||
|
double[] derivative = new double[length];
|
||||||
|
|
||||||
|
for (int i = 0; i < length; i++) {
|
||||||
|
if (i == 0) {
|
||||||
|
derivative[i] = (histogram[i + 1] - histogram[i]) / resolution;
|
||||||
|
} else if (i == length - 1) {
|
||||||
|
derivative[i] = (histogram[i] - histogram[i - 1]) / resolution;
|
||||||
|
} else {
|
||||||
|
derivative[i] = (histogram[i + 1] - histogram[i - 1]) / (2 * resolution);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return derivative;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double calcMean(double[] arr, int start, int end) {
|
||||||
|
|
||||||
|
if (start == end) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
double sum = 0;
|
||||||
|
for (int i = start; i < end; i++) {
|
||||||
|
sum += arr[i];
|
||||||
|
}
|
||||||
|
return sum / (end - start);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
Find columns, by finding all local maxima/minima of the derivative. Filtering them for the ones with the biggest values.
|
||||||
|
For each found minima, we will step to the right until we hit a 0 in the derivative, this indicates a minimum in the main histogram. If this minimum is below a threshold, it is deemed a column divider.
|
||||||
|
Same goes for maxima, but stepping to the left now, since minima in the function will always be to the left of a maximum in its derivative.
|
||||||
|
*/
|
||||||
|
public List<Double> determineColumnsWithDerivative(double[] derivative) {
|
||||||
|
|
||||||
|
assert derivative.length == histogram.length;
|
||||||
|
|
||||||
|
Set<Integer> columnIndices = new HashSet<>();
|
||||||
|
double mean = calcMean(histogram, 0, histogram.length);
|
||||||
|
double maxDvValue = calcMax(derivative);
|
||||||
|
double minDvValue = calcMin(derivative);
|
||||||
|
|
||||||
|
if (maxDvValue - minDvValue < mean * MAX_VALUE_THRESHOLD) {
|
||||||
|
Collections.emptyList();
|
||||||
|
}
|
||||||
|
|
||||||
|
Extrema derivativeExtrema = calculateNearGlobalExtrema(derivative, maxDvValue, minDvValue);
|
||||||
|
|
||||||
|
List<Integer> columnsRightOfMinima = findZerosToTheRightOfMinima(derivative, derivativeExtrema.minima(), mean);
|
||||||
|
columnIndices.addAll(columnsRightOfMinima);
|
||||||
|
|
||||||
|
List<Integer> columnsLeftOfMaxima = findZerosToTheLeftOfMaxima(derivative, derivativeExtrema.maxima(), mean);
|
||||||
|
columnIndices.addAll(columnsLeftOfMaxima);
|
||||||
|
|
||||||
|
return columnIndices.stream()
|
||||||
|
.sorted(Comparator.naturalOrder())
|
||||||
|
.map(this::calculateXCoordinateFromIdx)
|
||||||
|
.toList();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<Integer> findZerosToTheLeftOfMaxima(double[] derivative, List<Integer> derivativeMaxima, double mean) {
|
||||||
|
|
||||||
|
List<Integer> columnsLeftOfMaxima = new ArrayList<>();
|
||||||
|
|
||||||
|
for (int i = 0; i < derivativeMaxima.size(); i++) {
|
||||||
|
List<Integer> consecutiveZeroes = new LinkedList<>();
|
||||||
|
boolean maximumFound = false;
|
||||||
|
int maximaIdx = derivativeMaxima.get(i) - 1; // the highest derivative will always be at least one step away from the lowest value.
|
||||||
|
int endIdx = (int) Math.max(globalStartIdx,
|
||||||
|
Math.min(maximaIdx - 1,
|
||||||
|
maximaIdx - 0.1 * bins_num)); // search through 10% of array to the right, but at least one step and at most to the left edge;
|
||||||
|
|
||||||
|
for (int j = maximaIdx; j >= endIdx; j--) {
|
||||||
|
if (derivative[j] < DERIVATIVE_ZERO_THRESHOLD) {
|
||||||
|
maximumFound = true;
|
||||||
|
consecutiveZeroes.add(j);
|
||||||
|
} else if (maximumFound) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (maximumFound) {
|
||||||
|
int midIdx = consecutiveZeroes.size() / 2;
|
||||||
|
int middleMinimumIdx = consecutiveZeroes.get(midIdx);
|
||||||
|
if (histogram[middleMinimumIdx] < mean * MINIMUM_THRESHOLD_FOR_COLUMNS) {
|
||||||
|
columnsLeftOfMaxima.add(middleMinimumIdx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return columnsLeftOfMaxima;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<Integer> findZerosToTheRightOfMinima(double[] derivative, List<Integer> derivativeMinima, double mean) {
|
||||||
|
|
||||||
|
List<Integer> columnIndixes = new LinkedList<>();
|
||||||
|
for (int i = 0; i < derivativeMinima.size(); i++) {
|
||||||
|
List<Integer> consecutiveZeroes = new LinkedList<>();
|
||||||
|
boolean minimumFound = false;
|
||||||
|
int minimaIdx = derivativeMinima.get(i) + 1; // the highest derivative will always be at least one step earlier than the lowest value.
|
||||||
|
int endIdx = (int) Math.min(globalEndIdx,
|
||||||
|
Math.max(minimaIdx + 1,
|
||||||
|
minimaIdx + 0.1 * bins_num)); // search through 10% of array to the right, but at least one step and at most to the right edge;
|
||||||
|
|
||||||
|
for (int j = minimaIdx; j < endIdx; j++) {
|
||||||
|
if (derivative[j] < DERIVATIVE_ZERO_THRESHOLD) {
|
||||||
|
minimumFound = true;
|
||||||
|
consecutiveZeroes.add(j);
|
||||||
|
} else if (minimumFound) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (minimumFound) {
|
||||||
|
int midIdx = consecutiveZeroes.size() / 2;
|
||||||
|
int middleMinimumIdx = consecutiveZeroes.get(midIdx);
|
||||||
|
if (histogram[middleMinimumIdx] < mean * MINIMUM_THRESHOLD_FOR_COLUMNS) {
|
||||||
|
columnIndixes.add(middleMinimumIdx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return columnIndixes;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private double calcMax(double[] array) {
|
||||||
|
|
||||||
|
double max = Double.NEGATIVE_INFINITY;
|
||||||
|
for (int i = 0; i < array.length; i++) {
|
||||||
|
if (array[i] > max) {
|
||||||
|
max = array[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return max;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private double calcMin(double[] array) {
|
||||||
|
|
||||||
|
double min = Double.POSITIVE_INFINITY;
|
||||||
|
for (int i = 0; i < array.length; i++) {
|
||||||
|
if (array[i] < min) {
|
||||||
|
min = array[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return min;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Extrema calculateNearGlobalExtrema(double[] derivative, double maxDvValue, double minDvValue) {
|
||||||
|
|
||||||
|
List<Integer> nearGlobalDvMaximaIdx = new LinkedList<>();
|
||||||
|
List<Integer> nearGlobalDvMinimaIdx = new LinkedList<>();
|
||||||
|
for (int i = globalStartIdx; i < globalEndIdx; i++) {
|
||||||
|
if (derivative[i] <= minDvValue * 0.8) {
|
||||||
|
nearGlobalDvMinimaIdx.add(i);
|
||||||
|
}
|
||||||
|
if (derivative[i] >= maxDvValue * 0.8) {
|
||||||
|
nearGlobalDvMaximaIdx.add(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
nearGlobalDvMinimaIdx = removeConsecutive(nearGlobalDvMinimaIdx);
|
||||||
|
nearGlobalDvMaximaIdx = removeConsecutive(nearGlobalDvMaximaIdx);
|
||||||
|
|
||||||
|
return new Extrema(nearGlobalDvMaximaIdx, nearGlobalDvMinimaIdx);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private record Extrema(List<Integer> maxima, List<Integer> minima) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Double calculateXCoordinateFromIdx(int globalMinIdx) {
|
||||||
|
|
||||||
|
return min + ((globalMinIdx + 1) * resolution);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static List<Integer> removeConsecutive(List<Integer> numbers) {
|
||||||
|
|
||||||
|
List<Integer> result = new ArrayList<>();
|
||||||
|
if (numbers == null || numbers.isEmpty()) {
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
result.add(numbers.get(0)); // Add the first number
|
||||||
|
|
||||||
|
for (int i = 1; i < numbers.size(); i++) {
|
||||||
|
if (numbers.get(i) != numbers.get(i - 1) + 1) {
|
||||||
|
result.add(numbers.get(i)); // Add non-consecutive numbers
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void kernelSmooth(double[] kernel) {
|
||||||
|
|
||||||
|
double[] newFrequencies = new double[histogram.length];
|
||||||
|
int shift = (kernel.length - 1) / 2;
|
||||||
|
for (int i = 0; i < kernel.length; i++) {
|
||||||
|
int jStart = Math.max(0, i - shift);
|
||||||
|
int jEnd = Math.min(histogram.length, histogram.length + i - shift);
|
||||||
|
for (int j = jStart; j < jEnd; j++) {
|
||||||
|
newFrequencies[j - i + shift] += kernel[i] * histogram[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
histogram = newFrequencies;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double[] createGaussianKernel(int length, double stdDeviation) {
|
||||||
|
|
||||||
|
int r = length / 2;
|
||||||
|
|
||||||
|
int size = 2 * r + 1;
|
||||||
|
double[] kernel = new double[size];
|
||||||
|
double sum = 0;
|
||||||
|
double b = 2 * (stdDeviation) * (stdDeviation);
|
||||||
|
double a = 1 / Math.sqrt(Math.PI * b);
|
||||||
|
for (int i = 0; i < size; i++) {
|
||||||
|
kernel[i] = a * Math.exp(-(i - r) * (i - r) / b);
|
||||||
|
sum += kernel[i];
|
||||||
|
}
|
||||||
|
for (int i = 0; i < size; i++) {
|
||||||
|
kernel[i] /= sum;
|
||||||
|
}
|
||||||
|
return kernel;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -1,17 +1,21 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
|
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.awt.geom.Point2D;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.Collections;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.HashMap;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.ListIterator;
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.ColumnDetector;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
|
||||||
|
|
||||||
@Service
|
@Service
|
||||||
public class ReadingOrderService {
|
public class ReadingOrderService {
|
||||||
@ -20,7 +24,7 @@ public class ReadingOrderService {
|
|||||||
public static final double MULTI_COLUMN_DETECTION_THRESHOLD = 1.5;
|
public static final double MULTI_COLUMN_DETECTION_THRESHOLD = 1.5;
|
||||||
|
|
||||||
|
|
||||||
public List<Zone> resolve(List<Zone> zones, boolean xyReadingOrder) {
|
public List<Zone> resolve(List<Zone> zones, boolean xyReadingOrder, LayoutparsingVisualizations visualizations, int page, PageInformation pageInformation) {
|
||||||
|
|
||||||
if (zones.isEmpty() || zones.size() == 1) {
|
if (zones.isEmpty() || zones.size() == 1) {
|
||||||
return zones;
|
return zones;
|
||||||
@ -30,28 +34,53 @@ public class ReadingOrderService {
|
|||||||
return resolveSingleColumnReadingOrder(zones);
|
return resolveSingleColumnReadingOrder(zones);
|
||||||
}
|
}
|
||||||
|
|
||||||
Map<Long, Integer> histogram = new HashMap<>();
|
var columnSeparatorLines = calculateColumns(zones);
|
||||||
for (Zone zone : zones) {
|
|
||||||
long minY = Math.round(zone.getBBox().getMinY());
|
|
||||||
long maxY = Math.round(zone.getBBox().getMaxY());
|
|
||||||
for (long i = minY; i <= maxY; i++) {
|
|
||||||
histogram.put(i, histogram.getOrDefault(i, 0) + 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (histogram.values()
|
if (columnSeparatorLines.isEmpty()) {
|
||||||
.stream()
|
|
||||||
.mapToInt(Integer::intValue).average()
|
|
||||||
.orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) {
|
|
||||||
return resolveSingleColumnReadingOrder(zones);
|
return resolveSingleColumnReadingOrder(zones);
|
||||||
} else {
|
} else {
|
||||||
|
for (Double columnLine : columnSeparatorLines) {
|
||||||
return resolveMultiColumnReadingOder(zones);
|
visualizations.addRulingVisualization(List.of(new Ruling(new Point2D.Double(columnLine, 0), new Point2D.Double(columnLine, 1000))), page);
|
||||||
|
}
|
||||||
|
return resolveMultiColumnReadingOder(zones, columnSeparatorLines, pageInformation);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static List<Double> calculateColumns(List<Zone> zones) {
|
||||||
|
|
||||||
|
if (zones.isEmpty()) {
|
||||||
|
return Collections.emptyList();
|
||||||
|
}
|
||||||
|
double min = zones.stream()
|
||||||
|
.mapToDouble(BoundingBox::getMinX)
|
||||||
|
.min()
|
||||||
|
.orElse(0);
|
||||||
|
double max = zones.stream()
|
||||||
|
.mapToDouble(BoundingBox::getMaxX)
|
||||||
|
.max()
|
||||||
|
.orElse(0);
|
||||||
|
double minY = zones.stream()
|
||||||
|
.mapToDouble(BoundingBox::getMinY)
|
||||||
|
.min()
|
||||||
|
.orElse(0);
|
||||||
|
double maxY = zones.stream()
|
||||||
|
.mapToDouble(BoundingBox::getMaxY)
|
||||||
|
.max()
|
||||||
|
.orElse(0);
|
||||||
|
|
||||||
|
var columnResolver = new ColumnDetector(min, max, minY, maxY);
|
||||||
|
|
||||||
|
zones.forEach(columnResolver::add);
|
||||||
|
columnResolver.kernelSmooth(columnResolver.createGaussianKernel(3, 1));
|
||||||
|
|
||||||
|
double[] derivative = columnResolver.computeDerivative();
|
||||||
|
|
||||||
|
return columnResolver.determineColumnsWithDerivative(derivative);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private static List<Zone> resolveSingleColumnReadingOrder(List<Zone> zones) {
|
private static List<Zone> resolveSingleColumnReadingOrder(List<Zone> zones) {
|
||||||
|
|
||||||
zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||||
@ -60,109 +89,47 @@ public class ReadingOrderService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private List<Zone> resolveMultiColumnReadingOder(List<Zone> zones) {
|
private List<Zone> resolveMultiColumnReadingOder(List<Zone> zones, List<Double> columnSeparatorLines, PageInformation pageInformation) {
|
||||||
|
|
||||||
// Simple reading order resolver for multi column page layout as described here : https://pub.towardsai.net/advanced-rag-02-unveiling-pdf-parsing-b84ae866344e
|
List<List<Zone>> zonesPerColumn = new LinkedList<>();
|
||||||
// TODO implement a more fancy reading order resolver see https://github.com/BobLd/DocumentLayoutAnalysis/blob/master/README.md#reading-order
|
for (Double ignored : columnSeparatorLines) {
|
||||||
|
zonesPerColumn.add(new LinkedList<>());
|
||||||
double minX = Double.POSITIVE_INFINITY;
|
|
||||||
double maxX = Double.NEGATIVE_INFINITY;
|
|
||||||
|
|
||||||
for (Zone zone : zones) {
|
|
||||||
if (zone.getX() < minX) {
|
|
||||||
minX = zone.getX();
|
|
||||||
}
|
|
||||||
if (zone.getX() + zone.getWidth() > maxX) {
|
|
||||||
maxX = zone.getX() + zone.getWidth();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
zonesPerColumn.add(new LinkedList<>());
|
||||||
|
|
||||||
double midLineXCoordinate = (minX + maxX) / 2;
|
|
||||||
|
|
||||||
List<Zone> leftOf = new ArrayList<>();
|
|
||||||
List<Zone> rightOf = new ArrayList<>();
|
|
||||||
List<Zone> middle = new ArrayList<>();
|
|
||||||
for (Zone zone : zones) {
|
for (Zone zone : zones) {
|
||||||
if (zone.getX() < midLineXCoordinate && zone.getX() + zone.getWidth() < midLineXCoordinate) {
|
boolean zoneAdded = false;
|
||||||
leftOf.add(zone);
|
if (zone.getMinY() < pageInformation.heightRot() * 0.5) {
|
||||||
} else if (zone.getX() > midLineXCoordinate && zone.getX() + zone.getWidth() > midLineXCoordinate) {
|
// above middle sort into column fitting left x value
|
||||||
rightOf.add(zone);
|
for (int col = 0; col < columnSeparatorLines.size(); col++) {
|
||||||
|
if (columnSeparatorLines.get(col) > zone.getMinX()) {
|
||||||
|
zonesPerColumn.get(col).add(zone);
|
||||||
|
zoneAdded = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
middle.add(zone);
|
// below middle sort into column fitting right x value
|
||||||
|
for (int col = 0; col < columnSeparatorLines.size(); col++) {
|
||||||
|
if (columnSeparatorLines.get(col) > zone.getMaxX()) {
|
||||||
|
zonesPerColumn.get(col).add(zone);
|
||||||
|
zoneAdded = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
if (!zoneAdded) {
|
||||||
|
zonesPerColumn.get(zonesPerColumn.size() - 1).add(zone);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
leftOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
zonesPerColumn.forEach(list -> list.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))));
|
||||||
|
|
||||||
rightOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
return zonesPerColumn.stream()
|
||||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
.flatMap(Collection::stream)
|
||||||
|
.toList();
|
||||||
middle.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
|
||||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
|
||||||
/*
|
|
||||||
List<Zone> leftNotIntersecting = new ArrayList<>();
|
|
||||||
for (Zone leftZone : leftOf) {
|
|
||||||
boolean intersects = false;
|
|
||||||
for (Zone rightZone : rightOf) {
|
|
||||||
if (leftZone.intersectsY(rightZone)) {
|
|
||||||
intersects = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
// early stopping
|
|
||||||
if (rightZone.getBBox().getMinY() > leftZone.getBBox().getMaxY()) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!intersects) {
|
|
||||||
leftNotIntersecting.add(leftZone);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
List<Zone> rightNotIntersecting = new ArrayList<>();
|
|
||||||
for (Zone rightZone : rightOf) {
|
|
||||||
boolean intersects = false;
|
|
||||||
for (Zone leftZone : leftOf) {
|
|
||||||
if (rightZone.intersectsY(leftZone)) {
|
|
||||||
intersects = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
// early stopping
|
|
||||||
if (leftZone.getBBox().getMinY() > rightZone.getBBox().getMaxY()) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!intersects) {
|
|
||||||
rightNotIntersecting.add(rightZone);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
leftOf.removeAll(leftNotIntersecting);
|
|
||||||
rightOf.removeAll(rightNotIntersecting);
|
|
||||||
|
|
||||||
middle.addAll(leftNotIntersecting);
|
|
||||||
middle.addAll(rightNotIntersecting);
|
|
||||||
*/
|
|
||||||
List<Zone> sortedZones = new ArrayList<>();
|
|
||||||
sortedZones.addAll(leftOf);
|
|
||||||
sortedZones.addAll(rightOf);
|
|
||||||
|
|
||||||
ListIterator<Zone> itty = middle.listIterator();
|
|
||||||
|
|
||||||
while (itty.hasNext()) {
|
|
||||||
Zone current = itty.next();
|
|
||||||
for (int i = 0; i < sortedZones.size(); i++) {
|
|
||||||
if (current.getY() < sortedZones.get(i).getY()) {
|
|
||||||
sortedZones.add(i, current);
|
|
||||||
itty.remove();
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
sortedZones.addAll(middle);
|
|
||||||
|
|
||||||
return sortedZones;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -82,7 +82,7 @@ public class ZoneBuilderService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public List<Zone> mergeZonesUntilConvergence(List<Zone> zones, double characterSpacing, double lineSpacing, CleanRulings rulings) {
|
public List<Zone> mergeZonesAgain(List<Zone> zones, double characterSpacing, double lineSpacing, CleanRulings rulings) {
|
||||||
|
|
||||||
double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER;
|
double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER;
|
||||||
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER;
|
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER;
|
||||||
|
|||||||
@ -46,7 +46,7 @@ public class StringFrequencyCounter {
|
|||||||
double total = countPerValue.values()
|
double total = countPerValue.values()
|
||||||
.stream()
|
.stream()
|
||||||
.mapToDouble(v -> v).sum();
|
.mapToDouble(v -> v).sum();
|
||||||
if ((double) standard / total > 0.85) {
|
if ((double) standard / total > 0.75) {
|
||||||
return mostPopular.getKey();
|
return mostPopular.getKey();
|
||||||
}
|
}
|
||||||
countPerValue.remove(mostPopular.getKey());
|
countPerValue.remove(mostPopular.getKey());
|
||||||
|
|||||||
@ -17,6 +17,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRul
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
|
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
@ -35,11 +36,12 @@ public class DocstrumBlockificationService {
|
|||||||
CleanRulings rulings,
|
CleanRulings rulings,
|
||||||
boolean xyOrder,
|
boolean xyOrder,
|
||||||
LayoutparsingVisualizations visualizations,
|
LayoutparsingVisualizations visualizations,
|
||||||
LayoutParsingType layoutParsingType) {
|
LayoutParsingType layoutParsingType,
|
||||||
|
PageInformation pageInformation) {
|
||||||
|
|
||||||
CleanRulings usedRulings = rulings.withoutTextRulings();
|
CleanRulings usedRulings = rulings.withoutTextRulings();
|
||||||
|
|
||||||
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder, usedRulings, visualizations);
|
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder, usedRulings, visualizations, pageInformation);
|
||||||
|
|
||||||
if (!textPositions.isEmpty()) {
|
if (!textPositions.isEmpty()) {
|
||||||
visualizations.addZoneVisualizations(zones, textPositions.get(0).getPage());
|
visualizations.addZoneVisualizations(zones, textPositions.get(0).getPage());
|
||||||
@ -58,8 +60,10 @@ public class DocstrumBlockificationService {
|
|||||||
|
|
||||||
mergeIntersectingBlocks(classificationPage, usedRulings, 2f, 2f);
|
mergeIntersectingBlocks(classificationPage, usedRulings, 2f, 2f);
|
||||||
|
|
||||||
if (layoutParsingType == LayoutParsingType.REDACT_MANAGER) {
|
combineBlocksBasic(classificationPage);
|
||||||
combineBlocks(classificationPage);
|
|
||||||
|
if (layoutParsingType == LayoutParsingType.REDACT_MANAGER || layoutParsingType == LayoutParsingType.DOCUMINE) {
|
||||||
|
combineBlocksSpecial(classificationPage);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (layoutParsingType == LayoutParsingType.CLARIFYND) {
|
if (layoutParsingType == LayoutParsingType.CLARIFYND) {
|
||||||
@ -105,7 +109,60 @@ public class DocstrumBlockificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void combineBlocks(ClassificationPage page) {
|
public void combineBlocksBasic(ClassificationPage page) {
|
||||||
|
|
||||||
|
TextPageBlock previous = new TextPageBlock();
|
||||||
|
ListIterator<AbstractPageBlock> itty = page.getTextBlocks().listIterator();
|
||||||
|
CleanRulings usedRulings = page.getCleanRulings().withoutTextRulings();
|
||||||
|
while (itty.hasNext()) {
|
||||||
|
|
||||||
|
AbstractPageBlock block = itty.next();
|
||||||
|
if (block instanceof TablePageBlock) {
|
||||||
|
previous = new TextPageBlock();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
TextPageBlock current = (TextPageBlock) block;
|
||||||
|
|
||||||
|
if (previous != null && !previous.getSequences().isEmpty()) {
|
||||||
|
|
||||||
|
if (current.getDir() != previous.getDir() || usedRulings.lineBetween(current, previous)) {
|
||||||
|
previous = current;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (current.intersectsY(previous) && current.horizontalDistance(previous) < 20) {
|
||||||
|
previous = combineBlocksAndResetIterator(previous, current, itty, true);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, page)) {
|
||||||
|
previous = combineBlocksAndResetIterator(previous, current, itty, true);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (previous.intersects(current)) {
|
||||||
|
previous = combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// merge headlines
|
||||||
|
if (current.getDir() == previous.getDir() && (Math.abs(current.getHighestFontSize() - previous.getHighestFontSize()) < 1.1f
|
||||||
|
&& current.getHighestFontSize() > 12
|
||||||
|
&& previous.getHighestFontSize() > 12
|
||||||
|
&& current.getMostPopularWordStyle().equals(previous.getMostPopularWordStyle())
|
||||||
|
&& current.intersects(previous, current.getMostPopularWordHeight(), current.getMostPopularWordHeight()))) {
|
||||||
|
previous = combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
previous = current;
|
||||||
|
}
|
||||||
|
|
||||||
|
mergeIntersectingBlocks(page, usedRulings, 0, 6.5f);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void combineBlocksSpecial(ClassificationPage page) {
|
||||||
|
|
||||||
TextPageBlock previous = new TextPageBlock();
|
TextPageBlock previous = new TextPageBlock();
|
||||||
ListIterator<AbstractPageBlock> itty = page.getTextBlocks().listIterator();
|
ListIterator<AbstractPageBlock> itty = page.getTextBlocks().listIterator();
|
||||||
@ -264,9 +321,7 @@ public class DocstrumBlockificationService {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (current.getDir() == inner.getDir() && (Math.abs(current.getHighestFontSize() - inner.getHighestFontSize()) < 1.1f && current.getHighestFontSize() > 12 && inner.getHighestFontSize() > 12 //
|
if (current.getDir() == inner.getDir() && current.intersects(inner, xThreshold, yThreshold)){
|
||||||
&& current.getMostPopularWordStyle().equals(inner.getMostPopularWordStyle()) //
|
|
||||||
&& current.intersects(inner, current.getMostPopularWordHeight(), current.getMostPopularWordHeight()))) {
|
|
||||||
|
|
||||||
boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate();
|
boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate();
|
||||||
current.getSequences().addAll(inner.getSequences());
|
current.getSequences().addAll(inner.getSequences());
|
||||||
@ -277,8 +332,7 @@ public class DocstrumBlockificationService {
|
|||||||
itty.set(current);
|
itty.set(current);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
} var blocksIterator = blocks.iterator();
|
||||||
var blocksIterator = blocks.iterator();
|
|
||||||
while (blocksIterator.hasNext()) {
|
while (blocksIterator.hasNext()) {
|
||||||
if (blocksIterator.next() == null) {
|
if (blocksIterator.next() == null) {
|
||||||
blocksIterator.remove();
|
blocksIterator.remove();
|
||||||
|
|||||||
@ -181,10 +181,7 @@ public class DocumentGraphFactory {
|
|||||||
|
|
||||||
Page page = context.getPage(textBlocks.get(0).getPage());
|
Page page = context.getPage(textBlocks.get(0).getPage());
|
||||||
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
|
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
|
||||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks),
|
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), footer, context, page);
|
||||||
footer,
|
|
||||||
context,
|
|
||||||
page);
|
|
||||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
|
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
|
||||||
footer.setTreeId(tocId);
|
footer.setTreeId(tocId);
|
||||||
footer.setLeafTextBlock(textBlock);
|
footer.setLeafTextBlock(textBlock);
|
||||||
@ -273,7 +270,8 @@ public class DocumentGraphFactory {
|
|||||||
return pages.keySet()
|
return pages.keySet()
|
||||||
.stream()
|
.stream()
|
||||||
.filter(page -> page.getNumber() == pageIndex)
|
.filter(page -> page.getNumber() == pageIndex)
|
||||||
.findFirst().orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex)));
|
.findFirst()
|
||||||
|
.orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex)));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -11,8 +11,8 @@ public record PageInformation(Rectangle2D mediabox, int number, int rotationDegr
|
|||||||
|
|
||||||
PDRectangle mediaBox = page.getMediaBox();
|
PDRectangle mediaBox = page.getMediaBox();
|
||||||
return new PageInformation(new Rectangle2D.Double(mediaBox.getLowerLeftX(), mediaBox.getLowerLeftY(), mediaBox.getWidth(), mediaBox.getHeight()),
|
return new PageInformation(new Rectangle2D.Double(mediaBox.getLowerLeftX(), mediaBox.getLowerLeftY(), mediaBox.getWidth(), mediaBox.getHeight()),
|
||||||
pageNum,
|
pageNum,
|
||||||
page.getRotation());
|
page.getRotation());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -22,6 +22,15 @@ public record PageInformation(Rectangle2D mediabox, int number, int rotationDegr
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double heightRot() {
|
||||||
|
|
||||||
|
if (rotationDegrees == 90 || rotationDegrees == 270) {
|
||||||
|
return width();
|
||||||
|
}
|
||||||
|
return height();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public double width() {
|
public double width() {
|
||||||
|
|
||||||
return mediabox.getWidth();
|
return mediabox.getWidth();
|
||||||
|
|||||||
@ -39,21 +39,21 @@ public class TextPositionSequenceComparator implements Comparator<TextPositionSe
|
|||||||
}
|
}
|
||||||
|
|
||||||
// get the text direction adjusted coordinates
|
// get the text direction adjusted coordinates
|
||||||
float x1 = pos1.getMinXDirAdj();
|
double x1 = pos1.getBBox().getX();
|
||||||
float x2 = pos2.getMinXDirAdj();
|
double x2 = pos2.getBBox().getX();
|
||||||
|
|
||||||
float pos1YBottom = pos1.getMaxYDirAdj();
|
double pos1YBottom = pos1.getBBox().getMaxY();
|
||||||
float pos2YBottom = pos2.getMaxYDirAdj();
|
double pos2YBottom = pos2.getBBox().getMaxY();
|
||||||
|
|
||||||
// note that the coordinates have been adjusted so 0,0 is in upper left
|
// note that the coordinates have been adjusted so 0,0 is in upper left
|
||||||
float pos1YTop = pos1YBottom - pos1.getTextHeightNoPadding();
|
double pos1YTop = pos1YBottom - pos1.getBBox().getHeight();
|
||||||
float pos2YTop = pos2YBottom - pos2.getTextHeightNoPadding();
|
double pos2YTop = pos2YBottom - pos2.getBBox().getHeight();
|
||||||
|
|
||||||
float yDifference = Math.abs(pos1YBottom - pos2YBottom);
|
double yDifference = Math.abs(pos1YBottom - pos2YBottom);
|
||||||
|
|
||||||
// we will do a simple tolerance comparison
|
// we will do a simple tolerance comparison
|
||||||
if (yDifference < .1 || pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom || pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom) {
|
if (yDifference < .1 || pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom || pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom) {
|
||||||
return Float.compare(x1, x2);
|
return Double.compare(x1, x2);
|
||||||
} else if (pos1YBottom < pos2YBottom) {
|
} else if (pos1YBottom < pos2YBottom) {
|
||||||
return -1;
|
return -1;
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@ -73,11 +73,11 @@ public class LayoutparsingVisualizations {
|
|||||||
boolean active;
|
boolean active;
|
||||||
|
|
||||||
final Visualizations words = Visualizations.builder().layer(ContentStreams.WORDS).build();
|
final Visualizations words = Visualizations.builder().layer(ContentStreams.WORDS).build();
|
||||||
final Visualizations lines = Visualizations.builder().layer(ContentStreams.LINES).build();
|
final Visualizations lines = Visualizations.builder().layer(ContentStreams.LINES).layerVisibilityDefaultValue(false).build();
|
||||||
final Visualizations zones = Visualizations.builder().layer(ContentStreams.ZONES).layerVisibilityDefaultValue(true).build();
|
final Visualizations zones = Visualizations.builder().layer(ContentStreams.ZONES).layerVisibilityDefaultValue(false).build();
|
||||||
final Visualizations mainBody = Visualizations.builder().layer(ContentStreams.MAIN_BODY).build();
|
final Visualizations mainBody = Visualizations.builder().layer(ContentStreams.MAIN_BODY).build();
|
||||||
final Visualizations clean_rulings = Visualizations.builder().layer(ContentStreams.CLEAN_RULINGS).build();
|
final Visualizations clean_rulings = Visualizations.builder().layer(ContentStreams.CLEAN_RULINGS).build();
|
||||||
final Visualizations rulings = Visualizations.builder().layer(ContentStreams.RULINGS).build();
|
final Visualizations rulings = Visualizations.builder().layer(ContentStreams.RULINGS).layerVisibilityDefaultValue(true).build();
|
||||||
final Visualizations cells = Visualizations.builder().layer(ContentStreams.CELLS).build();
|
final Visualizations cells = Visualizations.builder().layer(ContentStreams.CELLS).build();
|
||||||
final Visualizations markedContent = Visualizations.builder().layer(ContentStreams.MARKED_CONTENT).build();
|
final Visualizations markedContent = Visualizations.builder().layer(ContentStreams.MARKED_CONTENT).build();
|
||||||
final Visualizations neighbours = Visualizations.builder().layer(ContentStreams.NEIGHBOURS).build();
|
final Visualizations neighbours = Visualizations.builder().layer(ContentStreams.NEIGHBOURS).build();
|
||||||
@ -139,7 +139,7 @@ public class LayoutparsingVisualizations {
|
|||||||
visualizationsOnPage.getColoredLines()
|
visualizationsOnPage.getColoredLines()
|
||||||
.addAll(rulings
|
.addAll(rulings
|
||||||
.stream()
|
.stream()
|
||||||
.map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 0.5f))
|
.map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 2f))
|
||||||
.toList());
|
.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -30,21 +30,21 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
|||||||
@Autowired
|
@Autowired
|
||||||
private LayoutParsingPipeline layoutParsingPipeline;
|
private LayoutParsingPipeline layoutParsingPipeline;
|
||||||
|
|
||||||
@Disabled
|
|
||||||
@Test
|
@Test
|
||||||
|
// @Disabled
|
||||||
public void testLayoutParserEndToEnd() {
|
public void testLayoutParserEndToEnd() {
|
||||||
|
|
||||||
String filePath = "files/Minimal Examples/RotateTextWithRulingsTestFile.pdf";
|
String filePath = "/home/kschuettler/Dokumente/TestFiles/certificates/origin/ecm.pdf";
|
||||||
|
|
||||||
runForFile(filePath);
|
runForFile(filePath);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Disabled
|
// @Disabled
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void testLayoutParserEndToEndWithFolder() {
|
public void testLayoutParserEndToEndWithFolder() {
|
||||||
|
|
||||||
String folder = "/home/kschuettler/Dokumente/TestFiles/certificates/certificates-ocred";
|
String folder = "/home/kschuettler/Dokumente/TestFiles/certificates/origin";
|
||||||
List<Path> pdfFiles = Files.walk(Path.of(folder))
|
List<Path> pdfFiles = Files.walk(Path.of(folder))
|
||||||
.filter(path -> path.getFileName().toString().endsWith(".pdf"))
|
.filter(path -> path.getFileName().toString().endsWith(".pdf"))
|
||||||
.sorted(Comparator.comparing(Path::getFileName))
|
.sorted(Comparator.comparing(Path::getFileName))
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user