diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index abab9c3..d0f1ce0 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -266,7 +266,8 @@ public class LayoutParsingPipeline { classificationDocument.getVisualizations().addRulingVisualization(stripper.getRulings(), pageNumber); CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), stripper.getRulings()); - List emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals(), PageInformation.fromPDPage(pageNumber, pdPage)); + PageInformation pageInformation = PageInformation.fromPDPage(pageNumber, pdPage); + List emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals(), pageInformation); classificationDocument.getVisualizations().addCellVisualizations(emptyTableCells, pageNumber); TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings); @@ -283,9 +284,9 @@ public class LayoutParsingPipeline { redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getVisualizations()); case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings); case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG -> - docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getVisualizations(), layoutParsingType); + docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getVisualizations(), layoutParsingType, pageInformation); case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> - docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getVisualizations(), layoutParsingType); + docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getVisualizations(), layoutParsingType, pageInformation); }; classificationPage.setCleanRulings(cleanRulings); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/DocstrumSegmentationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/DocstrumSegmentationService.java index 6f361ff..8af8bec 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/DocstrumSegmentationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/DocstrumSegmentationService.java @@ -18,6 +18,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRul import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation; import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations; import lombok.RequiredArgsConstructor; @@ -33,7 +34,11 @@ public class DocstrumSegmentationService { private final ReadingOrderService readingOrderService; - public List segmentPage(List textPositions, boolean xyOrder, CleanRulings usedRulings, LayoutparsingVisualizations visualizations) { + public List segmentPage(List textPositions, + boolean xyOrder, + CleanRulings usedRulings, + LayoutparsingVisualizations visualizations, + PageInformation pageInformation) { List zones = new ArrayList<>(); zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.ZERO)); @@ -41,7 +46,7 @@ public class DocstrumSegmentationService { zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.HALF_CIRCLE)); zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.THREE_QUARTER_CIRCLE)); - return readingOrderService.resolve(zones, xyOrder); + return readingOrderService.resolve(zones, xyOrder, visualizations, textPositions.get(0).getPage(), pageInformation); } @@ -63,10 +68,9 @@ public class DocstrumSegmentationService { double lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20); List lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing, rulings); - List zones = zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing, rulings); + List zones = zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing, rulings); // return zones; - return zoneBuilderService.mergeZonesUntilConvergence(zones, characterSpacing, lineSpacing, rulings); + return zoneBuilderService.mergeZonesAgain(zones, characterSpacing, lineSpacing, rulings); } - } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/ColumnDetector.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/ColumnDetector.java new file mode 100644 index 0000000..c8601dd --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/ColumnDetector.java @@ -0,0 +1,319 @@ +package com.knecon.fforesight.service.layoutparser.processor.docstrum.model; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; + +public class ColumnDetector { + + public static final double MAX_VALUE_THRESHOLD = 0.5; + final static int bins_num = 128; + final static int globalStartIdx = 0; // ignore outer parts completely, we don't expect columns there + final static int globalEndIdx = bins_num; // i chose 7, since thirds seems a likely split for columns, therefore divided by 6 would eliminate those. + public static final double DERIVATIVE_ZERO_THRESHOLD = 1e-10; + public static final double MINIMUM_THRESHOLD_FOR_COLUMNS = 0.05; + double minY; + double maxY; + double midY; + double[] histogram; + double min; + double max; + double resolution; + double sum; + int N; + + + public ColumnDetector(double min, double max, double minY, double maxY) { + + this.min = min; + this.max = max; + this.minY = minY; + this.maxY = maxY; + this.midY = maxY - minY; + this.resolution = (max - min) / bins_num; + this.histogram = new double[bins_num]; + } + + + public void add(BoundingBox zone) { + + N++; + double weight = computeWeight(zone); + int start = (int) ((zone.getMinX() - min) / resolution); + int end = (int) ((zone.getMaxX() - min) / resolution); + for (int i = start; i < end; i++) { + histogram[i] += weight; + sum += histogram[i]; + } + } + + + private double computeWeight(BoundingBox zone) { + + double areaWeight = zone.getBBox().getHeight(); + + double relativeDistance = relativeDistanceToMiddle(zone.getBBox().getCenterY()); + + double distanceWeight; + if (relativeDistance < 0.6) { + distanceWeight = 1; + } else if (relativeDistance < 0.8) { + distanceWeight = 0.8; + } else { + distanceWeight = 0.1; + } + + return areaWeight * distanceWeight; + } + + + private double relativeDistanceToMiddle(double y) { + + double range = (maxY - minY) / 2; + double mid = minY + range; + + return Math.abs(y - mid) / range; + } + + + public double[] computeDerivative() { + + int length = histogram.length; + double[] derivative = new double[length]; + + for (int i = 0; i < length; i++) { + if (i == 0) { + derivative[i] = (histogram[i + 1] - histogram[i]) / resolution; + } else if (i == length - 1) { + derivative[i] = (histogram[i] - histogram[i - 1]) / resolution; + } else { + derivative[i] = (histogram[i + 1] - histogram[i - 1]) / (2 * resolution); + } + } + + return derivative; + } + + + public double calcMean(double[] arr, int start, int end) { + + if (start == end) { + return 0; + } + double sum = 0; + for (int i = start; i < end; i++) { + sum += arr[i]; + } + return sum / (end - start); + } + + + /* + Find columns, by finding all local maxima/minima of the derivative. Filtering them for the ones with the biggest values. + For each found minima, we will step to the right until we hit a 0 in the derivative, this indicates a minimum in the main histogram. If this minimum is below a threshold, it is deemed a column divider. + Same goes for maxima, but stepping to the left now, since minima in the function will always be to the left of a maximum in its derivative. + */ + public List determineColumnsWithDerivative(double[] derivative) { + + assert derivative.length == histogram.length; + + Set columnIndices = new HashSet<>(); + double mean = calcMean(histogram, 0, histogram.length); + double maxDvValue = calcMax(derivative); + double minDvValue = calcMin(derivative); + + if (maxDvValue - minDvValue < mean * MAX_VALUE_THRESHOLD) { + Collections.emptyList(); + } + + Extrema derivativeExtrema = calculateNearGlobalExtrema(derivative, maxDvValue, minDvValue); + + List columnsRightOfMinima = findZerosToTheRightOfMinima(derivative, derivativeExtrema.minima(), mean); + columnIndices.addAll(columnsRightOfMinima); + + List columnsLeftOfMaxima = findZerosToTheLeftOfMaxima(derivative, derivativeExtrema.maxima(), mean); + columnIndices.addAll(columnsLeftOfMaxima); + + return columnIndices.stream() + .sorted(Comparator.naturalOrder()) + .map(this::calculateXCoordinateFromIdx) + .toList(); + } + + + private List findZerosToTheLeftOfMaxima(double[] derivative, List derivativeMaxima, double mean) { + + List columnsLeftOfMaxima = new ArrayList<>(); + + for (int i = 0; i < derivativeMaxima.size(); i++) { + List consecutiveZeroes = new LinkedList<>(); + boolean maximumFound = false; + int maximaIdx = derivativeMaxima.get(i) - 1; // the highest derivative will always be at least one step away from the lowest value. + int endIdx = (int) Math.max(globalStartIdx, + Math.min(maximaIdx - 1, + maximaIdx - 0.1 * bins_num)); // search through 10% of array to the right, but at least one step and at most to the left edge; + + for (int j = maximaIdx; j >= endIdx; j--) { + if (derivative[j] < DERIVATIVE_ZERO_THRESHOLD) { + maximumFound = true; + consecutiveZeroes.add(j); + } else if (maximumFound) { + break; + } + } + if (maximumFound) { + int midIdx = consecutiveZeroes.size() / 2; + int middleMinimumIdx = consecutiveZeroes.get(midIdx); + if (histogram[middleMinimumIdx] < mean * MINIMUM_THRESHOLD_FOR_COLUMNS) { + columnsLeftOfMaxima.add(middleMinimumIdx); + } + } + } + return columnsLeftOfMaxima; + } + + + private List findZerosToTheRightOfMinima(double[] derivative, List derivativeMinima, double mean) { + + List columnIndixes = new LinkedList<>(); + for (int i = 0; i < derivativeMinima.size(); i++) { + List consecutiveZeroes = new LinkedList<>(); + boolean minimumFound = false; + int minimaIdx = derivativeMinima.get(i) + 1; // the highest derivative will always be at least one step earlier than the lowest value. + int endIdx = (int) Math.min(globalEndIdx, + Math.max(minimaIdx + 1, + minimaIdx + 0.1 * bins_num)); // search through 10% of array to the right, but at least one step and at most to the right edge; + + for (int j = minimaIdx; j < endIdx; j++) { + if (derivative[j] < DERIVATIVE_ZERO_THRESHOLD) { + minimumFound = true; + consecutiveZeroes.add(j); + } else if (minimumFound) { + break; + } + } + if (minimumFound) { + int midIdx = consecutiveZeroes.size() / 2; + int middleMinimumIdx = consecutiveZeroes.get(midIdx); + if (histogram[middleMinimumIdx] < mean * MINIMUM_THRESHOLD_FOR_COLUMNS) { + columnIndixes.add(middleMinimumIdx); + } + } + } + return columnIndixes; + } + + + private double calcMax(double[] array) { + + double max = Double.NEGATIVE_INFINITY; + for (int i = 0; i < array.length; i++) { + if (array[i] > max) { + max = array[i]; + } + } + return max; + } + + + private double calcMin(double[] array) { + + double min = Double.POSITIVE_INFINITY; + for (int i = 0; i < array.length; i++) { + if (array[i] < min) { + min = array[i]; + } + } + return min; + } + + + private Extrema calculateNearGlobalExtrema(double[] derivative, double maxDvValue, double minDvValue) { + + List nearGlobalDvMaximaIdx = new LinkedList<>(); + List nearGlobalDvMinimaIdx = new LinkedList<>(); + for (int i = globalStartIdx; i < globalEndIdx; i++) { + if (derivative[i] <= minDvValue * 0.8) { + nearGlobalDvMinimaIdx.add(i); + } + if (derivative[i] >= maxDvValue * 0.8) { + nearGlobalDvMaximaIdx.add(i); + } + } + + nearGlobalDvMinimaIdx = removeConsecutive(nearGlobalDvMinimaIdx); + nearGlobalDvMaximaIdx = removeConsecutive(nearGlobalDvMaximaIdx); + + return new Extrema(nearGlobalDvMaximaIdx, nearGlobalDvMinimaIdx); + } + + + private record Extrema(List maxima, List minima) { + + } + + + private Double calculateXCoordinateFromIdx(int globalMinIdx) { + + return min + ((globalMinIdx + 1) * resolution); + } + + + public static List removeConsecutive(List numbers) { + + List result = new ArrayList<>(); + if (numbers == null || numbers.isEmpty()) { + return result; + } + + result.add(numbers.get(0)); // Add the first number + + for (int i = 1; i < numbers.size(); i++) { + if (numbers.get(i) != numbers.get(i - 1) + 1) { + result.add(numbers.get(i)); // Add non-consecutive numbers + } + } + + return result; + } + + + public void kernelSmooth(double[] kernel) { + + double[] newFrequencies = new double[histogram.length]; + int shift = (kernel.length - 1) / 2; + for (int i = 0; i < kernel.length; i++) { + int jStart = Math.max(0, i - shift); + int jEnd = Math.min(histogram.length, histogram.length + i - shift); + for (int j = jStart; j < jEnd; j++) { + newFrequencies[j - i + shift] += kernel[i] * histogram[j]; + } + } + histogram = newFrequencies; + } + + + public double[] createGaussianKernel(int length, double stdDeviation) { + + int r = length / 2; + + int size = 2 * r + 1; + double[] kernel = new double[size]; + double sum = 0; + double b = 2 * (stdDeviation) * (stdDeviation); + double a = 1 / Math.sqrt(Math.PI * b); + for (int i = 0; i < size; i++) { + kernel[i] = a * Math.exp(-(i - r) * (i - r) / b); + sum += kernel[i]; + } + for (int i = 0; i < size; i++) { + kernel[i] /= sum; + } + return kernel; + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ReadingOrderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ReadingOrderService.java index 6d1a741..56fa167 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ReadingOrderService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ReadingOrderService.java @@ -1,17 +1,21 @@ package com.knecon.fforesight.service.layoutparser.processor.docstrum.service; -import java.util.ArrayList; +import java.awt.geom.Point2D; +import java.util.Collection; +import java.util.Collections; import java.util.Comparator; -import java.util.HashMap; +import java.util.LinkedList; import java.util.List; -import java.util.ListIterator; -import java.util.Map; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; +import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.ColumnDetector; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone; import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils; +import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling; +import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation; +import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations; @Service public class ReadingOrderService { @@ -20,7 +24,7 @@ public class ReadingOrderService { public static final double MULTI_COLUMN_DETECTION_THRESHOLD = 1.5; - public List resolve(List zones, boolean xyReadingOrder) { + public List resolve(List zones, boolean xyReadingOrder, LayoutparsingVisualizations visualizations, int page, PageInformation pageInformation) { if (zones.isEmpty() || zones.size() == 1) { return zones; @@ -30,28 +34,53 @@ public class ReadingOrderService { return resolveSingleColumnReadingOrder(zones); } - Map histogram = new HashMap<>(); - for (Zone zone : zones) { - long minY = Math.round(zone.getBBox().getMinY()); - long maxY = Math.round(zone.getBBox().getMaxY()); - for (long i = minY; i <= maxY; i++) { - histogram.put(i, histogram.getOrDefault(i, 0) + 1); - } - } + var columnSeparatorLines = calculateColumns(zones); - if (histogram.values() - .stream() - .mapToInt(Integer::intValue).average() - .orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) { + if (columnSeparatorLines.isEmpty()) { return resolveSingleColumnReadingOrder(zones); } else { - - return resolveMultiColumnReadingOder(zones); + for (Double columnLine : columnSeparatorLines) { + visualizations.addRulingVisualization(List.of(new Ruling(new Point2D.Double(columnLine, 0), new Point2D.Double(columnLine, 1000))), page); + } + return resolveMultiColumnReadingOder(zones, columnSeparatorLines, pageInformation); } } + private static List calculateColumns(List zones) { + + if (zones.isEmpty()) { + return Collections.emptyList(); + } + double min = zones.stream() + .mapToDouble(BoundingBox::getMinX) + .min() + .orElse(0); + double max = zones.stream() + .mapToDouble(BoundingBox::getMaxX) + .max() + .orElse(0); + double minY = zones.stream() + .mapToDouble(BoundingBox::getMinY) + .min() + .orElse(0); + double maxY = zones.stream() + .mapToDouble(BoundingBox::getMaxY) + .max() + .orElse(0); + + var columnResolver = new ColumnDetector(min, max, minY, maxY); + + zones.forEach(columnResolver::add); + columnResolver.kernelSmooth(columnResolver.createGaussianKernel(3, 1)); + + double[] derivative = columnResolver.computeDerivative(); + + return columnResolver.determineColumnsWithDerivative(derivative); + } + + private static List resolveSingleColumnReadingOrder(List zones) { zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) @@ -60,109 +89,47 @@ public class ReadingOrderService { } - private List resolveMultiColumnReadingOder(List zones) { + private List resolveMultiColumnReadingOder(List zones, List columnSeparatorLines, PageInformation pageInformation) { - // Simple reading order resolver for multi column page layout as described here : https://pub.towardsai.net/advanced-rag-02-unveiling-pdf-parsing-b84ae866344e - // TODO implement a more fancy reading order resolver see https://github.com/BobLd/DocumentLayoutAnalysis/blob/master/README.md#reading-order - - double minX = Double.POSITIVE_INFINITY; - double maxX = Double.NEGATIVE_INFINITY; - - for (Zone zone : zones) { - if (zone.getX() < minX) { - minX = zone.getX(); - } - if (zone.getX() + zone.getWidth() > maxX) { - maxX = zone.getX() + zone.getWidth(); - } + List> zonesPerColumn = new LinkedList<>(); + for (Double ignored : columnSeparatorLines) { + zonesPerColumn.add(new LinkedList<>()); } + zonesPerColumn.add(new LinkedList<>()); - double midLineXCoordinate = (minX + maxX) / 2; - - List leftOf = new ArrayList<>(); - List rightOf = new ArrayList<>(); - List middle = new ArrayList<>(); for (Zone zone : zones) { - if (zone.getX() < midLineXCoordinate && zone.getX() + zone.getWidth() < midLineXCoordinate) { - leftOf.add(zone); - } else if (zone.getX() > midLineXCoordinate && zone.getX() + zone.getWidth() > midLineXCoordinate) { - rightOf.add(zone); + boolean zoneAdded = false; + if (zone.getMinY() < pageInformation.heightRot() * 0.5) { + // above middle sort into column fitting left x value + for (int col = 0; col < columnSeparatorLines.size(); col++) { + if (columnSeparatorLines.get(col) > zone.getMinX()) { + zonesPerColumn.get(col).add(zone); + zoneAdded = true; + break; + } + } } else { - middle.add(zone); + // below middle sort into column fitting right x value + for (int col = 0; col < columnSeparatorLines.size(); col++) { + if (columnSeparatorLines.get(col) > zone.getMaxX()) { + zonesPerColumn.get(col).add(zone); + zoneAdded = true; + break; + } + } } + if (!zoneAdded) { + zonesPerColumn.get(zonesPerColumn.size() - 1).add(zone); + } + } - leftOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) - .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); + zonesPerColumn.forEach(list -> list.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) + .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)))); - rightOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) - .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); - - middle.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) - .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); -/* - List leftNotIntersecting = new ArrayList<>(); - for (Zone leftZone : leftOf) { - boolean intersects = false; - for (Zone rightZone : rightOf) { - if (leftZone.intersectsY(rightZone)) { - intersects = true; - break; - } - // early stopping - if (rightZone.getBBox().getMinY() > leftZone.getBBox().getMaxY()) { - break; - } - } - if (!intersects) { - leftNotIntersecting.add(leftZone); - } - } - - List rightNotIntersecting = new ArrayList<>(); - for (Zone rightZone : rightOf) { - boolean intersects = false; - for (Zone leftZone : leftOf) { - if (rightZone.intersectsY(leftZone)) { - intersects = true; - break; - } - // early stopping - if (leftZone.getBBox().getMinY() > rightZone.getBBox().getMaxY()) { - break; - } - } - if (!intersects) { - rightNotIntersecting.add(rightZone); - } - } - - leftOf.removeAll(leftNotIntersecting); - rightOf.removeAll(rightNotIntersecting); - - middle.addAll(leftNotIntersecting); - middle.addAll(rightNotIntersecting); -*/ - List sortedZones = new ArrayList<>(); - sortedZones.addAll(leftOf); - sortedZones.addAll(rightOf); - - ListIterator itty = middle.listIterator(); - - while (itty.hasNext()) { - Zone current = itty.next(); - for (int i = 0; i < sortedZones.size(); i++) { - if (current.getY() < sortedZones.get(i).getY()) { - sortedZones.add(i, current); - itty.remove(); - break; - } - } - } - - sortedZones.addAll(middle); - - return sortedZones; + return zonesPerColumn.stream() + .flatMap(Collection::stream) + .toList(); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ZoneBuilderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ZoneBuilderService.java index 3061bfc..0f70884 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ZoneBuilderService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ZoneBuilderService.java @@ -82,7 +82,7 @@ public class ZoneBuilderService { } - public List mergeZonesUntilConvergence(List zones, double characterSpacing, double lineSpacing, CleanRulings rulings) { + public List mergeZonesAgain(List zones, double characterSpacing, double lineSpacing, CleanRulings rulings) { double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER; double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/StringFrequencyCounter.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/StringFrequencyCounter.java index 0fd707d..e02161f 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/StringFrequencyCounter.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/StringFrequencyCounter.java @@ -46,7 +46,7 @@ public class StringFrequencyCounter { double total = countPerValue.values() .stream() .mapToDouble(v -> v).sum(); - if ((double) standard / total > 0.85) { + if ((double) standard / total > 0.75) { return mostPopular.getKey(); } countPerValue.remove(mostPopular.getKey()); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java index a5c8959..13e0975 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java @@ -17,6 +17,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRul import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation; import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations; import lombok.RequiredArgsConstructor; @@ -35,11 +36,12 @@ public class DocstrumBlockificationService { CleanRulings rulings, boolean xyOrder, LayoutparsingVisualizations visualizations, - LayoutParsingType layoutParsingType) { + LayoutParsingType layoutParsingType, + PageInformation pageInformation) { CleanRulings usedRulings = rulings.withoutTextRulings(); - var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder, usedRulings, visualizations); + var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder, usedRulings, visualizations, pageInformation); if (!textPositions.isEmpty()) { visualizations.addZoneVisualizations(zones, textPositions.get(0).getPage()); @@ -58,8 +60,10 @@ public class DocstrumBlockificationService { mergeIntersectingBlocks(classificationPage, usedRulings, 2f, 2f); - if (layoutParsingType == LayoutParsingType.REDACT_MANAGER) { - combineBlocks(classificationPage); + combineBlocksBasic(classificationPage); + + if (layoutParsingType == LayoutParsingType.REDACT_MANAGER || layoutParsingType == LayoutParsingType.DOCUMINE) { + combineBlocksSpecial(classificationPage); } if (layoutParsingType == LayoutParsingType.CLARIFYND) { @@ -105,7 +109,60 @@ public class DocstrumBlockificationService { } - public void combineBlocks(ClassificationPage page) { + public void combineBlocksBasic(ClassificationPage page) { + + TextPageBlock previous = new TextPageBlock(); + ListIterator itty = page.getTextBlocks().listIterator(); + CleanRulings usedRulings = page.getCleanRulings().withoutTextRulings(); + while (itty.hasNext()) { + + AbstractPageBlock block = itty.next(); + if (block instanceof TablePageBlock) { + previous = new TextPageBlock(); + continue; + } + TextPageBlock current = (TextPageBlock) block; + + if (previous != null && !previous.getSequences().isEmpty()) { + + if (current.getDir() != previous.getDir() || usedRulings.lineBetween(current, previous)) { + previous = current; + continue; + } + + if (current.intersectsY(previous) && current.horizontalDistance(previous) < 20) { + previous = combineBlocksAndResetIterator(previous, current, itty, true); + continue; + } + + if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, page)) { + previous = combineBlocksAndResetIterator(previous, current, itty, true); + continue; + } + + if (previous.intersects(current)) { + previous = combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate()); + continue; + } + + // merge headlines + if (current.getDir() == previous.getDir() && (Math.abs(current.getHighestFontSize() - previous.getHighestFontSize()) < 1.1f + && current.getHighestFontSize() > 12 + && previous.getHighestFontSize() > 12 + && current.getMostPopularWordStyle().equals(previous.getMostPopularWordStyle()) + && current.intersects(previous, current.getMostPopularWordHeight(), current.getMostPopularWordHeight()))) { + previous = combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate()); + continue; + } + } + previous = current; + } + + mergeIntersectingBlocks(page, usedRulings, 0, 6.5f); + } + + + public void combineBlocksSpecial(ClassificationPage page) { TextPageBlock previous = new TextPageBlock(); ListIterator itty = page.getTextBlocks().listIterator(); @@ -264,9 +321,7 @@ public class DocstrumBlockificationService { continue; } - if (current.getDir() == inner.getDir() && (Math.abs(current.getHighestFontSize() - inner.getHighestFontSize()) < 1.1f && current.getHighestFontSize() > 12 && inner.getHighestFontSize() > 12 // - && current.getMostPopularWordStyle().equals(inner.getMostPopularWordStyle()) // - && current.intersects(inner, current.getMostPopularWordHeight(), current.getMostPopularWordHeight()))) { + if (current.getDir() == inner.getDir() && current.intersects(inner, xThreshold, yThreshold)){ boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate(); current.getSequences().addAll(inner.getSequences()); @@ -277,8 +332,7 @@ public class DocstrumBlockificationService { itty.set(current); } } - } - var blocksIterator = blocks.iterator(); + } var blocksIterator = blocks.iterator(); while (blocksIterator.hasNext()) { if (blocksIterator.next() == null) { blocksIterator.remove(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java index 36ee3eb..17252a5 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java @@ -181,10 +181,7 @@ public class DocumentGraphFactory { Page page = context.getPage(textBlocks.get(0).getPage()); Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build(); - AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), - footer, - context, - page); + AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), footer, context, page); List tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer); footer.setTreeId(tocId); footer.setLeafTextBlock(textBlock); @@ -273,7 +270,8 @@ public class DocumentGraphFactory { return pages.keySet() .stream() .filter(page -> page.getNumber() == pageIndex) - .findFirst().orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex))); + .findFirst() + .orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex))); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PageInformation.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PageInformation.java index 0b53d74..2403255 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PageInformation.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PageInformation.java @@ -11,8 +11,8 @@ public record PageInformation(Rectangle2D mediabox, int number, int rotationDegr PDRectangle mediaBox = page.getMediaBox(); return new PageInformation(new Rectangle2D.Double(mediaBox.getLowerLeftX(), mediaBox.getLowerLeftY(), mediaBox.getWidth(), mediaBox.getHeight()), - pageNum, - page.getRotation()); + pageNum, + page.getRotation()); } @@ -22,6 +22,15 @@ public record PageInformation(Rectangle2D mediabox, int number, int rotationDegr } + public double heightRot() { + + if (rotationDegrees == 90 || rotationDegrees == 270) { + return width(); + } + return height(); + } + + public double width() { return mediabox.getWidth(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionSequenceComparator.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionSequenceComparator.java index b002dbc..52974d3 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionSequenceComparator.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionSequenceComparator.java @@ -39,21 +39,21 @@ public class TextPositionSequenceComparator implements Comparator= pos1YTop && pos2YBottom <= pos1YBottom || pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom) { - return Float.compare(x1, x2); + return Double.compare(x1, x2); } else if (pos1YBottom < pos2YBottom) { return -1; } else { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutparsingVisualizations.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutparsingVisualizations.java index 1ca34b5..cf8ee8a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutparsingVisualizations.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutparsingVisualizations.java @@ -73,11 +73,11 @@ public class LayoutparsingVisualizations { boolean active; final Visualizations words = Visualizations.builder().layer(ContentStreams.WORDS).build(); - final Visualizations lines = Visualizations.builder().layer(ContentStreams.LINES).build(); - final Visualizations zones = Visualizations.builder().layer(ContentStreams.ZONES).layerVisibilityDefaultValue(true).build(); + final Visualizations lines = Visualizations.builder().layer(ContentStreams.LINES).layerVisibilityDefaultValue(false).build(); + final Visualizations zones = Visualizations.builder().layer(ContentStreams.ZONES).layerVisibilityDefaultValue(false).build(); final Visualizations mainBody = Visualizations.builder().layer(ContentStreams.MAIN_BODY).build(); final Visualizations clean_rulings = Visualizations.builder().layer(ContentStreams.CLEAN_RULINGS).build(); - final Visualizations rulings = Visualizations.builder().layer(ContentStreams.RULINGS).build(); + final Visualizations rulings = Visualizations.builder().layer(ContentStreams.RULINGS).layerVisibilityDefaultValue(true).build(); final Visualizations cells = Visualizations.builder().layer(ContentStreams.CELLS).build(); final Visualizations markedContent = Visualizations.builder().layer(ContentStreams.MARKED_CONTENT).build(); final Visualizations neighbours = Visualizations.builder().layer(ContentStreams.NEIGHBOURS).build(); @@ -139,7 +139,7 @@ public class LayoutparsingVisualizations { visualizationsOnPage.getColoredLines() .addAll(rulings .stream() - .map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 0.5f)) + .map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 2f)) .toList()); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java index ed044d6..ea92acb 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java @@ -30,21 +30,21 @@ public class LayoutparserEnd2EndTest extends AbstractTest { @Autowired private LayoutParsingPipeline layoutParsingPipeline; - @Disabled @Test +// @Disabled public void testLayoutParserEndToEnd() { - String filePath = "files/Minimal Examples/RotateTextWithRulingsTestFile.pdf"; + String filePath = "/home/kschuettler/Dokumente/TestFiles/certificates/origin/ecm.pdf"; runForFile(filePath); } @Test - @Disabled +// @Disabled @SneakyThrows public void testLayoutParserEndToEndWithFolder() { - String folder = "/home/kschuettler/Dokumente/TestFiles/certificates/certificates-ocred"; + String folder = "/home/kschuettler/Dokumente/TestFiles/certificates/origin"; List pdfFiles = Files.walk(Path.of(folder)) .filter(path -> path.getFileName().toString().endsWith(".pdf")) .sorted(Comparator.comparing(Path::getFileName))