akra-certificates: finetuning for certificates

2024-05-06 13:26:17 +02:00 · 2024-05-06 13:26:17 +02:00 · e8513d05e2
commit e8513d05e2
parent eb2ea755a5
12 changed files with 508 additions and 156 deletions
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java
@ -266,7 +266,8 @@ public class LayoutParsingPipeline {
            classificationDocument.getVisualizations().addRulingVisualization(stripper.getRulings(), pageNumber);
            CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), stripper.getRulings());

-            List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals(), PageInformation.fromPDPage(pageNumber, pdPage));
+            PageInformation pageInformation = PageInformation.fromPDPage(pageNumber, pdPage);
+            List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals(), pageInformation);
            classificationDocument.getVisualizations().addCellVisualizations(emptyTableCells, pageNumber);

            TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings);
@ -283,9 +284,9 @@ public class LayoutParsingPipeline {
                        redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getVisualizations());
                case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
                case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG ->
-                        docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getVisualizations(), layoutParsingType);
+                        docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getVisualizations(), layoutParsingType, pageInformation);
                case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG ->
-                        docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getVisualizations(), layoutParsingType);
+                        docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getVisualizations(), layoutParsingType, pageInformation);
            };

            classificationPage.setCleanRulings(cleanRulings);
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/DocstrumSegmentationService.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/DocstrumSegmentationService.java
@ -18,6 +18,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRul
 import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
 import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
 import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
+import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
 import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;

 import lombok.RequiredArgsConstructor;
@ -33,7 +34,11 @@ public class DocstrumSegmentationService {
    private final ReadingOrderService readingOrderService;


-    public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean xyOrder, CleanRulings usedRulings, LayoutparsingVisualizations visualizations) {
+    public List<Zone> segmentPage(List<TextPositionSequence> textPositions,
+                                  boolean xyOrder,
+                                  CleanRulings usedRulings,
+                                  LayoutparsingVisualizations visualizations,
+                                  PageInformation pageInformation) {

        List<Zone> zones = new ArrayList<>();
        zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.ZERO));
@ -41,7 +46,7 @@ public class DocstrumSegmentationService {
        zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.HALF_CIRCLE));
        zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.THREE_QUARTER_CIRCLE));

-        return readingOrderService.resolve(zones, xyOrder);
+        return readingOrderService.resolve(zones, xyOrder, visualizations, textPositions.get(0).getPage(), pageInformation);
    }


@ -63,10 +68,9 @@ public class DocstrumSegmentationService {
        double lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20);

        List<Line> lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing, rulings);
-        List<Zone> zones =  zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing, rulings);
+        List<Zone> zones = zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing, rulings);
 //        return zones;
-        return zoneBuilderService.mergeZonesUntilConvergence(zones, characterSpacing, lineSpacing, rulings);
+        return zoneBuilderService.mergeZonesAgain(zones, characterSpacing, lineSpacing, rulings);
    }

-
 }
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/ColumnDetector.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/model/ColumnDetector.java
@ -0,0 +1,319 @@
+package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Set;
+
+public class ColumnDetector {
+
+    public static final double MAX_VALUE_THRESHOLD = 0.5;
+    final static int bins_num = 128;
+    final static int globalStartIdx = 0; // ignore outer parts completely, we don't expect columns there
+    final static int globalEndIdx = bins_num; // i chose 7, since thirds seems a likely split for columns, therefore divided by 6 would eliminate those.
+    public static final double DERIVATIVE_ZERO_THRESHOLD = 1e-10;
+    public static final double MINIMUM_THRESHOLD_FOR_COLUMNS = 0.05;
+    double minY;
+    double maxY;
+    double midY;
+    double[] histogram;
+    double min;
+    double max;
+    double resolution;
+    double sum;
+    int N;
+
+
+    public ColumnDetector(double min, double max, double minY, double maxY) {
+
+        this.min = min;
+        this.max = max;
+        this.minY = minY;
+        this.maxY = maxY;
+        this.midY = maxY - minY;
+        this.resolution = (max - min) / bins_num;
+        this.histogram = new double[bins_num];
+    }
+
+
+    public void add(BoundingBox zone) {
+
+        N++;
+        double weight = computeWeight(zone);
+        int start = (int) ((zone.getMinX() - min) / resolution);
+        int end = (int) ((zone.getMaxX() - min) / resolution);
+        for (int i = start; i < end; i++) {
+            histogram[i] += weight;
+            sum += histogram[i];
+        }
+    }
+
+
+    private double computeWeight(BoundingBox zone) {
+
+        double areaWeight = zone.getBBox().getHeight();
+
+        double relativeDistance = relativeDistanceToMiddle(zone.getBBox().getCenterY());
+
+        double distanceWeight;
+        if (relativeDistance < 0.6) {
+            distanceWeight = 1;
+        } else if (relativeDistance < 0.8) {
+            distanceWeight = 0.8;
+        } else {
+            distanceWeight = 0.1;
+        }
+
+        return areaWeight * distanceWeight;
+    }
+
+
+    private double relativeDistanceToMiddle(double y) {
+
+        double range = (maxY - minY) / 2;
+        double mid = minY + range;
+
+        return Math.abs(y - mid) / range;
+    }
+
+
+    public double[] computeDerivative() {
+
+        int length = histogram.length;
+        double[] derivative = new double[length];
+
+        for (int i = 0; i < length; i++) {
+            if (i == 0) {
+                derivative[i] = (histogram[i + 1] - histogram[i]) / resolution;
+            } else if (i == length - 1) {
+                derivative[i] = (histogram[i] - histogram[i - 1]) / resolution;
+            } else {
+                derivative[i] = (histogram[i + 1] - histogram[i - 1]) / (2 * resolution);
+            }
+        }
+
+        return derivative;
+    }
+
+
+    public double calcMean(double[] arr, int start, int end) {
+
+        if (start == end) {
+            return 0;
+        }
+        double sum = 0;
+        for (int i = start; i < end; i++) {
+            sum += arr[i];
+        }
+        return sum / (end - start);
+    }
+
+
+    /*
+    Find columns, by finding all local maxima/minima of the derivative. Filtering them for the ones with the biggest values.
+    For each found minima, we will step to the right until we hit a 0 in the derivative, this indicates a minimum in the main histogram. If this minimum is below a threshold, it is deemed a column divider.
+    Same goes for maxima, but stepping to the left now, since minima in the function will always be to the left of a maximum in its derivative.
+     */
+    public List<Double> determineColumnsWithDerivative(double[] derivative) {
+
+        assert derivative.length == histogram.length;
+
+        Set<Integer> columnIndices = new HashSet<>();
+        double mean = calcMean(histogram, 0, histogram.length);
+        double maxDvValue = calcMax(derivative);
+        double minDvValue = calcMin(derivative);
+
+        if (maxDvValue - minDvValue < mean * MAX_VALUE_THRESHOLD) {
+            Collections.emptyList();
+        }
+
+        Extrema derivativeExtrema = calculateNearGlobalExtrema(derivative, maxDvValue, minDvValue);
+
+        List<Integer> columnsRightOfMinima = findZerosToTheRightOfMinima(derivative, derivativeExtrema.minima(), mean);
+        columnIndices.addAll(columnsRightOfMinima);
+
+        List<Integer> columnsLeftOfMaxima = findZerosToTheLeftOfMaxima(derivative, derivativeExtrema.maxima(), mean);
+        columnIndices.addAll(columnsLeftOfMaxima);
+
+        return columnIndices.stream()
+                .sorted(Comparator.naturalOrder())
+                .map(this::calculateXCoordinateFromIdx)
+                .toList();
+    }
+
+
+    private List<Integer> findZerosToTheLeftOfMaxima(double[] derivative, List<Integer> derivativeMaxima, double mean) {
+
+        List<Integer> columnsLeftOfMaxima = new ArrayList<>();
+
+        for (int i = 0; i < derivativeMaxima.size(); i++) {
+            List<Integer> consecutiveZeroes = new LinkedList<>();
+            boolean maximumFound = false;
+            int maximaIdx = derivativeMaxima.get(i) - 1; // the highest derivative will always be at least one step away from the lowest value.
+            int endIdx = (int) Math.max(globalStartIdx,
+                                        Math.min(maximaIdx - 1,
+                                                 maximaIdx - 0.1 * bins_num)); // search through 10% of array to the right, but at least one step and at most to the left edge;
+
+            for (int j = maximaIdx; j >= endIdx; j--) {
+                if (derivative[j] < DERIVATIVE_ZERO_THRESHOLD) {
+                    maximumFound = true;
+                    consecutiveZeroes.add(j);
+                } else if (maximumFound) {
+                    break;
+                }
+            }
+            if (maximumFound) {
+                int midIdx = consecutiveZeroes.size() / 2;
+                int middleMinimumIdx = consecutiveZeroes.get(midIdx);
+                if (histogram[middleMinimumIdx] < mean * MINIMUM_THRESHOLD_FOR_COLUMNS) {
+                    columnsLeftOfMaxima.add(middleMinimumIdx);
+                }
+            }
+        }
+        return columnsLeftOfMaxima;
+    }
+
+
+    private List<Integer> findZerosToTheRightOfMinima(double[] derivative, List<Integer> derivativeMinima, double mean) {
+
+        List<Integer> columnIndixes = new LinkedList<>();
+        for (int i = 0; i < derivativeMinima.size(); i++) {
+            List<Integer> consecutiveZeroes = new LinkedList<>();
+            boolean minimumFound = false;
+            int minimaIdx = derivativeMinima.get(i) + 1; // the highest derivative will always be at least one step earlier than the lowest value.
+            int endIdx = (int) Math.min(globalEndIdx,
+                                        Math.max(minimaIdx + 1,
+                                                 minimaIdx + 0.1 * bins_num)); // search through 10% of array to the right, but at least one step and at most to the right edge;
+
+            for (int j = minimaIdx; j < endIdx; j++) {
+                if (derivative[j] < DERIVATIVE_ZERO_THRESHOLD) {
+                    minimumFound = true;
+                    consecutiveZeroes.add(j);
+                } else if (minimumFound) {
+                    break;
+                }
+            }
+            if (minimumFound) {
+                int midIdx = consecutiveZeroes.size() / 2;
+                int middleMinimumIdx = consecutiveZeroes.get(midIdx);
+                if (histogram[middleMinimumIdx] < mean * MINIMUM_THRESHOLD_FOR_COLUMNS) {
+                    columnIndixes.add(middleMinimumIdx);
+                }
+            }
+        }
+        return columnIndixes;
+    }
+
+
+    private double calcMax(double[] array) {
+
+        double max = Double.NEGATIVE_INFINITY;
+        for (int i = 0; i < array.length; i++) {
+            if (array[i] > max) {
+                max = array[i];
+            }
+        }
+        return max;
+    }
+
+
+    private double calcMin(double[] array) {
+
+        double min = Double.POSITIVE_INFINITY;
+        for (int i = 0; i < array.length; i++) {
+            if (array[i] < min) {
+                min = array[i];
+            }
+        }
+        return min;
+    }
+
+
+    private Extrema calculateNearGlobalExtrema(double[] derivative, double maxDvValue, double minDvValue) {
+
+        List<Integer> nearGlobalDvMaximaIdx = new LinkedList<>();
+        List<Integer> nearGlobalDvMinimaIdx = new LinkedList<>();
+        for (int i = globalStartIdx; i < globalEndIdx; i++) {
+            if (derivative[i] <= minDvValue * 0.8) {
+                nearGlobalDvMinimaIdx.add(i);
+            }
+            if (derivative[i] >= maxDvValue * 0.8) {
+                nearGlobalDvMaximaIdx.add(i);
+            }
+        }
+
+        nearGlobalDvMinimaIdx = removeConsecutive(nearGlobalDvMinimaIdx);
+        nearGlobalDvMaximaIdx = removeConsecutive(nearGlobalDvMaximaIdx);
+
+        return new Extrema(nearGlobalDvMaximaIdx, nearGlobalDvMinimaIdx);
+    }
+
+
+    private record Extrema(List<Integer> maxima, List<Integer> minima) {
+
+    }
+
+
+    private Double calculateXCoordinateFromIdx(int globalMinIdx) {
+
+        return min + ((globalMinIdx + 1) * resolution);
+    }
+
+
+    public static List<Integer> removeConsecutive(List<Integer> numbers) {
+
+        List<Integer> result = new ArrayList<>();
+        if (numbers == null || numbers.isEmpty()) {
+            return result;
+        }
+
+        result.add(numbers.get(0)); // Add the first number
+
+        for (int i = 1; i < numbers.size(); i++) {
+            if (numbers.get(i) != numbers.get(i - 1) + 1) {
+                result.add(numbers.get(i)); // Add non-consecutive numbers
+            }
+        }
+
+        return result;
+    }
+
+
+    public void kernelSmooth(double[] kernel) {
+
+        double[] newFrequencies = new double[histogram.length];
+        int shift = (kernel.length - 1) / 2;
+        for (int i = 0; i < kernel.length; i++) {
+            int jStart = Math.max(0, i - shift);
+            int jEnd = Math.min(histogram.length, histogram.length + i - shift);
+            for (int j = jStart; j < jEnd; j++) {
+                newFrequencies[j - i + shift] += kernel[i] * histogram[j];
+            }
+        }
+        histogram = newFrequencies;
+    }
+
+
+    public double[] createGaussianKernel(int length, double stdDeviation) {
+
+        int r = length / 2;
+
+        int size = 2 * r + 1;
+        double[] kernel = new double[size];
+        double sum = 0;
+        double b = 2 * (stdDeviation) * (stdDeviation);
+        double a = 1 / Math.sqrt(Math.PI * b);
+        for (int i = 0; i < size; i++) {
+            kernel[i] = a * Math.exp(-(i - r) * (i - r) / b);
+            sum += kernel[i];
+        }
+        for (int i = 0; i < size; i++) {
+            kernel[i] /= sum;
+        }
+        return kernel;
+    }
+
+}
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ReadingOrderService.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ReadingOrderService.java
@ -1,17 +1,21 @@
 package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;

-import java.util.ArrayList;
+import java.awt.geom.Point2D;
+import java.util.Collection;
+import java.util.Collections;
 import java.util.Comparator;
-import java.util.HashMap;
+import java.util.LinkedList;
 import java.util.List;
-import java.util.ListIterator;
-import java.util.Map;

 import org.springframework.stereotype.Service;

 import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
+import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.ColumnDetector;
 import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
 import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
+import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
+import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
+import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;

@Service
 public class ReadingOrderService {
@ -20,7 +24,7 @@ public class ReadingOrderService {
    public static final double MULTI_COLUMN_DETECTION_THRESHOLD = 1.5;


-    public List<Zone> resolve(List<Zone> zones, boolean xyReadingOrder) {
+    public List<Zone> resolve(List<Zone> zones, boolean xyReadingOrder, LayoutparsingVisualizations visualizations, int page, PageInformation pageInformation) {

        if (zones.isEmpty() || zones.size() == 1) {
            return zones;
@ -30,28 +34,53 @@ public class ReadingOrderService {
            return resolveSingleColumnReadingOrder(zones);
        }

-        Map<Long, Integer> histogram = new HashMap<>();
-        for (Zone zone : zones) {
-            long minY = Math.round(zone.getBBox().getMinY());
-            long maxY = Math.round(zone.getBBox().getMaxY());
-            for (long i = minY; i <= maxY; i++) {
-                histogram.put(i, histogram.getOrDefault(i, 0) + 1);
-            }
-        }
+        var columnSeparatorLines = calculateColumns(zones);

-        if (histogram.values()
-                    .stream()
-                    .mapToInt(Integer::intValue).average()
-                    .orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) {
+        if (columnSeparatorLines.isEmpty()) {
            return resolveSingleColumnReadingOrder(zones);
        } else {
-
-            return resolveMultiColumnReadingOder(zones);
+            for (Double columnLine : columnSeparatorLines) {
+                visualizations.addRulingVisualization(List.of(new Ruling(new Point2D.Double(columnLine, 0), new Point2D.Double(columnLine, 1000))), page);
+            }
+            return resolveMultiColumnReadingOder(zones, columnSeparatorLines, pageInformation);
        }

    }


+    private static List<Double> calculateColumns(List<Zone> zones) {
+
+        if (zones.isEmpty()) {
+            return Collections.emptyList();
+        }
+        double min = zones.stream()
+                .mapToDouble(BoundingBox::getMinX)
+                .min()
+                .orElse(0);
+        double max = zones.stream()
+                .mapToDouble(BoundingBox::getMaxX)
+                .max()
+                .orElse(0);
+        double minY = zones.stream()
+                .mapToDouble(BoundingBox::getMinY)
+                .min()
+                .orElse(0);
+        double maxY = zones.stream()
+                .mapToDouble(BoundingBox::getMaxY)
+                .max()
+                .orElse(0);
+
+        var columnResolver = new ColumnDetector(min, max, minY, maxY);
+
+        zones.forEach(columnResolver::add);
+        columnResolver.kernelSmooth(columnResolver.createGaussianKernel(3, 1));
+
+        double[] derivative = columnResolver.computeDerivative();
+
+        return columnResolver.determineColumnsWithDerivative(derivative);
+    }
+
+
    private static List<Zone> resolveSingleColumnReadingOrder(List<Zone> zones) {

        zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
@ -60,109 +89,47 @@ public class ReadingOrderService {
    }


-    private List<Zone> resolveMultiColumnReadingOder(List<Zone> zones) {
+    private List<Zone> resolveMultiColumnReadingOder(List<Zone> zones, List<Double> columnSeparatorLines, PageInformation pageInformation) {

-        // Simple reading order resolver for multi column page layout as described here : https://pub.towardsai.net/advanced-rag-02-unveiling-pdf-parsing-b84ae866344e
-        // TODO implement a more fancy reading order resolver see https://github.com/BobLd/DocumentLayoutAnalysis/blob/master/README.md#reading-order
-
-        double minX = Double.POSITIVE_INFINITY;
-        double maxX = Double.NEGATIVE_INFINITY;
-
-        for (Zone zone : zones) {
-            if (zone.getX() < minX) {
-                minX = zone.getX();
-            }
-            if (zone.getX() + zone.getWidth() > maxX) {
-                maxX = zone.getX() + zone.getWidth();
-            }
+        List<List<Zone>> zonesPerColumn = new LinkedList<>();
+        for (Double ignored : columnSeparatorLines) {
+            zonesPerColumn.add(new LinkedList<>());
        }
+        zonesPerColumn.add(new LinkedList<>());

-        double midLineXCoordinate = (minX + maxX) / 2;
-
-        List<Zone> leftOf = new ArrayList<>();
-        List<Zone> rightOf = new ArrayList<>();
-        List<Zone> middle = new ArrayList<>();
        for (Zone zone : zones) {
-            if (zone.getX() < midLineXCoordinate && zone.getX() + zone.getWidth() < midLineXCoordinate) {
-                leftOf.add(zone);
-            } else if (zone.getX() > midLineXCoordinate && zone.getX() + zone.getWidth() > midLineXCoordinate) {
-                rightOf.add(zone);
+            boolean zoneAdded = false;
+            if (zone.getMinY() < pageInformation.heightRot() * 0.5) {
+                // above middle sort into column fitting left x value
+                for (int col = 0; col < columnSeparatorLines.size(); col++) {
+                    if (columnSeparatorLines.get(col) > zone.getMinX()) {
+                        zonesPerColumn.get(col).add(zone);
+                        zoneAdded = true;
+                        break;
+                    }
+                }
            } else {
-                middle.add(zone);
+                // below middle sort into column fitting right x value
+                for (int col = 0; col < columnSeparatorLines.size(); col++) {
+                    if (columnSeparatorLines.get(col) > zone.getMaxX()) {
+                        zonesPerColumn.get(col).add(zone);
+                        zoneAdded = true;
+                        break;
+                    }
+                }
            }
+            if (!zoneAdded) {
+                zonesPerColumn.get(zonesPerColumn.size() - 1).add(zone);
+            }
+
        }

-        leftOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
-                            .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
+        zonesPerColumn.forEach(list -> list.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
+                                                         .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))));

-        rightOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
-                             .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
-
-        middle.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
-                            .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
-/*
-        List<Zone> leftNotIntersecting = new ArrayList<>();
-        for (Zone leftZone : leftOf) {
-            boolean intersects = false;
-            for (Zone rightZone : rightOf) {
-                if (leftZone.intersectsY(rightZone)) {
-                    intersects = true;
-                    break;
-                }
-                // early stopping
-                if (rightZone.getBBox().getMinY() > leftZone.getBBox().getMaxY()) {
-                    break;
-                }
-            }
-            if (!intersects) {
-                leftNotIntersecting.add(leftZone);
-            }
-        }
-
-        List<Zone> rightNotIntersecting = new ArrayList<>();
-        for (Zone rightZone : rightOf) {
-            boolean intersects = false;
-            for (Zone leftZone : leftOf) {
-                if (rightZone.intersectsY(leftZone)) {
-                    intersects = true;
-                    break;
-                }
-                // early stopping
-                if (leftZone.getBBox().getMinY() > rightZone.getBBox().getMaxY()) {
-                    break;
-                }
-            }
-            if (!intersects) {
-                rightNotIntersecting.add(rightZone);
-            }
-        }
-
-        leftOf.removeAll(leftNotIntersecting);
-        rightOf.removeAll(rightNotIntersecting);
-
-        middle.addAll(leftNotIntersecting);
-        middle.addAll(rightNotIntersecting);
-*/
-        List<Zone> sortedZones = new ArrayList<>();
-        sortedZones.addAll(leftOf);
-        sortedZones.addAll(rightOf);
-
-        ListIterator<Zone> itty = middle.listIterator();
-
-        while (itty.hasNext()) {
-            Zone current = itty.next();
-            for (int i = 0; i < sortedZones.size(); i++) {
-                if (current.getY() < sortedZones.get(i).getY()) {
-                    sortedZones.add(i, current);
-                    itty.remove();
-                    break;
-                }
-            }
-        }
-
-        sortedZones.addAll(middle);
-
-        return sortedZones;
+        return zonesPerColumn.stream()
+                .flatMap(Collection::stream)
+                .toList();
    }

 }
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ZoneBuilderService.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/docstrum/service/ZoneBuilderService.java
@ -82,7 +82,7 @@ public class ZoneBuilderService {
    }


-    public List<Zone> mergeZonesUntilConvergence(List<Zone> zones, double characterSpacing, double lineSpacing, CleanRulings rulings) {
+    public List<Zone> mergeZonesAgain(List<Zone> zones, double characterSpacing, double lineSpacing, CleanRulings rulings) {

        double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER;
        double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER;
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/StringFrequencyCounter.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/StringFrequencyCounter.java
@ -46,7 +46,7 @@ public class StringFrequencyCounter {
            double total = countPerValue.values()
                    .stream()
                    .mapToDouble(v -> v).sum();
-            if ((double) standard / total > 0.85) {
+            if ((double) standard / total > 0.75) {
                return mostPopular.getKey();
            }
            countPerValue.remove(mostPopular.getKey());
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java
@ -17,6 +17,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRul
 import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
 import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
 import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
+import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
 import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;

 import lombok.RequiredArgsConstructor;
@ -35,11 +36,12 @@ public class DocstrumBlockificationService {
                                       CleanRulings rulings,
                                       boolean xyOrder,
                                       LayoutparsingVisualizations visualizations,
-                                       LayoutParsingType layoutParsingType) {
+                                       LayoutParsingType layoutParsingType,
+                                       PageInformation pageInformation) {

        CleanRulings usedRulings = rulings.withoutTextRulings();

-        var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder, usedRulings, visualizations);
+        var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder, usedRulings, visualizations, pageInformation);

        if (!textPositions.isEmpty()) {
            visualizations.addZoneVisualizations(zones, textPositions.get(0).getPage());
@ -58,8 +60,10 @@ public class DocstrumBlockificationService {

        mergeIntersectingBlocks(classificationPage, usedRulings, 2f, 2f);

-        if (layoutParsingType == LayoutParsingType.REDACT_MANAGER) {
-            combineBlocks(classificationPage);
+        combineBlocksBasic(classificationPage);
+
+        if (layoutParsingType == LayoutParsingType.REDACT_MANAGER || layoutParsingType == LayoutParsingType.DOCUMINE) {
+            combineBlocksSpecial(classificationPage);
        }

        if (layoutParsingType == LayoutParsingType.CLARIFYND) {
@ -105,7 +109,60 @@ public class DocstrumBlockificationService {
    }


-    public void combineBlocks(ClassificationPage page) {
+    public void combineBlocksBasic(ClassificationPage page) {
+
+        TextPageBlock previous = new TextPageBlock();
+        ListIterator<AbstractPageBlock> itty = page.getTextBlocks().listIterator();
+        CleanRulings usedRulings = page.getCleanRulings().withoutTextRulings();
+        while (itty.hasNext()) {
+
+            AbstractPageBlock block = itty.next();
+            if (block instanceof TablePageBlock) {
+                previous = new TextPageBlock();
+                continue;
+            }
+            TextPageBlock current = (TextPageBlock) block;
+
+            if (previous != null && !previous.getSequences().isEmpty()) {
+
+                if (current.getDir() != previous.getDir() || usedRulings.lineBetween(current, previous)) {
+                    previous = current;
+                    continue;
+                }
+
+                if (current.intersectsY(previous) && current.horizontalDistance(previous) < 20) {
+                    previous = combineBlocksAndResetIterator(previous, current, itty, true);
+                    continue;
+                }
+
+                if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, page)) {
+                    previous = combineBlocksAndResetIterator(previous, current, itty, true);
+                    continue;
+                }
+
+                if (previous.intersects(current)) {
+                    previous = combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
+                    continue;
+                }
+
+                // merge headlines
+                if (current.getDir() == previous.getDir() && (Math.abs(current.getHighestFontSize() - previous.getHighestFontSize()) < 1.1f
+                                                              && current.getHighestFontSize() > 12
+                                                              && previous.getHighestFontSize() > 12
+                                                              && current.getMostPopularWordStyle().equals(previous.getMostPopularWordStyle())
+                                                              && current.intersects(previous, current.getMostPopularWordHeight(), current.getMostPopularWordHeight()))) {
+                    previous = combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
+                    continue;
+                }
+            }
+            previous = current;
+        }
+
+        mergeIntersectingBlocks(page, usedRulings, 0, 6.5f);
+    }
+
+
+    public void combineBlocksSpecial(ClassificationPage page) {

        TextPageBlock previous = new TextPageBlock();
        ListIterator<AbstractPageBlock> itty = page.getTextBlocks().listIterator();
@ -264,9 +321,7 @@ public class DocstrumBlockificationService {
                    continue;
                }

-                if (current.getDir() == inner.getDir() && (Math.abs(current.getHighestFontSize() - inner.getHighestFontSize()) < 1.1f && current.getHighestFontSize() > 12 && inner.getHighestFontSize() > 12 //
-                                                               && current.getMostPopularWordStyle().equals(inner.getMostPopularWordStyle()) //
-                                                               && current.intersects(inner, current.getMostPopularWordHeight(), current.getMostPopularWordHeight()))) {
+                if (current.getDir() == inner.getDir() && current.intersects(inner, xThreshold, yThreshold)){

                    boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate();
                    current.getSequences().addAll(inner.getSequences());
@ -277,8 +332,7 @@ public class DocstrumBlockificationService {
                    itty.set(current);
                }
            }
-        }
-        var blocksIterator = blocks.iterator();
+        } var blocksIterator = blocks.iterator();
        while (blocksIterator.hasNext()) {
            if (blocksIterator.next() == null) {
                blocksIterator.remove();
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java
@ -181,10 +181,7 @@ public class DocumentGraphFactory {

        Page page = context.getPage(textBlocks.get(0).getPage());
        Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
-        AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks),
-                                                                                  footer,
-                                                                                  context,
-                                                                                  page);
+        AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), footer, context, page);
        List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
        footer.setTreeId(tocId);
        footer.setLeafTextBlock(textBlock);
@ -273,7 +270,8 @@ public class DocumentGraphFactory {
            return pages.keySet()
                    .stream()
                    .filter(page -> page.getNumber() == pageIndex)
-                    .findFirst().orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex)));
+                    .findFirst()
+                    .orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex)));
        }

    }
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PageInformation.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/PageInformation.java
@ -11,8 +11,8 @@ public record PageInformation(Rectangle2D mediabox, int number, int rotationDegr

        PDRectangle mediaBox = page.getMediaBox();
        return new PageInformation(new Rectangle2D.Double(mediaBox.getLowerLeftX(), mediaBox.getLowerLeftY(), mediaBox.getWidth(), mediaBox.getHeight()),
-                pageNum,
-                page.getRotation());
+                                   pageNum,
+                                   page.getRotation());
    }


@ -22,6 +22,15 @@ public record PageInformation(Rectangle2D mediabox, int number, int rotationDegr
    }


+    public double heightRot() {
+
+        if (rotationDegrees == 90 || rotationDegrees == 270) {
+            return width();
+        }
+        return height();
+    }
+
+
    public double width() {

        return mediabox.getWidth();
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionSequenceComparator.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/utils/TextPositionSequenceComparator.java
@ -39,21 +39,21 @@ public class TextPositionSequenceComparator implements Comparator<TextPositionSe
        }

        // get the text direction adjusted coordinates
-        float x1 = pos1.getMinXDirAdj();
-        float x2 = pos2.getMinXDirAdj();
+        double x1 = pos1.getBBox().getX();
+        double x2 = pos2.getBBox().getX();

-        float pos1YBottom = pos1.getMaxYDirAdj();
-        float pos2YBottom = pos2.getMaxYDirAdj();
+        double pos1YBottom = pos1.getBBox().getMaxY();
+        double pos2YBottom = pos2.getBBox().getMaxY();

        // note that the coordinates have been adjusted so 0,0 is in upper left
-        float pos1YTop = pos1YBottom - pos1.getTextHeightNoPadding();
-        float pos2YTop = pos2YBottom - pos2.getTextHeightNoPadding();
+        double pos1YTop = pos1YBottom - pos1.getBBox().getHeight();
+        double pos2YTop = pos2YBottom - pos2.getBBox().getHeight();

-        float yDifference = Math.abs(pos1YBottom - pos2YBottom);
+        double yDifference = Math.abs(pos1YBottom - pos2YBottom);

        // we will do a simple tolerance comparison
        if (yDifference < .1 || pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom || pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom) {
-            return Float.compare(x1, x2);
+            return Double.compare(x1, x2);
        } else if (pos1YBottom < pos2YBottom) {
            return -1;
        } else {
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutparsingVisualizations.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/visualization/LayoutparsingVisualizations.java
@ -73,11 +73,11 @@ public class LayoutparsingVisualizations {
    boolean active;

    final Visualizations words = Visualizations.builder().layer(ContentStreams.WORDS).build();
-    final Visualizations lines = Visualizations.builder().layer(ContentStreams.LINES).build();
-    final Visualizations zones = Visualizations.builder().layer(ContentStreams.ZONES).layerVisibilityDefaultValue(true).build();
+    final Visualizations lines = Visualizations.builder().layer(ContentStreams.LINES).layerVisibilityDefaultValue(false).build();
+    final Visualizations zones = Visualizations.builder().layer(ContentStreams.ZONES).layerVisibilityDefaultValue(false).build();
    final Visualizations mainBody = Visualizations.builder().layer(ContentStreams.MAIN_BODY).build();
    final Visualizations clean_rulings = Visualizations.builder().layer(ContentStreams.CLEAN_RULINGS).build();
-    final Visualizations rulings = Visualizations.builder().layer(ContentStreams.RULINGS).build();
+    final Visualizations rulings = Visualizations.builder().layer(ContentStreams.RULINGS).layerVisibilityDefaultValue(true).build();
    final Visualizations cells = Visualizations.builder().layer(ContentStreams.CELLS).build();
    final Visualizations markedContent = Visualizations.builder().layer(ContentStreams.MARKED_CONTENT).build();
    final Visualizations neighbours = Visualizations.builder().layer(ContentStreams.NEIGHBOURS).build();
@ -139,7 +139,7 @@ public class LayoutparsingVisualizations {
        visualizationsOnPage.getColoredLines()
                .addAll(rulings
                                .stream()
-                                .map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 0.5f))
+                                .map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 2f))
                                .toList());
    }

--- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java
+++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java
@ -30,21 +30,21 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
    @Autowired
    private LayoutParsingPipeline layoutParsingPipeline;

-    @Disabled
    @Test
+//    @Disabled
    public void testLayoutParserEndToEnd() {

-        String filePath = "files/Minimal Examples/RotateTextWithRulingsTestFile.pdf";
+        String filePath = "/home/kschuettler/Dokumente/TestFiles/certificates/origin/ecm.pdf";

        runForFile(filePath);
    }

    @Test
-    @Disabled
+//    @Disabled
    @SneakyThrows
    public void testLayoutParserEndToEndWithFolder() {

-        String folder = "/home/kschuettler/Dokumente/TestFiles/certificates/certificates-ocred";
+        String folder = "/home/kschuettler/Dokumente/TestFiles/certificates/origin";
        List<Path> pdfFiles = Files.walk(Path.of(folder))
                .filter(path -> path.getFileName().toString().endsWith(".pdf"))
                .sorted(Comparator.comparing(Path::getFileName))