Compare commits

...

5 Commits

Author SHA1 Message Date
Kilian Schuettler
81231ae486 akra-certificate: change column detector slightly
* introduce settings
2024-05-07 15:39:35 +02:00
Dominique Eifländer
8e7bed8b52 some fixes 2024-05-07 11:01:49 +02:00
Kilian Schuettler
fda3f1001f akra-certificate: wip 2024-05-06 14:53:47 +02:00
Kilian Schuettler
e8513d05e2 akra-certificates: finetuning for certificates 2024-05-06 13:26:17 +02:00
Kilian Schuettler
eb2ea755a5 akra-certificates: finetuninng for certificates 2024-05-03 14:07:22 +02:00
19 changed files with 725 additions and 166 deletions

1
.gitattributes vendored
View File

@ -1 +0,0 @@
*.pdf filter=lfs diff=lfs merge=lfs -text

View File

@ -96,6 +96,7 @@ public class LayoutParsingPipeline {
VisualLayoutParsingAdapter visualLayoutParsingAdapter; VisualLayoutParsingAdapter visualLayoutParsingAdapter;
ClarifyndClassificationService clarifyndClassificationService; ClarifyndClassificationService clarifyndClassificationService;
GraphicExtractorService graphicExtractorService; GraphicExtractorService graphicExtractorService;
LayoutparserSettings settings;
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException { public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
@ -104,24 +105,32 @@ public class LayoutParsingPipeline {
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier()); log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId()); File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile); File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId())
.orElse(originFile);
VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse(); VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse();
if (layoutParsingRequest.visualLayoutParsingFileId().isPresent()) { if (layoutParsingRequest.visualLayoutParsingFileId()
visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId().get()); .isPresent()) {
visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId()
.get());
} }
ImageServiceResponse imageServiceResponse = new ImageServiceResponse(); ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
if (layoutParsingRequest.imagesFileStorageId().isPresent()) { if (layoutParsingRequest.imagesFileStorageId()
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get()); .isPresent()) {
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId()
.get());
} }
TableServiceResponse tableServiceResponse = new TableServiceResponse(); TableServiceResponse tableServiceResponse = new TableServiceResponse();
if (layoutParsingRequest.tablesFileStorageId().isPresent()) { if (layoutParsingRequest.tablesFileStorageId()
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get()); .isPresent()) {
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId()
.get());
} }
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(), ClassificationDocument classificationDocument = parseLayout(settings.getLayoutParsingTypeOverride() == null //
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(),
originFile, originFile,
imageServiceResponse, imageServiceResponse,
tableServiceResponse, tableServiceResponse,
@ -130,11 +139,17 @@ public class LayoutParsingPipeline {
log.info("Building document graph for {}", layoutParsingRequest.identifier()); log.info("Building document graph for {}", layoutParsingRequest.identifier());
Document documentGraph = observeBuildDocumentGraph(layoutParsingRequest.layoutParsingType(), classificationDocument); Document documentGraph = observeBuildDocumentGraph(settings.getLayoutParsingTypeOverride() == null //
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(), classificationDocument);
log.info("Creating viewer document for {}", layoutParsingRequest.identifier()); log.info("Creating viewer document for {}", layoutParsingRequest.identifier());
layoutGridService.addLayoutGrid(viewerDocumentFile, documentGraph, viewerDocumentFile, false, layoutParsingRequest.visualLayoutParsingFileId().isPresent()); layoutGridService.addLayoutGrid(viewerDocumentFile,
documentGraph,
viewerDocumentFile,
false,
layoutParsingRequest.visualLayoutParsingFileId()
.isPresent());
log.info("Storing resulting files for {}", layoutParsingRequest.identifier()); log.info("Storing resulting files for {}", layoutParsingRequest.identifier());
@ -224,7 +239,9 @@ public class LayoutParsingPipeline {
Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse); Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
ClassificationDocument classificationDocument = new ClassificationDocument(); ClassificationDocument classificationDocument = new ClassificationDocument();
classificationDocument.getVisualizations().setActive(identifier.containsKey("debug")); if (settings.isDebug() || identifier.containsKey("debug")) {
classificationDocument.getVisualizations().setActive(true);
}
List<ClassificationPage> classificationPages = new ArrayList<>(); List<ClassificationPage> classificationPages = new ArrayList<>();
@ -266,7 +283,8 @@ public class LayoutParsingPipeline {
classificationDocument.getVisualizations().addRulingVisualization(stripper.getRulings(), pageNumber); classificationDocument.getVisualizations().addRulingVisualization(stripper.getRulings(), pageNumber);
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), stripper.getRulings()); CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals(), PageInformation.fromPDPage(pageNumber, pdPage)); PageInformation pageInformation = PageInformation.fromPDPage(pageNumber, pdPage);
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals(), pageInformation);
classificationDocument.getVisualizations().addCellVisualizations(emptyTableCells, pageNumber); classificationDocument.getVisualizations().addCellVisualizations(emptyTableCells, pageNumber);
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings); TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings);
@ -283,9 +301,9 @@ public class LayoutParsingPipeline {
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getVisualizations()); redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getVisualizations());
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings); case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG -> case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG ->
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getVisualizations(), layoutParsingType); docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getVisualizations(), layoutParsingType, pageInformation);
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG ->
docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getVisualizations(), layoutParsingType); docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getVisualizations(), layoutParsingType, pageInformation);
}; };
classificationPage.setCleanRulings(cleanRulings); classificationPage.setCleanRulings(cleanRulings);

View File

@ -0,0 +1,20 @@
package com.knecon.fforesight.service.layoutparser.processor;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.context.annotation.Configuration;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import lombok.AccessLevel;
import lombok.Data;
import lombok.experimental.FieldDefaults;
@Data
@Configuration
@ConfigurationProperties("layoutparser")
@FieldDefaults(level = AccessLevel.PRIVATE)
public class LayoutparserSettings {
boolean debug;
LayoutParsingType layoutParsingTypeOverride;
}

View File

@ -18,6 +18,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRul
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations; import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
@ -33,7 +34,11 @@ public class DocstrumSegmentationService {
private final ReadingOrderService readingOrderService; private final ReadingOrderService readingOrderService;
public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean xyOrder, CleanRulings usedRulings, LayoutparsingVisualizations visualizations) { public List<Zone> segmentPage(List<TextPositionSequence> textPositions,
boolean xyOrder,
CleanRulings usedRulings,
LayoutparsingVisualizations visualizations,
PageInformation pageInformation) {
List<Zone> zones = new ArrayList<>(); List<Zone> zones = new ArrayList<>();
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.ZERO)); zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.ZERO));
@ -41,7 +46,7 @@ public class DocstrumSegmentationService {
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.HALF_CIRCLE)); zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.HALF_CIRCLE));
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.THREE_QUARTER_CIRCLE)); zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.THREE_QUARTER_CIRCLE));
return readingOrderService.resolve(zones, xyOrder); return readingOrderService.resolve(zones, xyOrder, visualizations, textPositions.get(0).getPage(), pageInformation);
} }
@ -63,7 +68,9 @@ public class DocstrumSegmentationService {
double lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20); double lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20);
List<Line> lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing, rulings); List<Line> lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing, rulings);
return zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing, rulings); List<Zone> zones = zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing, rulings);
// return zones;
return zoneBuilderService.mergeZonesAgain(zones, characterSpacing, lineSpacing, rulings);
} }
} }

View File

@ -135,6 +135,12 @@ public abstract class BoundingBox {
} }
public boolean intersectsYJava(BoundingBox other) {
return this.getY() <= other.getMaxY() && this.getMaxY() >= other.getY();
}
public boolean intersectsY(BoundingBox other, float threshold) { public boolean intersectsY(BoundingBox other, float threshold) {
return this.getPdfMinY() - threshold <= other.getPdfMaxY() && this.getPdfMaxY() + threshold >= other.getPdfMinY(); return this.getPdfMinY() - threshold <= other.getPdfMaxY() && this.getPdfMaxY() + threshold >= other.getPdfMinY();
@ -143,7 +149,13 @@ public abstract class BoundingBox {
public boolean intersectsX(BoundingBox other) { public boolean intersectsX(BoundingBox other) {
return this.getPdfMinX() <= other.getMaxX() && this.getMaxX() >= other.getPdfMinX(); return this.getPdfMinX() <= other.getPdfMaxX() && this.getPdfMaxX() >= other.getPdfMinX();
}
public boolean intersectsXJava(BoundingBox other) {
return this.getX() <= other.getMaxX() && this.getMaxX() >= other.getMinX();
} }
@ -182,4 +194,60 @@ public abstract class BoundingBox {
} }
}; };
public double horizontalDistance(BoundingBox other) {
Rectangle2D left;
Rectangle2D right;
if (this.leftOf(other)) {
left = this.getBBox();
right = other.getBBox();
} else {
left = other.getBBox();
right = this.getBBox();
}
return Math.max(0, right.getMinX() - left.getMaxX());
}
public double verticalDistance(BoundingBox other) {
Rectangle2D bottom;
Rectangle2D top;
if (this.isAbove(other)) {
top = this.getBBox();
bottom = other.getBBox();
} else {
bottom = this.getBBox();
top = other.getBBox();
}
return Math.max(0, bottom.getMinY() - top.getMaxY());
}
public boolean rightOf(BoundingBox other) {
return this.intersectsYJava(other) && other.getMaxX() <= this.getMinX();
}
public boolean leftOf(BoundingBox other) {
return this.intersectsYJava(other) && other.getMinX() >= this.getMaxX();
}
public boolean isAbove(BoundingBox other) {
return this.intersectsXJava(other) && other.getMinY() >= this.getMaxY();
}
public boolean isBelow(BoundingBox other) {
return this.intersectsXJava(other) && this.getMinY() >= other.getMaxY();
}
} }

View File

@ -0,0 +1,320 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
public class ColumnDetector {
public static final double MAX_VALUE_THRESHOLD = 0.5;
final static int bins_num = 512;
final static int globalStartIdx = 0; // ignore outer parts completely, we don't expect columns there
final static int globalEndIdx = bins_num; // i chose 7, since thirds seems a likely split for columns, therefore divided by 6 would eliminate those.
public static final double DERIVATIVE_ZERO_THRESHOLD = 1e-10;
public static final double MINIMUM_THRESHOLD_FOR_COLUMNS = 0.05;
public static final double NEAR_GLOBAL_THRESHOLD = 0.5;
double minY;
double maxY;
double midY;
double[] histogram;
double min;
double max;
double resolution;
double sum;
int N;
public ColumnDetector(double min, double max, double minY, double maxY) {
this.min = min;
this.max = max;
this.minY = minY;
this.maxY = maxY;
this.midY = maxY - minY;
this.resolution = (max - min) / bins_num;
this.histogram = new double[bins_num];
}
public void add(BoundingBox zone) {
N++;
double weight = computeWeight(zone);
int start = (int) ((zone.getMinX() - min) / resolution);
int end = (int) ((zone.getMaxX() - min) / resolution);
for (int i = start; i < end; i++) {
histogram[i] += weight;
sum += histogram[i];
}
}
private double computeWeight(BoundingBox zone) {
double areaWeight = zone.getBBox().getHeight();
double relativeDistance = relativeDistanceToMiddle(zone.getBBox().getCenterY());
double distanceWeight;
if (relativeDistance < 0.6) {
distanceWeight = 1;
} else if (relativeDistance < 0.8) {
distanceWeight = 0.8;
} else {
distanceWeight = 0.1;
}
return areaWeight * distanceWeight;
}
private double relativeDistanceToMiddle(double y) {
double range = (maxY - minY) / 2;
double mid = minY + range;
return Math.abs(y - mid) / range;
}
public double[] computeDerivative() {
int length = histogram.length;
double[] derivative = new double[length];
for (int i = 0; i < length; i++) {
if (i == 0) {
derivative[i] = (histogram[i + 1] - histogram[i]) / resolution;
} else if (i == length - 1) {
derivative[i] = (histogram[i] - histogram[i - 1]) / resolution;
} else {
derivative[i] = (histogram[i + 1] - histogram[i - 1]) / (2 * resolution);
}
}
return derivative;
}
public double calcMean(double[] arr, int start, int end) {
if (start == end) {
return 0;
}
double sum = 0;
for (int i = start; i < end; i++) {
sum += arr[i];
}
return sum / (end - start);
}
/*
Find columns, by finding all local maxima/minima of the derivative. Filtering them for the ones with the biggest values.
For each found minima, we will step to the right until we hit a 0 in the derivative, this indicates a minimum in the main histogram. If this minimum is below a threshold, it is deemed a column divider.
Same goes for maxima, but stepping to the left now, since minima in the function will always be to the left of a maximum in its derivative.
*/
public List<Double> determineColumnsWithDerivative(double[] derivative) {
assert derivative.length == histogram.length;
Set<Integer> columnIndices = new HashSet<>();
double mean = calcMean(histogram, 0, histogram.length);
double maxDvValue = calcMax(derivative);
double minDvValue = calcMin(derivative);
if (maxDvValue - minDvValue < mean * MAX_VALUE_THRESHOLD) {
Collections.emptyList();
}
Extrema derivativeExtrema = calculateNearGlobalExtrema(derivative, maxDvValue, minDvValue);
List<Integer> columnsRightOfMinima = findZerosToTheRightOfMinima(derivative, derivativeExtrema.minima(), mean);
columnIndices.addAll(columnsRightOfMinima);
List<Integer> columnsLeftOfMaxima = findZerosToTheLeftOfMaxima(derivative, derivativeExtrema.maxima(), mean);
columnIndices.addAll(columnsLeftOfMaxima);
return columnIndices.stream()
.sorted(Comparator.naturalOrder())
.map(this::calculateXCoordinateFromIdx)
.toList();
}
private List<Integer> findZerosToTheLeftOfMaxima(double[] derivative, List<Integer> derivativeMaxima, double mean) {
List<Integer> columnsLeftOfMaxima = new ArrayList<>();
for (int i = 0; i < derivativeMaxima.size(); i++) {
List<Integer> consecutiveZeroes = new LinkedList<>();
boolean maximumFound = false;
int maximaIdx = derivativeMaxima.get(i) - 1; // the highest derivative will always be at least one step away from the lowest value.
int endIdx = (int) Math.max(globalStartIdx,
Math.min(maximaIdx - 1,
maximaIdx - 0.1 * bins_num)); // search through 10% of array to the right, but at least one step and at most to the left edge;
for (int j = maximaIdx; j >= endIdx; j--) {
if (derivative[j] < DERIVATIVE_ZERO_THRESHOLD) {
maximumFound = true;
consecutiveZeroes.add(j);
} else if (maximumFound) {
break;
}
}
if (maximumFound) {
int midIdx = consecutiveZeroes.size() / 2;
int middleMinimumIdx = consecutiveZeroes.get(midIdx);
if (histogram[middleMinimumIdx] < mean * MINIMUM_THRESHOLD_FOR_COLUMNS) {
columnsLeftOfMaxima.add(middleMinimumIdx);
}
}
}
return columnsLeftOfMaxima;
}
private List<Integer> findZerosToTheRightOfMinima(double[] derivative, List<Integer> derivativeMinima, double mean) {
List<Integer> columnIndixes = new LinkedList<>();
for (int i = 0; i < derivativeMinima.size(); i++) {
List<Integer> consecutiveZeroes = new LinkedList<>();
boolean minimumFound = false;
int minimaIdx = derivativeMinima.get(i) + 1; // the highest derivative will always be at least one step earlier than the lowest value.
int endIdx = (int) Math.min(globalEndIdx,
Math.max(minimaIdx + 1,
minimaIdx + 0.1 * bins_num)); // search through 10% of array to the right, but at least one step and at most to the right edge;
for (int j = minimaIdx; j < endIdx; j++) {
if (derivative[j] < DERIVATIVE_ZERO_THRESHOLD) {
minimumFound = true;
consecutiveZeroes.add(j);
} else if (minimumFound) {
break;
}
}
if (minimumFound) {
int midIdx = consecutiveZeroes.size() / 2;
int middleMinimumIdx = consecutiveZeroes.get(midIdx);
if (histogram[middleMinimumIdx] < mean * MINIMUM_THRESHOLD_FOR_COLUMNS) {
columnIndixes.add(middleMinimumIdx);
}
}
}
return columnIndixes;
}
private double calcMax(double[] array) {
double max = Double.NEGATIVE_INFINITY;
for (int i = 0; i < array.length; i++) {
if (array[i] > max) {
max = array[i];
}
}
return max;
}
private double calcMin(double[] array) {
double min = Double.POSITIVE_INFINITY;
for (int i = 0; i < array.length; i++) {
if (array[i] < min) {
min = array[i];
}
}
return min;
}
private Extrema calculateNearGlobalExtrema(double[] derivative, double maxDvValue, double minDvValue) {
List<Integer> nearGlobalDvMaximaIdx = new LinkedList<>();
List<Integer> nearGlobalDvMinimaIdx = new LinkedList<>();
for (int i = globalStartIdx; i < globalEndIdx; i++) {
if (derivative[i] <= minDvValue * NEAR_GLOBAL_THRESHOLD) {
nearGlobalDvMinimaIdx.add(i);
}
if (derivative[i] >= maxDvValue * NEAR_GLOBAL_THRESHOLD) {
nearGlobalDvMaximaIdx.add(i);
}
}
nearGlobalDvMinimaIdx = removeConsecutive(nearGlobalDvMinimaIdx);
nearGlobalDvMaximaIdx = removeConsecutive(nearGlobalDvMaximaIdx);
return new Extrema(nearGlobalDvMaximaIdx, nearGlobalDvMinimaIdx);
}
private record Extrema(List<Integer> maxima, List<Integer> minima) {
}
private Double calculateXCoordinateFromIdx(int globalMinIdx) {
return min + ((globalMinIdx + 1) * resolution);
}
public static List<Integer> removeConsecutive(List<Integer> numbers) {
List<Integer> result = new ArrayList<>();
if (numbers == null || numbers.isEmpty()) {
return result;
}
result.add(numbers.get(0)); // Add the first number
for (int i = 1; i < numbers.size(); i++) {
if (numbers.get(i) != numbers.get(i - 1) + 1) {
result.add(numbers.get(i)); // Add non-consecutive numbers
}
}
return result;
}
public void kernelSmooth(double[] kernel) {
double[] newFrequencies = new double[histogram.length];
int shift = (kernel.length - 1) / 2;
for (int i = 0; i < kernel.length; i++) {
int jStart = Math.max(0, i - shift);
int jEnd = Math.min(histogram.length, histogram.length + i - shift);
for (int j = jStart; j < jEnd; j++) {
newFrequencies[j - i + shift] += kernel[i] * histogram[j];
}
}
histogram = newFrequencies;
}
public double[] createGaussianKernel(int length, double stdDeviation) {
int r = length / 2;
int size = 2 * r + 1;
double[] kernel = new double[size];
double sum = 0;
double b = 2 * (stdDeviation) * (stdDeviation);
double a = 1 / Math.sqrt(Math.PI * b);
for (int i = 0; i < size; i++) {
kernel[i] = a * Math.exp(-(i - r) * (i - r) / b);
sum += kernel[i];
}
for (int i = 0; i < size; i++) {
kernel[i] /= sum;
}
return kernel;
}
}

View File

@ -29,4 +29,6 @@ public class Zone extends BoundingBox {
return sb.toString().trim(); return sb.toString().trim();
} }
} }

View File

@ -1,17 +1,21 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service; package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
import java.util.ArrayList; import java.awt.geom.Point2D;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator; import java.util.Comparator;
import java.util.HashMap; import java.util.LinkedList;
import java.util.List; import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.ColumnDetector;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone; import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils; import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
@Service @Service
public class ReadingOrderService { public class ReadingOrderService {
@ -20,7 +24,7 @@ public class ReadingOrderService {
public static final double MULTI_COLUMN_DETECTION_THRESHOLD = 1.5; public static final double MULTI_COLUMN_DETECTION_THRESHOLD = 1.5;
public List<Zone> resolve(List<Zone> zones, boolean xyReadingOrder) { public List<Zone> resolve(List<Zone> zones, boolean xyReadingOrder, LayoutparsingVisualizations visualizations, int page, PageInformation pageInformation) {
if (zones.isEmpty() || zones.size() == 1) { if (zones.isEmpty() || zones.size() == 1) {
return zones; return zones;
@ -30,28 +34,53 @@ public class ReadingOrderService {
return resolveSingleColumnReadingOrder(zones); return resolveSingleColumnReadingOrder(zones);
} }
Map<Long, Integer> histogram = new HashMap<>(); var columnSeparatorLines = calculateColumns(zones);
for (Zone zone : zones) {
long minY = Math.round(zone.getBBox().getMinY());
long maxY = Math.round(zone.getBBox().getMaxY());
for (long i = minY; i <= maxY; i++) {
histogram.put(i, histogram.getOrDefault(i, 0) + 1);
}
}
if (histogram.values() if (columnSeparatorLines.isEmpty()) {
.stream()
.mapToInt(Integer::intValue).average()
.orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) {
return resolveSingleColumnReadingOrder(zones); return resolveSingleColumnReadingOrder(zones);
} else { } else {
for (Double columnLine : columnSeparatorLines) {
return resolveMultiColumnReadingOder(zones); visualizations.addRulingVisualization(List.of(new Ruling(new Point2D.Double(columnLine, 0), new Point2D.Double(columnLine, 1000))), page);
}
return resolveMultiColumnReadingOder(zones, columnSeparatorLines, pageInformation);
} }
} }
private static List<Double> calculateColumns(List<Zone> zones) {
if (zones.isEmpty()) {
return Collections.emptyList();
}
double min = zones.stream()
.mapToDouble(BoundingBox::getMinX)
.min()
.orElse(0);
double max = zones.stream()
.mapToDouble(BoundingBox::getMaxX)
.max()
.orElse(0);
double minY = zones.stream()
.mapToDouble(BoundingBox::getMinY)
.min()
.orElse(0);
double maxY = zones.stream()
.mapToDouble(BoundingBox::getMaxY)
.max()
.orElse(0);
var columnResolver = new ColumnDetector(min, max, minY, maxY);
zones.forEach(columnResolver::add);
columnResolver.kernelSmooth(columnResolver.createGaussianKernel(3, 1));
double[] derivative = columnResolver.computeDerivative();
return columnResolver.determineColumnsWithDerivative(derivative);
}
private static List<Zone> resolveSingleColumnReadingOrder(List<Zone> zones) { private static List<Zone> resolveSingleColumnReadingOrder(List<Zone> zones) {
zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
@ -60,109 +89,55 @@ public class ReadingOrderService {
} }
private List<Zone> resolveMultiColumnReadingOder(List<Zone> zones) { private List<Zone> resolveMultiColumnReadingOder(List<Zone> zones, List<Double> columnSeparatorLines, PageInformation pageInformation) {
// Simple reading order resolver for multi column page layout as described here : https://pub.towardsai.net/advanced-rag-02-unveiling-pdf-parsing-b84ae866344e List<List<Zone>> zonesPerColumn = new LinkedList<>();
// TODO implement a more fancy reading order resolver see https://github.com/BobLd/DocumentLayoutAnalysis/blob/master/README.md#reading-order for (Double ignored : columnSeparatorLines) {
zonesPerColumn.add(new LinkedList<>());
double minX = Double.POSITIVE_INFINITY;
double maxX = Double.NEGATIVE_INFINITY;
for (Zone zone : zones) {
if (zone.getX() < minX) {
minX = zone.getX();
}
if (zone.getX() + zone.getWidth() > maxX) {
maxX = zone.getX() + zone.getWidth();
}
} }
zonesPerColumn.add(new LinkedList<>());
double midLineXCoordinate = (minX + maxX) / 2;
List<Zone> leftOf = new ArrayList<>();
List<Zone> rightOf = new ArrayList<>();
List<Zone> middle = new ArrayList<>();
for (Zone zone : zones) { for (Zone zone : zones) {
if (zone.getX() < midLineXCoordinate && zone.getX() + zone.getWidth() < midLineXCoordinate) { boolean zoneAdded = false;
leftOf.add(zone); if (zone.getMinY() < pageInformation.heightRot() * 0.5) {
} else if (zone.getX() > midLineXCoordinate && zone.getX() + zone.getWidth() > midLineXCoordinate) { // above middle sort into column fitting left x value
rightOf.add(zone); for (int col = 0; col < columnSeparatorLines.size(); col++) {
if (columnSeparatorLines.get(col) > zone.getMinX()) {
zonesPerColumn.get(col).add(zone);
zoneAdded = true;
break;
}
}
} else { } else {
middle.add(zone); // below middle sort into column fitting right x value
for (int col = 0; col < columnSeparatorLines.size(); col++) {
if (columnSeparatorLines.get(col) > zone.getMaxX()) {
zonesPerColumn.get(col).add(zone);
zoneAdded = true;
break;
}
}
} }
if (!zoneAdded) {
zonesPerColumn.get(zonesPerColumn.size() - 1).add(zone);
}
} }
leftOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) zonesPerColumn.forEach(list -> list.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))));
rightOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) zonesPerColumn.forEach(list -> list.sort(new Comparator<Zone>() {
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); @Override
public int compare(Zone o1, Zone o2) {
middle.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) return Math.abs(o1.getMinY() - o2.getMinY()) < 5 && o1.getMinX() < o2.getMinX() == true ? -1 : 0;
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
/*
List<Zone> leftNotIntersecting = new ArrayList<>();
for (Zone leftZone : leftOf) {
boolean intersects = false;
for (Zone rightZone : rightOf) {
if (leftZone.intersectsY(rightZone)) {
intersects = true;
break;
}
// early stopping
if (rightZone.getBBox().getMinY() > leftZone.getBBox().getMaxY()) {
break;
}
} }
if (!intersects) { }));
leftNotIntersecting.add(leftZone);
}
}
List<Zone> rightNotIntersecting = new ArrayList<>(); return zonesPerColumn.stream()
for (Zone rightZone : rightOf) { .flatMap(Collection::stream)
boolean intersects = false; .toList();
for (Zone leftZone : leftOf) {
if (rightZone.intersectsY(leftZone)) {
intersects = true;
break;
}
// early stopping
if (leftZone.getBBox().getMinY() > rightZone.getBBox().getMaxY()) {
break;
}
}
if (!intersects) {
rightNotIntersecting.add(rightZone);
}
}
leftOf.removeAll(leftNotIntersecting);
rightOf.removeAll(rightNotIntersecting);
middle.addAll(leftNotIntersecting);
middle.addAll(rightNotIntersecting);
*/
List<Zone> sortedZones = new ArrayList<>();
sortedZones.addAll(leftOf);
sortedZones.addAll(rightOf);
ListIterator<Zone> itty = middle.listIterator();
while (itty.hasNext()) {
Zone current = itty.next();
for (int i = 0; i < sortedZones.size(); i++) {
if (current.getY() < sortedZones.get(i).getY()) {
sortedZones.add(i, current);
itty.remove();
break;
}
}
}
sortedZones.addAll(middle);
return sortedZones;
} }
} }

View File

@ -1,6 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service; package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator; import java.util.Comparator;
import java.util.HashSet; import java.util.HashSet;
import java.util.List; import java.util.List;
@ -81,6 +82,78 @@ public class ZoneBuilderService {
} }
public List<Zone> mergeZonesAgain(List<Zone> zones, double characterSpacing, double lineSpacing, CleanRulings rulings) {
double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER;
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER;
double minHorizontalMergeDistance = characterSpacing * MIN_HORIZONTAL_MERGE_DISTANCE_MULTIPLIER;
double maxVerticalMergeDistance = lineSpacing * MAX_VERTICAL_MERGE_DISTANCE_MULTIPLIER;
UnionFind<Zone> unionFind = new UnionFind<>(new HashSet<>(zones));
double meanHeight = calculateMeanHeight(zones.stream()
.map(Zone::getLines)
.flatMap(Collection::stream)
.toList());
zones.forEach(outerZone -> {
zones.forEach(innerZone -> {
if (innerZone == outerZone //
|| unionFind.inSameSet(outerZone, innerZone)//
|| !outerZone.intersectsYJava(innerZone) && !innerZone.intersectsXJava(innerZone)) {
return;
}
double scale = Math.min(outerZone.getHeight(), innerZone.getHeight()) / meanHeight;
scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE));
double horizontalDistance = outerZone.horizontalDistance(innerZone);
double verticalDistance = outerZone.verticalDistance(innerZone);
if (rulings.lineBetween(outerZone, innerZone)) {
return;
}
if (outerZone.intersectsYJava(innerZone) && horizontalDistance < 10) {
unionFind.union(outerZone, innerZone);
return;
}
if (outerZone.intersectsXJava(innerZone) && verticalDistance < 6) {
unionFind.union(outerZone, innerZone);
return;
}
boolean outerZoneEndsWithColon = outerZone.getLines()
.stream()
.allMatch(line -> line.toString().trim().endsWith(":"));
boolean innerZoneDoesNotEndWithColon = innerZone.getLines()
.stream()
.noneMatch(line -> line.toString().trim().endsWith(":"));
if (outerZoneEndsWithColon && innerZoneDoesNotEndWithColon && outerZone.leftOf(innerZone) && innerZone.getMinX() - outerZone.getMinX() < 250) {
unionFind.union(outerZone, innerZone);
return;
}
if (outerZoneEndsWithColon && innerZoneDoesNotEndWithColon && (outerZone.isAbove(innerZone) && verticalDistance < lineSpacing * 2)) {
unionFind.union(outerZone, innerZone);
return;
}
});
});
return unionFind.getGroups()
.stream()
.map(groupOfZones -> groupOfZones.stream()
.map(Zone::getLines)
.flatMap(Collection::stream)
.collect(Collectors.toList()))
.map(linesInZoneToMerge -> mergeLinesInZone(linesInZoneToMerge, characterSpacing, lineSpacing))
.toList();
}
private double calculateMeanHeight(List<Line> lines) { private double calculateMeanHeight(List<Line> lines) {
double meanHeight = 0.0; double meanHeight = 0.0;

View File

@ -76,15 +76,14 @@ public class RedTextPosition extends BoundingBox {
pos.setBBoxDirAdj(dirAdjPosition); pos.setBBoxDirAdj(dirAdjPosition);
AffineTransform affineTransform = getRotationMatrix(TextDirection.fromDegrees(textPosition.getDir()), textPosition.getPageWidth(), textPosition.getPageHeight()); AffineTransform affineTransform = getRotationMatrix(TextDirection.fromDegrees(textPosition.getDir()), textPosition.getPageWidth(), textPosition.getPageHeight());
Rectangle2D initialUserSpacePositionRect = affineTransform.createTransformedShape(dirAdjPosition).getBounds2D(); Rectangle2D bBoxInitialUserSpace = affineTransform.createTransformedShape(dirAdjPosition).getBounds2D();
pos.setBBoxInitialUserSpace(initialUserSpacePositionRect); // These are definitely correct pos.setBBoxInitialUserSpace(bBoxInitialUserSpace); // These are definitely correct
return pos; return pos;
} }
private static AffineTransform getRotationMatrix(TextDirection textDirection, float pageWidth, float pageHeight) { private static AffineTransform getRotationMatrix(TextDirection textDirection, float pageWidth, float pageHeight) {
AffineTransform transform = new AffineTransform(); AffineTransform transform = new AffineTransform();

View File

@ -41,6 +41,18 @@ public class StringFrequencyCounter {
mostPopular = entry; mostPopular = entry;
} }
} }
if (mostPopular != null && mostPopular.getKey().equals("standard")) {
int standard = countPerValue.get(mostPopular.getKey());
double total = countPerValue.values()
.stream()
.mapToDouble(v -> v).sum();
if ((double) standard / total > 0.75) {
return mostPopular.getKey();
}
countPerValue.remove(mostPopular.getKey());
return getMostPopular();
}
return mostPopular != null ? mostPopular.getKey() : null; return mostPopular != null ? mostPopular.getKey() : null;
} }

View File

@ -17,6 +17,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRul
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations; import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
@ -35,11 +36,12 @@ public class DocstrumBlockificationService {
CleanRulings rulings, CleanRulings rulings,
boolean xyOrder, boolean xyOrder,
LayoutparsingVisualizations visualizations, LayoutparsingVisualizations visualizations,
LayoutParsingType layoutParsingType) { LayoutParsingType layoutParsingType,
PageInformation pageInformation) {
CleanRulings usedRulings = rulings.withoutTextRulings(); CleanRulings usedRulings = rulings.withoutTextRulings();
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder, usedRulings, visualizations); var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder, usedRulings, visualizations, pageInformation);
if (!textPositions.isEmpty()) { if (!textPositions.isEmpty()) {
visualizations.addZoneVisualizations(zones, textPositions.get(0).getPage()); visualizations.addZoneVisualizations(zones, textPositions.get(0).getPage());
@ -56,10 +58,12 @@ public class DocstrumBlockificationService {
var classificationPage = new ClassificationPage(pageBlocks); var classificationPage = new ClassificationPage(pageBlocks);
classificationPage.setCleanRulings(rulings); classificationPage.setCleanRulings(rulings);
mergeIntersectingBlocks(classificationPage, usedRulings, 0, 0); mergeIntersectingBlocks(classificationPage, usedRulings, 2f, 2f);
if (layoutParsingType == LayoutParsingType.DOCUMINE || layoutParsingType == LayoutParsingType.REDACT_MANAGER) { combineBlocksBasic(classificationPage);
combineBlocks(classificationPage);
if (layoutParsingType == LayoutParsingType.REDACT_MANAGER || layoutParsingType == LayoutParsingType.DOCUMINE) {
combineBlocksSpecial(classificationPage);
} }
if (layoutParsingType == LayoutParsingType.CLARIFYND) { if (layoutParsingType == LayoutParsingType.CLARIFYND) {
@ -105,7 +109,60 @@ public class DocstrumBlockificationService {
} }
public void combineBlocks(ClassificationPage page) { public void combineBlocksBasic(ClassificationPage page) {
TextPageBlock previous = new TextPageBlock();
ListIterator<AbstractPageBlock> itty = page.getTextBlocks().listIterator();
CleanRulings usedRulings = page.getCleanRulings().withoutTextRulings();
while (itty.hasNext()) {
AbstractPageBlock block = itty.next();
if (block instanceof TablePageBlock) {
previous = new TextPageBlock();
continue;
}
TextPageBlock current = (TextPageBlock) block;
if (previous != null && !previous.getSequences().isEmpty()) {
if (current.getDir() != previous.getDir() || usedRulings.lineBetween(current, previous)) {
previous = current;
continue;
}
if (current.intersectsY(previous) && current.horizontalDistance(previous) < 50) {
previous = combineBlocksAndResetIterator(previous, current, itty, true);
continue;
}
if (isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(previous, current, page)) {
previous = combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
continue;
}
if (previous.intersects(current)) {
previous = combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
continue;
}
// merge headlines
if (current.getDir() == previous.getDir() && (Math.abs(current.getHighestFontSize() - previous.getHighestFontSize()) < 1.1f
&& current.getHighestFontSize() > 12
&& previous.getHighestFontSize() > 12
&& current.getMostPopularWordStyle().equals(previous.getMostPopularWordStyle())
&& current.intersects(previous, current.getMostPopularWordHeight(), current.getMostPopularWordHeight()))) {
previous = combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
continue;
}
}
previous = current;
}
mergeIntersectingBlocks(page, usedRulings, 0, 6.5f);
}
public void combineBlocksSpecial(ClassificationPage page) {
TextPageBlock previous = new TextPageBlock(); TextPageBlock previous = new TextPageBlock();
ListIterator<AbstractPageBlock> itty = page.getTextBlocks().listIterator(); ListIterator<AbstractPageBlock> itty = page.getTextBlocks().listIterator();
@ -264,7 +321,7 @@ public class DocstrumBlockificationService {
continue; continue;
} }
if (current.getDir() == inner.getDir() && current.intersects(inner, yThreshold, xThreshold)) { if (current.getDir() == inner.getDir() && current.intersects(inner, xThreshold, yThreshold)) {
boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate(); boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate();
current.getSequences().addAll(inner.getSequences()); current.getSequences().addAll(inner.getSequences());

View File

@ -62,6 +62,7 @@ public class DocuMineClassificationService {
textBlock.setClassification(PageBlockType.OTHER); textBlock.setClassification(PageBlockType.OTHER);
return; return;
} }
/*
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() || PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular()) .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())
@ -73,7 +74,8 @@ public class DocuMineClassificationService {
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular()) .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())
) { ) {
textBlock.setClassification(PageBlockType.FOOTER); textBlock.setClassification(PageBlockType.FOOTER);
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, } else */
if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks() document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
.size() == 1)) { .size() == 1)) {
if (!Pattern.matches("[0-9]+", textBlock.toString())) { if (!Pattern.matches("[0-9]+", textBlock.toString())) {

View File

@ -181,10 +181,7 @@ public class DocumentGraphFactory {
Page page = context.getPage(textBlocks.get(0).getPage()); Page page = context.getPage(textBlocks.get(0).getPage());
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build(); Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), footer, context, page);
footer,
context,
page);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer); List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
footer.setTreeId(tocId); footer.setTreeId(tocId);
footer.setLeafTextBlock(textBlock); footer.setLeafTextBlock(textBlock);
@ -273,7 +270,8 @@ public class DocumentGraphFactory {
return pages.keySet() return pages.keySet()
.stream() .stream()
.filter(page -> page.getNumber() == pageIndex) .filter(page -> page.getNumber() == pageIndex)
.findFirst().orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex))); .findFirst()
.orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex)));
} }
} }

View File

@ -117,7 +117,7 @@ public class SectionNodeFactory {
if (abstractPageBlock instanceof TextPageBlock) { if (abstractPageBlock instanceof TextPageBlock) {
switch (layoutParsingType) { switch (layoutParsingType) {
case REDACT_MANAGER, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> { case REDACT_MANAGER, DOCUMINE, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> {
alreadyMerged.add(abstractPageBlock); alreadyMerged.add(abstractPageBlock);
remainingBlocks.remove(abstractPageBlock); remainingBlocks.remove(abstractPageBlock);
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, new ArrayList<>(), layoutParsingType); DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, new ArrayList<>(), layoutParsingType);

View File

@ -11,8 +11,8 @@ public record PageInformation(Rectangle2D mediabox, int number, int rotationDegr
PDRectangle mediaBox = page.getMediaBox(); PDRectangle mediaBox = page.getMediaBox();
return new PageInformation(new Rectangle2D.Double(mediaBox.getLowerLeftX(), mediaBox.getLowerLeftY(), mediaBox.getWidth(), mediaBox.getHeight()), return new PageInformation(new Rectangle2D.Double(mediaBox.getLowerLeftX(), mediaBox.getLowerLeftY(), mediaBox.getWidth(), mediaBox.getHeight()),
pageNum, pageNum,
page.getRotation()); page.getRotation());
} }
@ -22,6 +22,15 @@ public record PageInformation(Rectangle2D mediabox, int number, int rotationDegr
} }
public double heightRot() {
if (rotationDegrees == 90 || rotationDegrees == 270) {
return width();
}
return height();
}
public double width() { public double width() {
return mediabox.getWidth(); return mediabox.getWidth();

View File

@ -39,21 +39,21 @@ public class TextPositionSequenceComparator implements Comparator<TextPositionSe
} }
// get the text direction adjusted coordinates // get the text direction adjusted coordinates
float x1 = pos1.getMinXDirAdj(); double x1 = pos1.getBBox().getX();
float x2 = pos2.getMinXDirAdj(); double x2 = pos2.getBBox().getX();
float pos1YBottom = pos1.getMaxYDirAdj(); double pos1YBottom = pos1.getBBox().getMaxY();
float pos2YBottom = pos2.getMaxYDirAdj(); double pos2YBottom = pos2.getBBox().getMaxY();
// note that the coordinates have been adjusted so 0,0 is in upper left // note that the coordinates have been adjusted so 0,0 is in upper left
float pos1YTop = pos1YBottom - pos1.getTextHeightNoPadding(); double pos1YTop = pos1YBottom - pos1.getBBox().getHeight();
float pos2YTop = pos2YBottom - pos2.getTextHeightNoPadding(); double pos2YTop = pos2YBottom - pos2.getBBox().getHeight();
float yDifference = Math.abs(pos1YBottom - pos2YBottom); double yDifference = Math.abs(pos1YBottom - pos2YBottom);
// we will do a simple tolerance comparison // we will do a simple tolerance comparison
if (yDifference < .1 || pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom || pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom) { if (yDifference < .1 || pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom || pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom) {
return Float.compare(x1, x2); return Double.compare(x1, x2);
} else if (pos1YBottom < pos2YBottom) { } else if (pos1YBottom < pos2YBottom) {
return -1; return -1;
} else { } else {

View File

@ -73,11 +73,11 @@ public class LayoutparsingVisualizations {
boolean active; boolean active;
final Visualizations words = Visualizations.builder().layer(ContentStreams.WORDS).build(); final Visualizations words = Visualizations.builder().layer(ContentStreams.WORDS).build();
final Visualizations lines = Visualizations.builder().layer(ContentStreams.LINES).build(); final Visualizations lines = Visualizations.builder().layer(ContentStreams.LINES).layerVisibilityDefaultValue(false).build();
final Visualizations zones = Visualizations.builder().layer(ContentStreams.ZONES).build(); final Visualizations zones = Visualizations.builder().layer(ContentStreams.ZONES).layerVisibilityDefaultValue(false).build();
final Visualizations mainBody = Visualizations.builder().layer(ContentStreams.MAIN_BODY).build(); final Visualizations mainBody = Visualizations.builder().layer(ContentStreams.MAIN_BODY).build();
final Visualizations clean_rulings = Visualizations.builder().layer(ContentStreams.CLEAN_RULINGS).build(); final Visualizations clean_rulings = Visualizations.builder().layer(ContentStreams.CLEAN_RULINGS).build();
final Visualizations rulings = Visualizations.builder().layer(ContentStreams.RULINGS).build(); final Visualizations rulings = Visualizations.builder().layer(ContentStreams.RULINGS).layerVisibilityDefaultValue(true).build();
final Visualizations cells = Visualizations.builder().layer(ContentStreams.CELLS).build(); final Visualizations cells = Visualizations.builder().layer(ContentStreams.CELLS).build();
final Visualizations markedContent = Visualizations.builder().layer(ContentStreams.MARKED_CONTENT).build(); final Visualizations markedContent = Visualizations.builder().layer(ContentStreams.MARKED_CONTENT).build();
final Visualizations neighbours = Visualizations.builder().layer(ContentStreams.NEIGHBOURS).build(); final Visualizations neighbours = Visualizations.builder().layer(ContentStreams.NEIGHBOURS).build();
@ -139,7 +139,7 @@ public class LayoutparsingVisualizations {
visualizationsOnPage.getColoredLines() visualizationsOnPage.getColoredLines()
.addAll(rulings .addAll(rulings
.stream() .stream()
.map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 0.5f)) .map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 2f))
.toList()); .toList());
} }
@ -180,7 +180,7 @@ public class LayoutparsingVisualizations {
visualizationsOnPage.getColoredRectangles() visualizationsOnPage.getColoredRectangles()
.addAll(zones.stream() .addAll(zones.stream()
.map(BoundingBox::getBBoxInitialUserSpace) .map(BoundingBox::getBBoxInitialUserSpace)
.map(zone -> new ColoredRectangle(zone, ZONES_COLOR, 1)) .map(zone -> new ColoredRectangle(zone, ZONES_COLOR, 0.5f))
.toList()); .toList());
} }

View File

@ -30,21 +30,21 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
@Autowired @Autowired
private LayoutParsingPipeline layoutParsingPipeline; private LayoutParsingPipeline layoutParsingPipeline;
@Disabled
@Test @Test
// @Disabled
public void testLayoutParserEndToEnd() { public void testLayoutParserEndToEnd() {
String filePath = "files/Minimal Examples/RotateTextWithRulingsTestFile.pdf"; String filePath = "/home/kschuettler/Dokumente/TestFiles/certificates/origin/ecm.pdf";
runForFile(filePath); runForFile(filePath);
} }
@Test @Test
@Disabled // @Disabled
@SneakyThrows @SneakyThrows
public void testLayoutParserEndToEndWithFolder() { public void testLayoutParserEndToEndWithFolder() {
String folder = "/home/kschuettler/Dokumente/TestFiles/large number of prod files"; String folder = "/home/kschuettler/Dokumente/TestFiles/certificates/origin";
List<Path> pdfFiles = Files.walk(Path.of(folder)) List<Path> pdfFiles = Files.walk(Path.of(folder))
.filter(path -> path.getFileName().toString().endsWith(".pdf")) .filter(path -> path.getFileName().toString().endsWith(".pdf"))
.sorted(Comparator.comparing(Path::getFileName)) .sorted(Comparator.comparing(Path::getFileName))
@ -70,7 +70,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
file = new File(filePath); file = new File(filePath);
} }
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER, true); LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.CLARIFYND_PARAGRAPH_DEBUG, true);
prepareStorage(layoutParsingRequest, file); prepareStorage(layoutParsingRequest, file);
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest); LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);