RED-7141: Fixed missing spaces in ExpansionTest.pdf

This commit is contained in:
Dominique Eifländer 2024-03-05 10:32:21 +01:00
parent 4644803fa8
commit d32182ece8
6 changed files with 18 additions and 16 deletions

View File

@ -263,7 +263,8 @@ public class LayoutParsingPipeline {
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case DOCSTRUM, REDACT_MANAGER -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells);
case DOCSTRUM -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false);
case REDACT_MANAGER -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, true);
};
classificationPage.setCleanRulings(cleanRulings);
classificationPage.setRotation(rotation);

View File

@ -38,7 +38,7 @@ public class DocstrumBlockificationService {
static final float THRESHOLD = 1f;
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells) {
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells, boolean xyOrder) {
// Underlined or strikethrough are also in rulings but we dont want to split blocks with them so we use cells.
List<Ruling> usedHorizonalRulings = new ArrayList<>();
@ -52,7 +52,7 @@ public class DocstrumBlockificationService {
});
List<AbstractPageBlock> abstractPageBlocks = new ArrayList<>();
var zones = docstrumSegmentationService.segmentPage(textPositions);
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder);
zones.forEach(zone -> {
List<TextPositionSequence> textPositionSequences = new ArrayList<>();

View File

@ -29,7 +29,7 @@ public class DocstrumSegmentationService {
private final ReadingOrderService readingOrderService;
public List<Zone> segmentPage(List<TextPositionSequence> textPositions) {
public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean xyOrder) {
List<Zone> zones = new ArrayList<>();
zones.addAll(computeZones(textPositions, TextDirection.ZERO));
@ -37,7 +37,7 @@ public class DocstrumSegmentationService {
zones.addAll(computeZones(textPositions, TextDirection.HALF_CIRCLE));
zones.addAll(computeZones(textPositions, TextDirection.THREE_QUARTER_CIRCLE));
return readingOrderService.resolve(zones);
return readingOrderService.resolve(zones, xyOrder);
}

View File

@ -12,7 +12,7 @@ import lombok.Data;
@Data
public class Line extends BoundingBox {
private static final double WORD_DISTANCE_MULTIPLIER = 0.2;
private static final double WORD_DISTANCE_MULTIPLIER = 0.18;
private final double x0;
private final double y0;

View File

@ -20,12 +20,16 @@ public class ReadingOrderService {
public static final double MULTI_COLUMN_DETECTION_THRESHOLD = 1.5;
public List<Zone> resolve(List<Zone> zones) {
public List<Zone> resolve(List<Zone> zones, boolean xyReadingOrder) {
if (zones.isEmpty() || zones.size() == 1) {
return zones;
}
if (xyReadingOrder) {
return resolveSingleColumnReadingOrder(zones);
}
Map<Long, Integer> histogram = new HashMap<>();
for (Zone zone : zones) {
long minY = Math.round(zone.getBBox().getMinY());
@ -35,10 +39,7 @@ public class ReadingOrderService {
}
}
if (histogram.values()
.stream()
.mapToInt(Integer::intValue).average()
.orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) {
if (histogram.values().stream().mapToInt(Integer::intValue).average().orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) {
return resolveSingleColumnReadingOrder(zones);
} else {
@ -51,7 +52,7 @@ public class ReadingOrderService {
private static List<Zone> resolveSingleColumnReadingOrder(List<Zone> zones) {
zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
return zones;
}
@ -89,13 +90,13 @@ public class ReadingOrderService {
}
leftOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
rightOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
middle.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
List<Zone> leftNotIntersecting = new ArrayList<>();
for (Zone leftZone : leftOf) {

View File

@ -26,7 +26,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
@SneakyThrows
public void testViewerDocument() {
String fileName = "files/Plenarprotokoll 1 (keine Druchsache!) (1).pdf";
String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
var documentFile = new ClassPathResource(fileName).getFile();