RED-7141: Fixed missing spaces in ExpansionTest.pdf

This commit is contained in:
Dominique Eifländer 2024-03-05 10:32:21 +01:00
parent 4644803fa8
commit d32182ece8
6 changed files with 18 additions and 16 deletions

View File

@ -263,7 +263,8 @@ public class LayoutParsingPipeline {
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical()); case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case DOCSTRUM, REDACT_MANAGER -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells); case DOCSTRUM -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false);
case REDACT_MANAGER -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, true);
}; };
classificationPage.setCleanRulings(cleanRulings); classificationPage.setCleanRulings(cleanRulings);
classificationPage.setRotation(rotation); classificationPage.setRotation(rotation);

View File

@ -38,7 +38,7 @@ public class DocstrumBlockificationService {
static final float THRESHOLD = 1f; static final float THRESHOLD = 1f;
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells) { public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells, boolean xyOrder) {
// Underlined or strikethrough are also in rulings but we dont want to split blocks with them so we use cells. // Underlined or strikethrough are also in rulings but we dont want to split blocks with them so we use cells.
List<Ruling> usedHorizonalRulings = new ArrayList<>(); List<Ruling> usedHorizonalRulings = new ArrayList<>();
@ -52,7 +52,7 @@ public class DocstrumBlockificationService {
}); });
List<AbstractPageBlock> abstractPageBlocks = new ArrayList<>(); List<AbstractPageBlock> abstractPageBlocks = new ArrayList<>();
var zones = docstrumSegmentationService.segmentPage(textPositions); var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder);
zones.forEach(zone -> { zones.forEach(zone -> {
List<TextPositionSequence> textPositionSequences = new ArrayList<>(); List<TextPositionSequence> textPositionSequences = new ArrayList<>();

View File

@ -29,7 +29,7 @@ public class DocstrumSegmentationService {
private final ReadingOrderService readingOrderService; private final ReadingOrderService readingOrderService;
public List<Zone> segmentPage(List<TextPositionSequence> textPositions) { public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean xyOrder) {
List<Zone> zones = new ArrayList<>(); List<Zone> zones = new ArrayList<>();
zones.addAll(computeZones(textPositions, TextDirection.ZERO)); zones.addAll(computeZones(textPositions, TextDirection.ZERO));
@ -37,7 +37,7 @@ public class DocstrumSegmentationService {
zones.addAll(computeZones(textPositions, TextDirection.HALF_CIRCLE)); zones.addAll(computeZones(textPositions, TextDirection.HALF_CIRCLE));
zones.addAll(computeZones(textPositions, TextDirection.THREE_QUARTER_CIRCLE)); zones.addAll(computeZones(textPositions, TextDirection.THREE_QUARTER_CIRCLE));
return readingOrderService.resolve(zones); return readingOrderService.resolve(zones, xyOrder);
} }

View File

@ -12,7 +12,7 @@ import lombok.Data;
@Data @Data
public class Line extends BoundingBox { public class Line extends BoundingBox {
private static final double WORD_DISTANCE_MULTIPLIER = 0.2; private static final double WORD_DISTANCE_MULTIPLIER = 0.18;
private final double x0; private final double x0;
private final double y0; private final double y0;

View File

@ -20,12 +20,16 @@ public class ReadingOrderService {
public static final double MULTI_COLUMN_DETECTION_THRESHOLD = 1.5; public static final double MULTI_COLUMN_DETECTION_THRESHOLD = 1.5;
public List<Zone> resolve(List<Zone> zones) { public List<Zone> resolve(List<Zone> zones, boolean xyReadingOrder) {
if (zones.isEmpty() || zones.size() == 1) { if (zones.isEmpty() || zones.size() == 1) {
return zones; return zones;
} }
if (xyReadingOrder) {
return resolveSingleColumnReadingOrder(zones);
}
Map<Long, Integer> histogram = new HashMap<>(); Map<Long, Integer> histogram = new HashMap<>();
for (Zone zone : zones) { for (Zone zone : zones) {
long minY = Math.round(zone.getBBox().getMinY()); long minY = Math.round(zone.getBBox().getMinY());
@ -35,10 +39,7 @@ public class ReadingOrderService {
} }
} }
if (histogram.values() if (histogram.values().stream().mapToInt(Integer::intValue).average().orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) {
.stream()
.mapToInt(Integer::intValue).average()
.orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) {
return resolveSingleColumnReadingOrder(zones); return resolveSingleColumnReadingOrder(zones);
} else { } else {
@ -51,7 +52,7 @@ public class ReadingOrderService {
private static List<Zone> resolveSingleColumnReadingOrder(List<Zone> zones) { private static List<Zone> resolveSingleColumnReadingOrder(List<Zone> zones) {
zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
return zones; return zones;
} }
@ -89,13 +90,13 @@ public class ReadingOrderService {
} }
leftOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) leftOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
rightOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) rightOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
middle.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)) middle.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))); .thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
List<Zone> leftNotIntersecting = new ArrayList<>(); List<Zone> leftNotIntersecting = new ArrayList<>();
for (Zone leftZone : leftOf) { for (Zone leftZone : leftOf) {

View File

@ -26,7 +26,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
@SneakyThrows @SneakyThrows
public void testViewerDocument() { public void testViewerDocument() {
String fileName = "files/Plenarprotokoll 1 (keine Druchsache!) (1).pdf"; String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
var documentFile = new ClassPathResource(fileName).getFile(); var documentFile = new ClassPathResource(fileName).getFile();