RED-7141: Fixed missing spaces in ExpansionTest.pdf
This commit is contained in:
parent
4644803fa8
commit
d32182ece8
@ -263,7 +263,8 @@ public class LayoutParsingPipeline {
|
||||
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case DOCSTRUM, REDACT_MANAGER -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells);
|
||||
case DOCSTRUM -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false);
|
||||
case REDACT_MANAGER -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, true);
|
||||
};
|
||||
classificationPage.setCleanRulings(cleanRulings);
|
||||
classificationPage.setRotation(rotation);
|
||||
|
||||
@ -38,7 +38,7 @@ public class DocstrumBlockificationService {
|
||||
static final float THRESHOLD = 1f;
|
||||
|
||||
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells) {
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells, boolean xyOrder) {
|
||||
|
||||
// Underlined or strikethrough are also in rulings but we dont want to split blocks with them so we use cells.
|
||||
List<Ruling> usedHorizonalRulings = new ArrayList<>();
|
||||
@ -52,7 +52,7 @@ public class DocstrumBlockificationService {
|
||||
});
|
||||
|
||||
List<AbstractPageBlock> abstractPageBlocks = new ArrayList<>();
|
||||
var zones = docstrumSegmentationService.segmentPage(textPositions);
|
||||
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder);
|
||||
zones.forEach(zone -> {
|
||||
|
||||
List<TextPositionSequence> textPositionSequences = new ArrayList<>();
|
||||
|
||||
@ -29,7 +29,7 @@ public class DocstrumSegmentationService {
|
||||
private final ReadingOrderService readingOrderService;
|
||||
|
||||
|
||||
public List<Zone> segmentPage(List<TextPositionSequence> textPositions) {
|
||||
public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean xyOrder) {
|
||||
|
||||
List<Zone> zones = new ArrayList<>();
|
||||
zones.addAll(computeZones(textPositions, TextDirection.ZERO));
|
||||
@ -37,7 +37,7 @@ public class DocstrumSegmentationService {
|
||||
zones.addAll(computeZones(textPositions, TextDirection.HALF_CIRCLE));
|
||||
zones.addAll(computeZones(textPositions, TextDirection.THREE_QUARTER_CIRCLE));
|
||||
|
||||
return readingOrderService.resolve(zones);
|
||||
return readingOrderService.resolve(zones, xyOrder);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -12,7 +12,7 @@ import lombok.Data;
|
||||
@Data
|
||||
public class Line extends BoundingBox {
|
||||
|
||||
private static final double WORD_DISTANCE_MULTIPLIER = 0.2;
|
||||
private static final double WORD_DISTANCE_MULTIPLIER = 0.18;
|
||||
|
||||
private final double x0;
|
||||
private final double y0;
|
||||
|
||||
@ -20,12 +20,16 @@ public class ReadingOrderService {
|
||||
public static final double MULTI_COLUMN_DETECTION_THRESHOLD = 1.5;
|
||||
|
||||
|
||||
public List<Zone> resolve(List<Zone> zones) {
|
||||
public List<Zone> resolve(List<Zone> zones, boolean xyReadingOrder) {
|
||||
|
||||
if (zones.isEmpty() || zones.size() == 1) {
|
||||
return zones;
|
||||
}
|
||||
|
||||
if (xyReadingOrder) {
|
||||
return resolveSingleColumnReadingOrder(zones);
|
||||
}
|
||||
|
||||
Map<Long, Integer> histogram = new HashMap<>();
|
||||
for (Zone zone : zones) {
|
||||
long minY = Math.round(zone.getBBox().getMinY());
|
||||
@ -35,10 +39,7 @@ public class ReadingOrderService {
|
||||
}
|
||||
}
|
||||
|
||||
if (histogram.values()
|
||||
.stream()
|
||||
.mapToInt(Integer::intValue).average()
|
||||
.orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) {
|
||||
if (histogram.values().stream().mapToInt(Integer::intValue).average().orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) {
|
||||
return resolveSingleColumnReadingOrder(zones);
|
||||
} else {
|
||||
|
||||
@ -51,7 +52,7 @@ public class ReadingOrderService {
|
||||
private static List<Zone> resolveSingleColumnReadingOrder(List<Zone> zones) {
|
||||
|
||||
zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
return zones;
|
||||
}
|
||||
|
||||
@ -89,13 +90,13 @@ public class ReadingOrderService {
|
||||
}
|
||||
|
||||
leftOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
|
||||
rightOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
|
||||
middle.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||
|
||||
List<Zone> leftNotIntersecting = new ArrayList<>();
|
||||
for (Zone leftZone : leftOf) {
|
||||
|
||||
@ -26,7 +26,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
@SneakyThrows
|
||||
public void testViewerDocument() {
|
||||
|
||||
String fileName = "files/Plenarprotokoll 1 (keine Druchsache!) (1).pdf";
|
||||
String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
|
||||
var documentFile = new ClassPathResource(fileName).getFile();
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user