RED-7141: Fixed missing spaces in ExpansionTest.pdf
This commit is contained in:
parent
4644803fa8
commit
d32182ece8
@ -263,7 +263,8 @@ public class LayoutParsingPipeline {
|
|||||||
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||||
case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||||
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||||
case DOCSTRUM, REDACT_MANAGER -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells);
|
case DOCSTRUM -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false);
|
||||||
|
case REDACT_MANAGER -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, true);
|
||||||
};
|
};
|
||||||
classificationPage.setCleanRulings(cleanRulings);
|
classificationPage.setCleanRulings(cleanRulings);
|
||||||
classificationPage.setRotation(rotation);
|
classificationPage.setRotation(rotation);
|
||||||
|
|||||||
@ -38,7 +38,7 @@ public class DocstrumBlockificationService {
|
|||||||
static final float THRESHOLD = 1f;
|
static final float THRESHOLD = 1f;
|
||||||
|
|
||||||
|
|
||||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells) {
|
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells, boolean xyOrder) {
|
||||||
|
|
||||||
// Underlined or strikethrough are also in rulings but we dont want to split blocks with them so we use cells.
|
// Underlined or strikethrough are also in rulings but we dont want to split blocks with them so we use cells.
|
||||||
List<Ruling> usedHorizonalRulings = new ArrayList<>();
|
List<Ruling> usedHorizonalRulings = new ArrayList<>();
|
||||||
@ -52,7 +52,7 @@ public class DocstrumBlockificationService {
|
|||||||
});
|
});
|
||||||
|
|
||||||
List<AbstractPageBlock> abstractPageBlocks = new ArrayList<>();
|
List<AbstractPageBlock> abstractPageBlocks = new ArrayList<>();
|
||||||
var zones = docstrumSegmentationService.segmentPage(textPositions);
|
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder);
|
||||||
zones.forEach(zone -> {
|
zones.forEach(zone -> {
|
||||||
|
|
||||||
List<TextPositionSequence> textPositionSequences = new ArrayList<>();
|
List<TextPositionSequence> textPositionSequences = new ArrayList<>();
|
||||||
|
|||||||
@ -29,7 +29,7 @@ public class DocstrumSegmentationService {
|
|||||||
private final ReadingOrderService readingOrderService;
|
private final ReadingOrderService readingOrderService;
|
||||||
|
|
||||||
|
|
||||||
public List<Zone> segmentPage(List<TextPositionSequence> textPositions) {
|
public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean xyOrder) {
|
||||||
|
|
||||||
List<Zone> zones = new ArrayList<>();
|
List<Zone> zones = new ArrayList<>();
|
||||||
zones.addAll(computeZones(textPositions, TextDirection.ZERO));
|
zones.addAll(computeZones(textPositions, TextDirection.ZERO));
|
||||||
@ -37,7 +37,7 @@ public class DocstrumSegmentationService {
|
|||||||
zones.addAll(computeZones(textPositions, TextDirection.HALF_CIRCLE));
|
zones.addAll(computeZones(textPositions, TextDirection.HALF_CIRCLE));
|
||||||
zones.addAll(computeZones(textPositions, TextDirection.THREE_QUARTER_CIRCLE));
|
zones.addAll(computeZones(textPositions, TextDirection.THREE_QUARTER_CIRCLE));
|
||||||
|
|
||||||
return readingOrderService.resolve(zones);
|
return readingOrderService.resolve(zones, xyOrder);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -12,7 +12,7 @@ import lombok.Data;
|
|||||||
@Data
|
@Data
|
||||||
public class Line extends BoundingBox {
|
public class Line extends BoundingBox {
|
||||||
|
|
||||||
private static final double WORD_DISTANCE_MULTIPLIER = 0.2;
|
private static final double WORD_DISTANCE_MULTIPLIER = 0.18;
|
||||||
|
|
||||||
private final double x0;
|
private final double x0;
|
||||||
private final double y0;
|
private final double y0;
|
||||||
|
|||||||
@ -20,12 +20,16 @@ public class ReadingOrderService {
|
|||||||
public static final double MULTI_COLUMN_DETECTION_THRESHOLD = 1.5;
|
public static final double MULTI_COLUMN_DETECTION_THRESHOLD = 1.5;
|
||||||
|
|
||||||
|
|
||||||
public List<Zone> resolve(List<Zone> zones) {
|
public List<Zone> resolve(List<Zone> zones, boolean xyReadingOrder) {
|
||||||
|
|
||||||
if (zones.isEmpty() || zones.size() == 1) {
|
if (zones.isEmpty() || zones.size() == 1) {
|
||||||
return zones;
|
return zones;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (xyReadingOrder) {
|
||||||
|
return resolveSingleColumnReadingOrder(zones);
|
||||||
|
}
|
||||||
|
|
||||||
Map<Long, Integer> histogram = new HashMap<>();
|
Map<Long, Integer> histogram = new HashMap<>();
|
||||||
for (Zone zone : zones) {
|
for (Zone zone : zones) {
|
||||||
long minY = Math.round(zone.getBBox().getMinY());
|
long minY = Math.round(zone.getBBox().getMinY());
|
||||||
@ -35,10 +39,7 @@ public class ReadingOrderService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (histogram.values()
|
if (histogram.values().stream().mapToInt(Integer::intValue).average().orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) {
|
||||||
.stream()
|
|
||||||
.mapToInt(Integer::intValue).average()
|
|
||||||
.orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) {
|
|
||||||
return resolveSingleColumnReadingOrder(zones);
|
return resolveSingleColumnReadingOrder(zones);
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
|
|||||||
@ -26,7 +26,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void testViewerDocument() {
|
public void testViewerDocument() {
|
||||||
|
|
||||||
String fileName = "files/Plenarprotokoll 1 (keine Druchsache!) (1).pdf";
|
String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||||
|
|
||||||
var documentFile = new ClassPathResource(fileName).getFile();
|
var documentFile = new ClassPathResource(fileName).getFile();
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user