RED-7141: Fixed more overlap problems

This commit is contained in:
Dominique Eifländer 2024-03-14 16:30:52 +01:00
parent c55984aa67
commit 1d765a6baa
3 changed files with 14 additions and 11 deletions

View File

@ -54,7 +54,11 @@ public class DocstrumBlockificationService {
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder);
var pageBlocks = toAbstractPageBlocks(zones, usedHorizonalRulings, usedVerticalRulings, xyOrder);
return new ClassificationPage(pageBlocks);
var classificationPage = new ClassificationPage(pageBlocks);
mergeIntersectingBlocks(classificationPage.getTextBlocks(), 0, 0);
return classificationPage;
}
@ -91,8 +95,6 @@ public class DocstrumBlockificationService {
public void combineBlocks(ClassificationPage page) {
mergeIntersectingBlocks(page.getTextBlocks(), 0, 0);
TextPageBlock previous = new TextPageBlock();
ListIterator<AbstractPageBlock> itty = page.getTextBlocks().listIterator();
while (itty.hasNext()) {
@ -112,22 +114,22 @@ public class DocstrumBlockificationService {
}
if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, page)) {
combineBlocksAndResetIterator(previous, current, itty, true);
previous = combineBlocksAndResetIterator(previous, current, itty, true);
continue;
}
if (previous.almostIntersects(current, 0, 0)) {
combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
previous = combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
continue;
}
if (isSameTopOrBottomWithPreviousHavingMaxTwoLinesAndCurrentThanOneAndMax4OtherBlocksOnHeight(previous, current, page)) {
combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
previous = combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
continue;
}
if (isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(previous, current, page)) {
combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
previous = combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
continue;
}
@ -135,7 +137,7 @@ public class DocstrumBlockificationService {
previous = current;
}
mergeIntersectingBlocks(page.getTextBlocks(), 0, 0);
mergeIntersectingBlocks(page.getTextBlocks(), 0, 6.5f);
}
@ -165,7 +167,7 @@ public class DocstrumBlockificationService {
}
private void combineBlocksAndResetIterator(TextPageBlock previous, TextPageBlock current, ListIterator<AbstractPageBlock> itty, boolean toDuplicate) {
private TextPageBlock combineBlocksAndResetIterator(TextPageBlock previous, TextPageBlock current, ListIterator<AbstractPageBlock> itty, boolean toDuplicate) {
previous.getSequences().addAll(current.getSequences());
previous = buildTextBlock(previous.getSequences(), 0);
@ -174,6 +176,7 @@ public class DocstrumBlockificationService {
itty.previous();
itty.set(previous);
itty.next();
return previous;
}

View File

@ -91,7 +91,7 @@ public class SectionNodeFactory {
if (abstractPageBlock instanceof TextPageBlock) {
switch (layoutParsingType) {
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> {
case REDACT_MANAGER, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> {
alreadyMerged.add(abstractPageBlock);
remainingBlocks.remove(abstractPageBlock);
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, new ArrayList<>());

View File

@ -27,7 +27,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
@SneakyThrows
public void testViewerDocument() {
String fileName = "files/new/270 rotated text on non rotated pages.pdf";
String fileName = "files/100 Trinexapac-ethyl_RAR_20_Volume_3CP_B-9_ 2018-01-10.pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
var documentFile = new ClassPathResource(fileName).getFile();