RED-7141: Fixed more overlap problems
This commit is contained in:
parent
c55984aa67
commit
1d765a6baa
@ -54,7 +54,11 @@ public class DocstrumBlockificationService {
|
||||
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder);
|
||||
var pageBlocks = toAbstractPageBlocks(zones, usedHorizonalRulings, usedVerticalRulings, xyOrder);
|
||||
|
||||
return new ClassificationPage(pageBlocks);
|
||||
var classificationPage = new ClassificationPage(pageBlocks);
|
||||
|
||||
mergeIntersectingBlocks(classificationPage.getTextBlocks(), 0, 0);
|
||||
|
||||
return classificationPage;
|
||||
}
|
||||
|
||||
|
||||
@ -91,8 +95,6 @@ public class DocstrumBlockificationService {
|
||||
|
||||
public void combineBlocks(ClassificationPage page) {
|
||||
|
||||
mergeIntersectingBlocks(page.getTextBlocks(), 0, 0);
|
||||
|
||||
TextPageBlock previous = new TextPageBlock();
|
||||
ListIterator<AbstractPageBlock> itty = page.getTextBlocks().listIterator();
|
||||
while (itty.hasNext()) {
|
||||
@ -112,22 +114,22 @@ public class DocstrumBlockificationService {
|
||||
}
|
||||
|
||||
if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, page)) {
|
||||
combineBlocksAndResetIterator(previous, current, itty, true);
|
||||
previous = combineBlocksAndResetIterator(previous, current, itty, true);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (previous.almostIntersects(current, 0, 0)) {
|
||||
combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
|
||||
previous = combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
|
||||
continue;
|
||||
}
|
||||
|
||||
if (isSameTopOrBottomWithPreviousHavingMaxTwoLinesAndCurrentThanOneAndMax4OtherBlocksOnHeight(previous, current, page)) {
|
||||
combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
|
||||
previous = combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
|
||||
continue;
|
||||
}
|
||||
|
||||
if (isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(previous, current, page)) {
|
||||
combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
|
||||
previous = combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -135,7 +137,7 @@ public class DocstrumBlockificationService {
|
||||
previous = current;
|
||||
}
|
||||
|
||||
mergeIntersectingBlocks(page.getTextBlocks(), 0, 0);
|
||||
mergeIntersectingBlocks(page.getTextBlocks(), 0, 6.5f);
|
||||
}
|
||||
|
||||
|
||||
@ -165,7 +167,7 @@ public class DocstrumBlockificationService {
|
||||
}
|
||||
|
||||
|
||||
private void combineBlocksAndResetIterator(TextPageBlock previous, TextPageBlock current, ListIterator<AbstractPageBlock> itty, boolean toDuplicate) {
|
||||
private TextPageBlock combineBlocksAndResetIterator(TextPageBlock previous, TextPageBlock current, ListIterator<AbstractPageBlock> itty, boolean toDuplicate) {
|
||||
|
||||
previous.getSequences().addAll(current.getSequences());
|
||||
previous = buildTextBlock(previous.getSequences(), 0);
|
||||
@ -174,6 +176,7 @@ public class DocstrumBlockificationService {
|
||||
itty.previous();
|
||||
itty.set(previous);
|
||||
itty.next();
|
||||
return previous;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -91,7 +91,7 @@ public class SectionNodeFactory {
|
||||
if (abstractPageBlock instanceof TextPageBlock) {
|
||||
|
||||
switch (layoutParsingType) {
|
||||
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> {
|
||||
case REDACT_MANAGER, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> {
|
||||
alreadyMerged.add(abstractPageBlock);
|
||||
remainingBlocks.remove(abstractPageBlock);
|
||||
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, new ArrayList<>());
|
||||
|
||||
@ -27,7 +27,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
@SneakyThrows
|
||||
public void testViewerDocument() {
|
||||
|
||||
String fileName = "files/new/270 rotated text on non rotated pages.pdf";
|
||||
String fileName = "files/100 Trinexapac-ethyl_RAR_20_Volume_3CP_B-9_ 2018-01-10.pdf";
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
|
||||
var documentFile = new ClassPathResource(fileName).getFile();
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user