Merge branch 'RED-7141' into 'main'
RED-7141: Fixed more overlap problems See merge request fforesight/layout-parser!119
This commit is contained in:
commit
ac850c2626
@ -54,7 +54,11 @@ public class DocstrumBlockificationService {
|
|||||||
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder);
|
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder);
|
||||||
var pageBlocks = toAbstractPageBlocks(zones, usedHorizonalRulings, usedVerticalRulings, xyOrder);
|
var pageBlocks = toAbstractPageBlocks(zones, usedHorizonalRulings, usedVerticalRulings, xyOrder);
|
||||||
|
|
||||||
return new ClassificationPage(pageBlocks);
|
var classificationPage = new ClassificationPage(pageBlocks);
|
||||||
|
|
||||||
|
mergeIntersectingBlocks(classificationPage.getTextBlocks(), 0, 0);
|
||||||
|
|
||||||
|
return classificationPage;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -91,8 +95,6 @@ public class DocstrumBlockificationService {
|
|||||||
|
|
||||||
public void combineBlocks(ClassificationPage page) {
|
public void combineBlocks(ClassificationPage page) {
|
||||||
|
|
||||||
mergeIntersectingBlocks(page.getTextBlocks(), 0, 0);
|
|
||||||
|
|
||||||
TextPageBlock previous = new TextPageBlock();
|
TextPageBlock previous = new TextPageBlock();
|
||||||
ListIterator<AbstractPageBlock> itty = page.getTextBlocks().listIterator();
|
ListIterator<AbstractPageBlock> itty = page.getTextBlocks().listIterator();
|
||||||
while (itty.hasNext()) {
|
while (itty.hasNext()) {
|
||||||
@ -112,22 +114,22 @@ public class DocstrumBlockificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, page)) {
|
if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, page)) {
|
||||||
combineBlocksAndResetIterator(previous, current, itty, true);
|
previous = combineBlocksAndResetIterator(previous, current, itty, true);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (previous.almostIntersects(current, 0, 0)) {
|
if (previous.almostIntersects(current, 0, 0)) {
|
||||||
combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
|
previous = combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (isSameTopOrBottomWithPreviousHavingMaxTwoLinesAndCurrentThanOneAndMax4OtherBlocksOnHeight(previous, current, page)) {
|
if (isSameTopOrBottomWithPreviousHavingMaxTwoLinesAndCurrentThanOneAndMax4OtherBlocksOnHeight(previous, current, page)) {
|
||||||
combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
|
previous = combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(previous, current, page)) {
|
if (isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(previous, current, page)) {
|
||||||
combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
|
previous = combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -135,7 +137,7 @@ public class DocstrumBlockificationService {
|
|||||||
previous = current;
|
previous = current;
|
||||||
}
|
}
|
||||||
|
|
||||||
mergeIntersectingBlocks(page.getTextBlocks(), 0, 0);
|
mergeIntersectingBlocks(page.getTextBlocks(), 0, 6.5f);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -165,7 +167,7 @@ public class DocstrumBlockificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void combineBlocksAndResetIterator(TextPageBlock previous, TextPageBlock current, ListIterator<AbstractPageBlock> itty, boolean toDuplicate) {
|
private TextPageBlock combineBlocksAndResetIterator(TextPageBlock previous, TextPageBlock current, ListIterator<AbstractPageBlock> itty, boolean toDuplicate) {
|
||||||
|
|
||||||
previous.getSequences().addAll(current.getSequences());
|
previous.getSequences().addAll(current.getSequences());
|
||||||
previous = buildTextBlock(previous.getSequences(), 0);
|
previous = buildTextBlock(previous.getSequences(), 0);
|
||||||
@ -174,6 +176,7 @@ public class DocstrumBlockificationService {
|
|||||||
itty.previous();
|
itty.previous();
|
||||||
itty.set(previous);
|
itty.set(previous);
|
||||||
itty.next();
|
itty.next();
|
||||||
|
return previous;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -91,7 +91,7 @@ public class SectionNodeFactory {
|
|||||||
if (abstractPageBlock instanceof TextPageBlock) {
|
if (abstractPageBlock instanceof TextPageBlock) {
|
||||||
|
|
||||||
switch (layoutParsingType) {
|
switch (layoutParsingType) {
|
||||||
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> {
|
case REDACT_MANAGER, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> {
|
||||||
alreadyMerged.add(abstractPageBlock);
|
alreadyMerged.add(abstractPageBlock);
|
||||||
remainingBlocks.remove(abstractPageBlock);
|
remainingBlocks.remove(abstractPageBlock);
|
||||||
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, new ArrayList<>());
|
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, new ArrayList<>());
|
||||||
|
|||||||
@ -27,7 +27,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void testViewerDocument() {
|
public void testViewerDocument() {
|
||||||
|
|
||||||
String fileName = "files/new/270 rotated text on non rotated pages.pdf";
|
String fileName = "files/100 Trinexapac-ethyl_RAR_20_Volume_3CP_B-9_ 2018-01-10.pdf";
|
||||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||||
|
|
||||||
var documentFile = new ClassPathResource(fileName).getFile();
|
var documentFile = new ClassPathResource(fileName).getFile();
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user