RED-8773 - Fix images not appearing on specific file

This commit is contained in:
Andrei Isvoran 2024-04-03 10:21:45 +03:00
parent 1ca02f72c8
commit 34b260bb60
3 changed files with 122 additions and 51 deletions

View File

@ -49,9 +49,14 @@ public class DocumentGraphFactory {
Document documentGraph = new Document();
Context context = new Context(documentGraph);
document.getPages().forEach(context::buildAndAddPageWithCounter);
document.getSections().stream().flatMap(section -> section.getImages().stream()).forEach(image -> context.getImages().add(image));
addSections(document, context);
document.getPages()
.forEach(context::buildAndAddPageWithCounter);
document.getSections()
.stream()
.flatMap(section -> section.getImages()
.stream())
.forEach(image -> context.getImages().add(image));
addSections(document, context, documentGraph);
addHeaderAndFooterToEachPage(document, context);
documentGraph.setNumberOfPages(context.pages.size());
@ -62,9 +67,10 @@ public class DocumentGraphFactory {
}
private void addSections(ClassificationDocument document, Context context) {
private void addSections(ClassificationDocument classificationDocument, Context context, Document document) {
document.getSections().forEach(section -> SectionNodeFactory.addSection(null, section.getNonEmptyPageBlocks(), section.getImages(), context));
classificationDocument.getSections()
.forEach(section -> SectionNodeFactory.addSection(null, section.getNonEmptyPageBlocks(), section.getImages(), context, document));
}
@ -74,9 +80,11 @@ public class DocumentGraphFactory {
GenericSemanticNode node;
if (originalTextBlock.isHeadline()) {
node = Headline.builder().documentTree(context.getDocumentTree()).build();
node = Headline.builder().documentTree(context.getDocumentTree())
.build();
} else {
node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
node = Paragraph.builder().documentTree(context.getDocumentTree())
.build();
}
page.getMainBody().add(node);
@ -93,6 +101,22 @@ public class DocumentGraphFactory {
public void addImage(Section section, ClassifiedImage image, Context context) {
Image imageNode = createImage(image, context);
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(section, imageNode);
imageNode.setTreeId(treeId);
}
public void addImage(Document document, ClassifiedImage image, Context context) {
Image imageNode = createImage(image, context);
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(document, imageNode);
imageNode.setTreeId(treeId);
}
private Image createImage(ClassifiedImage image, Context context) {
Rectangle2D position = image.getPosition();
Page page = context.getPage(image.getPage());
Image imageNode = Image.builder()
@ -104,9 +128,7 @@ public class DocumentGraphFactory {
.documentTree(context.getDocumentTree())
.build();
page.getMainBody().add(imageNode);
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(section, imageNode);
imageNode.setTreeId(treeId);
return imageNode;
}
@ -145,11 +167,12 @@ public class DocumentGraphFactory {
private void addFooter(List<TextPageBlock> textBlocks, Context context) {
Page page = context.getPage(textBlocks.get(0).getPage());
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
Footer footer = Footer.builder().documentTree(context.getDocumentTree())
.build();
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks),
footer,
context,
page);
footer,
context,
page);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
footer.setTreeId(tocId);
footer.setLeafTextBlock(textBlock);
@ -160,7 +183,8 @@ public class DocumentGraphFactory {
public void addHeader(List<TextPageBlock> textBlocks, Context context) {
Page page = context.getPage(textBlocks.get(0).getPage());
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
Header header = Header.builder().documentTree(context.getDocumentTree())
.build();
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), header, 0, page);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
header.setTreeId(tocId);
@ -172,7 +196,8 @@ public class DocumentGraphFactory {
private void addEmptyFooter(int pageIndex, Context context) {
Page page = context.getPage(pageIndex);
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
Footer footer = Footer.builder().documentTree(context.getDocumentTree())
.build();
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(footer, context, page);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
footer.setTreeId(tocId);
@ -184,7 +209,8 @@ public class DocumentGraphFactory {
private void addEmptyHeader(int pageIndex, Context context) {
Page page = context.getPage(pageIndex);
Header header = Header.builder().documentTree(context.getDocumentTree()).build();
Header header = Header.builder().documentTree(context.getDocumentTree())
.build();
AtomicTextBlock textBlock = context.textBlockFactory.emptyTextBlock(header, 0, page);
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(header);
header.setTreeId(tocId);

View File

@ -11,6 +11,7 @@ import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
@ -24,27 +25,46 @@ import lombok.experimental.UtilityClass;
@UtilityClass
public class SectionNodeFactory {
public void addSection(GenericSemanticNode parentNode, List<AbstractPageBlock> pageBlocks, List<ClassifiedImage> images, DocumentGraphFactory.Context context) {
public void addSection(GenericSemanticNode parentNode,
List<AbstractPageBlock> pageBlocks,
List<ClassifiedImage> images,
DocumentGraphFactory.Context context,
Document document) {
// This is for the case where we have images on a page without any text/footer/header.
// The pageBlocks list is empty, but we still need to add those images to the document.
if (!images.isEmpty() && pageBlocks.isEmpty()) {
images.stream()
.distinct()
.forEach(image -> DocumentGraphFactory.addImage(document, image, context));
return;
}
if (pageBlocks.isEmpty()) {
return;
}
Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream().collect(groupingBy(AbstractPageBlock::getPage));
Section section = Section.builder().documentTree(context.getDocumentTree()).build();
Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream()
.collect(groupingBy(AbstractPageBlock::getPage));
Section section = Section.builder().documentTree(context.getDocumentTree())
.build();
context.getSections().add(section);
blocksPerPage.keySet().forEach(pageNumber -> addSectionNodeToPageNode(context, section, pageNumber));
blocksPerPage.keySet()
.forEach(pageNumber -> addSectionNodeToPageNode(context, section, pageNumber));
section.setTreeId(getTreeId(parentNode, context, section));
addFirstHeadlineDirectlyToSection(pageBlocks, context, section);
addFirstHeadlineDirectlyToSection(pageBlocks, context, section, document);
if (containsTablesAndTextBlocks(pageBlocks)) {
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(section, subSectionPageBlocks, emptyList(), context));
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(section, subSectionPageBlocks, emptyList(), context, document));
} else {
addTablesAndParagraphsAndHeadlinesToSection(pageBlocks, context, section);
addTablesAndParagraphsAndHeadlinesToSection(pageBlocks, context, section, document);
}
images.stream().distinct().forEach(image -> DocumentGraphFactory.addImage(section, image, context));
images.stream()
.distinct()
.forEach(image -> DocumentGraphFactory.addImage(section, image, context));
}
@ -58,16 +78,16 @@ public class SectionNodeFactory {
}
private void addFirstHeadlineDirectlyToSection(List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section) {
private void addFirstHeadlineDirectlyToSection(List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section, Document document) {
if (pageBlocks.get(0).isHeadline()) {
addTablesAndParagraphsAndHeadlinesToSection(List.of(pageBlocks.get(0)), context, section);
addTablesAndParagraphsAndHeadlinesToSection(List.of(pageBlocks.get(0)), context, section, document);
pageBlocks.remove(0);
}
}
private void addTablesAndParagraphsAndHeadlinesToSection(List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section) {
private void addTablesAndParagraphsAndHeadlinesToSection(List<AbstractPageBlock> pageBlocks, DocumentGraphFactory.Context context, Section section, Document document) {
Set<AbstractPageBlock> alreadyMerged = new HashSet<>();
List<AbstractPageBlock> remainingBlocks = new LinkedList<>(pageBlocks);
@ -86,7 +106,7 @@ public class SectionNodeFactory {
} else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) {
List<TablePageBlock> tablesToMerge = TableMergingUtility.findConsecutiveTablesWithSameColCountAndSameHeaders(tablePageBlock, remainingBlocks);
alreadyMerged.addAll(tablesToMerge);
TableNodeFactory.addTable(section, tablesToMerge, context);
TableNodeFactory.addTable(section, tablesToMerge, context, document);
} else {
throw new RuntimeException(format("Unhandled AbstractPageBlockType %s!", abstractPageBlock.getClass()));
}
@ -96,7 +116,9 @@ public class SectionNodeFactory {
private boolean containsTablesAndTextBlocks(List<AbstractPageBlock> pageBlocks) {
return pageBlocks.stream().anyMatch(pageBlock -> pageBlock instanceof TablePageBlock) && pageBlocks.stream().anyMatch(pageBlock -> pageBlock instanceof TextPageBlock);
return pageBlocks.stream()
.anyMatch(pageBlock -> pageBlock instanceof TablePageBlock) && pageBlocks.stream()
.anyMatch(pageBlock -> pageBlock instanceof TextPageBlock);
}
@ -112,7 +134,9 @@ public class SectionNodeFactory {
List<List<AbstractPageBlock>> splitList = splitIntoCoherentList(pageBlocks);
movePrecedingHeadlineToTableList(splitList);
return splitList.stream().filter(list -> !list.isEmpty()).toList();
return splitList.stream()
.filter(list -> !list.isEmpty())
.toList();
}
@ -133,7 +157,8 @@ public class SectionNodeFactory {
private boolean listIsTablesOnly(List<AbstractPageBlock> abstractPageBlocks) {
return abstractPageBlocks.stream().allMatch(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock);
return abstractPageBlocks.stream()
.allMatch(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock);
}

View File

@ -8,6 +8,7 @@ import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
@ -27,23 +28,26 @@ public class TableNodeFactory {
public final double TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD = 0.05;
public void addTable(GenericSemanticNode parentNode, List<TablePageBlock> tablesToMerge, DocumentGraphFactory.Context context) {
public void addTable(GenericSemanticNode parentNode, List<TablePageBlock> tablesToMerge, DocumentGraphFactory.Context context, Document document) {
setPageNumberInCells(tablesToMerge);
Set<Page> pages = tablesToMerge.stream().map(AbstractPageBlock::getPage).map(context::getPage).collect(Collectors.toSet());
List<List<Cell>> mergedRows = tablesToMerge.stream().map(TablePageBlock::getRows).flatMap(Collection::stream).toList();
Set<Page> pages = tablesToMerge.stream()
.map(AbstractPageBlock::getPage)
.map(context::getPage)
.collect(Collectors.toSet());
List<List<Cell>> mergedRows = tablesToMerge.stream()
.map(TablePageBlock::getRows)
.flatMap(Collection::stream)
.toList();
Table table = Table.builder()
.documentTree(context.getDocumentTree())
.numberOfCols(mergedRows.isEmpty() ? 0 : mergedRows.get(0).size())
.numberOfRows(mergedRows.size())
Table table = Table.builder().documentTree(context.getDocumentTree()).numberOfCols(mergedRows.isEmpty() ? 0 : mergedRows.get(0).size()).numberOfRows(mergedRows.size())
.build();
pages.forEach(page -> addTableToPage(page, parentNode, table));
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, table);
table.setTreeId(treeId);
addTableCells(mergedRows, table, context);
addTableCells(mergedRows, table, context, document);
ifTableHasNoHeadersSetFirstRowAsHeaders(table);
}
@ -63,7 +67,8 @@ public class TableNodeFactory {
private void setPageNumberInTextBlocksWithPageNumberSetTo0(TablePageBlock table, Cell cell) {
cell.getTextBlocks().stream()//
cell.getTextBlocks()
.stream()//
.filter(tb -> tb.getPage() == 0)//
.forEach(tb -> tb.setPage(table.getPage()));
}
@ -82,28 +87,32 @@ public class TableNodeFactory {
private void ifTableHasNoHeadersSetFirstRowAsHeaders(Table table) {
if (table.streamHeaders().findAny().isEmpty()) {
table.streamRow(0).forEach(tableCellNode -> tableCellNode.setHeader(true));
if (table.streamHeaders()
.findAny().isEmpty()) {
table.streamRow(0)
.forEach(tableCellNode -> tableCellNode.setHeader(true));
}
}
private void addTableCells(List<List<Cell>> rows, Table table, DocumentGraphFactory.Context context) {
private void addTableCells(List<List<Cell>> rows, Table table, DocumentGraphFactory.Context context, Document document) {
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
for (int colIndex = 0; colIndex < rows.get(rowIndex).size(); colIndex++) {
addTableCell(rows.get(rowIndex).get(colIndex), rowIndex, colIndex, table, context);
addTableCell(rows.get(rowIndex)
.get(colIndex), rowIndex, colIndex, table, context, document);
}
}
}
@SuppressWarnings("PMD.UnusedPrivateMethod") // PMD actually flags this wrong
private void addTableCell(Cell cell, int rowIndex, int colIndex, Table tableNode, DocumentGraphFactory.Context context) {
private void addTableCell(Cell cell, int rowIndex, int colIndex, Table tableNode, DocumentGraphFactory.Context context, Document document) {
Page page = context.getPage(cell.getPageNumber());
TableCell tableCell = TableCell.builder().documentTree(context.getDocumentTree()).row(rowIndex).col(colIndex).header(cell.isHeaderCell()).bBox(cell.getBounds2D()).build();
TableCell tableCell = TableCell.builder().documentTree(context.getDocumentTree()).row(rowIndex).col(colIndex).header(cell.isHeaderCell()).bBox(cell.getBounds2D())
.build();
page.getMainBody().add(tableCell);
List<Integer> treeId = context.getDocumentTree().createNewTableChildEntryAndReturnId(tableNode, tableCell);
@ -113,16 +122,26 @@ public class TableNodeFactory {
if (cell.getTextBlocks().isEmpty()) {
tableCell.setLeafTextBlock(context.getTextBlockFactory().emptyTextBlock(tableNode, context, page));
} else if (cell.getTextBlocks().size() == 1) {
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(cell.getTextBlocks().get(0).getSequences(), tableCell, context, page);
textBlock = context.getTextBlockFactory()
.buildAtomicTextBlock(cell.getTextBlocks()
.get(0).getSequences(), tableCell, context, page);
tableCell.setLeafTextBlock(textBlock);
} else if (firstTextBlockIsHeadline(cell)) {
SectionNodeFactory.addSection(tableCell, cell.getTextBlocks().stream().map(tb -> (AbstractPageBlock) tb).toList(), emptyList(), context);
SectionNodeFactory.addSection(tableCell,
cell.getTextBlocks()
.stream()
.map(tb -> (AbstractPageBlock) tb)
.toList(),
emptyList(),
context,
document);
} else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) {
List<TextPositionSequence> sequences = TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(cell.getTextBlocks());
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(sequences, tableCell, context, page);
tableCell.setLeafTextBlock(textBlock);
} else {
cell.getTextBlocks().forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList()));
cell.getTextBlocks()
.forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList()));
}
}
@ -135,7 +154,8 @@ public class TableNodeFactory {
private boolean firstTextBlockIsHeadline(Cell cell) {
return cell.getTextBlocks().get(0).isHeadline();
return cell.getTextBlocks()
.get(0).isHeadline();
}
}